defselect_threshold(y_val, p_val): """ Finds the best threshold to use for selecting outliers based on the results from a validation set (p_val) and the ground truth (y_val) Args: y_val (ndarray): Ground truth on validation set p_val (ndarray): Results on validation set Returns: epsilon (float): Threshold chosen F1 (float): F1 score by choosing epsilon as threshold """
defmultivariate_gaussian(X, mu, var): """ Computes the probability density function of the examples X under the multivariate gaussian distribution with parameters mu and var. If var is a matrix, it is treated as the covariance matrix. If var is a vector, it is treated as the var values of the variances in each dimension (a diagonal covariance matrix """ k = len(mu) if var.ndim == 1: var = np.diag(var) X = X - mu p = (2* np.pi)**(-k/2) * np.linalg.det(var)**(-0.5) * \ np.exp(-0.5 * np.sum(np.matmul(X, np.linalg.pinv(var)) * X, axis=1)) # np.linalg.det表示矩阵行列式,np.linalg.pinv表示矩阵的伪逆 return p
调用:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Estimate the Gaussian parameters mu, var = estimate_gaussian(X_train_high)
# Evaluate the probabilites for the training set p = multivariate_gaussian(X_train, mu, var)
# Evaluate the probabilites for the cross validation set p_val = multivariate_gaussian(X_val, mu, var)
# Find the best threshold epsilon, F1 = select_threshold(y_val, p_val)
print('Best epsilon found using cross-validation: %e'% epsilon) print('Best F1 on Cross Validation Set: %f'% F1) print('# Anomalies found: %d'% sum(p_high < epsilon))