PyOD: python unsupervised outlier detection with auto encoders
I found this tutorial online that does outlier detection (with pyod in python) .
https://towardsdatascience.com/anomaly-detection-with-pyod-b523fc47db9
However, the tutorial does not show how to trace the outlier scores back to the original data - how can I actually determine which observations were the outliers? I have attached the code below.
Thanks!
import numpy as np
import pandas as pd
from pyod.models.knn import KNN
from pyod.models.auto_encoder import AutoEncoder
from pyod.utils.data import generate_data
contamination = 0.1 # percentage of outliers
n_train = 500 # number of training points
n_test = 500 # number of testing points
n_features = 25 # Number of features
X_train, y_train, X_test, y_test = generate_data(
n_train=n_train, n_test=n_test,
n_features= n_features,
contamination=contamination,random_state=1234)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
from sklearn.preprocessing import StandardScaler
X_train = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(X_train)
X_test = StandardScaler().fit_transform(X_test)
X_test = pd.DataFrame(X_test)
clf1 = AutoEncoder(hidden_neurons =[25, 2, 2, 25])
clf1.fit(X_train)
# Get the outlier scores for the train data
y_train_scores = clf1.decision_scores_
# Predict the anomaly scores
y_test_scores = clf1.decision_function(X_test) # outlier scores
y_test_scores = pd.Series(y_test_scores)
# Plot it!
import matplotlib.pyplot as plt
plt.hist(y_test_scores, bins='auto')
plt.title("Histogram for Model Clf1 Anomaly Scores")
plt.show()
df_test = X_test.copy()
df_test['score'] = y_test_scores
df_test['cluster'] = np.where(df_test['score']<4, 0, 1)
df_test['cluster'].value_counts()
df_test.groupby('cluster').mean()