add embedding visualization and domain encoding generator

This commit is contained in:
René Knaebel 2017-07-29 10:43:59 +02:00
parent c1535b941b
commit 2593131e9e
2 changed files with 36 additions and 0 deletions

View File

@ -227,3 +227,21 @@ def load_or_generate_h5data(h5data, train_data, domain_length, window_size):
store_h5dataset(h5data, domain_tr, flow_tr, client_tr, server_tr) store_h5dataset(h5data, domain_tr, flow_tr, client_tr, server_tr)
logger.info("load h5 dataset") logger.info("load h5 dataset")
return load_h5dataset(h5data) return load_h5dataset(h5data)
# TODO: implement csv loading if already generated
def load_or_generate_domains(train_data, domain_length):
char_dict = get_character_dict()
user_flow_df = get_user_flow_data(train_data)
domain_encs = user_flow_df.domain.apply(lambda d: get_domain_features(d, char_dict, domain_length))
domain_encs = np.stack(domain_encs)
user_flow_df = user_flow_df[["domain", "serverLabel", "trustedHits", "virusTotalHits"]].dropna(axis=0, how="any")
user_flow_df.reset_index(inplace=True)
user_flow_df["clientLabel"] = np.where(
np.logical_or(user_flow_df.trustedHits > 0, user_flow_df.virusTotalHits >= 3), 1.0, 0.0)
user_flow_df = user_flow_df[["domain", "serverLabel", "clientLabel"]]
user_flow_df.groupby(user_flow_df.domain).mean()
return domain_encs, user_flow_df[["serverLabel", "clientLabel"]].as_matrix()

View File

@ -3,6 +3,7 @@ import os
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from keras.utils import plot_model from keras.utils import plot_model
from sklearn.decomposition import PCA
from sklearn.metrics import ( from sklearn.metrics import (
auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve, auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
roc_auc_score, roc_curve roc_auc_score, roc_curve
@ -144,5 +145,22 @@ def plot_training_curve(logs, key, path, dpi=600):
plt.close() plt.close()
def plot_embedding(domain_embedding, labels, path, dpi=600):
pca = PCA(n_components=2)
domain_reduced = pca.fit_transform(domain_embedding)
print(pca.explained_variance_ratio_)
# use if draw subset of predictions
# idx = np.random.choice(np.arange(len(domain_reduced)), 10000)
plt.scatter(domain_reduced[:, 0],
domain_reduced[:, 1],
c=(labels * (1, 2)).sum(1).astype(int),
cmap=plt.cm.plasma,
s=3)
plt.colorbar()
plt.savefig(path, dpi=dpi)
def plot_model_as(model, path): def plot_model_as(model, path):
plot_model(model, to_file=path, show_shapes=True, show_layer_names=True) plot_model(model, to_file=path, show_shapes=True, show_layer_names=True)