add embedding visualization and domain encoding generator
This commit is contained in:
parent
c1535b941b
commit
2593131e9e
18
dataset.py
18
dataset.py
@ -227,3 +227,21 @@ def load_or_generate_h5data(h5data, train_data, domain_length, window_size):
|
|||||||
store_h5dataset(h5data, domain_tr, flow_tr, client_tr, server_tr)
|
store_h5dataset(h5data, domain_tr, flow_tr, client_tr, server_tr)
|
||||||
logger.info("load h5 dataset")
|
logger.info("load h5 dataset")
|
||||||
return load_h5dataset(h5data)
|
return load_h5dataset(h5data)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: implement csv loading if already generated
|
||||||
|
def load_or_generate_domains(train_data, domain_length):
|
||||||
|
char_dict = get_character_dict()
|
||||||
|
user_flow_df = get_user_flow_data(train_data)
|
||||||
|
|
||||||
|
domain_encs = user_flow_df.domain.apply(lambda d: get_domain_features(d, char_dict, domain_length))
|
||||||
|
domain_encs = np.stack(domain_encs)
|
||||||
|
|
||||||
|
user_flow_df = user_flow_df[["domain", "serverLabel", "trustedHits", "virusTotalHits"]].dropna(axis=0, how="any")
|
||||||
|
user_flow_df.reset_index(inplace=True)
|
||||||
|
user_flow_df["clientLabel"] = np.where(
|
||||||
|
np.logical_or(user_flow_df.trustedHits > 0, user_flow_df.virusTotalHits >= 3), 1.0, 0.0)
|
||||||
|
user_flow_df = user_flow_df[["domain", "serverLabel", "clientLabel"]]
|
||||||
|
user_flow_df.groupby(user_flow_df.domain).mean()
|
||||||
|
|
||||||
|
return domain_encs, user_flow_df[["serverLabel", "clientLabel"]].as_matrix()
|
||||||
|
18
visualize.py
18
visualize.py
@ -3,6 +3,7 @@ import os
|
|||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from keras.utils import plot_model
|
from keras.utils import plot_model
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
from sklearn.metrics import (
|
from sklearn.metrics import (
|
||||||
auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
|
auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
|
||||||
roc_auc_score, roc_curve
|
roc_auc_score, roc_curve
|
||||||
@ -144,5 +145,22 @@ def plot_training_curve(logs, key, path, dpi=600):
|
|||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def plot_embedding(domain_embedding, labels, path, dpi=600):
|
||||||
|
pca = PCA(n_components=2)
|
||||||
|
domain_reduced = pca.fit_transform(domain_embedding)
|
||||||
|
print(pca.explained_variance_ratio_)
|
||||||
|
|
||||||
|
# use if draw subset of predictions
|
||||||
|
# idx = np.random.choice(np.arange(len(domain_reduced)), 10000)
|
||||||
|
|
||||||
|
plt.scatter(domain_reduced[:, 0],
|
||||||
|
domain_reduced[:, 1],
|
||||||
|
c=(labels * (1, 2)).sum(1).astype(int),
|
||||||
|
cmap=plt.cm.plasma,
|
||||||
|
s=3)
|
||||||
|
plt.colorbar()
|
||||||
|
plt.savefig(path, dpi=dpi)
|
||||||
|
|
||||||
|
|
||||||
def plot_model_as(model, path):
|
def plot_model_as(model, path):
|
||||||
plot_model(model, to_file=path, show_shapes=True, show_layer_names=True)
|
plot_model(model, to_file=path, show_shapes=True, show_layer_names=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user