From 1cf62423e19e24fe70ee615640a09e7694997d69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= Date: Sun, 10 Sep 2017 18:06:40 +0200 Subject: [PATCH] add regularization to small networks, fix model name in args, fix visualizations --- arguments.py | 1 + fancy.sh | 23 ++++++++++++----------- main.py | 32 ++++++++++++++++++++++++++++---- models/pauls_networks.py | 29 +++++++++++++++++++++-------- run.sh | 9 +++++++-- visualize.py | 6 +++--- 6 files changed, 72 insertions(+), 28 deletions(-) diff --git a/arguments.py b/arguments.py index 7b03890..f7f0bdd 100644 --- a/arguments.py +++ b/arguments.py @@ -102,6 +102,7 @@ parser.add_argument("--new_model", action="store_true", dest="new_model") def get_model_args(args): return [{ "model_path": model_path, + "model_name": os.path.split(os.path.normpath(model_path))[1], "embedding_model": os.path.join(model_path, "embd.h5"), "clf_model": os.path.join(model_path, "clf.h5"), "train_log": os.path.join(model_path, "train.log.csv"), diff --git a/fancy.sh b/fancy.sh index d604926..00d571e 100644 --- a/fancy.sh +++ b/fancy.sh @@ -5,19 +5,20 @@ DATADIR=$2 python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_small_final --test ${DATADIR}/futureData.csv --model_output both python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_small_inter --test ${DATADIR}/futureData.csv --model_output both +python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_small_staggered --test ${DATADIR}/futureData.csv --model_output both python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_small_final --test ${DATADIR}/futureData.csv --model_output client python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_small_inter --test ${DATADIR}/futureData.csv --model_output client -python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_final --test ${DATADIR}/futureData.csv --model_output both -python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_inter --test ${DATADIR}/futureData.csv --model_output both -python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_final --test ${DATADIR}/futureData.csv --model_output client -python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_inter --test ${DATADIR}/futureData.csv --model_output client +#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_final --test ${DATADIR}/futureData.csv --model_output both +#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/both_medium_inter --test ${DATADIR}/futureData.csv --model_output both +#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_final --test ${DATADIR}/futureData.csv --model_output client +#python3 main.py --mode fancy --batch 1024 --model ${RESDIR}/client_medium_inter --test ${DATADIR}/futureData.csv --model_output client + +#python3 main.py --mode all_fancy --batch 256 --test ${DATADIR}/futureData.csv \ +# --models ${RESDIR}/*_small_*/ --out-prefix ${RESDIR}/small + +#python3 main.py --mode all_fancy --batch 256 --test ${DATADIR}/futureData.csv \ +# --models ${RESDIR}/*_medium_*/ --out-prefix ${RESDIR}/medium python3 main.py --mode all_fancy --batch 256 --test ${DATADIR}/futureData.csv \ - --models ${RESDIR}/*_small_*/ --out-prefix ${RESDIR}/small - -python3 main.py --mode all_fancy --batch 256 --test ${DATADIR}/futureData.csv \ - --models ${RESDIR}/*_medium_*/ --out-prefix ${RESDIR}/medium - -python3 main.py --mode all_fancy --batch 256 --test ${DATADIR}/futureData.csv \ - --models ${RESDIR}/*/ --out-prefix ${RESDIR}/all \ No newline at end of file + --models ${RESDIR}/*/ --out-prefix ${RESDIR}/all diff --git a/main.py b/main.py index 8c80708..61ad0c4 100644 --- a/main.py +++ b/main.py @@ -288,6 +288,14 @@ def main_visualization(): }) df["client_val"] = np.logical_or(df.hits_vt == 1.0, df.hits_trusted >= 3) df_user = df.groupby(df.names).max() + + paul = dataset.load_predictions("results/paul/") + df_paul = pd.DataFrame(data={ + "names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(), + "hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten() + }) + df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3) + df_paul_user = df_paul.groupby(df_paul.names).max() logger.info("plot model") model = load_model(args.clf_model, custom_objects=models.get_metrics()) @@ -306,22 +314,26 @@ def main_visualization(): logger.info("plot pr curve") visualize.plot_clf() visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), args.model_path) + visualize.plot_precision_recall(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul") visualize.plot_legend() visualize.plot_save("{}/window_client_prc.png".format(args.model_path)) logger.info("plot roc curve") visualize.plot_clf() visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), args.model_path) + visualize.plot_roc_curve(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul") visualize.plot_legend() visualize.plot_save("{}/window_client_roc.png".format(args.model_path)) visualize.plot_clf() visualize.plot_precision_recall(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix(), args.model_path) + visualize.plot_precision_recall(df_paul_user.client_val.as_matrix(), df_paul_user.client_pred.as_matrix(), "paul") visualize.plot_legend() visualize.plot_save("{}/user_client_prc.png".format(args.model_path)) visualize.plot_clf() visualize.plot_roc_curve(df_user.client_val.as_matrix(), df_user.client_pred.as_matrix(), args.model_path) + visualize.plot_roc_curve(df_paul_user.client_val.as_matrix(), df_paul_user.client_pred.as_matrix(), "paul") visualize.plot_legend() visualize.plot_save("{}/user_client_roc.png".format(args.model_path)) @@ -351,12 +363,21 @@ def main_visualize_all(): }) res["client_val"] = np.logical_or(res.hits_vt == 1.0, res.hits_trusted >= 3) return res + + paul = dataset.load_predictions("results/paul/") + df_paul = pd.DataFrame(data={ + "names": paul["testNames"].flatten(), "client_pred": paul["testScores"].flatten(), + "hits_vt": paul["testLabel"].flatten(), "hits_trusted": paul["testHits"].flatten() + }) + df_paul["client_val"] = np.logical_or(df_paul.hits_vt == 1.0, df_paul.hits_trusted >= 3) + df_paul_user = df_paul.groupby(df_paul.names).max() logger.info("plot pr curves") visualize.plot_clf() for model_args in get_model_args(args): df = load_df(model_args["model_path"]) - visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_args["model_path"]) + visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_args["model_name"]) + visualize.plot_precision_recall(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul") visualize.plot_legend() visualize.plot_save(f"{args.output_prefix}_window_client_prc.png") @@ -364,7 +385,8 @@ def main_visualize_all(): visualize.plot_clf() for model_args in get_model_args(args): df = load_df(model_args["model_path"]) - visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_args["model_path"]) + visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_args["model_name"]) + visualize.plot_roc_curve(df_paul.client_val.as_matrix(), df_paul.client_pred.as_matrix(), "paul") visualize.plot_legend() visualize.plot_save(f"{args.output_prefix}_window_client_roc.png") @@ -373,7 +395,8 @@ def main_visualize_all(): for model_args in get_model_args(args): df = load_df(model_args["model_path"]) df = df.groupby(df.names).max() - visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_args["model_path"]) + visualize.plot_precision_recall(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_args["model_name"]) + visualize.plot_precision_recall(df_paul_user.client_val.as_matrix(), df_paul_user.client_pred.as_matrix(), "paul") visualize.plot_legend() visualize.plot_save(f"{args.output_prefix}_user_client_prc.png") @@ -382,7 +405,8 @@ def main_visualize_all(): for model_args in get_model_args(args): df = load_df(model_args["model_path"]) df = df.groupby(df.names).max() - visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_args["model_path"]) + visualize.plot_roc_curve(df.client_val.as_matrix(), df.client_pred.as_matrix(), model_args["model_name"]) + visualize.plot_roc_curve(df_paul_user.client_val.as_matrix(), df_paul_user.client_pred.as_matrix(), "paul") visualize.plot_legend() visualize.plot_save(f"{args.output_prefix}_user_client_roc.png") diff --git a/models/pauls_networks.py b/models/pauls_networks.py index fefc040..88978bc 100644 --- a/models/pauls_networks.py +++ b/models/pauls_networks.py @@ -1,11 +1,12 @@ +from collections import namedtuple + import keras from keras.engine import Input, Model as KerasModel -from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed +from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalMaxPooling1D, TimeDistributed +from keras.regularizers import l2 import dataset -from collections import namedtuple - Model = namedtuple("Model", ["in_domains", "in_flows", "out_client", "out_server"]) best_config = { @@ -33,10 +34,14 @@ best_config = { def get_embedding(embedding_size, input_length, filter_size, kernel_size, hidden_dims, drop_out=0.5) -> KerasModel: x = y = Input(shape=(input_length,)) y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y) - y = Conv1D(filter_size, kernel_size, activation='relu')(y) + y = Conv1D(filter_size, + kernel_size, + kernel_regularizer=l2(0.01), + activation='relu')(y) y = GlobalMaxPooling1D()(y) y = Dropout(drop_out)(y) - y = Dense(hidden_dims)(y) + y = Dense(hidden_dims, + kernel_regularizer=l2(0.01))(y) y = Activation('relu')(y) return KerasModel(x, y) @@ -50,12 +55,13 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le # CNN processing a small slides of flow windows y = Conv1D(cnn_dims, kernel_size, + kernel_regularizer=l2(0.01), activation='relu', input_shape=(window_size, domain_features + flow_features))(merged) # remove temporal dimension by global max pooling y = GlobalMaxPooling1D()(y) y = Dropout(cnnDropout)(y) - y = Dense(dense_dim, activation='relu')(y) + y = Dense(dense_dim, kernel_regularizer=l2(0.01), activation='relu')(y) out_client = Dense(1, activation='sigmoid', name="client")(y) out_server = Dense(1, activation='sigmoid', name="server")(y) @@ -68,18 +74,25 @@ def get_new_model(dropout, flow_features, domain_features, window_size, domain_l ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") encoded = TimeDistributed(cnn)(ipt_domains) merged = keras.layers.concatenate([encoded, ipt_flows], -1) - y = Dense(dense_dim, activation="relu", name="dense_server")(merged) + y = Dense(dense_dim, + kernel_regularizer=l2(0.01), + activation="relu", + name="dense_server")(merged) out_server = Dense(1, activation="sigmoid", name="server")(y) merged = keras.layers.concatenate([merged, y], -1) # CNN processing a small slides of flow windows y = Conv1D(cnn_dims, kernel_size, + kernel_regularizer=l2(0.01), activation='relu', input_shape=(window_size, domain_features + flow_features))(merged) # remove temporal dimension by global max pooling y = GlobalMaxPooling1D()(y) y = Dropout(dropout)(y) - y = Dense(dense_dim, activation='relu', name="dense_client")(y) + y = Dense(dense_dim, + kernel_regularizer=l2(0.01), + activation='relu', + name="dense_client")(y) out_client = Dense(1, activation='sigmoid', name="client")(y) diff --git a/run.sh b/run.sh index cd25fd6..61df6d8 100644 --- a/run.sh +++ b/run.sh @@ -5,6 +5,8 @@ RESDIR=$1 mkdir -p /tmp/rk/${RESDIR} DATADIR=$2 +EPOCHS=100 + for output in client both do for depth in small @@ -15,7 +17,7 @@ do python main.py --mode train \ --train ${DATADIR}/currentData.csv \ --model ${RESDIR}/${output}_${depth}_${mtype} \ - --epochs 50 \ + --epochs $EPOCHS \ --embd 128 \ --filter_embd 256 --kernel_embd 8 --dense_embd 128 \ --domain_embd 32 \ @@ -35,7 +37,7 @@ do python main.py --mode train \ --train ${DATADIR}/currentData.csv \ --model ${RESDIR}/both_${depth}_staggered \ - --epochs 50 \ + --epochs $EPOCHS \ --embd 128 \ --filter_embd 256 --kernel_embd 8 --dense_embd 128 \ --domain_embd 32 \ @@ -46,3 +48,6 @@ do --type staggered \ --depth ${depth} done + +# python main.py --mode train --epochs 100 --embd 64 --filter_embd 128 --kernel_embd 5 --dense_embd 128 --domain_embd 32 --filter_main 32 --kernel_main 5 --dense_main 512 --batch 256 --balanced_weights --model_output ${output} --type ${mtype} --depth ${depth} --train ${DATADIR}/currentData.csv --model ${RESDIR}/${output}_${depth}_${mtype} +# python main.py --mode train --epochs 100 --embd 64 --filter_embd 128 --kernel_embd 5 --dense_embd 128 --domain_embd 32 --filter_main 32 --kernel_main 5 --dense_main 512 --batch 256 --balanced_weights --model_output ${output} --type ${mtype} --depth ${depth} --train ${DATADIR}/currentData.csv --model ${RESDIR}/${output}_${depth}_${mtype} diff --git a/visualize.py b/visualize.py index 9cd02e7..ab143bb 100644 --- a/visualize.py +++ b/visualize.py @@ -5,7 +5,7 @@ import numpy as np from sklearn.decomposition import TruncatedSVD from sklearn.metrics import ( auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve, - roc_auc_score, roc_curve, average_precision_score + roc_auc_score, roc_curve ) @@ -54,7 +54,7 @@ def plot_precision_recall(y, y_pred, label=""): # ax.hold(True) score = fbeta_score(y, y_pred.round(), 1) # prc_ap = average_precision_score(y, y_pred) - plt.plot(recall, precision, '--', label=f"{label} - {score}") + plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}") # ax.step(recall[::-1], decreasing_max_precision, '-r') plt.xlabel('Recall') plt.ylabel('Precision') @@ -90,7 +90,7 @@ def plot_roc_curve(mask, prediction, label=""): fpr, tpr, thresholds = roc_curve(y, y_pred) roc_auc = auc(fpr, tpr) plt.xscale('log') - plt.plot(fpr, tpr, label=f"{label} - {roc_auc}") + plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}") def plot_confusion_matrix(y_true, y_pred, path,