From 820a5d1a4d543a1aa29f50d721bfe6a96940ccba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= Date: Sat, 29 Jul 2017 19:42:36 +0200 Subject: [PATCH] add new network architecture - server label moves to the middle --- dataset.py | 3 +- main.py | 133 ++++++++++++++++++++++++++++----------- models/__init__.py | 5 +- models/pauls_networks.py | 26 +++++++- models/renes_networks.py | 24 +++++++ 5 files changed, 151 insertions(+), 40 deletions(-) diff --git a/dataset.py b/dataset.py index eb14ef7..2b28aab 100644 --- a/dataset.py +++ b/dataset.py @@ -120,7 +120,6 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10): server_tr = server_tr[idx] client_tr = np_utils.to_categorical(client_tr, 2) - server_tr = np_utils.to_categorical(server_tr, 2) return domain_tr, flow_tr, client_tr, server_tr @@ -166,7 +165,7 @@ def create_dataset_from_lists(chunks, vocab, max_len): logger.info(" select names") names = np.unique(np.stack(map(lambda f: f.user_hash, chunks))) logger.info(" select servers") - servers = np.max(np.stack(map(lambda f: f.serverLabel, chunks)), axis=1) + servers = np.stack(map(lambda f: f.serverLabel, chunks)) logger.info(" select trusted hits") trusted_hits = np.max(np.stack(map(lambda f: f.trustedHits, chunks)), axis=1) diff --git a/main.py b/main.py index 7464819..bb251e4 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import pandas as pd import tensorflow as tf from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping from keras.models import load_model -from sklearn.decomposition import PCA +from keras.utils import np_utils from sklearn.utils import class_weight import arguments @@ -46,8 +46,6 @@ ch.setFormatter(formatter) # add ch to logger logger.addHandler(ch) -print = logger.info - args = arguments.parse() if args.gpu: @@ -104,8 +102,8 @@ def main_hyperband(): def get_custom_class_weights(client_tr, server_tr): - client = client_tr.value.argmax(1) - server = server_tr.value.argmax(1) + client = client_tr.value.argmax(1) if type(client_tr) != np.ndarray else client_tr.argmax(1) + server = server_tr.value.argmax(1) if type(server_tr) != np.ndarray else server_tr.argmax(1) client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client) server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server) return { @@ -118,8 +116,8 @@ def main_train(param=None): exists_or_make_path(args.model_path) char_dict = dataset.get_character_dict() - domain_tr, flow_tr, client_tr, server_tr = load_or_generate_h5data(args.train_h5data, args.train_data, - args.domain_length, args.window) + domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data, + args.domain_length, args.window) # parameter p = { @@ -133,10 +131,10 @@ def main_train(param=None): 'dropout': 0.5, 'domain_features': args.domain_embedding, 'embedding_size': args.embedding, - 'filter_main': 128, + 'filter_main': 64, 'flow_features': 3, # 'dense_main': 512, - 'dense_main': 128, + 'dense_main': 64, 'filter_embedding': args.hidden_char_dims, 'hidden_embedding': args.domain_embedding, 'kernel_embedding': 3, @@ -146,7 +144,7 @@ def main_train(param=None): if not param: param = p - embedding, model = models.get_models_by_params(param) + embedding, model, _ = models.get_models_by_params(param) embedding.summary() model.summary() logger.info("define callbacks") @@ -166,6 +164,8 @@ def main_train(param=None): loss='categorical_crossentropy', metrics=['accuracy'] + custom_metrics) + server_tr = np_utils.to_categorical(np.max(server_windows_tr, axis=1), 2) + if args.class_weights: logger.info("class weights: compute custom weights") custom_class_weights = get_custom_class_weights(client_tr, server_tr) @@ -200,7 +200,7 @@ def main_test(): char_dict = dataset.get_character_dict() user_flow_df = dataset.get_user_flow_data(args.test_data) - domains = user_flow_df.domain.unique() + domains = user_flow_df.domain.unique()[:-1] def get_domain_features_reduced(d): return dataset.get_domain_features(d[0], char_dict, args.domain_length) @@ -211,13 +211,93 @@ def main_test(): model = load_model(args.embedding_model) domain_features = np.stack(domain_features).reshape((-1, 40)) - pred = model.predict(domains, batch_size=args.batch_size, verbose=1) + pred = model.predict(domain_features, batch_size=args.batch_size, verbose=1) np.save("/tmp/rk/domains.npy", domains) np.save("/tmp/rk/domain_features.npy", domain_features) np.save("/tmp/rk/domain_embd.npy", pred) +def main_new_model(): + exists_or_make_path(args.model_path) + + char_dict = dataset.get_character_dict() + domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data, + args.domain_length, args.window) + + # parameter + p = { + "type": args.model_type, + "batch_size": 64, + "window_size": args.window, + "domain_length": args.domain_length, + "flow_features": 3, + "vocab_size": len(char_dict) + 1, + # + 'dropout': 0.5, + 'domain_features': args.domain_embedding, + 'embedding_size': args.embedding, + 'filter_main': 64, + 'flow_features': 3, + # 'dense_main': 512, + 'dense_main': 64, + 'filter_embedding': args.hidden_char_dims, + 'hidden_embedding': args.domain_embedding, + 'kernel_embedding': 3, + 'kernels_main': 3, + 'input_length': 40 + } + + embedding, _, model = models.get_models_by_params(p) + embedding.summary() + model.summary() + logger.info("define callbacks") + callbacks = [] + callbacks.append(ModelCheckpoint(filepath=args.clf_model, + monitor='val_loss', + verbose=False, + save_best_only=True)) + callbacks.append(CSVLogger(args.train_log)) + if args.stop_early: + callbacks.append(EarlyStopping(monitor='val_loss', + patience=5, + verbose=False)) + logger.info("compile model") + custom_metrics = models.get_metric_functions() + model.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy'] + custom_metrics) + + server_tr = np_utils.to_categorical(np.max(server_windows_tr, axis=1), 2) + + if args.class_weights: + logger.info("class weights: compute custom weights") + custom_class_weights = get_custom_class_weights(client_tr, server_tr) + logger.info(custom_class_weights) + else: + logger.info("class weights: set default") + custom_class_weights = None + logger.info("start training") + server_tr = np.stack(np_utils.to_categorical(s, 2) for s in server_windows_tr) + model.fit([domain_tr, flow_tr], + [client_tr, server_tr], + batch_size=args.batch_size, + epochs=args.epochs, + callbacks=callbacks, + shuffle=True, + validation_split=0.2, + class_weight=custom_class_weights) + logger.info("save embedding") + embedding.save(args.embedding_model) + + +def main_embedding(): + model = load_model(args.embedding_model) + domain_encs, labels = dataset.load_or_generate_domains(args.train_data, args.domain_length) + domain_embedding = model.predict(domain_encs, batch_size=args.batch_size, verbose=1) + visualize.plot_embedding(domain_embedding, labels, path="results/pp3/embd.png") + + def main_visualization(): domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data, args.domain_length, args.window) @@ -248,27 +328,6 @@ def main_visualization(): "{}/server_cov.png".format(args.model_path), normalize=False, title="Server Confusion Matrix") - # embedding visi - import matplotlib.pyplot as plt - - model = load_model(args.embedding_model) - domains = np.reshape(domain_val, (domain_val.shape[0] * domain_val.shape[1], 40)) - domain_embedding = model.predict(domains, batch_size=args.batch_size, verbose=1) - - pca = PCA(n_components=2) - domain_reduced = pca.fit_transform(domain_embedding) - print(pca.explained_variance_ratio_) - - clients = np.repeat(client_val, 10, axis=0) - clients = clients.argmax(1) - servers = np.repeat(server_val, 10, axis=0) - servers = servers.argmax(1) - - plt.scatter(domain_reduced[:, 0], domain_reduced[:, 1], c=clients, cmap=plt.cm.bwr, s=2) - plt.show() - plt.scatter(domain_reduced[:, 0], domain_reduced[:, 1], c=servers, cmap=plt.cm.bwr, s=2) - plt.show() - def main_score(): # mask = dataset.load_mask_eval(args.data, args.test_image) @@ -281,9 +340,9 @@ def main_data(): char_dict = dataset.get_character_dict() user_flow_df = dataset.get_user_flow_data(args.train_data) logger.info("create training dataset") - domain_tr, flow_tr, client_tr, server_tr = dataset.create_dataset_from_flows(user_flow_df, char_dict, - max_len=args.domain_length, - window_size=args.window) + domain_tr, flow_tr, client_tr, server_tr, _ = dataset.create_dataset_from_flows(user_flow_df, char_dict, + max_len=args.domain_length, + window_size=args.window) print(f"domain shape {domain_tr.shape}") print(f"flow shape {flow_tr.shape}") print(f"client shape {client_tr.shape}") @@ -305,6 +364,8 @@ def main(): main_paul_best() if "data" in args.modes: main_data() + if "train_new" in args.modes: + main_new_model() if __name__ == "__main__": diff --git a/models/__init__.py b/models/__init__.py index 84228b9..49ba780 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -32,7 +32,10 @@ def get_models_by_params(params: dict): predict_model = networks.get_model(dropout, flow_features, domain_features, window_size, domain_length, filter_main, kernel_main, dense_dim, embedding_model) - return embedding_model, predict_model + new_model = networks.get_new_model(dropout, flow_features, domain_features, window_size, domain_length, + filter_main, kernel_main, dense_dim, embedding_model) + + return embedding_model, predict_model, new_model def get_metrics(): diff --git a/models/pauls_networks.py b/models/pauls_networks.py index 363e904..4919cf6 100644 --- a/models/pauls_networks.py +++ b/models/pauls_networks.py @@ -30,8 +30,8 @@ def get_embedding(vocab_size, embedding_size, input_length, y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y) y = Conv1D(filters, kernel_size, activation='relu')(y) y = GlobalMaxPooling1D()(y) - y = Dense(hidden_dims)(y) y = Dropout(drop_out)(y) + y = Dense(hidden_dims)(y) y = Activation('relu')(y) return Model(x, y) @@ -56,3 +56,27 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le y2 = Dense(2, activation='softmax', name="server")(y) return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2)) + + +def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size, + dense_dim, cnn): + ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains") + ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") + encoded = TimeDistributed(cnn)(ipt_domains) + + y2 = Dense(2, activation="softmax", name="server")(encoded) + merged = keras.layers.concatenate([encoded, ipt_flows, y2], -1) + + y = Conv1D(cnn_dims, + kernel_size, + activation='relu', + input_shape=(window_size, domain_features + flow_features))(merged) + # remove temporal dimension by global max pooling + y = GlobalMaxPooling1D()(y) + y = Dropout(dropout)(y) + y = Dense(dense_dim, activation='relu')(y) + + y1 = Dense(2, activation='softmax', name="client")(y) + model = Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2)) + + return model diff --git a/models/renes_networks.py b/models/renes_networks.py index 0d72c07..08ae13b 100644 --- a/models/renes_networks.py +++ b/models/renes_networks.py @@ -42,3 +42,27 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le y2 = Dense(2, activation='softmax', name="server")(y) return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2)) + + +def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size, + dense_dim, cnn): + ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains") + ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") + encoded = TimeDistributed(cnn)(ipt_domains) + + y2 = Dense(2, activation="softmax", name="server")(encoded) + merged = keras.layers.concatenate([encoded, ipt_flows, y2], -1) + + y = Conv1D(cnn_dims, + kernel_size, + activation='relu', + input_shape=(window_size, domain_features + flow_features))(merged) + # remove temporal dimension by global max pooling + y = GlobalMaxPooling1D()(y) + y = Dropout(dropout)(y) + y = Dense(dense_dim, activation='relu')(y) + + y1 = Dense(2, activation='softmax', name="client")(y) + model = Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2)) + + return model