add new network architecture - server label moves to the middle

2017-07-29 19:42:36 +02:00
parent 8cd1023165
commit 820a5d1a4d
5 changed files with 151 additions and 40 deletions
--- a/dataset.py
+++ b/dataset.py
@@ -120,7 +120,6 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
    server_tr = server_tr[idx]

    client_tr = np_utils.to_categorical(client_tr, 2)
-    server_tr = np_utils.to_categorical(server_tr, 2)

    return domain_tr, flow_tr, client_tr, server_tr

@@ -166,7 +165,7 @@ def create_dataset_from_lists(chunks, vocab, max_len):
    logger.info("  select names")
    names = np.unique(np.stack(map(lambda f: f.user_hash, chunks)))
    logger.info("  select servers")
-    servers = np.max(np.stack(map(lambda f: f.serverLabel, chunks)), axis=1)
+    servers = np.stack(map(lambda f: f.serverLabel, chunks))
    logger.info("  select trusted hits")
    trusted_hits = np.max(np.stack(map(lambda f: f.trustedHits, chunks)), axis=1)

--- a/main.py
+++ b/main.py
@@ -7,7 +7,7 @@ import pandas as pd
 import tensorflow as tf
 from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
 from keras.models import load_model
-from sklearn.decomposition import PCA
+from keras.utils import np_utils
 from sklearn.utils import class_weight

 import arguments
@@ -46,8 +46,6 @@ ch.setFormatter(formatter)
 # add ch to logger
 logger.addHandler(ch)

-print = logger.info
-
 args = arguments.parse()

 if args.gpu:
@@ -104,8 +102,8 @@ def main_hyperband():


 def get_custom_class_weights(client_tr, server_tr):
-    client = client_tr.value.argmax(1)
-    server = server_tr.value.argmax(1)
+    client = client_tr.value.argmax(1) if type(client_tr) != np.ndarray else client_tr.argmax(1)
+    server = server_tr.value.argmax(1) if type(server_tr) != np.ndarray else server_tr.argmax(1)
    client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client)
    server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server)
    return {
@@ -118,7 +116,7 @@ def main_train(param=None):
    exists_or_make_path(args.model_path)

    char_dict = dataset.get_character_dict()
-    domain_tr, flow_tr, client_tr, server_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
+    domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
                                                                               args.domain_length, args.window)

    # parameter
@@ -133,10 +131,10 @@ def main_train(param=None):
        'dropout': 0.5,
        'domain_features': args.domain_embedding,
        'embedding_size': args.embedding,
-        'filter_main': 128,
+        'filter_main': 64,
        'flow_features': 3,
        # 'dense_main': 512,
-        'dense_main': 128,
+        'dense_main': 64,
        'filter_embedding': args.hidden_char_dims,
        'hidden_embedding': args.domain_embedding,
        'kernel_embedding': 3,
@@ -146,7 +144,7 @@ def main_train(param=None):
    if not param:
        param = p

-    embedding, model = models.get_models_by_params(param)
+    embedding, model, _ = models.get_models_by_params(param)
    embedding.summary()
    model.summary()
    logger.info("define callbacks")
@@ -166,6 +164,8 @@ def main_train(param=None):
                  loss='categorical_crossentropy',
                  metrics=['accuracy'] + custom_metrics)

+    server_tr = np_utils.to_categorical(np.max(server_windows_tr, axis=1), 2)
+
    if args.class_weights:
        logger.info("class weights: compute custom weights")
        custom_class_weights = get_custom_class_weights(client_tr, server_tr)
@@ -200,7 +200,7 @@ def main_test():

    char_dict = dataset.get_character_dict()
    user_flow_df = dataset.get_user_flow_data(args.test_data)
-    domains = user_flow_df.domain.unique()
+    domains = user_flow_df.domain.unique()[:-1]

    def get_domain_features_reduced(d):
        return dataset.get_domain_features(d[0], char_dict, args.domain_length)
@@ -211,13 +211,93 @@ def main_test():

    model = load_model(args.embedding_model)
    domain_features = np.stack(domain_features).reshape((-1, 40))
-    pred = model.predict(domains, batch_size=args.batch_size, verbose=1)
+    pred = model.predict(domain_features, batch_size=args.batch_size, verbose=1)

    np.save("/tmp/rk/domains.npy", domains)
    np.save("/tmp/rk/domain_features.npy", domain_features)
    np.save("/tmp/rk/domain_embd.npy", pred)


+def main_new_model():
+    exists_or_make_path(args.model_path)
+
+    char_dict = dataset.get_character_dict()
+    domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
+                                                                               args.domain_length, args.window)
+
+    # parameter
+    p = {
+        "type": args.model_type,
+        "batch_size": 64,
+        "window_size": args.window,
+        "domain_length": args.domain_length,
+        "flow_features": 3,
+        "vocab_size": len(char_dict) + 1,
+        #
+        'dropout': 0.5,
+        'domain_features': args.domain_embedding,
+        'embedding_size': args.embedding,
+        'filter_main': 64,
+        'flow_features': 3,
+        # 'dense_main': 512,
+        'dense_main': 64,
+        'filter_embedding': args.hidden_char_dims,
+        'hidden_embedding': args.domain_embedding,
+        'kernel_embedding': 3,
+        'kernels_main': 3,
+        'input_length': 40
+    }
+
+    embedding, _, model = models.get_models_by_params(p)
+    embedding.summary()
+    model.summary()
+    logger.info("define callbacks")
+    callbacks = []
+    callbacks.append(ModelCheckpoint(filepath=args.clf_model,
+                                     monitor='val_loss',
+                                     verbose=False,
+                                     save_best_only=True))
+    callbacks.append(CSVLogger(args.train_log))
+    if args.stop_early:
+        callbacks.append(EarlyStopping(monitor='val_loss',
+                                       patience=5,
+                                       verbose=False))
+    logger.info("compile model")
+    custom_metrics = models.get_metric_functions()
+    model.compile(optimizer='adam',
+                  loss='categorical_crossentropy',
+                  metrics=['accuracy'] + custom_metrics)
+
+    server_tr = np_utils.to_categorical(np.max(server_windows_tr, axis=1), 2)
+
+    if args.class_weights:
+        logger.info("class weights: compute custom weights")
+        custom_class_weights = get_custom_class_weights(client_tr, server_tr)
+        logger.info(custom_class_weights)
+    else:
+        logger.info("class weights: set default")
+        custom_class_weights = None
+    logger.info("start training")
+    server_tr = np.stack(np_utils.to_categorical(s, 2) for s in server_windows_tr)
+    model.fit([domain_tr, flow_tr],
+              [client_tr, server_tr],
+              batch_size=args.batch_size,
+              epochs=args.epochs,
+              callbacks=callbacks,
+              shuffle=True,
+              validation_split=0.2,
+              class_weight=custom_class_weights)
+    logger.info("save embedding")
+    embedding.save(args.embedding_model)
+
+
+def main_embedding():
+    model = load_model(args.embedding_model)
+    domain_encs, labels = dataset.load_or_generate_domains(args.train_data, args.domain_length)
+    domain_embedding = model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
+    visualize.plot_embedding(domain_embedding, labels, path="results/pp3/embd.png")
+
+
 def main_visualization():
    domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data,
                                                                           args.domain_length, args.window)
@@ -248,27 +328,6 @@ def main_visualization():
                                    "{}/server_cov.png".format(args.model_path),
                                    normalize=False, title="Server Confusion Matrix")

-    # embedding visi
-    import matplotlib.pyplot as plt
-
-    model = load_model(args.embedding_model)
-    domains = np.reshape(domain_val, (domain_val.shape[0] * domain_val.shape[1], 40))
-    domain_embedding = model.predict(domains, batch_size=args.batch_size, verbose=1)
-
-    pca = PCA(n_components=2)
-    domain_reduced = pca.fit_transform(domain_embedding)
-    print(pca.explained_variance_ratio_)
-
-    clients = np.repeat(client_val, 10, axis=0)
-    clients = clients.argmax(1)
-    servers = np.repeat(server_val, 10, axis=0)
-    servers = servers.argmax(1)
-
-    plt.scatter(domain_reduced[:, 0], domain_reduced[:, 1], c=clients, cmap=plt.cm.bwr, s=2)
-    plt.show()
-    plt.scatter(domain_reduced[:, 0], domain_reduced[:, 1], c=servers, cmap=plt.cm.bwr, s=2)
-    plt.show()
-

 def main_score():
    # mask = dataset.load_mask_eval(args.data, args.test_image)
@@ -281,7 +340,7 @@ def main_data():
    char_dict = dataset.get_character_dict()
    user_flow_df = dataset.get_user_flow_data(args.train_data)
    logger.info("create training dataset")
-    domain_tr, flow_tr, client_tr, server_tr = dataset.create_dataset_from_flows(user_flow_df, char_dict,
+    domain_tr, flow_tr, client_tr, server_tr, _ = dataset.create_dataset_from_flows(user_flow_df, char_dict,
                                                                                    max_len=args.domain_length,
                                                                                    window_size=args.window)
    print(f"domain shape {domain_tr.shape}")
@@ -305,6 +364,8 @@ def main():
        main_paul_best()
    if "data" in args.modes:
        main_data()
+    if "train_new" in args.modes:
+        main_new_model()


 if __name__ == "__main__":
--- a/models/init.py
+++ b/models/init.py
@@ -32,7 +32,10 @@ def get_models_by_params(params: dict):
    predict_model = networks.get_model(dropout, flow_features, domain_features, window_size, domain_length,
                                       filter_main, kernel_main, dense_dim, embedding_model)

-    return embedding_model, predict_model
+    new_model = networks.get_new_model(dropout, flow_features, domain_features, window_size, domain_length,
+                                       filter_main, kernel_main, dense_dim, embedding_model)
+
+    return embedding_model, predict_model, new_model


 def get_metrics():
--- a/models/pauls_networks.py
+++ b/models/pauls_networks.py
@@ -30,8 +30,8 @@ def get_embedding(vocab_size, embedding_size, input_length,
    y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
    y = Conv1D(filters, kernel_size, activation='relu')(y)
    y = GlobalMaxPooling1D()(y)
-    y = Dense(hidden_dims)(y)
    y = Dropout(drop_out)(y)
+    y = Dense(hidden_dims)(y)
    y = Activation('relu')(y)
    return Model(x, y)

@@ -56,3 +56,27 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le
    y2 = Dense(2, activation='softmax', name="server")(y)

    return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
+
+
+def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
+                  dense_dim, cnn):
+    ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
+    ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
+    encoded = TimeDistributed(cnn)(ipt_domains)
+
+    y2 = Dense(2, activation="softmax", name="server")(encoded)
+    merged = keras.layers.concatenate([encoded, ipt_flows, y2], -1)
+
+    y = Conv1D(cnn_dims,
+               kernel_size,
+               activation='relu',
+               input_shape=(window_size, domain_features + flow_features))(merged)
+    # remove temporal dimension by global max pooling
+    y = GlobalMaxPooling1D()(y)
+    y = Dropout(dropout)(y)
+    y = Dense(dense_dim, activation='relu')(y)
+
+    y1 = Dense(2, activation='softmax', name="client")(y)
+    model = Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
+
+    return model
--- a/models/renes_networks.py
+++ b/models/renes_networks.py
@@ -42,3 +42,27 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le
    y2 = Dense(2, activation='softmax', name="server")(y)

    return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
+
+
+def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
+                  dense_dim, cnn):
+    ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
+    ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
+    encoded = TimeDistributed(cnn)(ipt_domains)
+
+    y2 = Dense(2, activation="softmax", name="server")(encoded)
+    merged = keras.layers.concatenate([encoded, ipt_flows, y2], -1)
+
+    y = Conv1D(cnn_dims,
+               kernel_size,
+               activation='relu',
+               input_shape=(window_size, domain_features + flow_features))(merged)
+    # remove temporal dimension by global max pooling
+    y = GlobalMaxPooling1D()(y)
+    y = Dropout(dropout)(y)
+    y = Dense(dense_dim, activation='relu')(y)
+
+    y1 = Dense(2, activation='softmax', name="client")(y)
+    model = Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
+
+    return model