refactor dataset creation, split up functions

2017-09-08 17:11:13 +02:00
parent 528829bb33
commit edc75f4f44
3 changed files with 72 additions and 57 deletions
--- a/66
+++ b/66
@@ -1,37 +1,69 @@
 run:
-	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test1 --epochs 2 --depth small \
+	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_1 --epochs 2 --depth small \
-	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final
+	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final --model_output both
-	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test2 --epochs 2 --depth small \
+	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_2 --epochs 2 --depth small \
-	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter
+	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter --model_output both
-	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test3 --epochs 2 --depth medium \
+	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_3 --epochs 2 --depth medium \
-	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final
+	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final --model_output both
-	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test4 --epochs 2 --depth medium \
+	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_4 --epochs 2 --depth medium \
-	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter
+	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter --model_output both
-	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test5 --epochs 2 --depth small \
+	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_5 --epochs 2 --depth small \
-	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type staggered
+	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type staggered --model_output both
 	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_client_1 --epochs 2 --depth small \
 	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final --model_output client
 	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_client_2 --epochs 2 --depth small \
 	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter --model_output client
 	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_client_3 --epochs 2 --depth medium \
 	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final --model_output client
 	python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_client_4 --epochs 2 --depth medium \
 	                --filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
 	                --dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter --model_output client
 test:
-	python3 main.py --mode test --batch 128 --models results/test* --test data/rk_mini.csv.gz
+	python3 main.py --mode test --batch 128 --models results/test/test_both_* --test data/rk_mini.csv.gz --model_output both
 	python3 main.py --mode test --batch 128 --models results/test/test_client_* --test data/rk_mini.csv.gz --model_output client
 fancy:
-	python3 main.py --mode fancy --batch 128 --model results/test1 --test data/rk_mini.csv.gz
+	python3 main.py --mode fancy --batch 128 --model results/test/test_both_1 --test data/rk_mini.csv.gz
-	python3 main.py --mode fancy --batch 128 --model results/test2 --test data/rk_mini.csv.gz
+	python3 main.py --mode fancy --batch 128 --model results/test/test_both_2 --test data/rk_mini.csv.gz
-	python3 main.py --mode fancy --batch 128 --model results/test3 --test data/rk_mini.csv.gz
+	python3 main.py --mode fancy --batch 128 --model results/test/test_both_3 --test data/rk_mini.csv.gz
-	python3 main.py --mode fancy --batch 128 --model results/test4 --test data/rk_mini.csv.gz
+	python3 main.py --mode fancy --batch 128 --model results/test/test_both_4 --test data/rk_mini.csv.gz
 	python3 main.py --mode fancy --batch 128 --model results/test/test_both_5 --test data/rk_mini.csv.gz
 	python3 main.py --mode fancy --batch 128 --model results/test/test_client_1 --test data/rk_mini.csv.gz
 	python3 main.py --mode fancy --batch 128 --model results/test/test_client_2 --test data/rk_mini.csv.gz
 	python3 main.py --mode fancy --batch 128 --model results/test/test_client_3 --test data/rk_mini.csv.gz
 	python3 main.py --mode fancy --batch 128 --model results/test/test_client_4 --test data/rk_mini.csv.gz
 all-fancy:
-	python3 main.py --mode all_fancy --batch 128 --models results/test* --test data/rk_mini.csv.gz
+	python3 main.py --mode all_fancy --batch 128 --models results/test/test* --test data/rk_mini.csv.gz
 hyper:
 	python3 main.py --mode hyperband --batch 64 --train data/rk_data.csv.gz
 clean:
-	rm -r results/test*
+	rm -r results/test/test*
 	rm data/rk_mini.csv.gz.h5
--- a/dataset.py
+++ b/dataset.py
@@ -99,18 +99,13 @@ def get_all_flow_features(features):
 def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
-    logger.info("get chunks from user data frames")
+    domain, flow, name, hits, trusted_hits, server = create_raw_dataset_from_flows(user_flow_df, char_dict,
-    with Pool() as pool:
+                                                                                   max_len, window_size)
-        results = []
+    domain, flow, name, client, server = filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server)
-        for user_flow in tqdm(get_flow_per_user(user_flow_df), total=len(user_flow_df['user_hash'].unique().tolist())):
+    return domain, flow, name, client, server
-            results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
+
-        windows = [window for res in results for window in res.get()]
+
-    logger.info("create training dataset")
+def filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server):
    domain, flow, hits, names, server, trusted_hits = create_dataset_from_lists(chunks=windows,
                                                                                vocab=char_dict,
                                                                                max_len=max_len)
    # make client labels discrete with 4 different values
    hits = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits))
    # select only 1.0 and 0.0 from training data
    pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0]
    neg_idx = np.where(hits == 0.0)[0]
@@ -118,15 +113,15 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
    # choose selected sample to train on
    domain = domain[idx]
    flow = flow[idx]
-    client_tr = np.zeros_like(idx, float)
+    client = np.zeros_like(idx, float)
-    client_tr[:pos_idx.shape[-1]] = 1.0
+    client[:pos_idx.shape[-1]] = 1.0
    server = server[idx]
-    names = names[idx]
+    name = name[idx]
-    return domain, flow, names, client_tr, server
+    return domain, flow, name, client, server
-def create_testset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
+def create_raw_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
    logger.info("get chunks from user data frames")
    with Pool() as pool:
        results = []
@@ -134,24 +129,13 @@ def create_testset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
            results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
        windows = [window for res in results for window in res.get()]
    logger.info("create training dataset")
-    domain, flow, hits, names, server, trusted_hits = create_dataset_from_lists(chunks=windows,
+    domain, flow, hits, name, server, trusted_hits = create_dataset_from_windows(chunks=windows,
-                                                                                vocab=char_dict,
+                                                                                 vocab=char_dict,
-                                                                                max_len=max_len)
+                                                                                 max_len=max_len)
    # make client labels discrete with 4 different values
-    hits = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits))
+    hits = np.apply_along_axis(lambda x: make_label_discrete(x, 3), 0, np.atleast_2d(hits))
    # select only 1.0 and 0.0 from training data
    pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0]
    neg_idx = np.where(hits == 0.0)[0]
    idx = np.concatenate((pos_idx, neg_idx))
    # choose selected sample to train on
    domain = domain[idx]
    flow = flow[idx]
    client_tr = np.zeros_like(idx, float)
    client_tr[:pos_idx.shape[-1]] = 1.0
    server = server[idx]
    names = names[idx]
-    return domain, flow, names, client_tr, server
+    return domain, flow, name, hits, trusted_hits, server
 def store_h5dataset(path, data: dict):
@@ -163,14 +147,13 @@ def store_h5dataset(path, data: dict):
 def load_h5dataset(path):
    f = h5py.File(path, "r")
    keys = f.keys()
    data = {}
-    for k in keys:
+    for k in f.keys():
        data[k] = f[k]
    return data
-def create_dataset_from_lists(chunks, vocab, max_len):
+def create_dataset_from_windows(chunks, vocab, max_len):
    """
    combines domain and feature windows to sequential training data
    :param chunks: list of flow feature windows
@@ -204,7 +187,7 @@ def create_dataset_from_lists(chunks, vocab, max_len):
            hits, names, servers, trusted_hits)
-def discretize_label(values, threshold):
+def make_label_discrete(values, threshold):
    max_val = np.max(values)
    if max_val >= threshold:
        return 1.0
@@ -251,8 +234,8 @@ def load_or_generate_h5data(h5data, train_data, domain_length, window_size):
        user_flow_df = get_user_flow_data(train_data)
        logger.info("create training dataset")
        domain, flow, name, client, server = create_dataset_from_flows(user_flow_df, char_dict,
-                                                                        max_len=domain_length,
+                                                                       max_len=domain_length,
-                                                                        window_size=window_size)
+                                                                       window_size=window_size)
        logger.info("store training dataset as h5 file")
        data = {
            "domain": domain.astype(np.int8),
--- a/main.py
+++ b/main.py
@@ -275,7 +275,7 @@ def main_test():
        # np.save(model_args["model_path"] + "/domain_embds.npy", domain_embeddings)
        results["domain_embds"] = domain_embeddings
-        joblib.dump(results, model_args["model_path"] + "results.joblib", compress=3)
+        joblib.dump(results, model_args["model_path"] + "/results.joblib", compress=3)
 def main_visualization():