refactor dataset creation, split up functions
This commit is contained in:
parent
528829bb33
commit
edc75f4f44
66
Makefile
66
Makefile
@ -1,37 +1,69 @@
|
|||||||
run:
|
run:
|
||||||
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test1 --epochs 2 --depth small \
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_1 --epochs 2 --depth small \
|
||||||
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final --model_output both
|
||||||
|
|
||||||
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test2 --epochs 2 --depth small \
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_2 --epochs 2 --depth small \
|
||||||
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter --model_output both
|
||||||
|
|
||||||
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test3 --epochs 2 --depth medium \
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_3 --epochs 2 --depth medium \
|
||||||
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final --model_output both
|
||||||
|
|
||||||
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test4 --epochs 2 --depth medium \
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_4 --epochs 2 --depth medium \
|
||||||
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter --model_output both
|
||||||
|
|
||||||
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test5 --epochs 2 --depth small \
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_both_5 --epochs 2 --depth small \
|
||||||
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type staggered
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type staggered --model_output both
|
||||||
|
|
||||||
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_client_1 --epochs 2 --depth small \
|
||||||
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final --model_output client
|
||||||
|
|
||||||
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_client_2 --epochs 2 --depth small \
|
||||||
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter --model_output client
|
||||||
|
|
||||||
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_client_3 --epochs 2 --depth medium \
|
||||||
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type final --model_output client
|
||||||
|
|
||||||
|
python3 main.py --mode train --train data/rk_mini.csv.gz --model results/test/test_client_4 --epochs 2 --depth medium \
|
||||||
|
--filter_embd 32 --kernel_embd 3 --filter_main 16 --kernel_main 3 --dense_main 128 \
|
||||||
|
--dense_embd 16 --domain_embd 8 --batch 64 --balanced_weights --type inter --model_output client
|
||||||
|
|
||||||
test:
|
test:
|
||||||
python3 main.py --mode test --batch 128 --models results/test* --test data/rk_mini.csv.gz
|
python3 main.py --mode test --batch 128 --models results/test/test_both_* --test data/rk_mini.csv.gz --model_output both
|
||||||
|
python3 main.py --mode test --batch 128 --models results/test/test_client_* --test data/rk_mini.csv.gz --model_output client
|
||||||
|
|
||||||
fancy:
|
fancy:
|
||||||
python3 main.py --mode fancy --batch 128 --model results/test1 --test data/rk_mini.csv.gz
|
python3 main.py --mode fancy --batch 128 --model results/test/test_both_1 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
python3 main.py --mode fancy --batch 128 --model results/test2 --test data/rk_mini.csv.gz
|
python3 main.py --mode fancy --batch 128 --model results/test/test_both_2 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
python3 main.py --mode fancy --batch 128 --model results/test3 --test data/rk_mini.csv.gz
|
python3 main.py --mode fancy --batch 128 --model results/test/test_both_3 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
python3 main.py --mode fancy --batch 128 --model results/test4 --test data/rk_mini.csv.gz
|
python3 main.py --mode fancy --batch 128 --model results/test/test_both_4 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
|
python3 main.py --mode fancy --batch 128 --model results/test/test_both_5 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
|
python3 main.py --mode fancy --batch 128 --model results/test/test_client_1 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
|
python3 main.py --mode fancy --batch 128 --model results/test/test_client_2 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
|
python3 main.py --mode fancy --batch 128 --model results/test/test_client_3 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
|
python3 main.py --mode fancy --batch 128 --model results/test/test_client_4 --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
all-fancy:
|
all-fancy:
|
||||||
python3 main.py --mode all_fancy --batch 128 --models results/test* --test data/rk_mini.csv.gz
|
python3 main.py --mode all_fancy --batch 128 --models results/test/test* --test data/rk_mini.csv.gz
|
||||||
|
|
||||||
hyper:
|
hyper:
|
||||||
python3 main.py --mode hyperband --batch 64 --train data/rk_data.csv.gz
|
python3 main.py --mode hyperband --batch 64 --train data/rk_data.csv.gz
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -r results/test*
|
rm -r results/test/test*
|
||||||
rm data/rk_mini.csv.gz.h5
|
rm data/rk_mini.csv.gz.h5
|
||||||
|
61
dataset.py
61
dataset.py
@ -99,18 +99,13 @@ def get_all_flow_features(features):
|
|||||||
|
|
||||||
|
|
||||||
def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
|
def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
|
||||||
logger.info("get chunks from user data frames")
|
domain, flow, name, hits, trusted_hits, server = create_raw_dataset_from_flows(user_flow_df, char_dict,
|
||||||
with Pool() as pool:
|
max_len, window_size)
|
||||||
results = []
|
domain, flow, name, client, server = filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server)
|
||||||
for user_flow in tqdm(get_flow_per_user(user_flow_df), total=len(user_flow_df['user_hash'].unique().tolist())):
|
return domain, flow, name, client, server
|
||||||
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
|
|
||||||
windows = [window for res in results for window in res.get()]
|
|
||||||
logger.info("create training dataset")
|
def filter_window_dataset_by_hits(domain, flow, name, hits, trusted_hits, server):
|
||||||
domain, flow, hits, names, server, trusted_hits = create_dataset_from_lists(chunks=windows,
|
|
||||||
vocab=char_dict,
|
|
||||||
max_len=max_len)
|
|
||||||
# make client labels discrete with 4 different values
|
|
||||||
hits = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits))
|
|
||||||
# select only 1.0 and 0.0 from training data
|
# select only 1.0 and 0.0 from training data
|
||||||
pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0]
|
pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0]
|
||||||
neg_idx = np.where(hits == 0.0)[0]
|
neg_idx = np.where(hits == 0.0)[0]
|
||||||
@ -118,15 +113,15 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
|
|||||||
# choose selected sample to train on
|
# choose selected sample to train on
|
||||||
domain = domain[idx]
|
domain = domain[idx]
|
||||||
flow = flow[idx]
|
flow = flow[idx]
|
||||||
client_tr = np.zeros_like(idx, float)
|
client = np.zeros_like(idx, float)
|
||||||
client_tr[:pos_idx.shape[-1]] = 1.0
|
client[:pos_idx.shape[-1]] = 1.0
|
||||||
server = server[idx]
|
server = server[idx]
|
||||||
names = names[idx]
|
name = name[idx]
|
||||||
|
|
||||||
return domain, flow, names, client_tr, server
|
return domain, flow, name, client, server
|
||||||
|
|
||||||
|
|
||||||
def create_testset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
|
def create_raw_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
|
||||||
logger.info("get chunks from user data frames")
|
logger.info("get chunks from user data frames")
|
||||||
with Pool() as pool:
|
with Pool() as pool:
|
||||||
results = []
|
results = []
|
||||||
@ -134,24 +129,13 @@ def create_testset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
|
|||||||
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
|
results.append(pool.apply_async(get_user_chunks, (user_flow, window_size)))
|
||||||
windows = [window for res in results for window in res.get()]
|
windows = [window for res in results for window in res.get()]
|
||||||
logger.info("create training dataset")
|
logger.info("create training dataset")
|
||||||
domain, flow, hits, names, server, trusted_hits = create_dataset_from_lists(chunks=windows,
|
domain, flow, hits, name, server, trusted_hits = create_dataset_from_windows(chunks=windows,
|
||||||
vocab=char_dict,
|
vocab=char_dict,
|
||||||
max_len=max_len)
|
max_len=max_len)
|
||||||
# make client labels discrete with 4 different values
|
# make client labels discrete with 4 different values
|
||||||
hits = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits))
|
hits = np.apply_along_axis(lambda x: make_label_discrete(x, 3), 0, np.atleast_2d(hits))
|
||||||
# select only 1.0 and 0.0 from training data
|
|
||||||
pos_idx = np.where(np.logical_or(hits == 1.0, trusted_hits >= 1.0))[0]
|
|
||||||
neg_idx = np.where(hits == 0.0)[0]
|
|
||||||
idx = np.concatenate((pos_idx, neg_idx))
|
|
||||||
# choose selected sample to train on
|
|
||||||
domain = domain[idx]
|
|
||||||
flow = flow[idx]
|
|
||||||
client_tr = np.zeros_like(idx, float)
|
|
||||||
client_tr[:pos_idx.shape[-1]] = 1.0
|
|
||||||
server = server[idx]
|
|
||||||
names = names[idx]
|
|
||||||
|
|
||||||
return domain, flow, names, client_tr, server
|
return domain, flow, name, hits, trusted_hits, server
|
||||||
|
|
||||||
|
|
||||||
def store_h5dataset(path, data: dict):
|
def store_h5dataset(path, data: dict):
|
||||||
@ -163,14 +147,13 @@ def store_h5dataset(path, data: dict):
|
|||||||
|
|
||||||
def load_h5dataset(path):
|
def load_h5dataset(path):
|
||||||
f = h5py.File(path, "r")
|
f = h5py.File(path, "r")
|
||||||
keys = f.keys()
|
|
||||||
data = {}
|
data = {}
|
||||||
for k in keys:
|
for k in f.keys():
|
||||||
data[k] = f[k]
|
data[k] = f[k]
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def create_dataset_from_lists(chunks, vocab, max_len):
|
def create_dataset_from_windows(chunks, vocab, max_len):
|
||||||
"""
|
"""
|
||||||
combines domain and feature windows to sequential training data
|
combines domain and feature windows to sequential training data
|
||||||
:param chunks: list of flow feature windows
|
:param chunks: list of flow feature windows
|
||||||
@ -204,7 +187,7 @@ def create_dataset_from_lists(chunks, vocab, max_len):
|
|||||||
hits, names, servers, trusted_hits)
|
hits, names, servers, trusted_hits)
|
||||||
|
|
||||||
|
|
||||||
def discretize_label(values, threshold):
|
def make_label_discrete(values, threshold):
|
||||||
max_val = np.max(values)
|
max_val = np.max(values)
|
||||||
if max_val >= threshold:
|
if max_val >= threshold:
|
||||||
return 1.0
|
return 1.0
|
||||||
@ -251,8 +234,8 @@ def load_or_generate_h5data(h5data, train_data, domain_length, window_size):
|
|||||||
user_flow_df = get_user_flow_data(train_data)
|
user_flow_df = get_user_flow_data(train_data)
|
||||||
logger.info("create training dataset")
|
logger.info("create training dataset")
|
||||||
domain, flow, name, client, server = create_dataset_from_flows(user_flow_df, char_dict,
|
domain, flow, name, client, server = create_dataset_from_flows(user_flow_df, char_dict,
|
||||||
max_len=domain_length,
|
max_len=domain_length,
|
||||||
window_size=window_size)
|
window_size=window_size)
|
||||||
logger.info("store training dataset as h5 file")
|
logger.info("store training dataset as h5 file")
|
||||||
data = {
|
data = {
|
||||||
"domain": domain.astype(np.int8),
|
"domain": domain.astype(np.int8),
|
||||||
|
2
main.py
2
main.py
@ -275,7 +275,7 @@ def main_test():
|
|||||||
# np.save(model_args["model_path"] + "/domain_embds.npy", domain_embeddings)
|
# np.save(model_args["model_path"] + "/domain_embds.npy", domain_embeddings)
|
||||||
|
|
||||||
results["domain_embds"] = domain_embeddings
|
results["domain_embds"] = domain_embeddings
|
||||||
joblib.dump(results, model_args["model_path"] + "results.joblib", compress=3)
|
joblib.dump(results, model_args["model_path"] + "/results.joblib", compress=3)
|
||||||
|
|
||||||
|
|
||||||
def main_visualization():
|
def main_visualization():
|
||||||
|
Loading…
Reference in New Issue
Block a user