From fdc03c99221dca831f3550e10cc55f64e87cbf20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= Date: Sat, 8 Jul 2017 17:46:07 +0200 Subject: [PATCH] add h5py example --- dataset.py | 91 +++++++++++++++++-------------------- main.py | 39 +++++++++++++++- scripts/make_csv_dataset.py | 9 ++++ 3 files changed, 87 insertions(+), 52 deletions(-) diff --git a/dataset.py b/dataset.py index 2019263..c0df04a 100644 --- a/dataset.py +++ b/dataset.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import string +import h5py import numpy as np import pandas as pd from keras.utils import np_utils @@ -91,39 +92,24 @@ def get_flow_features(flow): return features -# NOT USED ATM -def get_cisco_features(curDataLine, urlSIPDict): - numCiscoFeatures = 30 - try: - ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] - # log transform - ciscoFeatures = np.log1p(ciscoFeatures).astype(float) - return ciscoFeatures.ravel() - except: - return np.zeros([numCiscoFeatures, ]).ravel() - - def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, use_cisco_features=False): domains = [] features = [] print("get chunks from user data frames") - for i, user_flow in enumerate(get_flow_per_user(user_flow_df)): + for i, user_flow in tqdm(list(enumerate(get_flow_per_user(user_flow_df)))): (domain_windows, feature_windows) = get_user_chunks(user_flow, windowSize=window_size, overlapping=False, maxLengthInSeconds=-1) domains += domain_windows features += feature_windows - # TODO: remove later - if i >= 50: - break print("create training dataset") - domain_tr, flow_tr, hits_tr, names_tr, server_tr, trusted_hits_tr = create_dataset_from_lists( - domains=domains, features=features, vocab=char_dict, - max_len=max_len, - use_cisco_features=use_cisco_features, urlSIPDIct=dict(), - window_size=window_size) + domain_tr, flow_tr, hits_tr, names_tr, server_tr, trusted_hits_tr = create_dataset_from_lists(domains=domains, + flows=features, + vocab=char_dict, + max_len=max_len, + window_size=window_size) # make client labels discrete with 4 different values hits_tr = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr)) @@ -144,32 +130,29 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, return domain_tr, flow_tr, client_tr, server_tr -def create_dataset_from_lists(domains, features, vocab, max_len, - use_cisco_features=False, urlSIPDIct=dict(), - window_size=10): +def store_h5dataset(domain_tr, flow_tr, client_tr, server_tr): + f = h5py.File("data/full_dataset.h5", "w") + domain_tr = domain_tr.astype(np.int8) + f.create_dataset("domain", data=domain_tr) + f.create_dataset("flow", data=flow_tr) + server_tr = server_tr.astype(np.bool) + client_tr = client_tr.astype(np.bool) + f.create_dataset("client", data=client_tr) + f.create_dataset("server", data=server_tr) + f.close() + + +def create_dataset_from_lists(domains, flows, vocab, max_len, window_size=10): """ combines domain and feature windows to sequential training data :param domains: list of domain windows - :param features: list of feature windows + :param flows: list of flow feature windows :param vocab: :param max_len: - :param use_cisco_features: idk - :param urlSIPDIct: idk :param window_size: size of the flow window :return: """ - # TODO: check for hits vs vth consistency - # if 'hits' in dfs[0].keys(): - # hits_col = 'hits' - # elif 'virusTotalHits' in dfs[0].keys(): - # hits_col = 'virusTotalHits' - hits_col = "virusTotalHits" - - numFlowFeatures = 3 - numCiscoFeatures = 30 - numFeatures = numFlowFeatures - if use_cisco_features: - numFeatures += numCiscoFeatures + numFeatures = 3 sample_size = len(domains) hits = [] names = [] @@ -181,14 +164,13 @@ def create_dataset_from_lists(domains, features, vocab, max_len, for i in tqdm(np.arange(sample_size), miniters=10): for j in range(window_size): - domain_features[i, j] = get_domain_features(domains[i][j], vocab, max_len) - flow_features[i, j] = get_flow_features(features[i].iloc[j]) - # TODO: cisco features? + domain_features[i, j, :] = get_domain_features(domains[i][j], vocab, max_len) + flow_features[i, j, :] = get_flow_features(flows[i].iloc[j]) - hits.append(np.max(features[i][hits_col])) - names.append(np.unique(features[i]['user_hash'])) - servers.append(np.max(features[i]['serverLabel'])) - trusted_hits.append(np.max(features[i]['trustedHits'])) + hits.append(np.max(flows[i]['virusTotalHits'])) + names.append(np.unique(flows[i]['user_hash'])) + servers.append(np.max(flows[i]['serverLabel'])) + trusted_hits.append(np.max(flows[i]['trustedHits'])) return (domain_features, flow_features, np.array(hits), np.array(names), np.array(servers), np.array(trusted_hits)) @@ -206,11 +188,20 @@ def discretize_label(values, threshold): def get_user_flow_data(csv_file): + types = { + "duration": int, + "bytes_down": int, + "bytes_up": int, + "domain": object, + "timeStamp": float, + "server_ip": object, + "user_hash": float, + "virusTotalHits": int, + "serverLabel": int, + "trustedHits": int + } df = pd.read_csv(csv_file) - keys = ["duration", "bytes_down", "bytes_up", "domain", - "timeStamp", "server_ip", "user_hash", "virusTotalHits", - "serverLabel", "trustedHits"] - df = df[keys] + df = df[list(types.keys())] df.set_index(keys=['user_hash'], drop=False, inplace=True) return df diff --git a/main.py b/main.py index a7b0ee6..e63fdb7 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,7 @@ import argparse +import h5py +from keras.models import load_model from keras.utils import np_utils import dataset @@ -8,7 +10,8 @@ import models parser = argparse.ArgumentParser() -parser.add_argument("--modes", action="store", dest="modes", nargs="+") +parser.add_argument("--modes", action="store", dest="modes", nargs="+", + default=[]) parser.add_argument("--train", action="store", dest="train_data", default="data/full_dataset.csv.tar.bz2") @@ -193,7 +196,39 @@ def main_train(): model.save(args.clf_model) -from keras.models import load_model +def main_train_h5(): + # parameter + dropout_main = 0.5 + dense_main = 512 + kernel_main = 3 + filter_main = 128 + network = models.pauls_networks if args.model_type == "paul" else models.renes_networks + + char_dict = dataset.get_character_dict() + data = h5py.File("data/full_dataset.h5", "r") + + embedding = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length, + args.hidden_char_dims, kernel_main, args.domain_embedding, 0.5) + embedding.summary() + + model = network.get_model(dropout_main, data["flow"].shape[-1], args.domain_embedding, + args.window, args.domain_length, filter_main, kernel_main, + dense_main, embedding) + model.summary() + + model.compile(optimizer='adam', + loss='categorical_crossentropy', + metrics=['accuracy']) + + model.fit([data["domain"], data["flow"]], + [data["client"], data["server"]], + batch_size=args.batch_size, + epochs=args.epochs, + shuffle=True, + validation_split=0.2) + + embedding.save(args.embedding_model) + model.save(args.clf_model) def main_test(): diff --git a/scripts/make_csv_dataset.py b/scripts/make_csv_dataset.py index 2b2a92d..479582b 100644 --- a/scripts/make_csv_dataset.py +++ b/scripts/make_csv_dataset.py @@ -1,9 +1,18 @@ #!/usr/bin/python2 import joblib +import numpy as np import pandas as pd df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib") df = pd.concat(df["data"]) df.reset_index(inplace=True) +df.dropna(axis=0, how="any", inplace=True) +df[["duration", "bytes_down", "bytes_up"]] = df[["duration", "bytes_down", "bytes_up"]].astype(np.int) +df[["domain", "server_ip"]] = df[["domain", "server_ip"]].astype(str) +df[["server_label"]] = df[["server_label"]].astype(np.bool) +df.serverLabel = df.serverLabel.astype(np.bool) +df.virusTotalHits = df.virusTotalHits.astype(np.int) +df.trustedHits = df.trustedHits.astype(np.int) + df.to_csv("/tmp/rk/full_future_dataset.csv.gz", compression="gzip")