From 933f6bf1d701e9a3878072a18981ba0ed193f008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Knaebel?= Date: Thu, 6 Jul 2017 16:27:47 +0200 Subject: [PATCH] add feature to use both hits information from dataset --- dataset.py | 26 ++++++++++++++------------ main.py | 17 +++++++++++------ models/renes_networks.py | 3 ++- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/dataset.py b/dataset.py index ae06ecb..0aa8368 100644 --- a/dataset.py +++ b/dataset.py @@ -28,7 +28,7 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False, maxMilliSeconds = maxLengthInSeconds * 1000 outDomainLists = [] outDFFrames = [] - if overlapping == False: + if not overlapping: numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) userIDs = np.arange(len(dataFrame)) for blockID in np.arange(numBlocks): @@ -70,9 +70,9 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False, def get_domain_features(domain, vocab, max_length=40): encoding = np.zeros((max_length,)) for j in range(np.min([len(domain), max_length])): - curCharacter = domain[-j] - if curCharacter in vocab: - encoding[j] = vocab[curCharacter] + char = domain[-j] + if char in vocab: + encoding[j] = vocab[char] return encoding @@ -90,6 +90,7 @@ def get_flow_features(flow): return features +# NOT USED ATM def get_cisco_features(curDataLine, urlSIPDict): numCiscoFeatures = 30 try: @@ -124,19 +125,19 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10, window_size=window_size) # make client labels discrete with 4 different values - # TODO: use trusted_hits_tr for client classification too - client_labels = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr)) + hits_tr = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr)) # select only 1.0 and 0.0 from training data - pos_idx = np.where(client_labels == 1.0)[0] - neg_idx = np.where(client_labels == 0.0)[0] + pos_idx = np.where(np.logical_or(hits_tr == 1.0, trusted_hits_tr >= 1.0))[0] + neg_idx = np.where(hits_tr == 0.0)[0] idx = np.concatenate((pos_idx, neg_idx)) # choose selected sample to train on domain_tr = domain_tr[idx] flow_tr = flow_tr[idx] - client_labels = client_labels[idx] - server_labels = server_tr[idx] + client_tr = np.zeros_like(idx, float) + client_tr[:pos_idx.shape[-1]] = 1.0 + server_tr = server_tr[idx] - return domain_tr, flow_tr, client_labels, server_labels + return domain_tr, flow_tr, client_tr, server_tr def create_dataset_from_lists(domains, features, vocab, max_len, @@ -202,7 +203,8 @@ def discretize_label(values, threshold): def get_user_flow_data(csv_file): df = pd.read_csv(csv_file) - keys = ["duration", "bytes_down", "bytes_up", "domain", "timeStamp", "server_ip", "user_hash", "virusTotalHits", + keys = ["duration", "bytes_down", "bytes_up", "domain", + "timeStamp", "server_ip", "user_hash", "virusTotalHits", "serverLabel", "trustedHits"] df = df[keys] df.set_index(keys=['user_hash'], drop=False, inplace=True) diff --git a/main.py b/main.py index bda8ff1..6fa8c28 100644 --- a/main.py +++ b/main.py @@ -87,6 +87,7 @@ def main(): kernel_size = 3 drop_out = 0.5 filters = 128 + network = models.pauls_networks char_dict = dataset.get_character_dict() user_flow_df = dataset.get_user_flow_data(args.train_data) @@ -96,13 +97,13 @@ def main(): user_flow_df, char_dict, max_len=args.domain_length, window_size=args.window) - shared_cnn = models.renes_networks.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length, - args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5) + shared_cnn = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length, + args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5) shared_cnn.summary() - model = models.renes_networks.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding, - args.window, args.domain_length, filters, kernel_size, - cnnHiddenDims, shared_cnn) + model = network.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding, + args.window, args.domain_length, filters, kernel_size, + cnnHiddenDims, shared_cnn) model.summary() model.compile(optimizer='adam', @@ -119,7 +120,11 @@ def main(): validation_split=0.2) -def test(): +def main_train(): + pass + + +def main_test(): char_dict = dataset.get_character_dict() user_flow_df = dataset.get_user_flow_data(args.test_data) domain_val, flow_val, client_val, server_val = dataset.create_dataset_from_flows( diff --git a/models/renes_networks.py b/models/renes_networks.py index aaaefa6..5122720 100644 --- a/models/renes_networks.py +++ b/models/renes_networks.py @@ -25,11 +25,12 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows") merged = keras.layers.concatenate([encoded, ipt_flows], -1) # CNN processing a small slides of flow windows - # TODO: add more layers? y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu', input_shape=(window_size, domain_features + flow_features))(merged) y = MaxPool1D(pool_size=3, strides=1)(y) y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y) + y = MaxPool1D(pool_size=3, strides=1)(y) + y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y) # remove temporal dimension by global max pooling y = GlobalMaxPooling1D()(y) y = Dropout(cnnDropout)(y)