add feature to use both hits information from dataset
This commit is contained in:
parent
b2f5c56019
commit
933f6bf1d7
26
dataset.py
26
dataset.py
@ -28,7 +28,7 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
|
||||
maxMilliSeconds = maxLengthInSeconds * 1000
|
||||
outDomainLists = []
|
||||
outDFFrames = []
|
||||
if overlapping == False:
|
||||
if not overlapping:
|
||||
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
|
||||
userIDs = np.arange(len(dataFrame))
|
||||
for blockID in np.arange(numBlocks):
|
||||
@ -70,9 +70,9 @@ def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
|
||||
def get_domain_features(domain, vocab, max_length=40):
|
||||
encoding = np.zeros((max_length,))
|
||||
for j in range(np.min([len(domain), max_length])):
|
||||
curCharacter = domain[-j]
|
||||
if curCharacter in vocab:
|
||||
encoding[j] = vocab[curCharacter]
|
||||
char = domain[-j]
|
||||
if char in vocab:
|
||||
encoding[j] = vocab[char]
|
||||
return encoding
|
||||
|
||||
|
||||
@ -90,6 +90,7 @@ def get_flow_features(flow):
|
||||
return features
|
||||
|
||||
|
||||
# NOT USED ATM
|
||||
def get_cisco_features(curDataLine, urlSIPDict):
|
||||
numCiscoFeatures = 30
|
||||
try:
|
||||
@ -124,19 +125,19 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10,
|
||||
window_size=window_size)
|
||||
|
||||
# make client labels discrete with 4 different values
|
||||
# TODO: use trusted_hits_tr for client classification too
|
||||
client_labels = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
|
||||
hits_tr = np.apply_along_axis(lambda x: discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
|
||||
# select only 1.0 and 0.0 from training data
|
||||
pos_idx = np.where(client_labels == 1.0)[0]
|
||||
neg_idx = np.where(client_labels == 0.0)[0]
|
||||
pos_idx = np.where(np.logical_or(hits_tr == 1.0, trusted_hits_tr >= 1.0))[0]
|
||||
neg_idx = np.where(hits_tr == 0.0)[0]
|
||||
idx = np.concatenate((pos_idx, neg_idx))
|
||||
# choose selected sample to train on
|
||||
domain_tr = domain_tr[idx]
|
||||
flow_tr = flow_tr[idx]
|
||||
client_labels = client_labels[idx]
|
||||
server_labels = server_tr[idx]
|
||||
client_tr = np.zeros_like(idx, float)
|
||||
client_tr[:pos_idx.shape[-1]] = 1.0
|
||||
server_tr = server_tr[idx]
|
||||
|
||||
return domain_tr, flow_tr, client_labels, server_labels
|
||||
return domain_tr, flow_tr, client_tr, server_tr
|
||||
|
||||
|
||||
def create_dataset_from_lists(domains, features, vocab, max_len,
|
||||
@ -202,7 +203,8 @@ def discretize_label(values, threshold):
|
||||
|
||||
def get_user_flow_data(csv_file):
|
||||
df = pd.read_csv(csv_file)
|
||||
keys = ["duration", "bytes_down", "bytes_up", "domain", "timeStamp", "server_ip", "user_hash", "virusTotalHits",
|
||||
keys = ["duration", "bytes_down", "bytes_up", "domain",
|
||||
"timeStamp", "server_ip", "user_hash", "virusTotalHits",
|
||||
"serverLabel", "trustedHits"]
|
||||
df = df[keys]
|
||||
df.set_index(keys=['user_hash'], drop=False, inplace=True)
|
||||
|
11
main.py
11
main.py
@ -87,6 +87,7 @@ def main():
|
||||
kernel_size = 3
|
||||
drop_out = 0.5
|
||||
filters = 128
|
||||
network = models.pauls_networks
|
||||
|
||||
char_dict = dataset.get_character_dict()
|
||||
user_flow_df = dataset.get_user_flow_data(args.train_data)
|
||||
@ -96,11 +97,11 @@ def main():
|
||||
user_flow_df, char_dict,
|
||||
max_len=args.domain_length, window_size=args.window)
|
||||
|
||||
shared_cnn = models.renes_networks.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
|
||||
shared_cnn = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
|
||||
args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5)
|
||||
shared_cnn.summary()
|
||||
|
||||
model = models.renes_networks.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding,
|
||||
model = network.get_model(cnnDropout, flow_tr.shape[-1], args.domain_embedding,
|
||||
args.window, args.domain_length, filters, kernel_size,
|
||||
cnnHiddenDims, shared_cnn)
|
||||
model.summary()
|
||||
@ -119,7 +120,11 @@ def main():
|
||||
validation_split=0.2)
|
||||
|
||||
|
||||
def test():
|
||||
def main_train():
|
||||
pass
|
||||
|
||||
|
||||
def main_test():
|
||||
char_dict = dataset.get_character_dict()
|
||||
user_flow_df = dataset.get_user_flow_data(args.test_data)
|
||||
domain_val, flow_val, client_val, server_val = dataset.create_dataset_from_flows(
|
||||
|
@ -25,11 +25,12 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le
|
||||
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
||||
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
|
||||
# CNN processing a small slides of flow windows
|
||||
# TODO: add more layers?
|
||||
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu',
|
||||
input_shape=(window_size, domain_features + flow_features))(merged)
|
||||
y = MaxPool1D(pool_size=3, strides=1)(y)
|
||||
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
|
||||
y = MaxPool1D(pool_size=3, strides=1)(y)
|
||||
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu')(y)
|
||||
# remove temporal dimension by global max pooling
|
||||
y = GlobalMaxPooling1D()(y)
|
||||
y = Dropout(cnnDropout)(y)
|
||||
|
Loading…
Reference in New Issue
Block a user