refactoring and adding standard files for .gitignore and .keep in data

This commit is contained in:
René Knaebel 2017-06-30 09:04:24 +02:00
parent 87b927cdc9
commit be273d9247
5 changed files with 382 additions and 325 deletions

99
.gitignore vendored Normal file
View File

@ -0,0 +1,99 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
.cache/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
# intelliJ
.idea/
# Apple?
.DS_Store
# data
*.tif

View File

@ -17,7 +17,7 @@ import random
from keras.models import model_from_json from keras.models import model_from_json
import time import time
import re import re
import mongoDBConnector as mongoDBConnector # import mongoDBConnector as mongoDBConnector
import stackedNeuralModels as stackedNeuralModels import stackedNeuralModels as stackedNeuralModels
from tqdm import tqdm from tqdm import tqdm

View File

@ -1,20 +1,199 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import joblib import string
import keras import keras
import numpy as np import numpy as np
import tensorflow as tf import pandas as pd
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
from keras.layers import Input
from keras.models import Model from keras.models import Model
from keras.utils import np_utils from keras.utils import np_utils
from tqdm import tqdm from tqdm import tqdm
import stackedNeuralModels as stackedNeuralModels
config = tf.ConfigProto(log_device_placement=True) # config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5 # config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True # config.gpu_options.allow_growth = True
session = tf.Session(config=config) # session = tf.Session(config=config)
def get_character_dict():
return dict((char, idx) for (idx, char) in
enumerate(string.ascii_lowercase + string.punctuation))
def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300):
# print('maxLength: ' + str(maxLengthInSeconds))
maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = []
outDFFrames = []
if overlapping == False:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
else:
numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID + windowSize]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
return (outDomainLists, outDFFrames)
def getFeatureVecForDomain(domain, characterDict, maxLen=40):
curFeature = np.zeros([maxLen, ])
for j in range(np.min([len(domain), maxLen])):
# print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
return curFeature
def getFlowFeatures(curDataLine):
useKeys = ['duration', 'bytes_down', 'bytes_up']
curFeature = np.zeros([len(useKeys), ])
for i in range(len(useKeys)):
curKey = useKeys[i]
try:
curFeature[i] = np.log1p(curDataLine[curKey]).astype(float)
except:
pass
return curFeature
def getCiscoFeatures(curDataLine, urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# print('cisco features: ' + str(ciscoFeatures))
# log transform
ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
# print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures, ]).ravel()
def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, windowSize=10):
domainLists = []
dfLists = []
print("get chunks from user data frames")
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
(domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
overlapping=False, maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
if i >= 10:
break
print("create training dataset")
return create_dataset_from_lists(
domainLists=domainLists, dfLists=dfLists, charachterDict=char_dict,
maxLen=maxLen, threshold=threshold,
flagUseCiscoFeatures=False, urlSIPDIct=dict(),
windowSize=windowSize)
def create_dataset_from_lists(domainLists, dfLists, charachterDict, maxLen, threshold=3,
flagUseCiscoFeatures=False, urlSIPDIct=dict(),
windowSize=10):
if 'hits' in dfLists[0].keys():
hitName = 'hits'
elif 'virusTotalHits' in dfLists[0].keys():
hitName = 'virusTotalHits'
numFlowFeatures = 3
numCiscoFeatures = 30
numFeatures = numFlowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
outputFeatures = []
label = []
hits = []
trainNames = []
for i in range(windowSize):
outputFeatures.append(np.zeros([len(domainLists), maxLen]))
outputFeatures.append(np.zeros([len(domainLists), numFeatures]))
for i in tqdm(np.arange(len(domainLists)), miniters=10):
curCounter = 0
# print('len domainList: ' + str(len(domainLists[i])))
# print('len df: ' + str(len(dfLists[i])))
for j in range(np.min([windowSize, len(domainLists[i])])):
outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen)
curCounter += 1
if flagUseCiscoFeatures:
outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct)
else:
outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j])
curCounter += 1
curLabel = 0.0
if np.max(dfLists[i][hitName]) >= threshold:
curLabel = 1.0
elif np.max(dfLists[i][hitName]) == -1:
curLabel = -1.0
elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
curLabel = -2.0
label.append(curLabel)
hits.append(np.max(dfLists[i][hitName]))
trainNames.append(np.unique(dfLists[i]['user_hash']))
return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
def get_user_flow_data():
# load train and test data from joblib
# created with createTrainDataMultipleTaskLearning.py
# rk: changed to csv file
trainDFs = pd.read_csv("data/rk_data.csv.gz")
trainDFs.drop("Unnamed: 0", 1, inplace=True)
trainDFs.set_index(keys=['user_hash'], drop=False, inplace=True)
users = trainDFs['user_hash'].unique().tolist()
u0 = trainDFs.loc[trainDFs.user_hash == users[0]]
return trainDFs
def get_flow_per_user(df):
users = df['user_hash'].unique().tolist()
for user in users:
yield df.loc[df.user_hash == user]
if __name__ == "__main__": if __name__ == "__main__":
# parameter # parameter
@ -39,51 +218,28 @@ if __name__ == "__main__":
maxLengthInSeconds = -1 maxLengthInSeconds = -1
timesNeg = -1 timesNeg = -1
trainDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/currentData.joblib' char_dict = get_character_dict()
testDataPath = '/mnt/projekte/pmlcluster/cisco/trainData/equalClass/futureData.joblib' user_flow_df = get_user_flow_data()
if 'characterDict' not in locals(): print("create training dataset")
characterDictPath = 'trainData/characterIDDict.joblib' (X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
characterDict = joblib.load(characterDictPath)['characterIDDict'] user_flow_df, char_dict,
maxLen=maxLen, threshold=threshold, windowSize=windowSize)
# load train and test data from joblib pos_idx = np.where(y_tr == 1.0)[0]
# created with createTrainDataMultipleTaskLearning.py neg_idx = np.where(y_tr == 0.0)[0]
if 'trainDFs' not in locals():
tmpLoad = joblib.load(trainDataPath)
trainDFs = tmpLoad['data']
if 'testDFs' not in locals(): use_idx = np.concatenate((pos_idx, neg_idx))
tmpLoad = joblib.load(testDataPath)
sharedCNNFun = stackedNeuralModels.getCNNWitoutLastLayerFunctional(len(characterDict) + 1, embeddingSize, maxLen, y_tr = y_tr[use_idx]
domainFeatures, kernel_size, domainFeatures, 0.5) # hits_tr = hits_tr[use_idx]
# names_tr = names_tr[use_idx]
for i in range(len(X_tr)):
X_tr[i] = X_tr[i][use_idx]
domainLists = [] # TODO: WTF? I don't get it...
dfLists = [] sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
for i in tqdm(np.arange(len(trainDFs)), miniters=10): domainFeatures, kernel_size, domainFeatures, 0.5)
(domainListsTmp, dfListsTmp) = stackedNeuralModels.getChunksFromUserDataFrame(trainDFs[i],
windowSize=windowSize,
overlapping=False,
maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
if i == 100:
break
(testData, testLabel, testHits, testNames) = stackedNeuralModels.createTrainData(
domainLists=domainLists, dfLists=dfLists, charachterDict=characterDict,
maxLen=maxLen, threshold=threshold,
flagUseCiscoFeatures=False, urlSIPDIct=dict(),
windowSize=windowSize)
useIDs = np.where(testLabel == 1.0)[0]
useIDs = np.concatenate([useIDs, np.where(testLabel == 0.0)[0]])
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
for i in range(len(testData)):
testData[i] = testData[i][useIDs]
inputList = [] inputList = []
encodedList = [] encodedList = []
@ -102,7 +258,6 @@ if __name__ == "__main__":
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1) merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector) reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn # add second cnn
cnn = Conv1D(filters, cnn = Conv1D(filters,
kernel_size, kernel_size,
activation='relu', activation='relu',
@ -121,7 +276,7 @@ if __name__ == "__main__":
metrics=['accuracy']) metrics=['accuracy'])
epochNumber = 0 epochNumber = 0
trainLabel = np_utils.to_categorical(testLabel, 2) trainLabel = np_utils.to_categorical(y_tr, 2)
model.fit(x=testData, y=trainLabel, model.fit(x=X_tr, y=trainLabel, batch_size=128,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # , epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel)) # validation_data=(testData,testLabel))

0
data/.keep Normal file
View File

View File

@ -1,63 +1,52 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from keras.models import Sequential
from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda
from keras.layers import Convolution1D
import ciscoProcessing as ciscoProcessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib
import csv import csv
import keras import numpy as np
from keras.layers import Dense, Activation, Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Lambda
from keras.layers import Input from keras.layers import Input
from keras.models import Model from keras.models import Model
from keras.utils import np_utils from keras.models import Sequential
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc, roc_curve
from tqdm import tqdm from tqdm import tqdm
import os
def getCiscoFeatures(curDataLine,urlSIPDict): def getCiscoFeatures(curDataLine, urlSIPDict):
numCiscoFeatures = 30 numCiscoFeatures = 30
try: try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])] ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
#print('cisco features: ' + str(ciscoFeatures)) # print('cisco features: ' + str(ciscoFeatures))
# log transform # log transform
ciscoFeatures = np.log1p(ciscoFeatures,dtype='float32') ciscoFeatures = np.log1p(ciscoFeatures, dtype='float32')
#print('log transformed: ' + str(ciscoFeatures)) # print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel() return ciscoFeatures.ravel()
except: except:
return np.zeros([numCiscoFeatures,]).ravel() return np.zeros([numCiscoFeatures, ]).ravel()
def getCNNWithoutLastLayer(vocabSize, embeddingSize, input_length, filters, kernel_size,
def getCNNWithoutLastLayer(vocabSize,embeddingSize,input_length,filters,kernel_size, hidden_dims, drop_out):
hidden_dims,drop_out):
model = Sequential() model = Sequential()
model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize, model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize,
input_length=input_length)) input_length=input_length))
model.add(Conv1D(filters, model.add(Conv1D(filters,
kernel_size, kernel_size,
activation='relu')) activation='relu'))
# we use max pooling: # we use max pooling:
model.add(GlobalMaxPooling1D()) model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer: # We add a vanilla hidden layer:
model.add(Dense(hidden_dims)) model.add(Dense(hidden_dims))
model.add(Dropout(drop_out)) model.add(Dropout(drop_out))
model.add(Activation('relu')) model.add(Activation('relu'))
return model return model
def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters,kernel_size,
hidden_dims,drop_out): def getCNNWitoutLastLayerFunctional(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
a = Input(shape=(input_length,)) a = Input(shape=(input_length,))
embedding = Embedding(input_dim=vocabSize,output_dim=embeddingSize)(a) embedding = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(a)
conv1 = Conv1D(filters,kernel_size,activation='relu')(embedding) conv1 = Conv1D(filters, kernel_size, activation='relu')(embedding)
glob = GlobalMaxPooling1D()(conv1) glob = GlobalMaxPooling1D()(conv1)
dense = Dense(hidden_dims)(glob) dense = Dense(hidden_dims)(glob)
drop = Dropout(drop_out)(dense) drop = Dropout(drop_out)(dense)
@ -65,55 +54,58 @@ def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters
model = Model(a, model) model = Model(a, model)
return model return model
def getFlowFeatureLayer(numFeatures): def getFlowFeatureLayer(numFeatures):
model = Sequential() model = Sequential()
#slpModel.add(Dense(1, input_shape=(1,))) # slpModel.add(Dense(1, input_shape=(1,)))
model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,))) model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,)))
return model return model
def createCNNDataSet(domains,label,characterDict,maxLen=40):
def createCNNDataSet(domains, label, characterDict, maxLen=40):
# process domains in reverse order # process domains in reverse order
outFeature = np.zeros([len(domains),maxLen]) outFeature = np.zeros([len(domains), maxLen])
outLabel = np.zeros([len(domains),]) outLabel = np.zeros([len(domains), ])
for i in range(len(domains)): for i in range(len(domains)):
domain = domains[i] domain = domains[i]
curLabel = label[i] curLabel = label[i]
curFeature = np.zeros([maxLen,]) curFeature = np.zeros([maxLen, ])
# print(domain + ' ' + str(len(domain))) # print(domain + ' ' + str(len(domain)))
for j in range(np.min([len(domain),maxLen])): for j in range(np.min([len(domain), maxLen])):
#print(j) # print(j)
curCharacter = domain[-j] curCharacter = domain[-j]
if curCharacter in characterDict: if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter] curFeature[j] = characterDict[curCharacter]
outFeature[i] = curFeature outFeature[i] = curFeature
outLabel[i] = curLabel outLabel[i] = curLabel
return (outFeature,outLabel) return (outFeature, outLabel)
def getFeatureVecForDomain(domain,characterDict,maxLen=40):
curFeature = np.zeros([maxLen,]) def getFeatureVecForDomain(domain, characterDict, maxLen=40):
for j in range(np.min([len(domain),maxLen])): curFeature = np.zeros([maxLen, ])
#print(j) for j in range(np.min([len(domain), maxLen])):
# print(j)
curCharacter = domain[-j] curCharacter = domain[-j]
if curCharacter in characterDict: if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter] curFeature[j] = characterDict[curCharacter]
return curFeature return curFeature
def getFlowFeatures(curDataLine): def getFlowFeatures(curDataLine):
useKeys = ['duration','bytes_down','bytes_up'] useKeys = ['duration', 'bytes_down', 'bytes_up']
curFeature = np.zeros([len(useKeys),]) curFeature = np.zeros([len(useKeys), ])
for i in range(len(useKeys)): for i in range(len(useKeys)):
curKey = useKeys[i] curKey = useKeys[i]
try: try:
curFeature[i] = np.log1p(curDataLine[curKey],dtype='float32') curFeature[i] = np.log1p(curDataLine[curKey], dtype='float32')
except: except:
pass pass
return curFeature return curFeature
def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False, def getChunksFromUserDataFrame(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300): maxLengthInSeconds=300):
#print('maxLength: ' + str(maxLengthInSeconds)) # print('maxLength: ' + str(maxLengthInSeconds))
maxMilliSeconds = maxLengthInSeconds * 1000 maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = [] outDomainLists = []
outDFFrames = [] outDFFrames = []
@ -121,8 +113,8 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize))) numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame)) userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks): for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID+1)*windowSize)] curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
#print(curIDs) # print(curIDs)
useData = dataFrame.iloc[curIDs] useData = dataFrame.iloc[curIDs]
curDomains = useData['domain'] curDomains = useData['domain']
if maxLengthInSeconds != -1: if maxLengthInSeconds != -1:
@ -138,8 +130,8 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
numBlocks = len(dataFrame) + 1 - windowSize numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame)) userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks): for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID+windowSize] curIDs = userIDs[blockID:blockID + windowSize]
#print(curIDs) # print(curIDs)
useData = dataFrame.iloc[curIDs] useData = dataFrame.iloc[curIDs]
curDomains = useData['domain'] curDomains = useData['domain']
if maxLengthInSeconds != -1: if maxLengthInSeconds != -1:
@ -151,11 +143,11 @@ def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
curDomains = useData['domain'] curDomains = useData['domain']
outDomainLists.append(list(curDomains)) outDomainLists.append(list(curDomains))
outDFFrames.append(useData) outDFFrames.append(useData)
return (outDomainLists,outDFFrames) return (outDomainLists, outDFFrames)
def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3, def createTrainData(domainLists, dfLists, charachterDict, maxLen, threshold=3,
flagUseCiscoFeatures=False,urlSIPDIct=dict, flagUseCiscoFeatures=False, urlSIPDIct=dict,
windowSize=10): windowSize=10):
if 'hits' in dfLists[0].keys(): if 'hits' in dfLists[0].keys():
hitName = 'hits' hitName = 'hits'
@ -171,21 +163,21 @@ def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3,
hits = [] hits = []
trainNames = [] trainNames = []
for i in range(windowSize): for i in range(windowSize):
outputFeatures.append(np.zeros([len(domainLists),maxLen])) outputFeatures.append(np.zeros([len(domainLists), maxLen]))
outputFeatures.append(np.zeros([len(domainLists),numFeatures])) outputFeatures.append(np.zeros([len(domainLists), numFeatures]))
for i in tqdm(np.arange(len(domainLists)), miniters=10): for i in tqdm(np.arange(len(domainLists)), miniters=10):
curCounter = 0 curCounter = 0
#print('len domainList: ' + str(len(domainLists[i]))) # print('len domainList: ' + str(len(domainLists[i])))
#print('len df: ' + str(len(dfLists[i]))) # print('len df: ' + str(len(dfLists[i])))
for j in range(np.min([windowSize,len(domainLists[i])])): for j in range(np.min([windowSize, len(domainLists[i])])):
outputFeatures[curCounter][i,:] = getFeatureVecForDomain(domainLists[i][j],charachterDict,maxLen) outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen)
curCounter += 1 curCounter += 1
if flagUseCiscoFeatures: if flagUseCiscoFeatures:
outputFeatures[curCounter][i,0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j]) outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
outputFeatures[curCounter][i,numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j],urlSIPDIct) outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct)
else: else:
outputFeatures[curCounter][i,:] = getFlowFeatures(dfLists[i].iloc[j]) outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j])
curCounter += 1 curCounter += 1
curLabel = 0.0 curLabel = 0.0
if np.max(dfLists[i][hitName]) >= threshold: if np.max(dfLists[i][hitName]) >= threshold:
@ -198,215 +190,26 @@ def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3,
hits.append(np.max(dfLists[i][hitName])) hits.append(np.max(dfLists[i][hitName]))
trainNames.append(np.unique(dfLists[i]['user_hash'])) trainNames.append(np.unique(dfLists[i]['user_hash']))
return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames)) return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
def transformStringListToNumpyArray(listString): def transformStringListToNumpyArray(listString):
listString = listString.replace('[','').replace(']','') listString = listString.replace('[', '').replace(']', '')
return np.array(listString.split(','),dtype='float32') return np.array(listString.split(','), dtype='float32')
def getCiscoFeatureDict(csvPathList): def getCiscoFeatureDict(csvPathList):
outDict = dict() outDict = dict()
for path in tqdm(csvPathList, miniters=1): for path in tqdm(csvPathList, miniters=1):
fobj = open(path,'r') fobj = open(path, 'r')
csvReader = csv.DictReader(fobj,delimiter=',') csvReader = csv.DictReader(fobj, delimiter=',')
for row in csvReader: for row in csvReader:
urlSIPString = row['Domain'] + row['ServerIP'] urlSIPString = row['Domain'] + row['ServerIP']
ciscoFeatures = row['CiscoFeature'] ciscoFeatures = row['CiscoFeature']
outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures) outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures)
#if len(outDict) % 10000 == 0: # if len(outDict) % 10000 == 0:
# print('numbers in dict: ' + str(len(outDict))) # print('numbers in dict: ' + str(len(outDict)))
return outDict return outDict
if __name__ == "__main__": if __name__ == "__main__":
pass
# get data
trainDirsUserLevel = ['trainData/joblib2016-07-annomalous-stg-new/10/',
'trainData/joblib2016-07-annomalous-stg-new/09/',
'trainData/joblib2016-07-annomalous-stg-new/08/',
'trainData/joblib2016-07-annomalous-stg-new/07/',
'trainData/joblib2016-07-annomalous-stg-new/06/']
testDirsUserLevel = ['trainData/joblib2016-09-annomalous-stg-new/07/',\
'trainData/joblib2016-09-annomalous-stg-new/08/',\
'trainData/joblib2016-09-annomalous-stg-new/09/',\
'trainData/joblib2016-09-annomalous-stg-new/10/',\
'trainData/joblib2016-09-annomalous-stg-new/11/',\
'trainData/joblib2016-09-annomalous-stg-new/12/',\
'trainData/joblib2016-09-annomalous-stg-new/13/',\
'trainData/joblib2016-09-annomalous-stg-new/14/']
trainCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_07.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_06.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_08.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_10.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_09.csv']
testCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_12.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_08.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_07.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_09.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_13.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_14.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_10.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_11.csv']
# parameter
numNegPerDay = 5000
numEpochs = 10
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures= 30
windowSize = 10
maxLen = 40
lstmUnits = 32
lstmDenseSize = 128
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
flagUseCiscoFeatures = True
threshold = 3
resultStoreDir = 'results/201705/'
if flagUseCiscoFeatures:
resultStorePath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
resultModelPath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay)
else:
resultStorePath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
resultModelPath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay)
flagRedo = True
if flagUseCiscoFeatures:
if 'trainCiscoFeatureDict' not in locals():
trainCiscoFeatureDict = getCiscoFeatureDict(trainCiscoFeatureCSVPaths)
if 'testCiscoFeatureDict' not in locals():
testCiscoFeatureDict = getCiscoFeatureDict(testCiscoFeatureCSVPaths)
else:
trainCiscoFeatureDict = dict()
testCiscoFeatureDict = dict()
if flagRedo or not os.path.exists(resultStorePath):
if 'characterDict' not in locals():
characterDictPath = 'trainData/characterIDDict.joblib'
characterDict = joblib.load(characterDictPath)['characterIDDict']
print('create train data')
if 'dataFrameList' not in locals():
(dataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
trainDirsUserLevel,numNegPerDay = numNegPerDay)
maxHits = []
for i in range(len(dataFrameList)):
maxHits.append(np.max(dataFrameList[i]['hits']))
print('create test data')
# validation error
if 'testDataFrameList' not in locals():
(testDataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
[testDirsUserLevel[0]],numNegPerDay = numNegPerDay)
maxHits = []
for i in range(len(testDataFrameList)):
maxHits.append(np.max(testDataFrameList[i]['hits']))
sharedCNNFun = getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5)
inputList = []
encodedList = []
numFeatures = flowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2*i)+1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector)
lstm = LSTM(lstmUnits, input_shape=(windowSize,domainFeatures+numFeatures))(reshape)
dense = Dense(lstmDenseSize, activation='relu')(lstm)
dropout = Dropout(0.5)(dense)
# And add a logistic regression on top
predictions = Dense(2, activation='softmax')(dropout)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=predictions)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# get train data
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(dataFrameList)), miniters=10):
(domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(dataFrameList[i],windowSize=windowSize,overlapping=False)
domainLists += domainListsTmp
dfLists += dfListsTmp
(trainData,trainLabel,trainHits,trainNames) = createTrainData(domainLists,dfLists,characterDict,
maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=trainCiscoFeatureDict)
useIDs = np.where(trainHits == 0)[0]
useIDs = np.concatenate([useIDs,np.where(trainHits >= threshold)[0]])
for i in range(len(trainData)):
trainData[i] = np.array(trainData[i])[useIDs]
trainLabel = trainLabel[useIDs]
trainHits = trainHits[useIDs]
trainNames = trainNames[useIDs]
# get test data
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(testDataFrameList)), miniters=10):
(domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(testDataFrameList[i],windowSize=windowSize,overlapping=False)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = createTrainData(domainLists,dfLists,characterDict,
maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=testCiscoFeatureDict)
useIDs = np.where(testHits == 0)[0]
useIDs = np.concatenate([useIDs,np.where(testHits >= threshold)[0]])
for i in range(len(testData)):
testData[i] = np.array(testData[i])[useIDs]
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
numPos = len(np.where(trainLabel == 1.0)[0])
numNeg = len(np.where(trainLabel == 0.0)[0])
print('major class: ' + str(float(numNeg) / float(numNeg + numPos)))
lstmLabel = np_utils.to_categorical(trainLabel, 2)
lstmTestLabel = np_utils.to_categorical(testLabel, 2)
trainHist = model.fit(trainData,lstmLabel,epochs=numEpochs,batch_size=128, validation_data=(testData,lstmTestLabel))
# save lstm model
ciscoProcessing.save_model(model,resultModelPath+'.json',
resultModelPath + '.h5')
# classify train and test
trainScores = model.predict(trainData)[:,1]
testScores = model.predict(testData)[:,1]
joblib.dump({'testLabel':testLabel,
'testHits':testHits,
'testNames':testNames,
'testScores':testScores,
'trainLabel':trainLabel,
'trainScores':trainScores},resultStorePath,compress=3)