ma_cisco_malware/stackedNeuralModels.py
2017-06-27 20:29:19 +02:00

412 lines
17 KiB
Python

# -*- coding: utf-8 -*-
from keras.models import Sequential
from keras.layers import Dense, Activation,LSTM,Embedding,Dropout,Conv1D, GlobalMaxPooling1D, Merge, Reshape, Lambda
from keras.layers import Convolution1D
import ciscoProcessing as ciscoProcessing
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib
import csv
import keras
from keras.layers import Input
from keras.models import Model
from keras.utils import np_utils
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc, roc_curve
from tqdm import tqdm
import os
def getCiscoFeatures(curDataLine,urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
#print('cisco features: ' + str(ciscoFeatures))
# log transform
ciscoFeatures = np.log1p(ciscoFeatures,dtype='float32')
#print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures,]).ravel()
def getCNNWithoutLastLayer(vocabSize,embeddingSize,input_length,filters,kernel_size,
hidden_dims,drop_out):
model = Sequential()
model.add(Embedding(input_dim=vocabSize, output_dim=embeddingSize,
input_length=input_length))
model.add(Conv1D(filters,
kernel_size,
activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(drop_out))
model.add(Activation('relu'))
return model
def getCNNWitoutLastLayerFunctional(vocabSize,embeddingSize,input_length,filters,kernel_size,
hidden_dims,drop_out):
a = Input(shape=(input_length,))
embedding = Embedding(input_dim=vocabSize,output_dim=embeddingSize)(a)
conv1 = Conv1D(filters,kernel_size,activation='relu')(embedding)
glob = GlobalMaxPooling1D()(conv1)
dense = Dense(hidden_dims)(glob)
drop = Dropout(drop_out)(dense)
model = Activation('relu')(drop)
model = Model(a, model)
return model
def getFlowFeatureLayer(numFeatures):
model = Sequential()
#slpModel.add(Dense(1, input_shape=(1,)))
model.add(Lambda(lambda x: x + 0.0, input_shape=(numFeatures,)))
return model
def createCNNDataSet(domains,label,characterDict,maxLen=40):
# process domains in reverse order
outFeature = np.zeros([len(domains),maxLen])
outLabel = np.zeros([len(domains),])
for i in range(len(domains)):
domain = domains[i]
curLabel = label[i]
curFeature = np.zeros([maxLen,])
# print(domain + ' ' + str(len(domain)))
for j in range(np.min([len(domain),maxLen])):
#print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
outFeature[i] = curFeature
outLabel[i] = curLabel
return (outFeature,outLabel)
def getFeatureVecForDomain(domain,characterDict,maxLen=40):
curFeature = np.zeros([maxLen,])
for j in range(np.min([len(domain),maxLen])):
#print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
return curFeature
def getFlowFeatures(curDataLine):
useKeys = ['duration','bytes_down','bytes_up']
curFeature = np.zeros([len(useKeys),])
for i in range(len(useKeys)):
curKey = useKeys[i]
try:
curFeature[i] = np.log1p(curDataLine[curKey],dtype='float32')
except:
pass
return curFeature
def getChunksFromUserDataFrame(dataFrame,windowSize=10,overlapping=False,
maxLengthInSeconds=300):
#print('maxLength: ' + str(maxLengthInSeconds))
maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = []
outDFFrames = []
if overlapping == False:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID+1)*windowSize)]
#print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
else:
numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID+windowSize]
#print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
return (outDomainLists,outDFFrames)
def createTrainData(domainLists,dfLists,charachterDict,maxLen,threshold = 3,
flagUseCiscoFeatures=False,urlSIPDIct=dict,
windowSize=10):
if 'hits' in dfLists[0].keys():
hitName = 'hits'
elif 'virusTotalHits' in dfLists[0].keys():
hitName = 'virusTotalHits'
numFlowFeatures = 3
numCiscoFeatures = 30
numFeatures = numFlowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
outputFeatures = []
label = []
hits = []
trainNames = []
for i in range(windowSize):
outputFeatures.append(np.zeros([len(domainLists),maxLen]))
outputFeatures.append(np.zeros([len(domainLists),numFeatures]))
for i in tqdm(np.arange(len(domainLists)), miniters=10):
curCounter = 0
#print('len domainList: ' + str(len(domainLists[i])))
#print('len df: ' + str(len(dfLists[i])))
for j in range(np.min([windowSize,len(domainLists[i])])):
outputFeatures[curCounter][i,:] = getFeatureVecForDomain(domainLists[i][j],charachterDict,maxLen)
curCounter += 1
if flagUseCiscoFeatures:
outputFeatures[curCounter][i,0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
outputFeatures[curCounter][i,numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j],urlSIPDIct)
else:
outputFeatures[curCounter][i,:] = getFlowFeatures(dfLists[i].iloc[j])
curCounter += 1
curLabel = 0.0
if np.max(dfLists[i][hitName]) >= threshold:
curLabel = 1.0
elif np.max(dfLists[i][hitName]) == -1:
curLabel = -1.0
elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
curLabel = -2.0
label.append(curLabel)
hits.append(np.max(dfLists[i][hitName]))
trainNames.append(np.unique(dfLists[i]['user_hash']))
return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
def transformStringListToNumpyArray(listString):
listString = listString.replace('[','').replace(']','')
return np.array(listString.split(','),dtype='float32')
def getCiscoFeatureDict(csvPathList):
outDict = dict()
for path in tqdm(csvPathList, miniters=1):
fobj = open(path,'r')
csvReader = csv.DictReader(fobj,delimiter=',')
for row in csvReader:
urlSIPString = row['Domain'] + row['ServerIP']
ciscoFeatures = row['CiscoFeature']
outDict[urlSIPString] = transformStringListToNumpyArray(ciscoFeatures)
#if len(outDict) % 10000 == 0:
# print('numbers in dict: ' + str(len(outDict)))
return outDict
if __name__ == "__main__":
# get data
trainDirsUserLevel = ['trainData/joblib2016-07-annomalous-stg-new/10/',
'trainData/joblib2016-07-annomalous-stg-new/09/',
'trainData/joblib2016-07-annomalous-stg-new/08/',
'trainData/joblib2016-07-annomalous-stg-new/07/',
'trainData/joblib2016-07-annomalous-stg-new/06/']
testDirsUserLevel = ['trainData/joblib2016-09-annomalous-stg-new/07/',\
'trainData/joblib2016-09-annomalous-stg-new/08/',\
'trainData/joblib2016-09-annomalous-stg-new/09/',\
'trainData/joblib2016-09-annomalous-stg-new/10/',\
'trainData/joblib2016-09-annomalous-stg-new/11/',\
'trainData/joblib2016-09-annomalous-stg-new/12/',\
'trainData/joblib2016-09-annomalous-stg-new/13/',\
'trainData/joblib2016-09-annomalous-stg-new/14/']
trainCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_07.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_06.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_08.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_10.csv',
'trainData/ciscoDomainFeatueres_joblib2016-07-annomalous-stg-new_09.csv']
testCiscoFeatureCSVPaths = ['trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_12.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_08.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_07.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_09.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_13.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_14.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_10.csv',
'trainData/ciscoDomainFeatueres_joblib2016-09-annomalous-stg-new_11.csv']
# parameter
numNegPerDay = 5000
numEpochs = 10
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures= 30
windowSize = 10
maxLen = 40
lstmUnits = 32
lstmDenseSize = 128
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
flagUseCiscoFeatures = True
threshold = 3
resultStoreDir = 'results/201705/'
if flagUseCiscoFeatures:
resultStorePath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
resultModelPath = resultStoreDir + 'cnn_plus_cisco_plus_lstm_numNegPerDay' + str(numNegPerDay)
else:
resultStorePath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay) + '.joblib'
resultModelPath = resultStoreDir + 'cnn_plus_lstm_numNegPerDay' + str(numNegPerDay)
flagRedo = True
if flagUseCiscoFeatures:
if 'trainCiscoFeatureDict' not in locals():
trainCiscoFeatureDict = getCiscoFeatureDict(trainCiscoFeatureCSVPaths)
if 'testCiscoFeatureDict' not in locals():
testCiscoFeatureDict = getCiscoFeatureDict(testCiscoFeatureCSVPaths)
else:
trainCiscoFeatureDict = dict()
testCiscoFeatureDict = dict()
if flagRedo or not os.path.exists(resultStorePath):
if 'characterDict' not in locals():
characterDictPath = 'trainData/characterIDDict.joblib'
characterDict = joblib.load(characterDictPath)['characterIDDict']
print('create train data')
if 'dataFrameList' not in locals():
(dataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
trainDirsUserLevel,numNegPerDay = numNegPerDay)
maxHits = []
for i in range(len(dataFrameList)):
maxHits.append(np.max(dataFrameList[i]['hits']))
print('create test data')
# validation error
if 'testDataFrameList' not in locals():
(testDataFrameList) = ciscoProcessing.loadRawDataSetFromJoblibPerUser(\
[testDirsUserLevel[0]],numNegPerDay = numNegPerDay)
maxHits = []
for i in range(len(testDataFrameList)):
maxHits.append(np.max(testDataFrameList[i]['hits']))
sharedCNNFun = getCNNWitoutLastLayerFunctional(len(characterDict)+1,embeddingSize,maxLen,domainFeatures,kernel_size,domainFeatures,0.5)
inputList = []
encodedList = []
numFeatures = flowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2*i)+1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures+numFeatures))(merged_vector)
lstm = LSTM(lstmUnits, input_shape=(windowSize,domainFeatures+numFeatures))(reshape)
dense = Dense(lstmDenseSize, activation='relu')(lstm)
dropout = Dropout(0.5)(dense)
# And add a logistic regression on top
predictions = Dense(2, activation='softmax')(dropout)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=predictions)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# get train data
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(dataFrameList)), miniters=10):
(domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(dataFrameList[i],windowSize=windowSize,overlapping=False)
domainLists += domainListsTmp
dfLists += dfListsTmp
(trainData,trainLabel,trainHits,trainNames) = createTrainData(domainLists,dfLists,characterDict,
maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=trainCiscoFeatureDict)
useIDs = np.where(trainHits == 0)[0]
useIDs = np.concatenate([useIDs,np.where(trainHits >= threshold)[0]])
for i in range(len(trainData)):
trainData[i] = np.array(trainData[i])[useIDs]
trainLabel = trainLabel[useIDs]
trainHits = trainHits[useIDs]
trainNames = trainNames[useIDs]
# get test data
domainLists = []
dfLists = []
for i in tqdm(np.arange(len(testDataFrameList)), miniters=10):
(domainListsTmp,dfListsTmp) = getChunksFromUserDataFrame(testDataFrameList[i],windowSize=windowSize,overlapping=False)
domainLists += domainListsTmp
dfLists += dfListsTmp
(testData,testLabel,testHits,testNames) = createTrainData(domainLists,dfLists,characterDict,
maxLen,threshold = threshold,
flagUseCiscoFeatures=flagUseCiscoFeatures,urlSIPDIct=testCiscoFeatureDict)
useIDs = np.where(testHits == 0)[0]
useIDs = np.concatenate([useIDs,np.where(testHits >= threshold)[0]])
for i in range(len(testData)):
testData[i] = np.array(testData[i])[useIDs]
testLabel = testLabel[useIDs]
testHits = testHits[useIDs]
testNames = testNames[useIDs]
numPos = len(np.where(trainLabel == 1.0)[0])
numNeg = len(np.where(trainLabel == 0.0)[0])
print('major class: ' + str(float(numNeg) / float(numNeg + numPos)))
lstmLabel = np_utils.to_categorical(trainLabel, 2)
lstmTestLabel = np_utils.to_categorical(testLabel, 2)
trainHist = model.fit(trainData,lstmLabel,epochs=numEpochs,batch_size=128, validation_data=(testData,lstmTestLabel))
# save lstm model
ciscoProcessing.save_model(model,resultModelPath+'.json',
resultModelPath + '.h5')
# classify train and test
trainScores = model.predict(trainData)[:,1]
testScores = model.predict(testData)[:,1]
joblib.dump({'testLabel':testLabel,
'testHits':testHits,
'testNames':testNames,
'testScores':testScores,
'trainLabel':trainLabel,
'trainScores':trainScores},resultStorePath,compress=3)