ma_cisco_malware/cnnOnCnnParameterSelection.py

283 lines
10 KiB
Python

# -*- coding: utf-8 -*-
import string
import keras
import numpy as np
import pandas as pd
from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape, Embedding, Input, Activation
from keras.models import Model
from keras.utils import np_utils
from tqdm import tqdm
# config = tf.ConfigProto(log_device_placement=True)
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
# config.gpu_options.allow_growth = True
# session = tf.Session(config=config)
def get_character_dict():
return dict((char, idx) for (idx, char) in
enumerate(string.ascii_lowercase + string.punctuation))
def get_cnn(vocabSize, embeddingSize, input_length, filters, kernel_size,
hidden_dims, drop_out):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocabSize, output_dim=embeddingSize)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_user_chunks(dataFrame, windowSize=10, overlapping=False,
maxLengthInSeconds=300):
# print('maxLength: ' + str(maxLengthInSeconds))
maxMilliSeconds = maxLengthInSeconds * 1000
outDomainLists = []
outDFFrames = []
if overlapping == False:
numBlocks = int(np.ceil(float(len(dataFrame)) / float(windowSize)))
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[(blockID * windowSize):((blockID + 1) * windowSize)]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
else:
numBlocks = len(dataFrame) + 1 - windowSize
userIDs = np.arange(len(dataFrame))
for blockID in np.arange(numBlocks):
curIDs = userIDs[blockID:blockID + windowSize]
# print(curIDs)
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
if maxLengthInSeconds != -1:
curMinMilliSeconds = np.min(useData['timeStamp']) + maxMilliSeconds
underTimeOutIDs = np.where(np.array(useData['timeStamp']) <= curMinMilliSeconds)
if len(underTimeOutIDs) != len(curIDs):
curIDs = curIDs[underTimeOutIDs]
useData = dataFrame.iloc[curIDs]
curDomains = useData['domain']
outDomainLists.append(list(curDomains))
outDFFrames.append(useData)
return (outDomainLists, outDFFrames)
def getFeatureVecForDomain(domain, characterDict, maxLen=40):
curFeature = np.zeros([maxLen, ])
for j in range(np.min([len(domain), maxLen])):
# print(j)
curCharacter = domain[-j]
if curCharacter in characterDict:
curFeature[j] = characterDict[curCharacter]
return curFeature
def getFlowFeatures(curDataLine):
useKeys = ['duration', 'bytes_down', 'bytes_up']
curFeature = np.zeros([len(useKeys), ])
for i in range(len(useKeys)):
curKey = useKeys[i]
try:
curFeature[i] = np.log1p(curDataLine[curKey]).astype(float)
except:
pass
return curFeature
def getCiscoFeatures(curDataLine, urlSIPDict):
numCiscoFeatures = 30
try:
ciscoFeatures = urlSIPDict[str(curDataLine['domain']) + str(curDataLine['server_ip'])]
# print('cisco features: ' + str(ciscoFeatures))
# log transform
ciscoFeatures = np.log1p(ciscoFeatures).astype(float)
# print('log transformed: ' + str(ciscoFeatures))
return ciscoFeatures.ravel()
except:
return np.zeros([numCiscoFeatures, ]).ravel()
def create_dataset_from_flows(user_flow_df, char_dict, maxLen, threshold=3, windowSize=10):
domainLists = []
dfLists = []
print("get chunks from user data frames")
for i, user_flow in enumerate(get_flow_per_user(user_flow_df)):
(domainListsTmp, dfListsTmp) = get_user_chunks(user_flow, windowSize=windowSize,
overlapping=False, maxLengthInSeconds=maxLengthInSeconds)
domainLists += domainListsTmp
dfLists += dfListsTmp
if i >= 10:
break
print("create training dataset")
return create_dataset_from_lists(
domainLists=domainLists, dfLists=dfLists, charachterDict=char_dict,
maxLen=maxLen, threshold=threshold,
flagUseCiscoFeatures=False, urlSIPDIct=dict(),
windowSize=windowSize)
def create_dataset_from_lists(domainLists, dfLists, charachterDict, maxLen, threshold=3,
flagUseCiscoFeatures=False, urlSIPDIct=dict(),
windowSize=10):
if 'hits' in dfLists[0].keys():
hitName = 'hits'
elif 'virusTotalHits' in dfLists[0].keys():
hitName = 'virusTotalHits'
numFlowFeatures = 3
numCiscoFeatures = 30
numFeatures = numFlowFeatures
if flagUseCiscoFeatures:
numFeatures += numCiscoFeatures
outputFeatures = []
label = []
hits = []
trainNames = []
for i in range(windowSize):
outputFeatures.append(np.zeros([len(domainLists), maxLen]))
outputFeatures.append(np.zeros([len(domainLists), numFeatures]))
for i in tqdm(np.arange(len(domainLists)), miniters=10):
curCounter = 0
# print('len domainList: ' + str(len(domainLists[i])))
# print('len df: ' + str(len(dfLists[i])))
for j in range(np.min([windowSize, len(domainLists[i])])):
outputFeatures[curCounter][i, :] = getFeatureVecForDomain(domainLists[i][j], charachterDict, maxLen)
curCounter += 1
if flagUseCiscoFeatures:
outputFeatures[curCounter][i, 0:numFlowFeatures] = getFlowFeatures(dfLists[i].iloc[j])
outputFeatures[curCounter][i, numFlowFeatures:] = getCiscoFeatures(dfLists[i].iloc[j], urlSIPDIct)
else:
outputFeatures[curCounter][i, :] = getFlowFeatures(dfLists[i].iloc[j])
curCounter += 1
curLabel = 0.0
if np.max(dfLists[i][hitName]) >= threshold:
curLabel = 1.0
elif np.max(dfLists[i][hitName]) == -1:
curLabel = -1.0
elif np.max(dfLists[i][hitName]) > 0 and np.max(dfLists[i][hitName]) < threshold:
curLabel = -2.0
label.append(curLabel)
hits.append(np.max(dfLists[i][hitName]))
trainNames.append(np.unique(dfLists[i]['user_hash']))
return (outputFeatures, np.array(label), np.array(hits), np.array(trainNames))
def get_user_flow_data():
# load train and test data from joblib
# created with createTrainDataMultipleTaskLearning.py
# rk: changed to csv file
trainDFs = pd.read_csv("data/rk_data.csv.gz")
trainDFs.drop("Unnamed: 0", 1, inplace=True)
trainDFs.set_index(keys=['user_hash'], drop=False, inplace=True)
users = trainDFs['user_hash'].unique().tolist()
u0 = trainDFs.loc[trainDFs.user_hash == users[0]]
return trainDFs
def get_flow_per_user(df):
users = df['user_hash'].unique().tolist()
for user in users:
yield df.loc[df.user_hash == user]
if __name__ == "__main__":
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
drop_out = 0.5
filters = 2
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
maxLengthInSeconds = -1
timesNeg = -1
char_dict = get_character_dict()
user_flow_df = get_user_flow_data()
print("create training dataset")
(X_tr, y_tr, hits_tr, names_tr) = create_dataset_from_flows(
user_flow_df, char_dict,
maxLen=maxLen, threshold=threshold, windowSize=windowSize)
pos_idx = np.where(y_tr == 1.0)[0]
neg_idx = np.where(y_tr == 0.0)[0]
use_idx = np.concatenate((pos_idx, neg_idx))
y_tr = y_tr[use_idx]
# hits_tr = hits_tr[use_idx]
# names_tr = names_tr[use_idx]
for i in range(len(X_tr)):
X_tr[i] = X_tr[i][use_idx]
# TODO: WTF? I don't get it...
sharedCNNFun = get_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
inputList = []
encodedList = []
numFeatures = flowFeatures
for i in range(windowSize):
inputList.append(Input(shape=(maxLen,)))
encodedList.append(sharedCNNFun(inputList[-1])) # add shared domain model
inputList.append(Input(shape=(numFeatures,)))
merge_layer_input = []
for i in range(windowSize):
merge_layer_input.append(encodedList[i])
merge_layer_input.append(inputList[(2 * i) + 1])
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate(merge_layer_input, axis=-1)
reshape = Reshape((windowSize, domainFeatures + numFeatures))(merged_vector)
# add second cnn
cnn = Conv1D(filters,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(reshape)
# we use max pooling:
maxPool = GlobalMaxPooling1D()(cnn)
cnnDropout = Dropout(cnnDropout)(maxPool)
cnnDense = Dense(cnnHiddenDims, activation='relu')(cnnDropout)
cnnOutput = Dense(2, activation='softmax')(cnnDense)
# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=inputList, outputs=cnnOutput)
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
epochNumber = 0
trainLabel = np_utils.to_categorical(y_tr, 2)
model.fit(x=X_tr, y=trainLabel, batch_size=128,
epochs=epochNumber + 1, shuffle=True, initial_epoch=epochNumber) # ,
# validation_data=(testData,testLabel))