ma_cisco_malware/main.py
2017-08-05 09:40:40 +02:00

276 lines
10 KiB
Python

import json
import logging
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
from keras.models import load_model
import arguments
import dataset
import hyperband
import models
# create logger
import visualize
from dataset import load_or_generate_h5data
from utils import exists_or_make_path, get_custom_class_weights
logger = logging.getLogger('logger')
logger.setLevel(logging.DEBUG)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
ch = logging.FileHandler("info.log")
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
args = arguments.parse()
if args.gpu:
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
# default parameter
PARAMS = {
"type": args.model_type,
"batch_size": 64,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
#
'dropout': 0.5,
'domain_features': args.domain_embedding,
'embedding_size': args.embedding,
'filter_main': 64,
'flow_features': 3,
# 'dense_main': 512,
'dense_main': 64,
'filter_embedding': args.hidden_char_dims,
'hidden_embedding': args.domain_embedding,
'kernel_embedding': 3,
'kernels_main': 3,
'input_length': 40,
'model_output': args.model_output
}
def main_paul_best():
pauls_best_params = models.pauls_networks.best_config
main_train(pauls_best_params)
def main_hyperband():
params = {
# static params
"type": ["paul"],
"batch_size": [args.batch_size],
"window_size": [10],
"domain_length": [40],
"flow_features": [3],
"input_length": [40],
# model params
"embedding_size": [8, 16, 32, 64, 128, 256],
"filter_embedding": [8, 16, 32, 64, 128, 256],
"kernel_embedding": [1, 3, 5, 7, 9],
"hidden_embedding": [8, 16, 32, 64, 128, 256],
"dropout": [0.5],
"domain_features": [8, 16, 32, 64, 128, 256],
"filter_main": [8, 16, 32, 64, 128, 256],
"kernels_main": [1, 3, 5, 7, 9],
"dense_main": [8, 16, 32, 64, 128, 256],
}
logger.info("create training dataset")
domain_tr, flow_tr, client_tr, server_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
args.domain_length, args.window)
hp = hyperband.Hyperband(params,
[domain_tr, flow_tr],
[client_tr, server_tr])
results = hp.run()
json.dump(results, open("hyperband.json"))
def main_train(param=None):
logger.info(f"Create model path {args.model_path}")
exists_or_make_path(args.model_path)
logger.info(f"Use command line arguments: {args}")
domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
args.domain_length, args.window)
if not param:
param = PARAMS
logger.info(f"Generator model with params: {param}")
embedding, model, new_model = models.get_models_by_params(param)
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=args.clf_model,
monitor='val_loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(args.train_log))
logger.info(f"Use early stopping: {args.stop_early}")
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
custom_metrics = models.get_metric_functions()
server_tr = np.max(server_windows_tr, axis=1)
if args.class_weights:
logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client_tr.value, server_tr)
logger.info(custom_class_weights)
else:
logger.info("class weights: set default")
custom_class_weights = None
logger.info(f"select model: {'new' if args.new_model else 'old'}")
if args.new_model:
server_tr = np.expand_dims(server_windows_tr, 2)
model = new_model
logger.info("compile and train model")
embedding.summary()
model.summary()
logger.info(model.get_config())
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'] + custom_metrics)
if args.model_output == "both":
labels = [client_tr, server_tr]
elif args.model_output == "client":
labels = [client_tr]
elif args.model_output == "server":
labels = [server_tr]
model.fit([domain_tr, flow_tr],
labels,
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks,
shuffle=True,
validation_split=0.2,
class_weight=custom_class_weights)
logger.info("save embedding")
embedding.save(args.embedding_model)
def main_test():
domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data,
args.domain_length, args.window)
clf = load_model(args.clf_model, custom_objects=models.get_metrics())
pred = clf.predict([domain_val, flow_val],
batch_size=args.batch_size,
verbose=1)
if args.model_output == "both":
c_pred, s_pred = pred
elif args.model_output == "client":
c_pred = pred
s_pred = np.zeros(0)
else:
c_pred = np.zeros(0)
s_pred = pred
dataset.save_predictions(args.future_prediction, c_pred, s_pred)
model = load_model(args.embedding_model)
domain_encs, labels = dataset.load_or_generate_domains(args.test_data, args.domain_length)
domain_embedding = model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
np.save(args.model_path + "/domain_embds.npy", domain_embedding)
def main_visualization():
domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data,
args.domain_length, args.window)
client_val, server_val = client_val.value, server_val.value
logger.info("plot model")
model = load_model(args.clf_model, custom_objects=models.get_metrics())
visualize.plot_model(model, os.path.join(args.model_path, "model.png"))
try:
logger.info("plot training curve")
logs = pd.read_csv(args.train_log)
if args.model_output == "client":
visualize.plot_training_curve(logs, "", "{}/client_train.png".format(args.model_path))
else:
visualize.plot_training_curve(logs, "client_", "{}/client_train.png".format(args.model_path))
visualize.plot_training_curve(logs, "server_", "{}/server_train.png".format(args.model_path))
except Exception as e:
logger.warning(f"could not generate training curves: {e}")
client_pred, server_pred = dataset.load_predictions(args.future_prediction)
client_pred, server_pred = client_pred.value, server_pred.value
logger.info("plot pr curve")
visualize.plot_precision_recall(client_val, client_pred.flatten(), "{}/client_prc.png".format(args.model_path))
# visualize.plot_precision_recall(server_val, server_pred, "{}/server_prc.png".format(args.model_path))
# visualize.plot_precision_recall_curves(client_val, client_pred, "{}/client_prc2.png".format(args.model_path))
# visualize.plot_precision_recall_curves(server_val, server_pred, "{}/server_prc2.png".format(args.model_path))
logger.info("plot roc curve")
visualize.plot_roc_curve(client_val, client_pred.flatten(), "{}/client_roc.png".format(args.model_path))
# visualize.plot_roc_curve(server_val, server_pred, "{}/server_roc.png".format(args.model_path))
visualize.plot_confusion_matrix(client_val, client_pred.flatten().round(),
"{}/client_cov.png".format(args.model_path),
normalize=False, title="Client Confusion Matrix")
# visualize.plot_confusion_matrix(server_val.argmax(1), server_pred.argmax(1),
# "{}/server_cov.png".format(args.model_path),
# normalize=False, title="Server Confusion Matrix")
logger.info("visualize embedding")
domain_encs, labels = dataset.load_or_generate_domains(args.test_data, args.domain_length)
domain_embedding = np.load(args.model_path + "/domain_embds.npy")
visualize.plot_embedding(domain_embedding, labels, path="{}/embd.png".format(args.model_path))
def main_data():
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data(args.train_data)
logger.info("create training dataset")
domain_tr, flow_tr, client_tr, server_tr, _ = dataset.create_dataset_from_flows(user_flow_df, char_dict,
max_len=args.domain_length,
window_size=args.window)
print(f"domain shape {domain_tr.shape}")
print(f"flow shape {flow_tr.shape}")
print(f"client shape {client_tr.shape}")
print(f"server shape {server_tr.shape}")
def main():
if "train" == args.mode:
main_train()
if "hyperband" == args.mode:
main_hyperband()
if "test" == args.mode:
main_test()
if "fancy" == args.mode:
main_visualization()
if "paul" == args.mode:
main_paul_best()
if "data" == args.mode:
main_data()
if __name__ == "__main__":
main()