added params

This commit is contained in:
René Knaebel 2017-07-07 16:48:10 +02:00
parent 3c4be52bb6
commit be56112b33
4 changed files with 191 additions and 72 deletions

View File

@ -1,3 +1,5 @@
test: test:
python3 main.py --epochs 1 --batch 64 --train data/rk_data.csv.gz --test data/rk_data.csv.gz python3 main.py --modes train --epochs 1 --batch 64 --train data/rk_data.csv.gz
hyper:
python3 main.py --modes hyperband --epochs 1 --batch 64 --train data/rk_data.csv.gz

View File

@ -1,76 +1,128 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# implementation of hyperband: # implementation of hyperband:
# https://arxiv.org/pdf/1603.06560.pdf # https://arxiv.org/pdf/1603.06560.pdf
import random
from math import log, ceil
from random import random as rng
from time import time, ctime
import numpy as np import numpy as np
import models
def get_hyperparameter_configuration(configGenerator, n):
configurations = []
for i in np.arange(0, n, 1):
configurations.append(configGenerator())
return configurations
def run_then_return_val_loss(config, r_i, modelGenerator, trainData, trainLabel, def sample_params(param_distribution: dict):
testData, testLabel): p = {}
# parameter for key, val in param_distribution.items():
batch_size = 128 p[key] = random.choice(val)
model = modelGenerator(config) return p
if model != None:
model.fit(x=trainData, y=trainLabel,
epochs=int(r_i), shuffle=True, initial_epoch=0, class Hyperband:
batch_size=batch_size) def __init__(self, param_distribution, X, y):
score = model.evaluate(testData, testLabel, self.get_params = lambda: sample_params(param_distribution)
batch_size=batch_size)
score = score[0] self.max_iter = 81 # maximum iterations per configuration
self.eta = 3 # defines configuration downsampling rate (default = 3)
self.logeta = lambda x: log(x) / log(self.eta)
self.s_max = int(self.logeta(self.max_iter))
self.B = (self.s_max + 1) * self.max_iter
self.results = [] # list of dicts
self.counter = 0
self.best_loss = np.inf
self.best_counter = -1
self.X = X
self.y = y
def try_params(self, n_iterations, params):
n_iterations = int(round(n_iterations))
embedding, model = models.get_models_by_params(params)
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
history = model.fit(self.X,
self.y,
batch_size=params["batch_size"],
epochs=n_iterations,
shuffle=True,
validation_split=0.2)
return {"loss": history.history['loss'][-1]}
# can be called multiple times
def run(self, skip_last=0, dry_run=False):
for s in reversed(range(self.s_max + 1)):
# initial number of configurations
n = int(ceil(self.B / self.max_iter / (s + 1) * self.eta ** s))
# initial number of iterations per config
r = self.max_iter * self.eta ** (-s)
# n random configurations
T = [self.get_params() for i in range(n)]
for i in range((s + 1) - int(skip_last)): # changed from s + 1
# Run each of the n configs for <iterations>
# and keep best (n_configs / eta) configurations
n_configs = n * self.eta ** (-i)
n_iterations = r * self.eta ** (i)
print("\n*** {} configurations x {:.1f} iterations each".format(
n_configs, n_iterations))
val_losses = []
early_stops = []
for t in T:
self.counter += 1
print("\n{} | {} | lowest loss so far: {:.4f} (run {})\n".format(
self.counter, ctime(), self.best_loss, self.best_counter))
start_time = time()
if dry_run:
result = {'loss': rng(), 'log_loss': rng(), 'auc': rng()}
else: else:
score = np.infty result = self.try_params(n_iterations, t) # <---
return score
assert (type(result) == dict)
assert ('loss' in result)
def top_k(configurations, L, k): seconds = int(round(time() - start_time))
outConfigs = [] print("\n{} seconds.".format(seconds))
sortIDs = np.argsort(np.array(L))
for i in np.arange(0, k, 1):
outConfigs.append(configurations[sortIDs[i]])
return outConfigs
loss = result['loss']
val_losses.append(loss)
def hyperband(R, nu, modelGenerator, early_stop = result.get('early_stop', False)
configGenerator, early_stops.append(early_stop)
trainData, trainLabel,
testData, testLabel,
outputFile=''):
allLosses = []
allConfigs = []
# input
# initialization # keeping track of the best result so far (for display only)
s_max = np.floor(np.log(R) / np.log(nu)) # could do it be checking results each time, but hey
B = (s_max + 1) * R if loss < self.best_loss:
self.best_loss = loss
self.best_counter = self.counter
for s in np.arange(s_max, -1, -1): result['counter'] = self.counter
n = np.ceil(np.float(B) / np.float(R) * (np.float(np.power(nu, s)) / np.float(s + 1))) result['seconds'] = seconds
r = np.float(R) * np.power(nu, -s) result['params'] = t
configurations = get_hyperparameter_configuration(configGenerator, n) result['iterations'] = n_iterations
for i in np.arange(0, s + 1, 1):
n_i = np.floor(np.float(n) * np.power(nu, -i))
r_i = np.float(r) * np.power(nu, i)
L = []
for config in configurations:
curLoss = run_then_return_val_loss(config, r_i, modelGenerator,
trainData, trainLabel,
testData, testLabel)
L.append(curLoss)
allLosses.append(curLoss)
allConfigs.append(config)
if outputFile != '':
with open(outputFile, 'a') as myfile:
myfile.write(str(config) + '\t' + str(curLoss) + \
'\t' + str(r_i) + '\n')
configurations = top_k(configurations, L, np.floor(np.float(n_i) / nu))
# print('n_i: ' + str(n_i)) self.results.append(result)
# print('r_i: ' + str(r_i))
bestConfig = top_k(allConfigs, allLosses, 1) # select a number of best configurations for the next loop
return (bestConfig[0], allConfigs, allLosses) # filter out early stops, if any
indices = np.argsort(val_losses)
T = [T[i] for i in indices if not early_stops[i]]
T = T[0:int(n_configs / self.eta)]
return self.results

67
main.py
View File

@ -3,11 +3,12 @@ import argparse
from keras.utils import np_utils from keras.utils import np_utils
import dataset import dataset
import hyperband
import models import models
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
# parser.add_argument("--modes", action="store", dest="modes", nargs="+") parser.add_argument("--modes", action="store", dest="modes", nargs="+")
parser.add_argument("--train", action="store", dest="train_data", parser.add_argument("--train", action="store", dest="train_data",
default="data/full_dataset.csv.tar.bz2") default="data/full_dataset.csv.tar.bz2")
@ -24,9 +25,9 @@ parser.add_argument("--model", action="store", dest="model",
# parser.add_argument("--pred", action="store", dest="pred", # parser.add_argument("--pred", action="store", dest="pred",
# default="") # default="")
# #
# parser.add_argument("--type", action="store", dest="model_type", parser.add_argument("--type", action="store", dest="model_type",
# default="simple_conv") default="paul")
#
parser.add_argument("--batch", action="store", dest="batch_size", parser.add_argument("--batch", action="store", dest="batch_size",
default=64, type=int) default=64, type=int)
@ -79,13 +80,52 @@ args = parser.parse_args()
# session = tf.Session(config=config) # session = tf.Session(config=config)
def main_hyperband():
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data(args.train_data)
params = {
# static params
"type": ["paul"],
"batch_size": [64],
"vocab_size": [len(char_dict) + 1],
"window_size": [10],
"domain_length": [40],
"flow_features": [3],
"input_length": [40],
# model params
"embedding_size": [16, 32, 64, 128, 256, 512],
"filter_embedding": [16, 32, 64, 128, 256, 512],
"kernel_embedding": [1, 3, 5, 7, 9],
"hidden_embedding": [16, 32, 64, 128, 256, 512],
"dropout": [0.5],
"domain_features": [16, 32, 64, 128, 256, 512],
"filter_main": [16, 32, 64, 128, 256, 512],
"kernels_main": [1, 3, 5, 7, 9],
"dense_main": [16, 32, 64, 128, 256, 512],
}
param = hyperband.sample_params(params)
print(param)
print("create training dataset")
domain_tr, flow_tr, client_tr, server_tr = dataset.create_dataset_from_flows(
user_flow_df, char_dict,
max_len=args.domain_length,
window_size=args.window)
client_tr = np_utils.to_categorical(client_tr, 2)
server_tr = np_utils.to_categorical(server_tr, 2)
hp = hyperband.Hyperband(params, [domain_tr, flow_tr], [client_tr, server_tr])
hp.run()
def main_train(): def main_train():
# parameter # parameter
cnnDropout = 0.5 cnnDropout = 0.5
cnnHiddenDims = 512 cnnHiddenDims = 512
kernel_size = 3 kernel_size = 3
filters = 128 filters = 128
network = models.pauls_networks network = models.pauls_networks if args.model_type == "paul" else models.renes_networks
char_dict = dataset.get_character_dict() char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data(args.train_data) user_flow_df = dataset.get_user_flow_data(args.train_data)
@ -94,6 +134,8 @@ def main_train():
domain_tr, flow_tr, client_tr, server_tr = dataset.create_dataset_from_flows( domain_tr, flow_tr, client_tr, server_tr = dataset.create_dataset_from_flows(
user_flow_df, char_dict, user_flow_df, char_dict,
max_len=args.domain_length, window_size=args.window) max_len=args.domain_length, window_size=args.window)
client_tr = np_utils.to_categorical(client_tr, 2)
server_tr = np_utils.to_categorical(server_tr, 2)
shared_cnn = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length, shared_cnn = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5) args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5)
@ -105,11 +147,9 @@ def main_train():
model.summary() model.summary()
model.compile(optimizer='adam', model.compile(optimizer='adam',
loss='binary_crossentropy', loss='categorical_crossentropy',
metrics=['accuracy']) metrics=['accuracy'])
client_tr = np_utils.to_categorical(client_tr, 2)
server_tr = np_utils.to_categorical(server_tr, 2)
model.fit([domain_tr, flow_tr], model.fit([domain_tr, flow_tr],
[client_tr, server_tr], [client_tr, server_tr],
batch_size=args.batch_size, batch_size=args.batch_size,
@ -117,6 +157,8 @@ def main_train():
shuffle=True, shuffle=True,
validation_split=0.2) validation_split=0.2)
model.save(args.model)
def main_test(): def main_test():
char_dict = dataset.get_character_dict() char_dict = dataset.get_character_dict()
@ -154,7 +196,16 @@ def main_score():
def main(): def main():
if "train" in args.modes:
main_train() main_train()
if "hyperband" in args.modes:
main_hyperband()
if "test" in args.modes:
main_test()
if "fancy" in args.modes:
main_visualization()
if "score" in args.modes:
main_score()
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -2,6 +2,20 @@ import keras
from keras.engine import Input, Model from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed
best_config = {
'domain_features': 32,
'drop_out': 0.5,
'embedding_size': 64,
'filter_main': 512,
'flow_features': 3,
'hidden_dims': 32,
'filter_embedding': 32,
'hidden_embedding': 32,
'kernel_embedding': 8,
'kernels_main': 8,
'input_length': 40
}
def get_embedding(vocab_size, embedding_size, input_length, def get_embedding(vocab_size, embedding_size, input_length,
filters, kernel_size, hidden_dims, drop_out=0.5): filters, kernel_size, hidden_dims, drop_out=0.5):