move vocab_size into implementation (not user dependent)

This commit is contained in:
René Knaebel 2017-07-30 13:47:11 +02:00
parent d97785f646
commit ebaeb6b96e
6 changed files with 82 additions and 154 deletions

View File

@ -18,6 +18,10 @@ def get_character_dict():
return chars return chars
def get_vocab_size():
return len(chars) + 1
def encode_char(c): def encode_char(c):
if c in chars: if c in chars:
return chars[c] return chars[c]

198
main.py
View File

@ -7,7 +7,6 @@ import pandas as pd
import tensorflow as tf import tensorflow as tf
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
from keras.models import load_model from keras.models import load_model
from sklearn.utils import class_weight
import arguments import arguments
import dataset import dataset
@ -16,7 +15,7 @@ import models
# create logger # create logger
import visualize import visualize
from dataset import load_or_generate_h5data from dataset import load_or_generate_h5data
from utils import exists_or_make_path from utils import exists_or_make_path, get_custom_class_weights
logger = logging.getLogger('logger') logger = logging.getLogger('logger')
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
@ -54,22 +53,39 @@ if args.gpu:
config.gpu_options.allow_growth = True config.gpu_options.allow_growth = True
session = tf.Session(config=config) session = tf.Session(config=config)
# default parameter
PARAMS = {
"type": args.model_type,
"batch_size": 64,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
#
'dropout': 0.5,
'domain_features': args.domain_embedding,
'embedding_size': args.embedding,
'filter_main': 64,
'flow_features': 3,
# 'dense_main': 512,
'dense_main': 64,
'filter_embedding': args.hidden_char_dims,
'hidden_embedding': args.domain_embedding,
'kernel_embedding': 3,
'kernels_main': 3,
'input_length': 40
}
def main_paul_best(): def main_paul_best():
char_dict = dataset.get_character_dict()
pauls_best_params = models.pauls_networks.best_config pauls_best_params = models.pauls_networks.best_config
pauls_best_params["vocab_size"] = len(char_dict) + 1
main_train(pauls_best_params) main_train(pauls_best_params)
def main_hyperband(): def main_hyperband():
char_dict = dataset.get_character_dict()
params = { params = {
# static params # static params
"type": ["paul"], "type": ["paul"],
"batch_size": [args.batch_size], "batch_size": [args.batch_size],
"vocab_size": [len(char_dict) + 1],
"window_size": [10], "window_size": [10],
"domain_length": [40], "domain_length": [40],
"flow_features": [3], "flow_features": [3],
@ -96,50 +112,16 @@ def main_hyperband():
json.dump(results, open("hyperband.json")) json.dump(results, open("hyperband.json"))
def get_custom_class_weights(client_tr, server_tr): def main_train(param=None, train_new_model=False):
client = client_tr.value if type(client_tr) != np.ndarray else client_tr
server = server_tr.value if type(server_tr) != np.ndarray else server_tr
client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client)
server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server)
return {
"client": client_class_weight,
"server": server_class_weight
}
def main_train(param=None):
exists_or_make_path(args.model_path) exists_or_make_path(args.model_path)
char_dict = dataset.get_character_dict()
domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data, domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
args.domain_length, args.window) args.domain_length, args.window)
# parameter
p = {
"type": args.model_type,
"batch_size": 64,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
"vocab_size": len(char_dict) + 1,
#
'dropout': 0.5,
'domain_features': args.domain_embedding,
'embedding_size': args.embedding,
'filter_main': 64,
'flow_features': 3,
# 'dense_main': 512,
'dense_main': 64,
'filter_embedding': args.hidden_char_dims,
'hidden_embedding': args.domain_embedding,
'kernel_embedding': 3,
'kernels_main': 3,
'input_length': 40
}
if not param: if not param:
param = p param = PARAMS
embedding, model, _ = models.get_models_by_params(param) embedding, model, new_model = models.get_models_by_params(param)
embedding.summary() embedding.summary()
model.summary() model.summary()
logger.info("define callbacks") logger.info("define callbacks")
@ -155,20 +137,26 @@ def main_train(param=None):
verbose=False)) verbose=False))
logger.info("compile model") logger.info("compile model")
custom_metrics = models.get_metric_functions() custom_metrics = models.get_metric_functions()
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'] + custom_metrics)
server_tr = np.max(server_windows_tr, axis=1) server_tr = np.max(server_windows_tr, axis=1)
if args.class_weights: if args.class_weights:
logger.info("class weights: compute custom weights") logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client_tr, server_tr) custom_class_weights = get_custom_class_weights(client_tr.value, server_tr)
logger.info(custom_class_weights) logger.info(custom_class_weights)
else: else:
logger.info("class weights: set default") logger.info("class weights: set default")
custom_class_weights = None custom_class_weights = None
logger.info("start training") logger.info("start training")
if train_new_model:
server_tr = np.expand_dims(server_windows_tr, 2)
model = new_model
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'] + custom_metrics)
model.fit([domain_tr, flow_tr], model.fit([domain_tr, flow_tr],
[client_tr, server_tr], [client_tr, server_tr],
batch_size=args.batch_size, batch_size=args.batch_size,
@ -185,105 +173,29 @@ def main_test():
domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data, domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data,
args.domain_length, args.window) args.domain_length, args.window)
clf = load_model(args.clf_model, custom_objects=models.get_metrics()) clf = load_model(args.clf_model, custom_objects=models.get_metrics())
# stats = clf.evaluate([domain_val, flow_val],
# [client_val, server_val],
# batch_size=args.batch_size)
y_pred = clf.predict([domain_val, flow_val], y_pred = clf.predict([domain_val, flow_val],
batch_size=args.batch_size, batch_size=args.batch_size,
verbose=1) verbose=1)
np.save(args.future_prediction, y_pred) np.save(args.future_prediction, y_pred)
char_dict = dataset.get_character_dict() # char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data(args.test_data) # user_flow_df = dataset.get_user_flow_data(args.test_data)
domains = user_flow_df.domain.unique()[:-1] # domains = user_flow_df.domain.unique()[:-1]
#
def get_domain_features_reduced(d): # def get_domain_features_reduced(d):
return dataset.get_domain_features(d[0], char_dict, args.domain_length) # return dataset.get_domain_features(d[0], char_dict, args.domain_length)
#
domain_features = [] # domain_features = []
for ds in domains: # for ds in domains:
domain_features.append(np.apply_along_axis(get_domain_features_reduced, 2, np.atleast_3d(ds))) # domain_features.append(np.apply_along_axis(get_domain_features_reduced, 2, np.atleast_3d(ds)))
#
model = load_model(args.embedding_model) # model = load_model(args.embedding_model)
domain_features = np.stack(domain_features).reshape((-1, 40)) # domain_features = np.stack(domain_features).reshape((-1, 40))
pred = model.predict(domain_features, batch_size=args.batch_size, verbose=1) # pred = model.predict(domain_features, batch_size=args.batch_size, verbose=1)
#
np.save("/tmp/rk/domains.npy", domains) # np.save("/tmp/rk/domains.npy", domains)
np.save("/tmp/rk/domain_features.npy", domain_features) # np.save("/tmp/rk/domain_features.npy", domain_features)
np.save("/tmp/rk/domain_embd.npy", pred) # np.save("/tmp/rk/domain_embd.npy", pred)
def main_new_model():
exists_or_make_path(args.model_path)
char_dict = dataset.get_character_dict()
domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
args.domain_length, args.window)
# parameter
p = {
"type": args.model_type,
"batch_size": 64,
"window_size": args.window,
"domain_length": args.domain_length,
"flow_features": 3,
"vocab_size": len(char_dict) + 1,
#
'dropout': 0.5,
'domain_features': args.domain_embedding,
'embedding_size': args.embedding,
'filter_main': 64,
'flow_features': 3,
# 'dense_main': 512,
'dense_main': 64,
'filter_embedding': args.hidden_char_dims,
'hidden_embedding': args.domain_embedding,
'kernel_embedding': 3,
'kernels_main': 3,
'input_length': 40
}
embedding, _, model = models.get_models_by_params(p)
embedding.summary()
model.summary()
logger.info("define callbacks")
callbacks = []
callbacks.append(ModelCheckpoint(filepath=args.clf_model,
monitor='val_loss',
verbose=False,
save_best_only=True))
callbacks.append(CSVLogger(args.train_log))
if args.stop_early:
callbacks.append(EarlyStopping(monitor='val_loss',
patience=5,
verbose=False))
logger.info("compile model")
custom_metrics = models.get_metric_functions()
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'] + custom_metrics)
server_tr = np.max(server_windows_tr, axis=1)
if args.class_weights:
logger.info("class weights: compute custom weights")
custom_class_weights = get_custom_class_weights(client_tr, server_tr)
logger.info(custom_class_weights)
else:
logger.info("class weights: set default")
custom_class_weights = None
logger.info("start training")
server_tr = np.expand_dims(server_windows_tr, 2)
model.fit([domain_tr, flow_tr],
[client_tr, server_tr],
batch_size=args.batch_size,
epochs=args.epochs,
callbacks=callbacks,
shuffle=True,
validation_split=0.2,
class_weight=custom_class_weights)
logger.info("save embedding")
embedding.save(args.embedding_model)
def main_embedding(): def main_embedding():
@ -360,7 +272,7 @@ def main():
if "data" in args.modes: if "data" in args.modes:
main_data() main_data()
if "train_new" in args.modes: if "train_new" in args.modes:
main_new_model() main_train(train_new_model=True)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,6 +1,5 @@
import keras.backend as K import keras.backend as K
import dataset
from . import pauls_networks from . import pauls_networks
from . import renes_networks from . import renes_networks
@ -9,7 +8,6 @@ def get_models_by_params(params: dict):
# decomposing param section # decomposing param section
# mainly embedding model # mainly embedding model
network_type = params.get("type") network_type = params.get("type")
vocab_size = len(dataset.get_character_dict()) + 1
embedding_size = params.get("embedding_size") embedding_size = params.get("embedding_size")
input_length = params.get("input_length") input_length = params.get("input_length")
filter_embedding = params.get("filter_embedding") filter_embedding = params.get("filter_embedding")
@ -26,8 +24,8 @@ def get_models_by_params(params: dict):
dense_dim = params.get("dense_main") dense_dim = params.get("dense_main")
# create models # create models
networks = renes_networks if network_type == "rene" else pauls_networks networks = renes_networks if network_type == "rene" else pauls_networks
embedding_model = networks.get_embedding(vocab_size, embedding_size, input_length, embedding_model = networks.get_embedding(embedding_size, input_length, filter_embedding, kernel_embedding,
filter_embedding, kernel_embedding, hidden_embedding, drop_out=dropout) hidden_embedding, drop_out=dropout)
predict_model = networks.get_model(dropout, flow_features, domain_features, window_size, domain_length, predict_model = networks.get_model(dropout, flow_features, domain_features, window_size, domain_length,
filter_main, kernel_main, dense_dim, embedding_model) filter_main, kernel_main, dense_dim, embedding_model)

View File

@ -2,6 +2,8 @@ import keras
from keras.engine import Input, Model from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed
import dataset
best_config = { best_config = {
"type": "paul", "type": "paul",
"batch_size": 64, "batch_size": 64,
@ -24,11 +26,10 @@ best_config = {
} }
def get_embedding(vocab_size, embedding_size, input_length, def get_embedding(embedding_size, input_length, filter_size, kernel_size, hidden_dims, drop_out=0.5):
filters, kernel_size, hidden_dims, drop_out=0.5):
x = y = Input(shape=(input_length,)) x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y) y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y) y = Conv1D(filter_size, kernel_size, activation='relu')(y)
y = GlobalMaxPooling1D()(y) y = GlobalMaxPooling1D()(y)
y = Dropout(drop_out)(y) y = Dropout(drop_out)(y)
y = Dense(hidden_dims)(y) y = Dense(hidden_dims)(y)

View File

@ -3,11 +3,12 @@ from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, TimeDistributed, MaxPool1D, \ from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, TimeDistributed, MaxPool1D, \
GlobalAveragePooling1D GlobalAveragePooling1D
import dataset
def get_embedding(vocab_size, embedding_size, input_length,
filter_size, kernel_size, hidden_dims, drop_out=0.5): def get_embedding(embedding_size, input_length, filter_size, kernel_size, hidden_dims, drop_out=0.5):
x = y = Input(shape=(input_length,)) x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y) y = Embedding(input_dim=dataset.get_vocab_size(), output_dim=embedding_size)(y)
y = Conv1D(filter_size, kernel_size=5, activation='relu')(y) y = Conv1D(filter_size, kernel_size=5, activation='relu')(y)
y = Conv1D(filter_size, kernel_size=3, activation='relu')(y) y = Conv1D(filter_size, kernel_size=3, activation='relu')(y)
y = Conv1D(filter_size, kernel_size=3, activation='relu')(y) y = Conv1D(filter_size, kernel_size=3, activation='relu')(y)

View File

@ -1,6 +1,18 @@
import os import os
import numpy as np
from sklearn.utils import class_weight
def exists_or_make_path(p): def exists_or_make_path(p):
if not os.path.exists(p): if not os.path.exists(p):
os.makedirs(p) os.makedirs(p)
def get_custom_class_weights(client, server):
client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client)
server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server)
return {
"client": client_class_weight,
"server": server_class_weight
}