add new network architecture - server label moves to the middle
This commit is contained in:
parent
8cd1023165
commit
820a5d1a4d
@ -120,7 +120,6 @@ def create_dataset_from_flows(user_flow_df, char_dict, max_len, window_size=10):
|
|||||||
server_tr = server_tr[idx]
|
server_tr = server_tr[idx]
|
||||||
|
|
||||||
client_tr = np_utils.to_categorical(client_tr, 2)
|
client_tr = np_utils.to_categorical(client_tr, 2)
|
||||||
server_tr = np_utils.to_categorical(server_tr, 2)
|
|
||||||
|
|
||||||
return domain_tr, flow_tr, client_tr, server_tr
|
return domain_tr, flow_tr, client_tr, server_tr
|
||||||
|
|
||||||
@ -166,7 +165,7 @@ def create_dataset_from_lists(chunks, vocab, max_len):
|
|||||||
logger.info(" select names")
|
logger.info(" select names")
|
||||||
names = np.unique(np.stack(map(lambda f: f.user_hash, chunks)))
|
names = np.unique(np.stack(map(lambda f: f.user_hash, chunks)))
|
||||||
logger.info(" select servers")
|
logger.info(" select servers")
|
||||||
servers = np.max(np.stack(map(lambda f: f.serverLabel, chunks)), axis=1)
|
servers = np.stack(map(lambda f: f.serverLabel, chunks))
|
||||||
logger.info(" select trusted hits")
|
logger.info(" select trusted hits")
|
||||||
trusted_hits = np.max(np.stack(map(lambda f: f.trustedHits, chunks)), axis=1)
|
trusted_hits = np.max(np.stack(map(lambda f: f.trustedHits, chunks)), axis=1)
|
||||||
|
|
||||||
|
127
main.py
127
main.py
@ -7,7 +7,7 @@ import pandas as pd
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
|
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
|
||||||
from keras.models import load_model
|
from keras.models import load_model
|
||||||
from sklearn.decomposition import PCA
|
from keras.utils import np_utils
|
||||||
from sklearn.utils import class_weight
|
from sklearn.utils import class_weight
|
||||||
|
|
||||||
import arguments
|
import arguments
|
||||||
@ -46,8 +46,6 @@ ch.setFormatter(formatter)
|
|||||||
# add ch to logger
|
# add ch to logger
|
||||||
logger.addHandler(ch)
|
logger.addHandler(ch)
|
||||||
|
|
||||||
print = logger.info
|
|
||||||
|
|
||||||
args = arguments.parse()
|
args = arguments.parse()
|
||||||
|
|
||||||
if args.gpu:
|
if args.gpu:
|
||||||
@ -104,8 +102,8 @@ def main_hyperband():
|
|||||||
|
|
||||||
|
|
||||||
def get_custom_class_weights(client_tr, server_tr):
|
def get_custom_class_weights(client_tr, server_tr):
|
||||||
client = client_tr.value.argmax(1)
|
client = client_tr.value.argmax(1) if type(client_tr) != np.ndarray else client_tr.argmax(1)
|
||||||
server = server_tr.value.argmax(1)
|
server = server_tr.value.argmax(1) if type(server_tr) != np.ndarray else server_tr.argmax(1)
|
||||||
client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client)
|
client_class_weight = class_weight.compute_class_weight('balanced', np.unique(client), client)
|
||||||
server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server)
|
server_class_weight = class_weight.compute_class_weight('balanced', np.unique(server), server)
|
||||||
return {
|
return {
|
||||||
@ -118,7 +116,7 @@ def main_train(param=None):
|
|||||||
exists_or_make_path(args.model_path)
|
exists_or_make_path(args.model_path)
|
||||||
|
|
||||||
char_dict = dataset.get_character_dict()
|
char_dict = dataset.get_character_dict()
|
||||||
domain_tr, flow_tr, client_tr, server_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
|
domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
|
||||||
args.domain_length, args.window)
|
args.domain_length, args.window)
|
||||||
|
|
||||||
# parameter
|
# parameter
|
||||||
@ -133,10 +131,10 @@ def main_train(param=None):
|
|||||||
'dropout': 0.5,
|
'dropout': 0.5,
|
||||||
'domain_features': args.domain_embedding,
|
'domain_features': args.domain_embedding,
|
||||||
'embedding_size': args.embedding,
|
'embedding_size': args.embedding,
|
||||||
'filter_main': 128,
|
'filter_main': 64,
|
||||||
'flow_features': 3,
|
'flow_features': 3,
|
||||||
# 'dense_main': 512,
|
# 'dense_main': 512,
|
||||||
'dense_main': 128,
|
'dense_main': 64,
|
||||||
'filter_embedding': args.hidden_char_dims,
|
'filter_embedding': args.hidden_char_dims,
|
||||||
'hidden_embedding': args.domain_embedding,
|
'hidden_embedding': args.domain_embedding,
|
||||||
'kernel_embedding': 3,
|
'kernel_embedding': 3,
|
||||||
@ -146,7 +144,7 @@ def main_train(param=None):
|
|||||||
if not param:
|
if not param:
|
||||||
param = p
|
param = p
|
||||||
|
|
||||||
embedding, model = models.get_models_by_params(param)
|
embedding, model, _ = models.get_models_by_params(param)
|
||||||
embedding.summary()
|
embedding.summary()
|
||||||
model.summary()
|
model.summary()
|
||||||
logger.info("define callbacks")
|
logger.info("define callbacks")
|
||||||
@ -166,6 +164,8 @@ def main_train(param=None):
|
|||||||
loss='categorical_crossentropy',
|
loss='categorical_crossentropy',
|
||||||
metrics=['accuracy'] + custom_metrics)
|
metrics=['accuracy'] + custom_metrics)
|
||||||
|
|
||||||
|
server_tr = np_utils.to_categorical(np.max(server_windows_tr, axis=1), 2)
|
||||||
|
|
||||||
if args.class_weights:
|
if args.class_weights:
|
||||||
logger.info("class weights: compute custom weights")
|
logger.info("class weights: compute custom weights")
|
||||||
custom_class_weights = get_custom_class_weights(client_tr, server_tr)
|
custom_class_weights = get_custom_class_weights(client_tr, server_tr)
|
||||||
@ -200,7 +200,7 @@ def main_test():
|
|||||||
|
|
||||||
char_dict = dataset.get_character_dict()
|
char_dict = dataset.get_character_dict()
|
||||||
user_flow_df = dataset.get_user_flow_data(args.test_data)
|
user_flow_df = dataset.get_user_flow_data(args.test_data)
|
||||||
domains = user_flow_df.domain.unique()
|
domains = user_flow_df.domain.unique()[:-1]
|
||||||
|
|
||||||
def get_domain_features_reduced(d):
|
def get_domain_features_reduced(d):
|
||||||
return dataset.get_domain_features(d[0], char_dict, args.domain_length)
|
return dataset.get_domain_features(d[0], char_dict, args.domain_length)
|
||||||
@ -211,13 +211,93 @@ def main_test():
|
|||||||
|
|
||||||
model = load_model(args.embedding_model)
|
model = load_model(args.embedding_model)
|
||||||
domain_features = np.stack(domain_features).reshape((-1, 40))
|
domain_features = np.stack(domain_features).reshape((-1, 40))
|
||||||
pred = model.predict(domains, batch_size=args.batch_size, verbose=1)
|
pred = model.predict(domain_features, batch_size=args.batch_size, verbose=1)
|
||||||
|
|
||||||
np.save("/tmp/rk/domains.npy", domains)
|
np.save("/tmp/rk/domains.npy", domains)
|
||||||
np.save("/tmp/rk/domain_features.npy", domain_features)
|
np.save("/tmp/rk/domain_features.npy", domain_features)
|
||||||
np.save("/tmp/rk/domain_embd.npy", pred)
|
np.save("/tmp/rk/domain_embd.npy", pred)
|
||||||
|
|
||||||
|
|
||||||
|
def main_new_model():
|
||||||
|
exists_or_make_path(args.model_path)
|
||||||
|
|
||||||
|
char_dict = dataset.get_character_dict()
|
||||||
|
domain_tr, flow_tr, client_tr, server_windows_tr = load_or_generate_h5data(args.train_h5data, args.train_data,
|
||||||
|
args.domain_length, args.window)
|
||||||
|
|
||||||
|
# parameter
|
||||||
|
p = {
|
||||||
|
"type": args.model_type,
|
||||||
|
"batch_size": 64,
|
||||||
|
"window_size": args.window,
|
||||||
|
"domain_length": args.domain_length,
|
||||||
|
"flow_features": 3,
|
||||||
|
"vocab_size": len(char_dict) + 1,
|
||||||
|
#
|
||||||
|
'dropout': 0.5,
|
||||||
|
'domain_features': args.domain_embedding,
|
||||||
|
'embedding_size': args.embedding,
|
||||||
|
'filter_main': 64,
|
||||||
|
'flow_features': 3,
|
||||||
|
# 'dense_main': 512,
|
||||||
|
'dense_main': 64,
|
||||||
|
'filter_embedding': args.hidden_char_dims,
|
||||||
|
'hidden_embedding': args.domain_embedding,
|
||||||
|
'kernel_embedding': 3,
|
||||||
|
'kernels_main': 3,
|
||||||
|
'input_length': 40
|
||||||
|
}
|
||||||
|
|
||||||
|
embedding, _, model = models.get_models_by_params(p)
|
||||||
|
embedding.summary()
|
||||||
|
model.summary()
|
||||||
|
logger.info("define callbacks")
|
||||||
|
callbacks = []
|
||||||
|
callbacks.append(ModelCheckpoint(filepath=args.clf_model,
|
||||||
|
monitor='val_loss',
|
||||||
|
verbose=False,
|
||||||
|
save_best_only=True))
|
||||||
|
callbacks.append(CSVLogger(args.train_log))
|
||||||
|
if args.stop_early:
|
||||||
|
callbacks.append(EarlyStopping(monitor='val_loss',
|
||||||
|
patience=5,
|
||||||
|
verbose=False))
|
||||||
|
logger.info("compile model")
|
||||||
|
custom_metrics = models.get_metric_functions()
|
||||||
|
model.compile(optimizer='adam',
|
||||||
|
loss='categorical_crossentropy',
|
||||||
|
metrics=['accuracy'] + custom_metrics)
|
||||||
|
|
||||||
|
server_tr = np_utils.to_categorical(np.max(server_windows_tr, axis=1), 2)
|
||||||
|
|
||||||
|
if args.class_weights:
|
||||||
|
logger.info("class weights: compute custom weights")
|
||||||
|
custom_class_weights = get_custom_class_weights(client_tr, server_tr)
|
||||||
|
logger.info(custom_class_weights)
|
||||||
|
else:
|
||||||
|
logger.info("class weights: set default")
|
||||||
|
custom_class_weights = None
|
||||||
|
logger.info("start training")
|
||||||
|
server_tr = np.stack(np_utils.to_categorical(s, 2) for s in server_windows_tr)
|
||||||
|
model.fit([domain_tr, flow_tr],
|
||||||
|
[client_tr, server_tr],
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
epochs=args.epochs,
|
||||||
|
callbacks=callbacks,
|
||||||
|
shuffle=True,
|
||||||
|
validation_split=0.2,
|
||||||
|
class_weight=custom_class_weights)
|
||||||
|
logger.info("save embedding")
|
||||||
|
embedding.save(args.embedding_model)
|
||||||
|
|
||||||
|
|
||||||
|
def main_embedding():
|
||||||
|
model = load_model(args.embedding_model)
|
||||||
|
domain_encs, labels = dataset.load_or_generate_domains(args.train_data, args.domain_length)
|
||||||
|
domain_embedding = model.predict(domain_encs, batch_size=args.batch_size, verbose=1)
|
||||||
|
visualize.plot_embedding(domain_embedding, labels, path="results/pp3/embd.png")
|
||||||
|
|
||||||
|
|
||||||
def main_visualization():
|
def main_visualization():
|
||||||
domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data,
|
domain_val, flow_val, client_val, server_val = load_or_generate_h5data(args.test_h5data, args.test_data,
|
||||||
args.domain_length, args.window)
|
args.domain_length, args.window)
|
||||||
@ -248,27 +328,6 @@ def main_visualization():
|
|||||||
"{}/server_cov.png".format(args.model_path),
|
"{}/server_cov.png".format(args.model_path),
|
||||||
normalize=False, title="Server Confusion Matrix")
|
normalize=False, title="Server Confusion Matrix")
|
||||||
|
|
||||||
# embedding visi
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
model = load_model(args.embedding_model)
|
|
||||||
domains = np.reshape(domain_val, (domain_val.shape[0] * domain_val.shape[1], 40))
|
|
||||||
domain_embedding = model.predict(domains, batch_size=args.batch_size, verbose=1)
|
|
||||||
|
|
||||||
pca = PCA(n_components=2)
|
|
||||||
domain_reduced = pca.fit_transform(domain_embedding)
|
|
||||||
print(pca.explained_variance_ratio_)
|
|
||||||
|
|
||||||
clients = np.repeat(client_val, 10, axis=0)
|
|
||||||
clients = clients.argmax(1)
|
|
||||||
servers = np.repeat(server_val, 10, axis=0)
|
|
||||||
servers = servers.argmax(1)
|
|
||||||
|
|
||||||
plt.scatter(domain_reduced[:, 0], domain_reduced[:, 1], c=clients, cmap=plt.cm.bwr, s=2)
|
|
||||||
plt.show()
|
|
||||||
plt.scatter(domain_reduced[:, 0], domain_reduced[:, 1], c=servers, cmap=plt.cm.bwr, s=2)
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
|
||||||
def main_score():
|
def main_score():
|
||||||
# mask = dataset.load_mask_eval(args.data, args.test_image)
|
# mask = dataset.load_mask_eval(args.data, args.test_image)
|
||||||
@ -281,7 +340,7 @@ def main_data():
|
|||||||
char_dict = dataset.get_character_dict()
|
char_dict = dataset.get_character_dict()
|
||||||
user_flow_df = dataset.get_user_flow_data(args.train_data)
|
user_flow_df = dataset.get_user_flow_data(args.train_data)
|
||||||
logger.info("create training dataset")
|
logger.info("create training dataset")
|
||||||
domain_tr, flow_tr, client_tr, server_tr = dataset.create_dataset_from_flows(user_flow_df, char_dict,
|
domain_tr, flow_tr, client_tr, server_tr, _ = dataset.create_dataset_from_flows(user_flow_df, char_dict,
|
||||||
max_len=args.domain_length,
|
max_len=args.domain_length,
|
||||||
window_size=args.window)
|
window_size=args.window)
|
||||||
print(f"domain shape {domain_tr.shape}")
|
print(f"domain shape {domain_tr.shape}")
|
||||||
@ -305,6 +364,8 @@ def main():
|
|||||||
main_paul_best()
|
main_paul_best()
|
||||||
if "data" in args.modes:
|
if "data" in args.modes:
|
||||||
main_data()
|
main_data()
|
||||||
|
if "train_new" in args.modes:
|
||||||
|
main_new_model()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -32,7 +32,10 @@ def get_models_by_params(params: dict):
|
|||||||
predict_model = networks.get_model(dropout, flow_features, domain_features, window_size, domain_length,
|
predict_model = networks.get_model(dropout, flow_features, domain_features, window_size, domain_length,
|
||||||
filter_main, kernel_main, dense_dim, embedding_model)
|
filter_main, kernel_main, dense_dim, embedding_model)
|
||||||
|
|
||||||
return embedding_model, predict_model
|
new_model = networks.get_new_model(dropout, flow_features, domain_features, window_size, domain_length,
|
||||||
|
filter_main, kernel_main, dense_dim, embedding_model)
|
||||||
|
|
||||||
|
return embedding_model, predict_model, new_model
|
||||||
|
|
||||||
|
|
||||||
def get_metrics():
|
def get_metrics():
|
||||||
|
@ -30,8 +30,8 @@ def get_embedding(vocab_size, embedding_size, input_length,
|
|||||||
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
|
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
|
||||||
y = Conv1D(filters, kernel_size, activation='relu')(y)
|
y = Conv1D(filters, kernel_size, activation='relu')(y)
|
||||||
y = GlobalMaxPooling1D()(y)
|
y = GlobalMaxPooling1D()(y)
|
||||||
y = Dense(hidden_dims)(y)
|
|
||||||
y = Dropout(drop_out)(y)
|
y = Dropout(drop_out)(y)
|
||||||
|
y = Dense(hidden_dims)(y)
|
||||||
y = Activation('relu')(y)
|
y = Activation('relu')(y)
|
||||||
return Model(x, y)
|
return Model(x, y)
|
||||||
|
|
||||||
@ -56,3 +56,27 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le
|
|||||||
y2 = Dense(2, activation='softmax', name="server")(y)
|
y2 = Dense(2, activation='softmax', name="server")(y)
|
||||||
|
|
||||||
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
|
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
|
||||||
|
|
||||||
|
|
||||||
|
def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
|
||||||
|
dense_dim, cnn):
|
||||||
|
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
|
||||||
|
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
||||||
|
encoded = TimeDistributed(cnn)(ipt_domains)
|
||||||
|
|
||||||
|
y2 = Dense(2, activation="softmax", name="server")(encoded)
|
||||||
|
merged = keras.layers.concatenate([encoded, ipt_flows, y2], -1)
|
||||||
|
|
||||||
|
y = Conv1D(cnn_dims,
|
||||||
|
kernel_size,
|
||||||
|
activation='relu',
|
||||||
|
input_shape=(window_size, domain_features + flow_features))(merged)
|
||||||
|
# remove temporal dimension by global max pooling
|
||||||
|
y = GlobalMaxPooling1D()(y)
|
||||||
|
y = Dropout(dropout)(y)
|
||||||
|
y = Dense(dense_dim, activation='relu')(y)
|
||||||
|
|
||||||
|
y1 = Dense(2, activation='softmax', name="client")(y)
|
||||||
|
model = Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
|
||||||
|
|
||||||
|
return model
|
||||||
|
@ -42,3 +42,27 @@ def get_model(cnnDropout, flow_features, domain_features, window_size, domain_le
|
|||||||
y2 = Dense(2, activation='softmax', name="server")(y)
|
y2 = Dense(2, activation='softmax', name="server")(y)
|
||||||
|
|
||||||
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
|
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
|
||||||
|
|
||||||
|
|
||||||
|
def get_new_model(dropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
|
||||||
|
dense_dim, cnn):
|
||||||
|
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
|
||||||
|
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
|
||||||
|
encoded = TimeDistributed(cnn)(ipt_domains)
|
||||||
|
|
||||||
|
y2 = Dense(2, activation="softmax", name="server")(encoded)
|
||||||
|
merged = keras.layers.concatenate([encoded, ipt_flows, y2], -1)
|
||||||
|
|
||||||
|
y = Conv1D(cnn_dims,
|
||||||
|
kernel_size,
|
||||||
|
activation='relu',
|
||||||
|
input_shape=(window_size, domain_features + flow_features))(merged)
|
||||||
|
# remove temporal dimension by global max pooling
|
||||||
|
y = GlobalMaxPooling1D()(y)
|
||||||
|
y = Dropout(dropout)(y)
|
||||||
|
y = Dense(dense_dim, activation='relu')(y)
|
||||||
|
|
||||||
|
y1 = Dense(2, activation='softmax', name="client")(y)
|
||||||
|
model = Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
|
||||||
|
|
||||||
|
return model
|
||||||
|
Loading…
x
Reference in New Issue
Block a user