added new networks for domain embedding and classification task

This commit is contained in:
René Knaebel 2017-07-05 17:37:08 +02:00
parent 59c1176e85
commit 3862dce975
4 changed files with 82 additions and 33 deletions

1
.gitignore vendored
View File

@ -100,3 +100,4 @@ ENV/
*.joblib
*.csv
*.csv.gz
*.csv.tar.*

43
main.py
View File

@ -37,9 +37,21 @@ parser.add_argument("--epochs", action="store", dest="epochs",
# parser.add_argument("--samples_val", action="store", dest="samples_val",
# default=10000, type=int)
#
# parser.add_argument("--area", action="store", dest="area_size",
# default=25, type=int)
#
parser.add_argument("--embd", action="store", dest="embedding",
default=128, type=int)
parser.add_argument("--hidden_char_dims", action="store", dest="hidden_char_dims",
default=256, type=int)
parser.add_argument("--window", action="store", dest="window",
default=10, type=int)
parser.add_argument("--domain_length", action="store", dest="domain_length",
default=40, type=int)
parser.add_argument("--domain_embd", action="store", dest="domain_embedding",
default=512, type=int)
# parser.add_argument("--queue", action="store", dest="queue_size",
# default=50, type=int)
#
@ -59,6 +71,7 @@ parser.add_argument("--epochs", action="store", dest="epochs",
args = parser.parse_args()
# config = tf.ConfigProto(log_device_placement=True)
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
# config.gpu_options.allow_growth = True
@ -67,24 +80,17 @@ args = parser.parse_args()
def main():
# parameter
innerCNNFilters = 512
innerCNNKernelSize = 2
cnnDropout = 0.5
cnnHiddenDims = 1024
domainFeatures = 512
flowFeatures = 3
numCiscoFeatures = 30
windowSize = 10
maxLen = 40
embeddingSize = 100
kernel_size = 2
kernel_size = 3
drop_out = 0.5
filters = 2
filters = 128
hidden_dims = 100
vocabSize = 40
threshold = 3
minFlowsPerUser = 10
numEpochs = 100
char_dict = dataset.get_character_dict()
user_flow_df = dataset.get_user_flow_data()
@ -92,7 +98,7 @@ def main():
print("create training dataset")
(X_tr, hits_tr, names_tr, server_tr, trusted_hits_tr) = dataset.create_dataset_from_flows(
user_flow_df, char_dict,
max_len=maxLen, window_size=windowSize)
max_len=args.domain_length, window_size=args.window)
# make client labels discrete with 4 different values
# TODO: use trusted_hits_tr for client classification too
client_labels = np.apply_along_axis(lambda x: dataset.discretize_label(x, 3), 0, np.atleast_2d(hits_tr))
@ -104,11 +110,14 @@ def main():
client_labels = client_labels[idx]
server_labels = server_tr[idx]
shared_cnn = models.get_shared_cnn(len(char_dict) + 1, embeddingSize, maxLen,
domainFeatures, kernel_size, domainFeatures, 0.5)
shared_cnn = models.get_embedding_network_rene(len(char_dict) + 1, args.embedding, args.domain_length,
args.hidden_char_dims, args.domain_embedding, 0.5)
shared_cnn.summary()
model = models.get_top_cnn(shared_cnn, flowFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size,
cnnHiddenDims, cnnDropout)
model = models.get_top_cnn_rene(cnnDropout, flowFeatures, args.domain_embedding,
args.window, args.domain_length, filters, kernel_size,
cnnHiddenDims, shared_cnn)
model.summary()
model.compile(optimizer='adam',
loss='binary_crossentropy',

View File

@ -1,10 +1,11 @@
import keras
from keras.engine import Input, Model
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed, MaxPool1D
def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_size,
hidden_dims, drop_out):
# designed by paul
def get_embedding_network_paul(vocab_size, embedding_size, input_length, filters, kernel_size,
hidden_dims, drop_out=0.5):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocab_size, output_dim=embedding_size)(y)
y = Conv1D(filters, kernel_size, activation='relu')(y)
@ -15,26 +16,65 @@ def get_shared_cnn(vocab_size, embedding_size, input_length, filters, kernel_siz
return Model(x, y)
def get_embedding_network_rene(vocab_size, embedding_size, input_length,
hidden_char_dims, hidden_dims, drop_out=0.5):
x = y = Input(shape=(input_length,))
y = Embedding(input_dim=vocab_size, output_dim=embedding_size, mask_zero=True)(y)
y = Conv1D(hidden_char_dims, kernel_size=5, activation='relu')(y)
y = MaxPool1D(pool_size=3, strides=1)(y)
y = Conv1D(hidden_char_dims, kernel_size=3, activation='relu')(y)
y = MaxPool1D(pool_size=3, strides=1)(y)
y = Conv1D(hidden_char_dims, kernel_size=3, activation='relu')(y)
y = GlobalMaxPooling1D()(y)
y = Dense(hidden_dims)(y)
y = Dropout(drop_out)(y)
y = Activation('relu')(y)
return Model(x, y)
def get_full_model(vocabSize, embeddingSize, maxLen, domainFeatures, flowFeatures,
filters, h1, h2, dropout, dense):
pass
def get_top_cnn(cnn, numFeatures, maxLen, windowSize, domainFeatures, filters, kernel_size, cnnHiddenDims, cnnDropout):
ipt_domains = Input(shape=(windowSize, maxLen), name="ipt_domains")
# designed by paul
def get_top_cnn(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim,
cnn):
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
encoded = TimeDistributed(cnn)(ipt_domains)
ipt_flows = Input(shape=(windowSize, numFeatures), name="ipt_flows")
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# add second cnn
y = Conv1D(filters,
# CNN processing a small slides of flow windows
# TODO: add more layers?
y = Conv1D(cnn_dims,
kernel_size,
activation='relu',
input_shape=(windowSize, domainFeatures + numFeatures))(merged)
# TODO: why global pooling? -> 3D to 2D
# we use max pooling:
input_shape=(window_size, domain_features + flow_features))(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y)
y = Dense(cnnHiddenDims, activation='relu')(y)
y = Dense(dense_dim, activation='relu')(y)
y1 = Dense(2, activation='softmax', name="client")(y)
y2 = Dense(2, activation='softmax', name="server")(y)
return Model(inputs=[ipt_domains, ipt_flows], outputs=(y1, y2))
def get_top_cnn_rene(cnnDropout, flow_features, domain_features, window_size, domain_length, cnn_dims, kernel_size,
dense_dim, cnn):
ipt_domains = Input(shape=(window_size, domain_length), name="ipt_domains")
encoded = TimeDistributed(cnn)(ipt_domains)
ipt_flows = Input(shape=(window_size, flow_features), name="ipt_flows")
merged = keras.layers.concatenate([encoded, ipt_flows], -1)
# CNN processing a small slides of flow windows
# TODO: add more layers?
y = Conv1D(filters=cnn_dims, kernel_size=kernel_size, activation='relu',
input_shape=(window_size, domain_features + flow_features))(merged)
# remove temporal dimension by global max pooling
y = GlobalMaxPooling1D()(y)
y = Dropout(cnnDropout)(y)
y = Dense(dense_dim, activation='relu')(y)
y1 = Dense(2, activation='softmax', name="client")(y)
y2 = Dense(2, activation='softmax', name="server")(y)

View File

@ -4,7 +4,6 @@ import joblib
import pandas as pd
df = joblib.load("/mnt/projekte/pmlcluster/cisco/trainData/multipleTaskLearning/currentData.joblib")
df = df["data"]
df = pd.concat(df)
df = pd.concat(df["data"])
df.reset_index(inplace=True)
df.to_csv("/tmp/rk/full_dataset.csv.gz", compression="gzip")
df.to_csv("/tmp/rk/full_future_dataset.csv.gz", compression="gzip")