added params

2017-07-07 16:48:10 +02:00
parent 3c4be52bb6
commit be56112b33
4 changed files with 191 additions and 72 deletions
--- a/4
+++ b/4
@@ -1,3 +1,5 @@
 test:
-	python3 main.py --epochs 1 --batch 64 --train data/rk_data.csv.gz --test data/rk_data.csv.gz
+	python3 main.py --modes train --epochs 1 --batch 64 --train data/rk_data.csv.gz

+hyper:
+	python3 main.py --modes hyperband --epochs 1 --batch 64 --train data/rk_data.csv.gz
--- a/hyperband.py
+++ b/hyperband.py
@@ -1,76 +1,128 @@
 # -*- coding: utf-8 -*-
 # implementation of hyperband:
 # https://arxiv.org/pdf/1603.06560.pdf
+import random
+from math import log, ceil
+from random import random as rng
+from time import time, ctime
+
 import numpy as np

-
-def get_hyperparameter_configuration(configGenerator, n):
-    configurations = []
-    for i in np.arange(0, n, 1):
-        configurations.append(configGenerator())
-    return configurations
+import models


-def run_then_return_val_loss(config, r_i, modelGenerator, trainData, trainLabel,
-                             testData, testLabel):
-    # parameter
-    batch_size = 128
-    model = modelGenerator(config)
-    if model != None:
-        model.fit(x=trainData, y=trainLabel,
-                  epochs=int(r_i), shuffle=True, initial_epoch=0,
-                  batch_size=batch_size)
-        score = model.evaluate(testData, testLabel,
-                               batch_size=batch_size)
-        score = score[0]
-    else:
-        score = np.infty
-    return score
+def sample_params(param_distribution: dict):
+    p = {}
+    for key, val in param_distribution.items():
+        p[key] = random.choice(val)
+    return p


-def top_k(configurations, L, k):
-    outConfigs = []
-    sortIDs = np.argsort(np.array(L))
-    for i in np.arange(0, k, 1):
-        outConfigs.append(configurations[sortIDs[i]])
-    return outConfigs
+class Hyperband:
+    def __init__(self, param_distribution, X, y):
+        self.get_params = lambda: sample_params(param_distribution)

+        self.max_iter = 81  # maximum iterations per configuration
+        self.eta = 3  # defines configuration downsampling rate (default = 3)

-def hyperband(R, nu, modelGenerator,
-              configGenerator,
-              trainData, trainLabel,
-              testData, testLabel,
-              outputFile=''):
-    allLosses = []
-    allConfigs = []
-    # input
+        self.logeta = lambda x: log(x) / log(self.eta)
+        self.s_max = int(self.logeta(self.max_iter))
+        self.B = (self.s_max + 1) * self.max_iter

-    # initialization
-    s_max = np.floor(np.log(R) / np.log(nu))
-    B = (s_max + 1) * R
+        self.results = []  # list of dicts
+        self.counter = 0
+        self.best_loss = np.inf
+        self.best_counter = -1

-    for s in np.arange(s_max, -1, -1):
-        n = np.ceil(np.float(B) / np.float(R) * (np.float(np.power(nu, s)) / np.float(s + 1)))
-        r = np.float(R) * np.power(nu, -s)
-        configurations = get_hyperparameter_configuration(configGenerator, n)
-        for i in np.arange(0, s + 1, 1):
-            n_i = np.floor(np.float(n) * np.power(nu, -i))
-            r_i = np.float(r) * np.power(nu, i)
-            L = []
-            for config in configurations:
-                curLoss = run_then_return_val_loss(config, r_i, modelGenerator,
-                                                   trainData, trainLabel,
-                                                   testData, testLabel)
-                L.append(curLoss)
-                allLosses.append(curLoss)
-                allConfigs.append(config)
-                if outputFile != '':
-                    with open(outputFile, 'a') as myfile:
-                        myfile.write(str(config) + '\t' + str(curLoss) + \
-                                     '\t' + str(r_i) + '\n')
-            configurations = top_k(configurations, L, np.floor(np.float(n_i) / nu))
+        self.X = X
+        self.y = y

-            # print('n_i: ' + str(n_i))
-            # print('r_i: ' + str(r_i))
-    bestConfig = top_k(allConfigs, allLosses, 1)
-    return (bestConfig[0], allConfigs, allLosses)
+    def try_params(self, n_iterations, params):
+        n_iterations = int(round(n_iterations))
+        embedding, model = models.get_models_by_params(params)
+        model.compile(optimizer='adam',
+                      loss='categorical_crossentropy',
+                      metrics=['accuracy'])
+
+        history = model.fit(self.X,
+                            self.y,
+                            batch_size=params["batch_size"],
+                            epochs=n_iterations,
+                            shuffle=True,
+                            validation_split=0.2)
+
+        return {"loss": history.history['loss'][-1]}
+
+    # can be called multiple times
+    def run(self, skip_last=0, dry_run=False):
+
+        for s in reversed(range(self.s_max + 1)):
+
+            # initial number of configurations
+            n = int(ceil(self.B / self.max_iter / (s + 1) * self.eta ** s))
+
+            # initial number of iterations per config
+            r = self.max_iter * self.eta ** (-s)
+
+            # n random configurations
+            T = [self.get_params() for i in range(n)]
+
+            for i in range((s + 1) - int(skip_last)):  # changed from s + 1
+
+                # Run each of the n configs for <iterations>
+                # and keep best (n_configs / eta) configurations
+
+                n_configs = n * self.eta ** (-i)
+                n_iterations = r * self.eta ** (i)
+
+                print("\n*** {} configurations x {:.1f} iterations each".format(
+                    n_configs, n_iterations))
+
+                val_losses = []
+                early_stops = []
+
+                for t in T:
+
+                    self.counter += 1
+                    print("\n{} | {} | lowest loss so far: {:.4f} (run {})\n".format(
+                        self.counter, ctime(), self.best_loss, self.best_counter))
+
+                    start_time = time()
+
+                    if dry_run:
+                        result = {'loss': rng(), 'log_loss': rng(), 'auc': rng()}
+                    else:
+                        result = self.try_params(n_iterations, t)  # <---
+
+                    assert (type(result) == dict)
+                    assert ('loss' in result)
+
+                    seconds = int(round(time() - start_time))
+                    print("\n{} seconds.".format(seconds))
+
+                    loss = result['loss']
+                    val_losses.append(loss)
+
+                    early_stop = result.get('early_stop', False)
+                    early_stops.append(early_stop)
+
+                    # keeping track of the best result so far (for display only)
+                    # could do it be checking results each time, but hey
+                    if loss < self.best_loss:
+                        self.best_loss = loss
+                        self.best_counter = self.counter
+
+                    result['counter'] = self.counter
+                    result['seconds'] = seconds
+                    result['params'] = t
+                    result['iterations'] = n_iterations
+
+                    self.results.append(result)
+
+                # select a number of best configurations for the next loop
+                # filter out early stops, if any
+                indices = np.argsort(val_losses)
+                T = [T[i] for i in indices if not early_stops[i]]
+                T = T[0:int(n_configs / self.eta)]
+
+        return self.results
--- a/main.py
+++ b/main.py
@@ -3,11 +3,12 @@ import argparse
 from keras.utils import np_utils

 import dataset
+import hyperband
 import models

 parser = argparse.ArgumentParser()

-# parser.add_argument("--modes", action="store", dest="modes", nargs="+")
+parser.add_argument("--modes", action="store", dest="modes", nargs="+")

 parser.add_argument("--train", action="store", dest="train_data",
                    default="data/full_dataset.csv.tar.bz2")
@@ -24,9 +25,9 @@ parser.add_argument("--model", action="store", dest="model",
 # parser.add_argument("--pred", action="store", dest="pred",
 #                     default="")
 #
-# parser.add_argument("--type", action="store", dest="model_type",
-#                     default="simple_conv")
-#
+parser.add_argument("--type", action="store", dest="model_type",
+                    default="paul")
+
 parser.add_argument("--batch", action="store", dest="batch_size",
                    default=64, type=int)

@@ -79,13 +80,52 @@ args = parser.parse_args()
 # session = tf.Session(config=config)


+def main_hyperband():
+    char_dict = dataset.get_character_dict()
+    user_flow_df = dataset.get_user_flow_data(args.train_data)
+
+    params = {
+        # static params
+        "type": ["paul"],
+        "batch_size": [64],
+        "vocab_size": [len(char_dict) + 1],
+        "window_size": [10],
+        "domain_length": [40],
+        "flow_features": [3],
+        "input_length": [40],
+        # model params
+        "embedding_size": [16, 32, 64, 128, 256, 512],
+        "filter_embedding": [16, 32, 64, 128, 256, 512],
+        "kernel_embedding": [1, 3, 5, 7, 9],
+        "hidden_embedding": [16, 32, 64, 128, 256, 512],
+        "dropout": [0.5],
+        "domain_features": [16, 32, 64, 128, 256, 512],
+        "filter_main": [16, 32, 64, 128, 256, 512],
+        "kernels_main": [1, 3, 5, 7, 9],
+        "dense_main": [16, 32, 64, 128, 256, 512],
+    }
+    param = hyperband.sample_params(params)
+    print(param)
+
+    print("create training dataset")
+    domain_tr, flow_tr, client_tr, server_tr = dataset.create_dataset_from_flows(
+        user_flow_df, char_dict,
+        max_len=args.domain_length,
+        window_size=args.window)
+    client_tr = np_utils.to_categorical(client_tr, 2)
+    server_tr = np_utils.to_categorical(server_tr, 2)
+
+    hp = hyperband.Hyperband(params, [domain_tr, flow_tr], [client_tr, server_tr])
+    hp.run()
+
+
 def main_train():
    # parameter
    cnnDropout = 0.5
    cnnHiddenDims = 512
    kernel_size = 3
    filters = 128
-    network = models.pauls_networks
+    network = models.pauls_networks if args.model_type == "paul" else models.renes_networks

    char_dict = dataset.get_character_dict()
    user_flow_df = dataset.get_user_flow_data(args.train_data)
@@ -94,6 +134,8 @@ def main_train():
    domain_tr, flow_tr, client_tr, server_tr = dataset.create_dataset_from_flows(
        user_flow_df, char_dict,
        max_len=args.domain_length, window_size=args.window)
+    client_tr = np_utils.to_categorical(client_tr, 2)
+    server_tr = np_utils.to_categorical(server_tr, 2)

    shared_cnn = network.get_embedding(len(char_dict) + 1, args.embedding, args.domain_length,
                                       args.hidden_char_dims, kernel_size, args.domain_embedding, 0.5)
@@ -105,11 +147,9 @@ def main_train():
    model.summary()

    model.compile(optimizer='adam',
-                  loss='binary_crossentropy',
+                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

-    client_tr = np_utils.to_categorical(client_tr, 2)
-    server_tr = np_utils.to_categorical(server_tr, 2)
    model.fit([domain_tr, flow_tr],
              [client_tr, server_tr],
              batch_size=args.batch_size,
@@ -117,6 +157,8 @@ def main_train():
              shuffle=True,
              validation_split=0.2)

+    model.save(args.model)
+

 def main_test():
    char_dict = dataset.get_character_dict()
@@ -154,7 +196,16 @@ def main_score():


 def main():
-    main_train()
+    if "train" in args.modes:
+        main_train()
+    if "hyperband" in args.modes:
+        main_hyperband()
+    if "test" in args.modes:
+        main_test()
+    if "fancy" in args.modes:
+        main_visualization()
+    if "score" in args.modes:
+        main_score()


 if __name__ == "__main__":
--- a/models/pauls_networks.py
+++ b/models/pauls_networks.py
@@ -2,6 +2,20 @@ import keras
 from keras.engine import Input, Model
 from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation, TimeDistributed

+best_config = {
+    'domain_features': 32,
+    'drop_out': 0.5,
+    'embedding_size': 64,
+    'filter_main': 512,
+    'flow_features': 3,
+    'hidden_dims': 32,
+    'filter_embedding': 32,
+    'hidden_embedding': 32,
+    'kernel_embedding': 8,
+    'kernels_main': 8,
+    'input_length': 40
+}
+

 def get_embedding(vocab_size, embedding_size, input_length,
                  filters, kernel_size, hidden_dims, drop_out=0.5):