ma_cisco_malware/visualize.py

import os

import matplotlib.pyplot as plt
import numpy as np
from scipy import interpolate
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import (
    auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,
    roc_auc_score, roc_curve
)


def scores(y_true):
    for (path, dirnames, fnames) in os.walk("results/"):
        for f in fnames:
            if path[-1] == "1" and f.endswith("npy"):
                y_pred = np.load(os.path.join(path, f)).flatten()
                print(path)
                tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1))
                tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0))
                fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0))
                fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1))
                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                accuracy = (tp + tn) / len(y_true)
                f1_score = 2 * (precision * recall) / (precision + recall)
                f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall)
                print("  precision:", precision)
                print("  recall:", recall)
                print("  accuracy:", accuracy)
                print("  f1 score:", f1_score)
                print("  f0.5 score:", f05_score)


def plot_clf():
    plt.clf()


def plot_save(path, dpi=300):
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    fig.savefig(path, dpi=dpi)
    plt.close()


def plot_legend():
    plt.legend()


def plot_precision_recall(y, y_pred, label=""):
    y = y.flatten()
    y_pred = y_pred.flatten()
    precision, recall, thresholds = precision_recall_curve(y, y_pred)
    # decreasing_max_precision = np.maximum.accumulate(precision)[::-1]

    # fig, ax = plt.subplots(1, 1)
    # ax.hold(True)
    score = fbeta_score(y, y_pred.round(), 1)
    # prc_ap = average_precision_score(y, y_pred)
    plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}")
    # ax.step(recall[::-1], decreasing_max_precision, '-r')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])


def calc_pr_mean(y, y_preds):
    appr = []
    scores = []
    y = y.flatten()
    
    for idx, y_pred in enumerate(y_preds):
        y_pred = y_pred.flatten()
        precision, recall, thresholds = precision_recall_curve(y, y_pred)
        appr.append(interpolate.interp1d(recall, precision))
        scores.append(fbeta_score(y, y_pred.round(), 1))
    x = np.linspace(0, 1, 10000)
    ys = np.vstack([f(x) for f in appr])
    ys_mean = ys.mean(axis=0)
    ys_std = ys.std(axis=0)
    scores_mean = np.mean(scores)
    return ys_mean, ys_std, scores_mean


def plot_mean_curve(x, ys, std, score, label):
    plt.plot(x, ys, label=f"{label} - {score:5.4}")
    plt.fill_between(x, ys - std, ys + std, alpha=0.1)
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])


def plot_pr_mean(y, y_preds, label=""):
    x = np.linspace(0, 1, 10000)
    ys_mean, ys_std, score = calc_pr_mean(y, y_preds)
    plot_mean_curve(x, ys_mean, ys_std, score, label)
    plt.xlabel('Recall')
    plt.ylabel('Precision')


def score_model(y, prediction):
    y = y.flatten()
    y_pred = prediction.flatten()

    precision, recall, thresholds = precision_recall_curve(y, y_pred)

    print(classification_report(y, y_pred.round()))
    print("Area under PR curve", auc(recall, precision))
    print("roc auc score", roc_auc_score(y, y_pred))
    print("F1 Score", fbeta_score(y, y_pred.round(), 1))
    print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5))


def plot_roc_curve(mask, prediction, label=""):
    y = mask.flatten()
    y_pred = prediction.flatten()
    fpr, tpr, thresholds = roc_curve(y, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.xscale('log')
    plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}")
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')


def calc_roc_mean(y, y_preds):
    appr = []
    aucs = []
    y = y.flatten()
    
    for idx, y_pred in enumerate(y_preds):
        y_pred = y_pred.flatten()
        fpr, tpr, thresholds = roc_curve(y, y_pred)
        appr.append(interpolate.interp1d(fpr, tpr))
        aucs.append(auc(fpr, tpr))
    x = np.linspace(0, 1, 10000)
    ys = np.vstack([f(x) for f in appr])
    ys_mean = ys.mean(axis=0)
    ys_std = ys.std(axis=0)
    auc_mean = np.mean(aucs)
    return ys_mean, ys_std, auc_mean


def plot_roc_mean(y, y_preds, label=""):
    x = np.linspace(0, 1, 10000)
    ys_mean, ys_std, score = calc_roc_mean(y, y_preds)
    plt.xscale('log')
    plot_mean_curve(x, ys_mean, ys_std, score, label)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    

def plot_confusion_matrix(y_true, y_pred, path,
                          normalize=False,
                          classes=("benign", "malicious"),
                          title='Confusion matrix',
                          cmap="Blues", dpi=600):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.clf()
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(path, dpi=dpi)
    plt.close()


def plot_training_curve(logs, key, path, dpi=600):
    plt.clf()
    plt.plot(logs[f"{key}acc"], label="accuracy")
    plt.plot(logs[f"{key}f1_score"], label="f1_score")

    plt.plot(logs[f"val_{key}acc"], label="val_accuracy")
    # plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score")

    plt.xlabel('epoch')
    plt.ylabel('percentage')
    plt.legend()
    plt.savefig(path, dpi=dpi)
    plt.close()


def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"):
    if method == "svd":
        red = TruncatedSVD(n_components=2)
    elif method == "tsne":
        red = TSNE(n_components=2, verbose=2)
    domain_reduced = red.fit_transform(domain_embedding)
    print(red.explained_variance_ratio_)
    # use if draw subset of predictions
    # idx = np.random.choice(np.arange(len(domain_reduced)), 10000)
    plt.scatter(domain_reduced[:, 0],
                domain_reduced[:, 1],
                c=(labels * (1, 2)).sum(1).astype(int),
                cmap=plt.cm.plasma,
                s=3,
                alpha=0.2)
    plt.colorbar()
    plt.savefig(path, dpi=dpi)


def plot_model_as(model, path):
    from keras.utils.vis_utils import plot_model
    plot_model(model, to_file=path, show_shapes=True, show_layer_names=True)
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`import os`

			`import matplotlib.pyplot as plt`
			`import numpy as np`
add first version of model averaging visualization 2017-09-26 19:25:37 +02:00			`from scipy import interpolate`
replace pca reduction by sklearn's truncated svd 2017-07-29 19:41:14 +02:00			`from sklearn.decomposition import TruncatedSVD`
add tsne (does not work with big data) fix model loading with custom selu function 2017-09-22 10:01:12 +02:00			`from sklearn.manifold import TSNE`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`from sklearn.metrics import (`
			`auc, classification_report, confusion_matrix, fbeta_score, precision_recall_curve,`
add regularization to small networks, fix model name in args, fix visualizations 2017-09-10 18:06:40 +02:00			`roc_auc_score, roc_curve`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`)`


load names with data for per-user evaluation 2017-09-02 16:02:48 +02:00			`def scores(y_true):`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`for (path, dirnames, fnames) in os.walk("results/"):`
			`for f in fnames:`
			`if path[-1] == "1" and f.endswith("npy"):`
			`y_pred = np.load(os.path.join(path, f)).flatten()`
			`print(path)`
			`tp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 1))`
			`tn = np.sum(np.logical_and(y_pred < 0.5, y_true == 0))`
			`fp = np.sum(np.logical_and(y_pred >= 0.5, y_true == 0))`
			`fn = np.sum(np.logical_and(y_pred < 0.5, y_true == 1))`
			`precision = tp / (tp + fp)`
			`recall = tp / (tp + fn)`
			`accuracy = (tp + tn) / len(y_true)`
			`f1_score = 2 * (precision * recall) / (precision + recall)`
			`f05_score = (1 + 0.5 ** 2) * (precision * recall) / (0.5 ** 2 * precision + recall)`
			`print(" precision:", precision)`
			`print(" recall:", recall)`
			`print(" accuracy:", accuracy)`
			`print(" f1 score:", f1_score)`
			`print(" f0.5 score:", f05_score)`


refactor visualization, change arguments for model type and its depth 2017-09-01 10:42:26 +02:00			`def plot_clf():`
			`plt.clf()`


fix missing parameters, add flat network structure, make larger graphics 2017-09-20 14:43:28 +02:00			`def plot_save(path, dpi=300):`
			`fig = plt.gcf()`
			`fig.set_size_inches(18.5, 10.5)`
			`fig.savefig(path, dpi=dpi)`
refactor visualization, change arguments for model type and its depth 2017-09-01 10:42:26 +02:00			`plt.close()`


			`def plot_legend():`
			`plt.legend()`


			`def plot_precision_recall(y, y_pred, label=""):`
			`y = y.flatten()`
			`y_pred = y_pred.flatten()`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`precision, recall, thresholds = precision_recall_curve(y, y_pred)`
load names with data for per-user evaluation 2017-09-02 16:02:48 +02:00			`# decreasing_max_precision = np.maximum.accumulate(precision)[::-1]`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00
			`# fig, ax = plt.subplots(1, 1)`
			`# ax.hold(True)`
add parser argument for naming in multi model modes, minor fixes, 2017-09-05 12:40:37 +02:00			`score = fbeta_score(y, y_pred.round(), 1)`
			`# prc_ap = average_precision_score(y, y_pred)`
add regularization to small networks, fix model name in args, fix visualizations 2017-09-10 18:06:40 +02:00			`plt.plot(recall, precision, '--', label=f"{label} - {score:5.4}")`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`# ax.step(recall[::-1], decreasing_max_precision, '-r')`
			`plt.xlabel('Recall')`
			`plt.ylabel('Precision')`
remove model selection based on validation loss 2017-09-16 15:25:34 +02:00			`plt.ylim([0.0, 1.0])`
			`plt.xlim([0.0, 1.0])`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00

add first version of model averaging visualization 2017-09-26 19:25:37 +02:00			`def calc_pr_mean(y, y_preds):`
			`appr = []`
			`scores = []`
			`y = y.flatten()`

			`for idx, y_pred in enumerate(y_preds):`
remove model selection based on validation loss 2017-09-16 15:25:34 +02:00			`y_pred = y_pred.flatten()`
			`precision, recall, thresholds = precision_recall_curve(y, y_pred)`
add first version of model averaging visualization 2017-09-26 19:25:37 +02:00			`appr.append(interpolate.interp1d(recall, precision))`
			`scores.append(fbeta_score(y, y_pred.round(), 1))`
			`x = np.linspace(0, 1, 10000)`
			`ys = np.vstack([f(x) for f in appr])`
			`ys_mean = ys.mean(axis=0)`
			`ys_std = ys.std(axis=0)`
			`scores_mean = np.mean(scores)`
			`return ys_mean, ys_std, scores_mean`


add retrain mode 2017-09-28 12:23:22 +02:00			`def plot_mean_curve(x, ys, std, score, label):`
			`plt.plot(x, ys, label=f"{label} - {score:5.4}")`
			`plt.fill_between(x, ys - std, ys + std, alpha=0.1)`
			`plt.ylim([0.0, 1.0])`
			`plt.xlim([0.0, 1.0])`


add first version of model averaging visualization 2017-09-26 19:25:37 +02:00			`def plot_pr_mean(y, y_preds, label=""):`
			`x = np.linspace(0, 1, 10000)`
			`ys_mean, ys_std, score = calc_pr_mean(y, y_preds)`
add retrain mode 2017-09-28 12:23:22 +02:00			`plot_mean_curve(x, ys_mean, ys_std, score, label)`
remove model selection based on validation loss 2017-09-16 15:25:34 +02:00			`plt.xlabel('Recall')`
			`plt.ylabel('Precision')`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00

			`def score_model(y, prediction):`
			`y = y.flatten()`
			`y_pred = prediction.flatten()`

			`precision, recall, thresholds = precision_recall_curve(y, y_pred)`

			`print(classification_report(y, y_pred.round()))`
			`print("Area under PR curve", auc(recall, precision))`
			`print("roc auc score", roc_auc_score(y, y_pred))`
			`print("F1 Score", fbeta_score(y, y_pred.round(), 1))`
			`print("F0.5 Score", fbeta_score(y, y_pred.round(), 0.5))`


refactor visualization, change arguments for model type and its depth 2017-09-01 10:42:26 +02:00			`def plot_roc_curve(mask, prediction, label=""):`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`y = mask.flatten()`
			`y_pred = prediction.flatten()`
			`fpr, tpr, thresholds = roc_curve(y, y_pred)`
			`roc_auc = auc(fpr, tpr)`
add parser argument for naming in multi model modes, minor fixes, 2017-09-05 12:40:37 +02:00			`plt.xscale('log')`
add regularization to small networks, fix model name in args, fix visualizations 2017-09-10 18:06:40 +02:00			`plt.plot(fpr, tpr, label=f"{label} - {roc_auc:5.4}")`
remove model selection based on validation loss 2017-09-16 15:25:34 +02:00			`plt.ylim([0.0, 1.0])`
			`plt.xlim([0.0, 1.0])`
			`plt.xlabel('False Positive Rate')`
			`plt.ylabel('True Positive Rate')`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00

add first version of model averaging visualization 2017-09-26 19:25:37 +02:00			`def calc_roc_mean(y, y_preds):`
			`appr = []`
			`aucs = []`
			`y = y.flatten()`

			`for idx, y_pred in enumerate(y_preds):`
			`y_pred = y_pred.flatten()`
			`fpr, tpr, thresholds = roc_curve(y, y_pred)`
			`appr.append(interpolate.interp1d(fpr, tpr))`
			`aucs.append(auc(fpr, tpr))`
			`x = np.linspace(0, 1, 10000)`
			`ys = np.vstack([f(x) for f in appr])`
			`ys_mean = ys.mean(axis=0)`
			`ys_std = ys.std(axis=0)`
			`auc_mean = np.mean(aucs)`
			`return ys_mean, ys_std, auc_mean`


			`def plot_roc_mean(y, y_preds, label=""):`
			`x = np.linspace(0, 1, 10000)`
add retrain mode 2017-09-28 12:23:22 +02:00			`ys_mean, ys_std, score = calc_roc_mean(y, y_preds)`
add first version of model averaging visualization 2017-09-26 19:25:37 +02:00			`plt.xscale('log')`
add retrain mode 2017-09-28 12:23:22 +02:00			`plot_mean_curve(x, ys_mean, ys_std, score, label)`
add first version of model averaging visualization 2017-09-26 19:25:37 +02:00			`plt.xlabel('False Positive Rate')`
			`plt.ylabel('True Positive Rate')`


add custom class weights based on sklearn balance 2017-07-14 15:57:52 +02:00			`def plot_confusion_matrix(y_true, y_pred, path,`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`normalize=False,`
remove regularizer for conv and domain 2017-09-10 23:40:14 +02:00			`classes=("benign", "malicious"),`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`title='Confusion matrix',`
add custom class weights based on sklearn balance 2017-07-14 15:57:52 +02:00			`cmap="Blues", dpi=600):`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`"""`
			`This function prints and plots the confusion matrix.`
			Normalization can be applied by setting `normalize=True`.
			`"""`
			`plt.clf()`
			`cm = confusion_matrix(y_true, y_pred)`

			`if normalize:`
			`cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]`
			`print("Normalized confusion matrix")`
			`else:`
			`print('Confusion matrix, without normalization')`
			`print(cm)`

fix covariance normalization; add run_model script for multi times training 2017-09-11 12:42:44 +02:00			`plt.imshow(cm, interpolation='nearest', cmap=cmap)`
			`plt.title(title)`
			`plt.colorbar()`
			`tick_marks = np.arange(len(classes))`
			`plt.xticks(tick_marks, classes, rotation=45)`
			`plt.yticks(tick_marks, classes)`

add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`thresh = cm.max() / 2.`
			`for i, j in ((i, j) for i in range(cm.shape[0]) for j in range(cm.shape[1])):`
			`plt.text(j, i, cm[i, j],`
			`horizontalalignment="center",`
			`color="white" if cm[i, j] > thresh else "black")`

			`plt.tight_layout()`
			`plt.ylabel('True label')`
			`plt.xlabel('Predicted label')`
add custom class weights based on sklearn balance 2017-07-14 15:57:52 +02:00			`plt.savefig(path, dpi=dpi)`
			`plt.close()`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00

			`def plot_training_curve(logs, key, path, dpi=600):`
			`plt.clf()`
fix lazy domain loading and generation process 2017-08-03 12:27:17 +02:00			`plt.plot(logs[f"{key}acc"], label="accuracy")`
			`plt.plot(logs[f"{key}f1_score"], label="f1_score")`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00
remove regularizer for conv and domain 2017-09-10 23:40:14 +02:00			`plt.plot(logs[f"val_{key}acc"], label="val_accuracy")`
			`# plt.plot(logs[f"val_{key}f1_score"], label="val_f1_score")`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00
			`plt.xlabel('epoch')`
			`plt.ylabel('percentage')`
			`plt.legend()`
			`plt.savefig(path, dpi=dpi)`
			`plt.close()`


add tsne (does not work with big data) fix model loading with custom selu function 2017-09-22 10:01:12 +02:00			`def plot_embedding(domain_embedding, labels, path, dpi=600, method="svd"):`
			`if method == "svd":`
			`red = TruncatedSVD(n_components=2)`
			`elif method == "tsne":`
			`red = TSNE(n_components=2, verbose=2)`
			`domain_reduced = red.fit_transform(domain_embedding)`
			`print(red.explained_variance_ratio_)`
add embedding visualization and domain encoding generator 2017-07-29 10:43:59 +02:00			`# use if draw subset of predictions`
			`# idx = np.random.choice(np.arange(len(domain_reduced)), 10000)`
			`plt.scatter(domain_reduced[:, 0],`
			`domain_reduced[:, 1],`
			`c=(labels * (1, 2)).sum(1).astype(int),`
			`cmap=plt.cm.plasma,`
add parser argument for naming in multi model modes, minor fixes, 2017-09-05 12:40:37 +02:00			`s=3,`
			`alpha=0.2)`
add embedding visualization and domain encoding generator 2017-07-29 10:43:59 +02:00			`plt.colorbar()`
			`plt.savefig(path, dpi=dpi)`


add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`def plot_model_as(model, path):`
refactor visualization, change arguments for model type and its depth 2017-09-01 10:42:26 +02:00			`from keras.utils.vis_utils import plot_model`
add visualization for training curves, pr, roc 2017-07-14 14:58:17 +02:00			`plot_model(model, to_file=path, show_shapes=True, show_layer_names=True)`