Commit 141718f8 authored by Ignacio Crespo's avatar Ignacio Crespo

check models elk

parents
import socket
import json
import sys
import redis
import pandas as pd
import json
import collections
from collections import OrderedDict
import yaml
import moev.MoEv as MoEv
import datetime
import time
import pytz
HOST = '172.16.238.17'
PORT = 9563
def parse_timestamp(date):
print date
date = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
dtobj1=date.replace(tzinfo=pytz.UTC) #replace method
dtobj_madrid=dtobj1.astimezone(pytz.timezone("Europe/Madrid")) #astimezone method
time_tuple = dtobj_madrid.timetuple()
timestamp = time.mktime(time_tuple)
print int(timestamp)
return int(timestamp)
def check_attack(json_flow):
moev_Manager = MoEv.MoEv("./moev/conf.yaml", 0.33, None, "Label", "netflow")
conf_file = moev_Manager.get_conf_file()
moev_Manager.load_json(json_flow, conf_file)
moev_Manager.cleanDataset()
#moev_Manager.preprocessDataset()
#moev_Manager.reductFeatures()
predictions = moev_Manager.createModels()
return predictions
r = redis.StrictRedis(host='172.16.238.11', port=6379, db=0)
print(r)
# delete the key
netflow_redis = r.lindex("logstash", -1)
netflow = json.loads(netflow_redis)
#print(netflow)
print(netflow["netflow"]["in_bytes"])
netflow_moev = collections.OrderedDict()
netflow_moev['#:unix_secs'] = parse_timestamp(netflow["@timestamp"].split('.')[0])
netflow_moev['unix_nsecs'] = 886670
netflow_moev['sysuptime'] = 69000
netflow_moev['exaddr'] = netflow["host"]["ip"]
netflow_moev['dpkts'] = netflow["netflow"]["in_pkts"]
netflow_moev['doctets'] = netflow["netflow"]["in_bytes"]
netflow_moev['first'] = 53440
netflow_moev['last'] = 53440
netflow_moev['engine_type'] = netflow["netflow"]["engine_type"]
netflow_moev['engine_id'] = netflow["netflow"]["engine_id"]
netflow_moev['srcaddr'] = netflow["source"]["ip"]
netflow_moev['dstaddr'] = netflow["destination"]["ip"]
netflow_moev['nexthop'] = netflow["flow"]["next_hop"]
netflow_moev['input'] = netflow["flow"]["input_snmp"]
netflow_moev['ouput'] = netflow["flow"]["output_snmp"]
netflow_moev['srcport'] = netflow["source"]["port"]
netflow_moev['dstport'] = netflow["destination"]["port"]
netflow_moev['prot'] = netflow["network"]["iana_number"]
netflow_moev['tos'] = netflow["flow"]["tos"]
with open("./tcp_flags.yml") as f:
file_flags= yaml.load(f, Loader=yaml.FullLoader)
tcp_flags = file_flags[('-'.join(netflow["flow"]["tcp_flags"]))]
f.close()
netflow_moev['tcp_flags'] = tcp_flags
netflow_moev['src_mask'] = netflow["flow"]["src_mask_len"]
netflow_moev['dst_mask'] = netflow["flow"]["dst_mask_len"]
netflow_moev['src_as'] = netflow["netflow"]["src_as"]
netflow_moev['dst_as'] = netflow["netflow"]["dst_as"]
test = json.dumps(netflow_moev)
#print(json.dumps(netflow_moev))
#print(json.loads(test))
predictions = check_attack(json.loads(test,object_pairs_hook=OrderedDict))
y = {"Check_Attack":predictions}
netflow.update(y)
print(json.dumps(netflow))
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
except socket.error as msg:
sys.stderr.write("[ERROR] %s\n" % msg[1])
sys.exit(1)
try:
sock.connect((HOST, PORT))
except socket.error as msg:
sys.stderr.write("[ERROR] %s\n" % msg[1])
sys.exit(2)
msg = netflow
sock.send(json.dumps(netflow))
sock.close()
sys.exit(0)
#
#
# Copyright (c) 2020 Adrian Campazas Vega, Ignacio Samuel Crespo Martinez, Angel Manuel Guerrero Higueras.
#
# This file is part of MoEv
#
# MoEv is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MoEv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO, filename='MoEv.log', format='%(asctime)s: %(levelname)s - %(message)s')
logger = logging.getLogger('MoEv')
# set up logging to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
# set a format which is simpler for console use
formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s')
console.setFormatter(formatter)
# add the handler to the root logger
logging.getLogger('MoEv').addHandler(console)
logger = logging.getLogger(__name__)
class Cleaner:
def __init__(self):
pass
def date_to_posix(self, value):
value = pd.to_datetime(value.strip(), infer_datetime_format=True)
return value.timestamp()
def ip_to_integer(self, ip):
stripped_ip = map(int, ip.split('.'))
return ((stripped_ip[0] * pow(256, 3)) + (stripped_ip[1] * pow(256, 2)) + (stripped_ip[2] * 256) + stripped_ip[3])
def parse_time(self, df, target):
logging.info("Process start: parse date into integer (POSIX)")
timestamp_list = df['Timestamp']
for i in timestamp_list:
if not type(i) == float:
df['Timestamp'].replace(i, value=self.date_to_posix(i), inplace=True)
logging.info("Process finished: parse date into integer (POSIX)")
return df
def map_ips(self, df, target):
logging.info("Process start: parse IP into integer")
for i in df[target]:
if not type(i) == int:
df[target].replace(i, value=self.ip_to_integer(i), inplace=True)
logging.info("Process finished: parse IP into integer")
return df
def fix_CIC_error(self, df):
logging.info("Process start: Fix the CICFlowMeter error")
field_list = df.count(axis='columns')
index_list = field_list[field_list < 82].index.values.astype(int)
for i in index_list:
df.drop([i], axis=0)
logging.info("Process finished: Fix the CICFlowMeter error")
return df
def search_NaN(self, df):
logging.info("Searching for NaN values...")
row_num = len(df)
for col in df:
nan_list = df[col].isnull()
nan_num = nan_list.sum()
if not nan_num == 0:
logging.info("Column %s have %i NaN values" % (col, nan_num))
logging.info("Search completed")
return df.fillna(0)
def search_infinite(self, df, lista):
logging.info("Searching for infinite values...")
row_num = len(df)
for col in df:
flag = False
for i in lista:
if(col == i):
flag = True
if not flag:
inf_list = np.isinf(df[col].astype('float64'))
inf_num = inf_list.sum()
if not inf_num == 0:
aux = df[col].astype('float64').replace([np.inf, -np.inf], np.nan)
aux = aux.dropna()
mean = aux.mean()
df[col] = df[col].astype('float64').replace([np.inf, -np.inf], mean)
logging.info("Column %s have %i infinite values" % (col, inf_num))
logging.info("Search completed")
return df
def search_negatives(self, df):
logging.info("Searching for negatives values...")
row_num = len(df)
for col in df:
neg_list = df[col] < 0
neg_num = neg_list.sum()
if not neg_num == 0:
logging.info("Column %s have %i negative values" % (col, neg_num))
logging.info("Search completed")
return df
def search_unimportant(self, df):
logging.info("Searching for columns with no variance...")
row_num = len(df)
for col in df:
aux = df[col][0] * row_num
col_sum = df[col].astype(float).sum()
if not aux == 0:
if float(col_sum)/float(aux) == 1:
logging.info("All values in column %s are the same: %f" % (col, float(df[col][0])))
#Tratar las columnas repetidas
if col_sum == 0:
print("All values in column %s are 0" % (col))
#Tratar columnas repetidas y de valor 0
logging.info("Search completed")
return df
def duplicatedRows(self, df):
logging.info("Removing duplicated Rows")
duplicatedList = df[df.duplicated()]
index_list = duplicatedList.index.values.astype(int)
for i in index_list:
df = df.drop([i], axis=0)
logging.info("Row %i removed" % (i))
return df
def clean_bad_columns(self, df):
logging.info("Process start: remove bad columns")
return df.drop(columns=['Flow ID','Flow Byts/s', 'Flow Pkts/s'])
def removeFeatures(self, df, list):
logging.info("Removing features...")
df = df.drop(columns=list)
return df
#
#
# Copyright (c) 2020 Adrian Campazas Vega, Ignacio Samuel Crespo Martinez, Angel Manuel Guerrero Higueras.
#
# This file is part of MoEv
#
# MoEv is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MoEv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from sklearn.decomposition import PCA
class FeatureReductor:
def __init__(self):
pass
def reductionPCA(self, X, k):
pca = PCA(n_components=k)
Y = pca.fit_transform(X)
print(Y)
return Y
#Return a transformed X
def selectKBest(self, function, value, X, y):
if function == "f_regression":
return pd.DataFrame(SelectKBest(f_regression, k=value).fit_transform(X, y))
if function == "chi2":
print(SelectKBest(chi2, k=value).fit_transform(X, y))
def extraTreeClassifier(self, X, y):
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
model = SelectFromModel(clf, prefit=True)
newX = model.transform(X)
print (newX)
return newX
#
#
# Copyright (c) 2020 Adrian Campazas Vega, Ignacio Samuel Crespo Martinez, Angel Manuel Guerrero Higueras.
#
# This file is part of MoEv
#
# MoEv is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MoEv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import numpy as np
from skimage.feature import hog
from sklearn.base import BaseEstimator, TransformerMixin
class HogTransformer(BaseEstimator, TransformerMixin):
"""
Expects an array of 2d arrays (1 channel images)
Calculates hog features for each img
"""
def __init__(self, y=None, orientations=9,
pixels_per_cell=(8, 8),
cells_per_block=(3, 3), block_norm='L2-Hys'):
self.y = y
self.orientations = orientations
self.pixels_per_cell = pixels_per_cell
self.cells_per_block = cells_per_block
self.block_norm = block_norm
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
def local_hog(X):
return hog(X, orientations=self.orientations, pixels_per_cell=self.pixels_per_cell, cells_per_block=self.cells_per_block, block_norm=self.block_norm)
try: # parallel
return np.array([local_hog(img) for img in X])
except:
return np.array([local_hog(img) for img in X])
\ No newline at end of file
2020-12-09 14:22:57,138: INFO - Moev.py started
2020-12-09 14:22:58,618: INFO - File charged: conf.yaml
2020-12-09 14:22:58,656: INFO - Dataset loaded
2020-12-09 14:22:58,656: INFO - Cleaner object created. Make sure that function cleanDataset() is called in start script.
2020-12-09 14:22:58,657: INFO - Process start: parse IP into integer
2020-12-09 14:23:00,819: INFO - Process finished: parse IP into integer
2020-12-09 14:23:00,819: INFO - Source IPs parsed to integer
2020-12-09 14:23:00,819: INFO - Process start: parse IP into integer
This diff is collapsed.
File added
#
#
# Copyright (c) 2020 Adrian Campazas Vega, Ignacio Samuel Crespo Martinez, Angel Manuel Guerrero Higueras.
#
# This file is part of MoEv
#
# MoEv is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MoEv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import numpy as np
import matplotlib.pyplot as plt
import pprint
import RGB2GrayTransformer
import HogTransformer
class NpyProcessor:
def __init__(self):
pass
def npy_processor(self, conf):
pp = pprint.PrettyPrinter(indent=4)
data_raw = np.load(conf["npy"]["data_raw"])
data_label = np.load(conf["npy"]["data_label"])
print(data_label.shape)
print(data_raw.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
data_raw,
data_label,
test_size=0.2,
shuffle=True,
random_state=42,
)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
import skimage
# create an instance of each transformer
grayify = RGB2GrayTransformer.RGB2GrayTransformer()
hogify = HogTransformer.HogTransformer(
pixels_per_cell=(8, 8),
cells_per_block=(2,2),
orientations=9,
block_norm='L2-Hys'
)
scalify = StandardScaler()
# call fit_transform on each transform converting X_train step by step
X_train_gray = grayify.fit_transform(X_train)
X_train_hog = hogify.fit_transform(X_train_gray)
X_train_prepared = scalify.fit_transform(X_train_hog)
X_test_gray = grayify.transform(X_test)
X_test_hog = hogify.transform(X_test_gray)
X_test_prepared = scalify.transform(X_test_hog)
return X_train_prepared, X_test_prepared, y_train, y_test
print(X_train_prepared.shape)
#
#
# Copyright (c) 2020 Adrian Campazas Vega, Ignacio Samuel Crespo Martinez, Angel Manuel Guerrero Higueras.
#
# This file is part of MoEv
#
# MoEv is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MoEv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import pandas as pd
from sklearn import preprocessing
class Preprocessor:
def __init__(self):
pass
# We remove from the DataFrame the first row and the label column so that they are not treated
def adapt_dataset(self, df):
#We save the data in the label column
labelColumn = df['Label']
#Remove the column from the dataframe
f_WithOut_Label = df.drop('Label', axis=1)
#We save the name of all columns by removing the label field
columns_names = df.keys().drop('Label')
return columns_names,labelColumn,f_WithOut_Label
def variance(self, df, limit, list):
for col in df:
flag = False
for i in list:
if col == i:
flag = True
if not flag:
variance = df[col].var()
if variance == limit:
print("Columna: %s con varianza %f" % (col, variance))
df = df.drop(col, axis=1)
return df
#Apply normalization method
def normalize(self, df):
#return pd.DataFrame(preprocessing.normalize(df))
scaler = preprocessing.Normalizer(norm='l2', copy=True)
scaler = scaler.fit(df)
return pd.DataFrame(scaler.fit_transform(df))
#Apply standardization method
def standardScaler(self, X):
scaler = preprocessing.StandardScaler()
newX = scaler.fit_transform(X)
return newX
#
#
# Copyright (c) 2020 Adrian Campazas Vega, Ignacio Samuel Crespo Martinez, Angel Manuel Guerrero Higueras.
#
# This file is part of MoEv
#
# MoEv is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MoEv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import skimage
class RGB2GrayTransformer(BaseEstimator, TransformerMixin):
"""
Convert an array of RGB images to grayscale
"""
def __init__(self):
pass
def fit(self, X, y=None):
"""returns itself"""
return self
def transform(self, X, y=None):
"""perform the transformation and return an array"""
return np.array([skimage.color.rgb2gray(img) for img in X])
\ No newline at end of file
#
#
# Copyright (c) 2020 Adrian Campazas Vega, Ignacio Samuel Crespo Martinez, Angel Manuel Guerrero Higueras.
#
# This file is part of MoEv
#
# MoEv is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MoEv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import pandas as pd
import MoEv
import argparse
import Preprocessor
import FeatureReductor
import NpyProcessor
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--type", help="Type of data: cic, netflow or npy ")
args = parser.parse_args()
data_type = ""
#Check the data that is going to loaded
if not args.type:
print("It is necessary to indicate the type of data")
parser.print_help()
quit()
else:
#CICFlowMeter data
if args.type == "cic":
data_type = args.type
#Netflow data
elif args.type == "netflow":
data_type = args.type
#npy images data
elif args.type == "npy":
data_type = args.type
else:
print("Incorrect type of data")
parser.print_help()
quit()
#We create Moev class that contain all methods to clean and analize the data and create and testing the models
npy_test = NpyProcessor.NpyProcessor()
moev_Manager = MoEv.MoEv("conf.yaml", 0.33, None, "Label", data_type)
conf_file = moev_Manager.get_conf_file()
#If data is images we should process the data befere create and test the models
if args.type == "npy":
#Process data
X_train, X_test, y_train, y_test = npy_test.npy_processor(conf_file)
#Load data externally
moev_Manager.load_data(X_train, X_test, y_train, y_test)
#Create and test models
moev_Manager.createModels()
else:
moev_Manager.load_dataset(conf_file)
moev_Manager.cleanDataset()
moev_Manager.analyzeDataset()
moev_Manager.preprocessDataset()
#moev_Manager.reductFeatures()
moev_Manager.createModels()
pass
#
#
# Copyright (c) 2020 Adrian Campazas Vega, Ignacio Samuel Crespo Martinez, Angel Manuel Guerrero Higueras.
#
# This file is part of MoEv
#
# MoEv is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MoEv is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import logging
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import itertools
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
def fit(classifier, X_train, y_train):
classifier.fit(X_train, y_train)
def predict(classifier, X_test):
return classifier.predict(X_test)
def accuracyScore(y_test, predictions):
return accuracy_score(y_test, predictions)
def learning_curves(X, y):
title = "Learning Curves (SGD)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = QuadraticDiscriminantAnalysis()
train_sizes=np.linspace(.1, 1.0, 5)
ylim=(0.7, 1.01)
n_jobs=4
plt.figure()
plt.title(title)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)