forked from malloc404/RPyCA
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
105 lines (90 loc) · 4.08 KB
/
main.py
File metadata and controls
105 lines (90 loc) · 4.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Python3 file
# Created by Marissa Bennett
import math, sys, csv, ast, time, re, warnings
import numpy as np
import pandas as pd
import rpyca as rp
from helperFiles.logger import *
from helperFiles.matrixOp import *
from helperFiles.fileHandler import *
from helperFiles.plotter import *
from helperFiles.models import *
from helperFiles.configParser import *
import cProfile
def main_func():
start_time = time.time()
# Ask for configuration to use
configType, con = setConfig()
# Set log for debugging or other purposes (can be overridden)
setLog(con['LogFile'])
logMsg(1,"CONFIGURATION USED: %s" % str(configType))
# Set all other configuration variables
fileName = con['CSVFile']
logMsg(1,"File Name: %s" % str(fileName))
labelsName = re.sub(r'[^\w]', '', con['Labels'])
#####
## TODO need to make these better and more reliable
# onehot = toList(con['OneHot'], integer=False)
# skip = toList(con['Skip'], integer=False)
######
seed = (0 if (con['RandomSeed'] == 0) else con['RandomSeed'])
sample = (0 if (con['SampleSize'] == 0) else con['SampleSize'])
ratioTrain, ratioValid = con['RatioTrainData'], con['RatioValidData']
# Set ML model to run
toRun = [con['Models']]
if "all" == con['Models']:
# NOTE these are not all the models in the model.py file
toRun = ['rf','knn','logreg','svm','dtree','nb','kmeans','gb','pynn']
# Set Looping actions
howToRun = []
mode = con['Mode']
if mode == 1:
howToRun = [con['LambdaStartValue']]
elif mode == 2: # this is used for plotting
howToRun = [con['LambdaStartValue']] * 10
else: # default for finding a good lambda
howToRun = frange(con['LambdaStartValue'], con['LambdaEndValue'], con['LambdaIncrValue'])
# ensures preprocessing happens at least once
# TODO look into if I could just randomize the random data again instead???
pre = True
Xlis,LSlis,XLSlis = [], [], []
# main loop
# TODO normalize each matrix with X1 things (see paper)
for l in howToRun:
if not mode == 0 or pre:
if "ISCX" in fileName: # TODO these should be changed into a function or something in the future
skip = ['FlowID', 'SourceIP', 'Timestamp', 'Label']
[X1, X2, X3], ymat = preproc(fileName, labelsName, sample, seed, ratioTrain, ratioValid, skip=skip)# onehot, skip)
elif "LLS_DDOS" in fileName:
skip = ['No.', 'Label']
[X1, X2, X3], ymat = preprocLLSDOS(fileName, labelsName, sample, seed, ratioTrain, ratioValid, skip=skip)# onehot, skip)
# [X1, X2, X3], ymat = preprocKaggle(fileName, labelsName, sample, seed, ratioTrain, ratioValid, onehot, skip)
pre = False # done preprocessing for mode 0 only!
# XXX
#plotU(X1, ymat[0])
logMsg(1, "Lambda: %s" % (str(l)))
print("\n\nLAMBDA: ", l)
# runs RPCA
[LS1, LS2], [XLS1, XLS2] = rp.rpca(X1, X2, l)
# XXX Future Work: see if lambda can be tuned outside of using ML models??
# ML/AI loop
for m in toRun:
print("running ML")
Xmat, LSmat, XLSmat, ymatX12 = [X1, X2], [LS1, LS2], [XLS1, XLS2], [ymat[0], ymat[1]]
res, dall = runModels(Xmat, LSmat, XLSmat, ymatX12, code=m)
# Validates ONLY if a good f1 score occurred
if res:
print("Validating...")
logMsg(1, "Validating GOOD Lambda: %s" % (str(l)))
# validate
[LS1, LS3], [XLS1, XLS3] = rp.project(X1, X3)
# ML/AI
Xmat, LSmat, XLSmat, ymatX13 = [X1, X3], [LS1, LS3], [XLS1, XLS3], [ymat[0], ymat[2]]
res, dgood = runModels(Xmat, LSmat, XLSmat, ymatX13, code=m)
Xlis.append(dgood[0])
LSlis.append(dgood[1])
XLSlis.append(dgood[2])
generateResults(toRun[0],l,Xlis,LSlis,XLSlis)
logMsg(1, "Time to complete: %s" % str(time.time() - start_time))
if __name__ == '__main__':
cProfile.run("main_func()")