FeatureEnVi: Visual Analytics for Feature Engineering Using Stepwise Selection and Semi-Automatic Extraction Approaches
https://doi.org/10.1109/TVCG.2022.3141040
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
531 lines
14 KiB
531 lines
14 KiB
from flask import Flask, render_template, jsonify, request
|
|
from flask_pymongo import PyMongo
|
|
from flask_cors import CORS, cross_origin
|
|
|
|
import json
|
|
import copy
|
|
import warnings
|
|
import re
|
|
import random
|
|
import math
|
|
import pandas as pd
|
|
import numpy as np
|
|
import multiprocessing
|
|
|
|
from joblib import Memory
|
|
|
|
from sklearn.svm import SVC
|
|
from bayes_opt import BayesianOptimization
|
|
from sklearn.model_selection import cross_validate
|
|
from sklearn.model_selection import cross_val_predict
|
|
|
|
# this block of code is for the connection between the server, the database, and the client (plus routing)
|
|
|
|
# access MongoDB
|
|
app = Flask(__name__)
|
|
|
|
app.config["MONGO_URI"] = "mongodb://localhost:27017/mydb"
|
|
mongo = PyMongo(app)
|
|
|
|
cors = CORS(app, resources={r"/data/*": {"origins": "*"}})
|
|
|
|
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
|
|
@app.route('/data/Reset', methods=["GET", "POST"])
|
|
def reset():
|
|
global DataRawLength
|
|
global DataResultsRaw
|
|
global previousState
|
|
previousState = []
|
|
|
|
global filterActionFinal
|
|
filterActionFinal = ''
|
|
|
|
global keySpecInternal
|
|
keySpecInternal = 1
|
|
|
|
global dataSpacePointsIDs
|
|
dataSpacePointsIDs = []
|
|
|
|
global previousStateActive
|
|
previousStateActive = []
|
|
|
|
global RANDOM_SEED
|
|
RANDOM_SEED = 42
|
|
|
|
global KNNModelsCount
|
|
global LRModelsCount
|
|
|
|
global keyData
|
|
keyData = 0
|
|
|
|
KNNModelsCount = 0
|
|
LRModelsCount = 100
|
|
|
|
global XData
|
|
XData = []
|
|
global yData
|
|
yData = []
|
|
|
|
global XDataStored
|
|
XDataStored = []
|
|
global yDataStored
|
|
yDataStored = []
|
|
|
|
global detailsParams
|
|
detailsParams = []
|
|
|
|
global algorithmList
|
|
algorithmList = []
|
|
|
|
global ClassifierIDsList
|
|
ClassifierIDsList = ''
|
|
|
|
# Initializing models
|
|
|
|
global resultsList
|
|
resultsList = []
|
|
|
|
global RetrieveModelsList
|
|
RetrieveModelsList = []
|
|
|
|
global allParametersPerformancePerModel
|
|
allParametersPerformancePerModel = []
|
|
|
|
global allParametersPerfCrossMutr
|
|
allParametersPerfCrossMutr = []
|
|
|
|
global HistoryPreservation
|
|
HistoryPreservation = []
|
|
|
|
global all_classifiers
|
|
all_classifiers = []
|
|
|
|
global crossValidation
|
|
crossValidation = 10
|
|
|
|
# models
|
|
global KNNModels
|
|
KNNModels = []
|
|
global RFModels
|
|
RFModels = []
|
|
|
|
global scoring
|
|
scoring = {'accuracy': 'accuracy', 'precision_micro': 'precision_micro', 'precision_macro': 'precision_macro', 'precision_weighted': 'precision_weighted', 'recall_micro': 'recall_micro', 'recall_macro': 'recall_macro', 'recall_weighted': 'recall_weighted', 'roc_auc_ovo_weighted': 'roc_auc_ovo_weighted'}
|
|
|
|
global loopFeatures
|
|
loopFeatures = 2
|
|
|
|
global results
|
|
results = []
|
|
|
|
global resultsMetrics
|
|
resultsMetrics = []
|
|
|
|
global parametersSelData
|
|
parametersSelData = []
|
|
|
|
global target_names
|
|
target_names = []
|
|
|
|
global target_namesLoc
|
|
target_namesLoc = []
|
|
return 'The reset was done!'
|
|
|
|
# retrieve data from client and select the correct data set
|
|
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
|
|
@app.route('/data/ServerRequest', methods=["GET", "POST"])
|
|
def retrieveFileName():
|
|
global DataRawLength
|
|
global DataResultsRaw
|
|
global DataResultsRawTest
|
|
global DataRawLengthTest
|
|
|
|
fileName = request.get_data().decode('utf8').replace("'", '"')
|
|
|
|
global keySpecInternal
|
|
keySpecInternal = 1
|
|
|
|
global filterActionFinal
|
|
filterActionFinal = ''
|
|
|
|
global dataSpacePointsIDs
|
|
dataSpacePointsIDs = []
|
|
|
|
global RANDOM_SEED
|
|
RANDOM_SEED = 42
|
|
|
|
global keyData
|
|
keyData = 0
|
|
|
|
global XData
|
|
XData = []
|
|
|
|
global previousState
|
|
previousState = []
|
|
|
|
global previousStateActive
|
|
previousStateActive = []
|
|
|
|
global yData
|
|
yData = []
|
|
|
|
global XDataStored
|
|
XDataStored = []
|
|
|
|
global yDataStored
|
|
yDataStored = []
|
|
|
|
global filterDataFinal
|
|
filterDataFinal = 'mean'
|
|
|
|
global ClassifierIDsList
|
|
ClassifierIDsList = ''
|
|
|
|
global algorithmList
|
|
algorithmList = []
|
|
|
|
global detailsParams
|
|
detailsParams = []
|
|
|
|
# Initializing models
|
|
|
|
global RetrieveModelsList
|
|
RetrieveModelsList = []
|
|
|
|
global resultsList
|
|
resultsList = []
|
|
|
|
global allParametersPerformancePerModel
|
|
allParametersPerformancePerModel = []
|
|
|
|
global allParametersPerfCrossMutr
|
|
allParametersPerfCrossMutr = []
|
|
|
|
global HistoryPreservation
|
|
HistoryPreservation = []
|
|
|
|
global all_classifiers
|
|
all_classifiers = []
|
|
|
|
global crossValidation
|
|
crossValidation = 5
|
|
|
|
global scoring
|
|
scoring = {'accuracy': 'accuracy', 'precision_weighted': 'precision_weighted', 'recall_weighted': 'recall_weighted', 'f1_weighted': 'f1_weighted', 'roc_auc_ovo_weighted': 'roc_auc_ovo_weighted'}
|
|
|
|
global loopFeatures
|
|
loopFeatures = 2
|
|
|
|
# models
|
|
global KNNModels
|
|
global SVCModels
|
|
global GausNBModels
|
|
global MLPModels
|
|
global LRModels
|
|
global LDAModels
|
|
global QDAModels
|
|
global RFModels
|
|
global ExtraTModels
|
|
global AdaBModels
|
|
global GradBModels
|
|
|
|
KNNModels = []
|
|
SVCModels = []
|
|
GausNBModels = []
|
|
MLPModels = []
|
|
LRModels = []
|
|
LDAModels = []
|
|
QDAModels = []
|
|
RFModels = []
|
|
ExtraTModels = []
|
|
AdaBModels = []
|
|
GradBModels = []
|
|
|
|
global results
|
|
results = []
|
|
|
|
global resultsMetrics
|
|
resultsMetrics = []
|
|
|
|
global parametersSelData
|
|
parametersSelData = []
|
|
|
|
global StanceTest
|
|
StanceTest = False
|
|
|
|
global target_names
|
|
|
|
target_names = []
|
|
|
|
global target_namesLoc
|
|
|
|
target_namesLoc = []
|
|
|
|
DataRawLength = -1
|
|
DataRawLengthTest = -1
|
|
data = json.loads(fileName)
|
|
if data['fileName'] == 'HeartC':
|
|
CollectionDB = mongo.db.HeartC.find()
|
|
elif data['fileName'] == 'StanceC':
|
|
StanceTest = True
|
|
CollectionDB = mongo.db.StanceC.find()
|
|
CollectionDBTest = mongo.db.StanceCTest.find()
|
|
elif data['fileName'] == 'DiabetesC':
|
|
CollectionDB = mongo.db.DiabetesC.find()
|
|
else:
|
|
CollectionDB = mongo.db.IrisC.find()
|
|
DataResultsRaw = []
|
|
for index, item in enumerate(CollectionDB):
|
|
item['_id'] = str(item['_id'])
|
|
item['InstanceID'] = index
|
|
DataResultsRaw.append(item)
|
|
DataRawLength = len(DataResultsRaw)
|
|
|
|
DataResultsRawTest = []
|
|
if (StanceTest):
|
|
for index, item in enumerate(CollectionDBTest):
|
|
item['_id'] = str(item['_id'])
|
|
item['InstanceID'] = index
|
|
DataResultsRawTest.append(item)
|
|
DataRawLengthTest = len(DataResultsRawTest)
|
|
|
|
dataSetSelection()
|
|
return 'Everything is okay'
|
|
|
|
# Retrieve data set from client
|
|
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
|
|
@app.route('/data/SendtoSeverDataSet', methods=["GET", "POST"])
|
|
def sendToServerData():
|
|
|
|
uploadedData = request.get_data().decode('utf8').replace("'", '"')
|
|
uploadedDataParsed = json.loads(uploadedData)
|
|
DataResultsRaw = uploadedDataParsed['uploadedData']
|
|
|
|
DataResults = copy.deepcopy(DataResultsRaw)
|
|
|
|
for dictionary in DataResultsRaw:
|
|
for key in dictionary.keys():
|
|
if (key.find('*') != -1):
|
|
target = key
|
|
continue
|
|
continue
|
|
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
|
|
DataResults.sort(key=lambda x: x[target], reverse=True)
|
|
|
|
for dictionary in DataResults:
|
|
del dictionary[target]
|
|
|
|
global AllTargets
|
|
global target_names
|
|
global target_namesLoc
|
|
AllTargets = [o[target] for o in DataResultsRaw]
|
|
AllTargetsFloatValues = []
|
|
|
|
previous = None
|
|
Class = 0
|
|
for i, value in enumerate(AllTargets):
|
|
if (i == 0):
|
|
previous = value
|
|
target_names.append(value)
|
|
if (value == previous):
|
|
AllTargetsFloatValues.append(Class)
|
|
else:
|
|
Class = Class + 1
|
|
target_names.append(value)
|
|
AllTargetsFloatValues.append(Class)
|
|
previous = value
|
|
|
|
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
|
|
|
|
global XData, yData, RANDOM_SEED
|
|
XData, yData = ArrayDataResults, AllTargetsFloatValues
|
|
|
|
global XDataStored, yDataStored
|
|
XDataStored = XData.copy()
|
|
yDataStored = yData.copy()
|
|
|
|
return 'Processed uploaded data set'
|
|
|
|
def dataSetSelection():
|
|
global XDataTest, yDataTest
|
|
XDataTest = pd.DataFrame()
|
|
global StanceTest
|
|
global AllTargets
|
|
global target_names
|
|
target_namesLoc = []
|
|
if (StanceTest):
|
|
DataResultsTest = copy.deepcopy(DataResultsRawTest)
|
|
|
|
for dictionary in DataResultsRawTest:
|
|
for key in dictionary.keys():
|
|
if (key.find('*') != -1):
|
|
target = key
|
|
continue
|
|
continue
|
|
|
|
DataResultsRawTest.sort(key=lambda x: x[target], reverse=True)
|
|
DataResultsTest.sort(key=lambda x: x[target], reverse=True)
|
|
|
|
for dictionary in DataResultsTest:
|
|
del dictionary['_id']
|
|
del dictionary['InstanceID']
|
|
del dictionary[target]
|
|
|
|
AllTargetsTest = [o[target] for o in DataResultsRawTest]
|
|
AllTargetsFloatValuesTest = []
|
|
|
|
previous = None
|
|
Class = 0
|
|
for i, value in enumerate(AllTargetsTest):
|
|
if (i == 0):
|
|
previous = value
|
|
target_namesLoc.append(value)
|
|
if (value == previous):
|
|
AllTargetsFloatValuesTest.append(Class)
|
|
else:
|
|
Class = Class + 1
|
|
target_namesLoc.append(value)
|
|
AllTargetsFloatValuesTest.append(Class)
|
|
previous = value
|
|
|
|
ArrayDataResultsTest = pd.DataFrame.from_dict(DataResultsTest)
|
|
|
|
XDataTest, yDataTest = ArrayDataResultsTest, AllTargetsFloatValuesTest
|
|
|
|
DataResults = copy.deepcopy(DataResultsRaw)
|
|
|
|
for dictionary in DataResultsRaw:
|
|
for key in dictionary.keys():
|
|
if (key.find('*') != -1):
|
|
target = key
|
|
continue
|
|
continue
|
|
|
|
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
|
|
DataResults.sort(key=lambda x: x[target], reverse=True)
|
|
|
|
for dictionary in DataResults:
|
|
del dictionary['_id']
|
|
del dictionary['InstanceID']
|
|
del dictionary[target]
|
|
|
|
AllTargets = [o[target] for o in DataResultsRaw]
|
|
AllTargetsFloatValues = []
|
|
|
|
previous = None
|
|
Class = 0
|
|
for i, value in enumerate(AllTargets):
|
|
if (i == 0):
|
|
previous = value
|
|
target_names.append(value)
|
|
if (value == previous):
|
|
AllTargetsFloatValues.append(Class)
|
|
else:
|
|
Class = Class + 1
|
|
target_names.append(value)
|
|
AllTargetsFloatValues.append(Class)
|
|
previous = value
|
|
|
|
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
|
|
|
|
global XData, yData, RANDOM_SEED
|
|
XData, yData = ArrayDataResults, AllTargetsFloatValues
|
|
|
|
global XDataStored, yDataStored
|
|
XDataStored = XData.copy()
|
|
yDataStored = yData.copy()
|
|
|
|
warnings.simplefilter('ignore')
|
|
|
|
executeModel()
|
|
|
|
return 'Everything is okay'
|
|
|
|
def create_global_function():
|
|
global estimator
|
|
def estimator(C, gamma):
|
|
# initialize model
|
|
model = SVC(C=C, gamma=gamma, degree=1, random_state=RANDOM_SEED)
|
|
# set in cross-validation
|
|
result = cross_validate(model, XData, yData, cv=crossValidation, scoring='accuracy')
|
|
# result is mean of test_score
|
|
return np.mean(result['test_score'])
|
|
|
|
# check this issue later because we are getting the same results
|
|
def executeModel():
|
|
|
|
create_global_function()
|
|
global estimator
|
|
global yPredictProb
|
|
|
|
params = {"C": (0.0001, 10000), "gamma": (0.0001, 10000)}
|
|
svc_bayesopt = BayesianOptimization(estimator, params)
|
|
svc_bayesopt.maximize(init_points=5, n_iter=25, acq='ucb')
|
|
bestParams = svc_bayesopt.max['params']
|
|
estimator = SVC(C=bestParams.get('C'), gamma=bestParams.get('gamma'), probability=True)
|
|
estimator.fit(XData, yData)
|
|
yPredict = estimator.predict(XData)
|
|
yPredictProb = cross_val_predict(estimator, XData, yData, cv=crossValidation, method='predict_proba')
|
|
|
|
return 'Everything Okay'
|
|
|
|
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
|
|
@app.route('/data/thresholdDataSpace', methods=["GET", "POST"])
|
|
def Seperation():
|
|
|
|
thresholds = request.get_data().decode('utf8').replace("'", '"')
|
|
thresholds = json.loads(thresholds)
|
|
thresholdsPos = thresholds['PositiveValue']
|
|
thresholdsNeg = thresholds['NegativeValue']
|
|
|
|
getCorrectPrediction = []
|
|
|
|
for index, value in enumerate(yPredictProb):
|
|
getCorrectPrediction.append(value[yData[index]]*100)
|
|
|
|
quadrant1 = []
|
|
quadrant2 = []
|
|
quadrant3 = []
|
|
quadrant4 = []
|
|
|
|
for index, value in enumerate(getCorrectPrediction):
|
|
if (value > 50 and value > thresholdsPos):
|
|
quadrant1.append(index)
|
|
elif (value > 50 and value <= thresholdsPos):
|
|
quadrant2.append(index)
|
|
elif (value <= 50 and value > thresholdsNeg):
|
|
quadrant3.append(index)
|
|
else:
|
|
quadrant4.append(index)
|
|
|
|
Datarows1 = XData.iloc[quadrant1, :]
|
|
Datarows2 = XData.iloc[quadrant2, :]
|
|
Datarows3 = XData.iloc[quadrant3, :]
|
|
Datarows4 = XData.iloc[quadrant4, :]
|
|
|
|
global packCorr
|
|
packCorr = []
|
|
|
|
corrMatrix1 = Datarows1.corr()
|
|
corrMatrix2 = Datarows2.corr()
|
|
corrMatrix3 = Datarows3.corr()
|
|
corrMatrix4 = Datarows4.corr()
|
|
|
|
packCorr.append(corrMatrix1.to_json())
|
|
packCorr.append(corrMatrix2.to_json())
|
|
packCorr.append(corrMatrix3.to_json())
|
|
packCorr.append(corrMatrix4.to_json())
|
|
|
|
packCorr.append(list(XData.columns.values.tolist()))
|
|
|
|
|
|
return 'Everything Okay'
|
|
|
|
@app.route('/data/returnCorrelations', methods=["GET", "POST"])
|
|
def SendCorrel():
|
|
global packCorr
|
|
|
|
response = {
|
|
'correlResul': packCorr
|
|
}
|
|
return jsonify(response) |