FeatureEnVi: Visual Analytics for Feature Engineering Using Stepwise Selection and Semi-Automatic Extraction Approaches https://doi.org/10.1109/TVCG.2022.3141040
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
FeatureEnVi/run.py

713 lines
21 KiB

4 years ago
from flask import Flask, render_template, jsonify, request
from flask_pymongo import PyMongo
from flask_cors import CORS, cross_origin
import json
import copy
import warnings
import re
import random
import math
import pandas as pd
import numpy as np
import multiprocessing
from joblib import Memory
4 years ago
from sklearn.svm import SVC
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_validate
4 years ago
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import OneHotEncoder
4 years ago
from sklearn.feature_selection import mutual_info_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
4 years ago
# this block of code is for the connection between the server, the database, and the client (plus routing)
# access MongoDB
app = Flask(__name__)
app.config["MONGO_URI"] = "mongodb://localhost:27017/mydb"
mongo = PyMongo(app)
cors = CORS(app, resources={r"/data/*": {"origins": "*"}})
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/Reset', methods=["GET", "POST"])
def reset():
global DataRawLength
global DataResultsRaw
global previousState
previousState = []
global filterActionFinal
filterActionFinal = ''
global keySpecInternal
keySpecInternal = 1
global dataSpacePointsIDs
dataSpacePointsIDs = []
global previousStateActive
previousStateActive = []
global RANDOM_SEED
RANDOM_SEED = 42
global KNNModelsCount
global LRModelsCount
global keyData
keyData = 0
KNNModelsCount = 0
LRModelsCount = 100
global XData
XData = []
global yData
yData = []
global XDataStored
XDataStored = []
global yDataStored
yDataStored = []
global detailsParams
detailsParams = []
global algorithmList
algorithmList = []
global ClassifierIDsList
ClassifierIDsList = ''
# Initializing models
global resultsList
resultsList = []
global RetrieveModelsList
RetrieveModelsList = []
global allParametersPerformancePerModel
allParametersPerformancePerModel = []
global allParametersPerfCrossMutr
allParametersPerfCrossMutr = []
global HistoryPreservation
HistoryPreservation = []
global all_classifiers
all_classifiers = []
global crossValidation
4 years ago
crossValidation = 10
4 years ago
# models
global KNNModels
KNNModels = []
global RFModels
RFModels = []
global scoring
scoring = {'accuracy': 'accuracy', 'precision_micro': 'precision_micro', 'precision_macro': 'precision_macro', 'precision_weighted': 'precision_weighted', 'recall_micro': 'recall_micro', 'recall_macro': 'recall_macro', 'recall_weighted': 'recall_weighted', 'roc_auc_ovo_weighted': 'roc_auc_ovo_weighted'}
global loopFeatures
loopFeatures = 2
global results
results = []
global resultsMetrics
resultsMetrics = []
global parametersSelData
parametersSelData = []
global target_names
target_names = []
global target_namesLoc
target_namesLoc = []
return 'The reset was done!'
# retrieve data from client and select the correct data set
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/ServerRequest', methods=["GET", "POST"])
def retrieveFileName():
global DataRawLength
global DataResultsRaw
global DataResultsRawTest
global DataRawLengthTest
fileName = request.get_data().decode('utf8').replace("'", '"')
global keySpecInternal
keySpecInternal = 1
global filterActionFinal
filterActionFinal = ''
global dataSpacePointsIDs
dataSpacePointsIDs = []
global RANDOM_SEED
RANDOM_SEED = 42
global keyData
keyData = 0
global XData
XData = []
global previousState
previousState = []
global previousStateActive
previousStateActive = []
global yData
yData = []
global XDataStored
XDataStored = []
global yDataStored
yDataStored = []
global filterDataFinal
filterDataFinal = 'mean'
global ClassifierIDsList
ClassifierIDsList = ''
global algorithmList
algorithmList = []
global detailsParams
detailsParams = []
# Initializing models
global RetrieveModelsList
RetrieveModelsList = []
global resultsList
resultsList = []
global allParametersPerformancePerModel
allParametersPerformancePerModel = []
global allParametersPerfCrossMutr
allParametersPerfCrossMutr = []
global HistoryPreservation
HistoryPreservation = []
global all_classifiers
all_classifiers = []
global crossValidation
crossValidation = 5
global scoring
scoring = {'accuracy': 'accuracy', 'precision_weighted': 'precision_weighted', 'recall_weighted': 'recall_weighted', 'f1_weighted': 'f1_weighted', 'roc_auc_ovo_weighted': 'roc_auc_ovo_weighted'}
global loopFeatures
loopFeatures = 2
# models
global KNNModels
global SVCModels
global GausNBModels
global MLPModels
global LRModels
global LDAModels
global QDAModels
global RFModels
global ExtraTModels
global AdaBModels
global GradBModels
KNNModels = []
SVCModels = []
GausNBModels = []
MLPModels = []
LRModels = []
LDAModels = []
QDAModels = []
RFModels = []
ExtraTModels = []
AdaBModels = []
GradBModels = []
global results
results = []
global resultsMetrics
resultsMetrics = []
global parametersSelData
parametersSelData = []
global StanceTest
StanceTest = False
global target_names
target_names = []
global target_namesLoc
target_namesLoc = []
DataRawLength = -1
DataRawLengthTest = -1
data = json.loads(fileName)
if data['fileName'] == 'HeartC':
CollectionDB = mongo.db.HeartC.find()
elif data['fileName'] == 'StanceC':
StanceTest = True
CollectionDB = mongo.db.StanceC.find()
CollectionDBTest = mongo.db.StanceCTest.find()
elif data['fileName'] == 'DiabetesC':
CollectionDB = mongo.db.DiabetesC.find()
else:
CollectionDB = mongo.db.IrisC.find()
DataResultsRaw = []
for index, item in enumerate(CollectionDB):
item['_id'] = str(item['_id'])
item['InstanceID'] = index
DataResultsRaw.append(item)
DataRawLength = len(DataResultsRaw)
DataResultsRawTest = []
if (StanceTest):
for index, item in enumerate(CollectionDBTest):
item['_id'] = str(item['_id'])
item['InstanceID'] = index
DataResultsRawTest.append(item)
DataRawLengthTest = len(DataResultsRawTest)
dataSetSelection()
return 'Everything is okay'
# Retrieve data set from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/SendtoSeverDataSet', methods=["GET", "POST"])
def sendToServerData():
uploadedData = request.get_data().decode('utf8').replace("'", '"')
uploadedDataParsed = json.loads(uploadedData)
DataResultsRaw = uploadedDataParsed['uploadedData']
DataResults = copy.deepcopy(DataResultsRaw)
for dictionary in DataResultsRaw:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
DataResults.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResults:
del dictionary[target]
global AllTargets
global target_names
global target_namesLoc
AllTargets = [o[target] for o in DataResultsRaw]
AllTargetsFloatValues = []
previous = None
Class = 0
for i, value in enumerate(AllTargets):
if (i == 0):
previous = value
target_names.append(value)
if (value == previous):
AllTargetsFloatValues.append(Class)
else:
Class = Class + 1
target_names.append(value)
AllTargetsFloatValues.append(Class)
previous = value
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
global XData, yData, RANDOM_SEED
XData, yData = ArrayDataResults, AllTargetsFloatValues
global XDataStored, yDataStored
XDataStored = XData.copy()
yDataStored = yData.copy()
return 'Processed uploaded data set'
def dataSetSelection():
global XDataTest, yDataTest
XDataTest = pd.DataFrame()
global StanceTest
global AllTargets
global target_names
target_namesLoc = []
if (StanceTest):
DataResultsTest = copy.deepcopy(DataResultsRawTest)
for dictionary in DataResultsRawTest:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRawTest.sort(key=lambda x: x[target], reverse=True)
DataResultsTest.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResultsTest:
del dictionary['_id']
del dictionary['InstanceID']
del dictionary[target]
AllTargetsTest = [o[target] for o in DataResultsRawTest]
AllTargetsFloatValuesTest = []
previous = None
Class = 0
for i, value in enumerate(AllTargetsTest):
if (i == 0):
previous = value
target_namesLoc.append(value)
if (value == previous):
AllTargetsFloatValuesTest.append(Class)
else:
Class = Class + 1
target_namesLoc.append(value)
AllTargetsFloatValuesTest.append(Class)
previous = value
ArrayDataResultsTest = pd.DataFrame.from_dict(DataResultsTest)
XDataTest, yDataTest = ArrayDataResultsTest, AllTargetsFloatValuesTest
DataResults = copy.deepcopy(DataResultsRaw)
for dictionary in DataResultsRaw:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
DataResults.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResults:
del dictionary['_id']
del dictionary['InstanceID']
del dictionary[target]
AllTargets = [o[target] for o in DataResultsRaw]
AllTargetsFloatValues = []
previous = None
Class = 0
for i, value in enumerate(AllTargets):
if (i == 0):
previous = value
target_names.append(value)
if (value == previous):
AllTargetsFloatValues.append(Class)
else:
Class = Class + 1
target_names.append(value)
AllTargetsFloatValues.append(Class)
previous = value
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
global XData, yData, RANDOM_SEED
XData, yData = ArrayDataResults, AllTargetsFloatValues
global XDataStored, yDataStored
XDataStored = XData.copy()
yDataStored = yData.copy()
warnings.simplefilter('ignore')
4 years ago
executeModel()
return 'Everything is okay'
4 years ago
4 years ago
def create_global_function():
global estimator
def estimator(C, gamma):
# initialize model
model = SVC(C=C, gamma=gamma, degree=1, random_state=RANDOM_SEED)
# set in cross-validation
4 years ago
result = cross_validate(model, XData, yData, cv=crossValidation, scoring='accuracy')
4 years ago
# result is mean of test_score
return np.mean(result['test_score'])
4 years ago
4 years ago
# check this issue later because we are getting the same results
4 years ago
def executeModel():
4 years ago
4 years ago
create_global_function()
global estimator
4 years ago
global yPredictProb
4 years ago
4 years ago
params = {"C": (0.0001, 10000), "gamma": (0.0001, 10000)}
svc_bayesopt = BayesianOptimization(estimator, params)
4 years ago
svc_bayesopt.maximize(init_points=5, n_iter=25, acq='ucb')
4 years ago
bestParams = svc_bayesopt.max['params']
4 years ago
estimator = SVC(C=bestParams.get('C'), gamma=bestParams.get('gamma'), probability=True, random_state=RANDOM_SEED)
4 years ago
estimator.fit(XData, yData)
yPredict = estimator.predict(XData)
yPredictProb = cross_val_predict(estimator, XData, yData, cv=crossValidation, method='predict_proba')
4 years ago
return 'Everything Okay'
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/thresholdDataSpace', methods=["GET", "POST"])
def Seperation():
thresholds = request.get_data().decode('utf8').replace("'", '"')
thresholds = json.loads(thresholds)
thresholdsPos = thresholds['PositiveValue']
thresholdsNeg = thresholds['NegativeValue']
getCorrectPrediction = []
for index, value in enumerate(yPredictProb):
getCorrectPrediction.append(value[yData[index]]*100)
quadrant1 = []
quadrant2 = []
quadrant3 = []
quadrant4 = []
for index, value in enumerate(getCorrectPrediction):
if (value > 50 and value > thresholdsPos):
quadrant1.append(index)
elif (value > 50 and value <= thresholdsPos):
quadrant2.append(index)
elif (value <= 50 and value > thresholdsNeg):
quadrant3.append(index)
else:
quadrant4.append(index)
DataRows1 = XData.iloc[quadrant1, :]
DataRows2 = XData.iloc[quadrant2, :]
DataRows3 = XData.iloc[quadrant3, :]
DataRows4 = XData.iloc[quadrant4, :]
corrMatrix1 = DataRows1.corr()
4 years ago
corrMatrix1 = corrMatrix1.abs()
corrMatrix2 = DataRows2.corr()
4 years ago
corrMatrix2 = corrMatrix2.abs()
corrMatrix3 = DataRows3.corr()
4 years ago
corrMatrix3 = corrMatrix3.abs()
corrMatrix4 = DataRows4.corr()
4 years ago
corrMatrix4 = corrMatrix4.abs()
DataRows1 = DataRows1.reset_index(drop=True)
DataRows2 = DataRows2.reset_index(drop=True)
DataRows3 = DataRows3.reset_index(drop=True)
DataRows4 = DataRows4.reset_index(drop=True)
targetRows1 = [yData[i] for i in quadrant1]
targetRows2 = [yData[i] for i in quadrant2]
targetRows3 = [yData[i] for i in quadrant3]
targetRows4 = [yData[i] for i in quadrant4]
targetRows1Arr = np.array(targetRows1)
targetRows2Arr = np.array(targetRows2)
targetRows3Arr = np.array(targetRows3)
targetRows4Arr = np.array(targetRows4)
uniqueTarget1 = unique(targetRows1)
uniqueTarget2 = unique(targetRows2)
uniqueTarget3 = unique(targetRows3)
uniqueTarget4 = unique(targetRows4)
if (len(targetRows1Arr) > 0):
onehotEncoder1 = OneHotEncoder(sparse=False)
targetRows1Arr = targetRows1Arr.reshape(len(targetRows1Arr), 1)
onehotEncoder1 = onehotEncoder1.fit_transform(targetRows1Arr)
hotEncoderDF1 = pd.DataFrame(onehotEncoder1)
concatDF1 = pd.concat([DataRows1, hotEncoderDF1], axis=1)
corrMatrixComb1 = concatDF1.corr()
4 years ago
corrMatrixComb1 = corrMatrixComb1.abs()
corrMatrixComb1 = corrMatrixComb1.iloc[:,-len(uniqueTarget1):]
4 years ago
X1 = add_constant(DataRows1.dropna())
VIF1 = pd.Series([variance_inflation_factor(X1.values, i)
for i in range(X1.shape[1])],
index=X1.columns)
if (len(targetRows1Arr) > 2):
MI1 = mutual_info_classif(DataRows1, targetRows1Arr)
MI1List = MI1.tolist()
else:
MI1List = []
else:
corrMatrixComb1 = pd.DataFrame()
4 years ago
VIF1 = pd.Series()
MI1List = []
if (len(targetRows2Arr) > 0):
onehotEncoder2 = OneHotEncoder(sparse=False)
targetRows2Arr = targetRows2Arr.reshape(len(targetRows2Arr), 1)
onehotEncoder2 = onehotEncoder2.fit_transform(targetRows2Arr)
hotEncoderDF2 = pd.DataFrame(onehotEncoder2)
concatDF2 = pd.concat([DataRows2, hotEncoderDF2], axis=1)
corrMatrixComb2 = concatDF2.corr()
4 years ago
corrMatrixComb2 = corrMatrixComb2.abs()
corrMatrixComb2 = corrMatrixComb2.iloc[:,-len(uniqueTarget2):]
4 years ago
X2 = add_constant(DataRows2.dropna())
VIF2 = pd.Series([variance_inflation_factor(X2.values, i)
for i in range(X2.shape[1])],
index=X2.columns)
if (len(targetRows2Arr) > 2):
MI2 = mutual_info_classif(DataRows2, targetRows2Arr)
MI2List = MI2.tolist()
else:
MI2List = []
else:
corrMatrixComb2 = pd.DataFrame()
4 years ago
VIF2 = pd.Series()
MI2List = []
if (len(targetRows3Arr) > 0):
onehotEncoder3 = OneHotEncoder(sparse=False)
targetRows3Arr = targetRows3Arr.reshape(len(targetRows3Arr), 1)
onehotEncoder3 = onehotEncoder3.fit_transform(targetRows3Arr)
hotEncoderDF3 = pd.DataFrame(onehotEncoder3)
concatDF3 = pd.concat([DataRows3, hotEncoderDF3], axis=1)
corrMatrixComb3 = concatDF3.corr()
4 years ago
corrMatrixComb3 = corrMatrixComb3.abs()
corrMatrixComb3 = corrMatrixComb3.iloc[:,-len(uniqueTarget3):]
4 years ago
X3 = add_constant(DataRows3.dropna())
VIF3 = pd.Series([variance_inflation_factor(X3.values, i)
for i in range(X3.shape[1])],
index=X3.columns)
if (len(targetRows3Arr) > 2):
MI3 = mutual_info_classif(DataRows3, targetRows3Arr)
MI3List = MI3.tolist()
else:
MI3List = []
else:
corrMatrixComb3 = pd.DataFrame()
4 years ago
VIF3 = pd.Series()
MI3List = []
if (len(targetRows4Arr) > 0):
onehotEncoder4 = OneHotEncoder(sparse=False)
targetRows4Arr = targetRows4Arr.reshape(len(targetRows4Arr), 1)
onehotEncoder4 = onehotEncoder4.fit_transform(targetRows4Arr)
hotEncoderDF4 = pd.DataFrame(onehotEncoder4)
concatDF4 = pd.concat([DataRows4, hotEncoderDF4], axis=1)
corrMatrixComb4 = concatDF4.corr()
4 years ago
corrMatrixComb4 = corrMatrixComb4.abs()
corrMatrixComb4 = corrMatrixComb4.iloc[:,-len(uniqueTarget4):]
4 years ago
X4 = add_constant(DataRows4.dropna())
VIF4 = pd.Series([variance_inflation_factor(X4.values, i)
for i in range(X4.shape[1])],
index=X4.columns)
if (len(targetRows4Arr) > 2):
MI4 = mutual_info_classif(DataRows4, targetRows4Arr)
MI4List = MI4.tolist()
else:
MI4List = []
else:
corrMatrixComb4 = pd.DataFrame()
4 years ago
VIF4 = pd.Series()
MI4List = []
targetRows1ArrDF = pd.DataFrame(targetRows1Arr)
targetRows2ArrDF = pd.DataFrame(targetRows2Arr)
targetRows3ArrDF = pd.DataFrame(targetRows3Arr)
targetRows4ArrDF = pd.DataFrame(targetRows4Arr)
concatAllDF1 = pd.concat([DataRows1, targetRows1ArrDF], axis=1)
concatAllDF2 = pd.concat([DataRows2, targetRows2ArrDF], axis=1)
concatAllDF3 = pd.concat([DataRows3, targetRows3ArrDF], axis=1)
concatAllDF4 = pd.concat([DataRows4, targetRows4ArrDF], axis=1)
corrMatrixCombTotal1 = concatAllDF1.corr()
4 years ago
corrMatrixCombTotal1 = corrMatrixCombTotal1.abs()
corrMatrixCombTotal2 = concatAllDF2.corr()
4 years ago
corrMatrixCombTotal2 = corrMatrixCombTotal2.abs()
corrMatrixCombTotal3 = concatAllDF3.corr()
4 years ago
corrMatrixCombTotal3 = corrMatrixCombTotal3.abs()
corrMatrixCombTotal4 = concatAllDF4.corr()
4 years ago
corrMatrixCombTotal4 = corrMatrixCombTotal4.abs()
corrMatrixCombTotal1 = pd.concat([corrMatrixCombTotal1.tail(1)])
corrMatrixCombTotal2 = pd.concat([corrMatrixCombTotal2.tail(1)])
corrMatrixCombTotal3 = pd.concat([corrMatrixCombTotal3.tail(1)])
corrMatrixCombTotal4 = pd.concat([corrMatrixCombTotal4.tail(1)])
4 years ago
global packCorr
packCorr = []
packCorr.append(list(XData.columns.values.tolist()))
packCorr.append(json.dumps(target_names))
4 years ago
packCorr.append(corrMatrix1.to_json())
packCorr.append(corrMatrix2.to_json())
packCorr.append(corrMatrix3.to_json())
packCorr.append(corrMatrix4.to_json())
4 years ago
packCorr.append(corrMatrixComb1.to_json())
packCorr.append(corrMatrixComb2.to_json())
packCorr.append(corrMatrixComb3.to_json())
packCorr.append(corrMatrixComb4.to_json())
packCorr.append(corrMatrixCombTotal1.to_json())
packCorr.append(corrMatrixCombTotal2.to_json())
packCorr.append(corrMatrixCombTotal3.to_json())
packCorr.append(corrMatrixCombTotal4.to_json())
4 years ago
packCorr.append(json.dumps(uniqueTarget1))
packCorr.append(json.dumps(uniqueTarget2))
packCorr.append(json.dumps(uniqueTarget3))
packCorr.append(json.dumps(uniqueTarget4))
4 years ago
4 years ago
packCorr.append(VIF1.to_json())
packCorr.append(VIF2.to_json())
packCorr.append(VIF3.to_json())
packCorr.append(VIF4.to_json())
packCorr.append(json.dumps(MI1List))
packCorr.append(json.dumps(MI2List))
packCorr.append(json.dumps(MI3List))
packCorr.append(json.dumps(MI4List))
4 years ago
return 'Everything Okay'
@app.route('/data/returnCorrelations', methods=["GET", "POST"])
def SendCorrel():
global packCorr
4 years ago
4 years ago
response = {
'correlResul': packCorr
}
return jsonify(response)
def unique(list1):
# intilize a null list
unique_list = []
# traverse for all elements
for x in list1:
# check if exists in unique_list or not
if x not in unique_list:
unique_list.append(x)
return unique_list