FeatureEnVi: Visual Analytics for Feature Engineering Using Stepwise Selection and Semi-Automatic Extraction Approaches https://doi.org/10.1109/TVCG.2022.3141040
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
FeatureEnVi/run.py

472 lines
12 KiB

4 years ago
from flask import Flask, render_template, jsonify, request
from flask_pymongo import PyMongo
from flask_cors import CORS, cross_origin
import json
import copy
import warnings
import re
import random
import math
import pandas as pd
import numpy as np
import multiprocessing
from joblib import Memory
4 years ago
from sklearn.svm import SVC
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_validate
4 years ago
from sklearn.model_selection import cross_val_predict
# this block of code is for the connection between the server, the database, and the client (plus routing)
# access MongoDB
app = Flask(__name__)
app.config["MONGO_URI"] = "mongodb://localhost:27017/mydb"
mongo = PyMongo(app)
cors = CORS(app, resources={r"/data/*": {"origins": "*"}})
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/Reset', methods=["GET", "POST"])
def reset():
global DataRawLength
global DataResultsRaw
global previousState
previousState = []
global filterActionFinal
filterActionFinal = ''
global keySpecInternal
keySpecInternal = 1
global dataSpacePointsIDs
dataSpacePointsIDs = []
global previousStateActive
previousStateActive = []
global RANDOM_SEED
RANDOM_SEED = 42
global KNNModelsCount
global LRModelsCount
global keyData
keyData = 0
KNNModelsCount = 0
LRModelsCount = 100
global XData
XData = []
global yData
yData = []
global XDataStored
XDataStored = []
global yDataStored
yDataStored = []
global detailsParams
detailsParams = []
global algorithmList
algorithmList = []
global ClassifierIDsList
ClassifierIDsList = ''
# Initializing models
global resultsList
resultsList = []
global RetrieveModelsList
RetrieveModelsList = []
global allParametersPerformancePerModel
allParametersPerformancePerModel = []
global allParametersPerfCrossMutr
allParametersPerfCrossMutr = []
global HistoryPreservation
HistoryPreservation = []
global all_classifiers
all_classifiers = []
global crossValidation
4 years ago
crossValidation = 10
4 years ago
# models
global KNNModels
KNNModels = []
global RFModels
RFModels = []
global scoring
scoring = {'accuracy': 'accuracy', 'precision_micro': 'precision_micro', 'precision_macro': 'precision_macro', 'precision_weighted': 'precision_weighted', 'recall_micro': 'recall_micro', 'recall_macro': 'recall_macro', 'recall_weighted': 'recall_weighted', 'roc_auc_ovo_weighted': 'roc_auc_ovo_weighted'}
global loopFeatures
loopFeatures = 2
global results
results = []
global resultsMetrics
resultsMetrics = []
global parametersSelData
parametersSelData = []
global target_names
target_names = []
global target_namesLoc
target_namesLoc = []
return 'The reset was done!'
# retrieve data from client and select the correct data set
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/ServerRequest', methods=["GET", "POST"])
def retrieveFileName():
global DataRawLength
global DataResultsRaw
global DataResultsRawTest
global DataRawLengthTest
fileName = request.get_data().decode('utf8').replace("'", '"')
global keySpecInternal
keySpecInternal = 1
global filterActionFinal
filterActionFinal = ''
global dataSpacePointsIDs
dataSpacePointsIDs = []
global RANDOM_SEED
RANDOM_SEED = 42
global keyData
keyData = 0
global XData
XData = []
global previousState
previousState = []
global previousStateActive
previousStateActive = []
global yData
yData = []
global XDataStored
XDataStored = []
global yDataStored
yDataStored = []
global filterDataFinal
filterDataFinal = 'mean'
global ClassifierIDsList
ClassifierIDsList = ''
global algorithmList
algorithmList = []
global detailsParams
detailsParams = []
# Initializing models
global RetrieveModelsList
RetrieveModelsList = []
global resultsList
resultsList = []
global allParametersPerformancePerModel
allParametersPerformancePerModel = []
global allParametersPerfCrossMutr
allParametersPerfCrossMutr = []
global HistoryPreservation
HistoryPreservation = []
global all_classifiers
all_classifiers = []
global crossValidation
crossValidation = 5
global scoring
scoring = {'accuracy': 'accuracy', 'precision_weighted': 'precision_weighted', 'recall_weighted': 'recall_weighted', 'f1_weighted': 'f1_weighted', 'roc_auc_ovo_weighted': 'roc_auc_ovo_weighted'}
global loopFeatures
loopFeatures = 2
# models
global KNNModels
global SVCModels
global GausNBModels
global MLPModels
global LRModels
global LDAModels
global QDAModels
global RFModels
global ExtraTModels
global AdaBModels
global GradBModels
KNNModels = []
SVCModels = []
GausNBModels = []
MLPModels = []
LRModels = []
LDAModels = []
QDAModels = []
RFModels = []
ExtraTModels = []
AdaBModels = []
GradBModels = []
global results
results = []
global resultsMetrics
resultsMetrics = []
global parametersSelData
parametersSelData = []
global StanceTest
StanceTest = False
global target_names
target_names = []
global target_namesLoc
target_namesLoc = []
DataRawLength = -1
DataRawLengthTest = -1
data = json.loads(fileName)
if data['fileName'] == 'HeartC':
CollectionDB = mongo.db.HeartC.find()
elif data['fileName'] == 'StanceC':
StanceTest = True
CollectionDB = mongo.db.StanceC.find()
CollectionDBTest = mongo.db.StanceCTest.find()
elif data['fileName'] == 'DiabetesC':
CollectionDB = mongo.db.DiabetesC.find()
else:
CollectionDB = mongo.db.IrisC.find()
DataResultsRaw = []
for index, item in enumerate(CollectionDB):
item['_id'] = str(item['_id'])
item['InstanceID'] = index
DataResultsRaw.append(item)
DataRawLength = len(DataResultsRaw)
DataResultsRawTest = []
if (StanceTest):
for index, item in enumerate(CollectionDBTest):
item['_id'] = str(item['_id'])
item['InstanceID'] = index
DataResultsRawTest.append(item)
DataRawLengthTest = len(DataResultsRawTest)
dataSetSelection()
return 'Everything is okay'
# Retrieve data set from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/SendtoSeverDataSet', methods=["GET", "POST"])
def sendToServerData():
uploadedData = request.get_data().decode('utf8').replace("'", '"')
uploadedDataParsed = json.loads(uploadedData)
DataResultsRaw = uploadedDataParsed['uploadedData']
DataResults = copy.deepcopy(DataResultsRaw)
for dictionary in DataResultsRaw:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
DataResults.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResults:
del dictionary[target]
global AllTargets
global target_names
global target_namesLoc
AllTargets = [o[target] for o in DataResultsRaw]
AllTargetsFloatValues = []
previous = None
Class = 0
for i, value in enumerate(AllTargets):
if (i == 0):
previous = value
target_names.append(value)
if (value == previous):
AllTargetsFloatValues.append(Class)
else:
Class = Class + 1
target_names.append(value)
AllTargetsFloatValues.append(Class)
previous = value
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
global XData, yData, RANDOM_SEED
XData, yData = ArrayDataResults, AllTargetsFloatValues
global XDataStored, yDataStored
XDataStored = XData.copy()
yDataStored = yData.copy()
return 'Processed uploaded data set'
def dataSetSelection():
global XDataTest, yDataTest
XDataTest = pd.DataFrame()
global StanceTest
global AllTargets
global target_names
target_namesLoc = []
if (StanceTest):
DataResultsTest = copy.deepcopy(DataResultsRawTest)
for dictionary in DataResultsRawTest:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRawTest.sort(key=lambda x: x[target], reverse=True)
DataResultsTest.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResultsTest:
del dictionary['_id']
del dictionary['InstanceID']
del dictionary[target]
AllTargetsTest = [o[target] for o in DataResultsRawTest]
AllTargetsFloatValuesTest = []
previous = None
Class = 0
for i, value in enumerate(AllTargetsTest):
if (i == 0):
previous = value
target_namesLoc.append(value)
if (value == previous):
AllTargetsFloatValuesTest.append(Class)
else:
Class = Class + 1
target_namesLoc.append(value)
AllTargetsFloatValuesTest.append(Class)
previous = value
ArrayDataResultsTest = pd.DataFrame.from_dict(DataResultsTest)
XDataTest, yDataTest = ArrayDataResultsTest, AllTargetsFloatValuesTest
DataResults = copy.deepcopy(DataResultsRaw)
for dictionary in DataResultsRaw:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
DataResults.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResults:
del dictionary['_id']
del dictionary['InstanceID']
del dictionary[target]
AllTargets = [o[target] for o in DataResultsRaw]
AllTargetsFloatValues = []
previous = None
Class = 0
for i, value in enumerate(AllTargets):
if (i == 0):
previous = value
target_names.append(value)
if (value == previous):
AllTargetsFloatValues.append(Class)
else:
Class = Class + 1
target_names.append(value)
AllTargetsFloatValues.append(Class)
previous = value
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
global XData, yData, RANDOM_SEED
XData, yData = ArrayDataResults, AllTargetsFloatValues
global XDataStored, yDataStored
XDataStored = XData.copy()
yDataStored = yData.copy()
warnings.simplefilter('ignore')
4 years ago
executeModel()
return 'Everything is okay'
4 years ago
4 years ago
def create_global_function():
global estimator
def estimator(C, gamma):
# initialize model
model = SVC(C=C, gamma=gamma, degree=1, random_state=RANDOM_SEED)
# set in cross-validation
result = cross_validate(model, XData, yData, cv=crossValidation)
# result is mean of test_score
return np.mean(result['test_score'])
4 years ago
location = './cachedir'
memory = Memory(location, verbose=0)
@memory.cache
4 years ago
def executeModel():
4 years ago
4 years ago
create_global_function()
global estimator
4 years ago
4 years ago
params = {"C": (0.0001, 10000), "gamma": (0.0001, 10000)}
svc_bayesopt = BayesianOptimization(estimator, params)
svc_bayesopt.maximize(init_points=5, n_iter=20, acq='ucb')
bestParams = svc_bayesopt.max['params']
estimator = SVC(C=bestParams.get('C'), gamma=bestParams.get('gamma'), probability=True)
estimator.fit(XData, yData)
yPredict = estimator.predict(XData)
yPredictProb = cross_val_predict(estimator, XData, yData, cv=crossValidation, method='predict_proba')
4 years ago
4 years ago
return 'Everything Okay'