StackGenVis: Alignment of Data, Algorithms, and Models for Stacking Ensemble Learning Using Performance Metrics https://doi.org/10.1109/TVCG.2020.3030352
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
StackGenVis/run.py

1259 lines
46 KiB

from flask import Flask, render_template, jsonify, request
from flask_pymongo import PyMongo
from flask_cors import CORS, cross_origin
import json
import collections
import numpy as np
import re
from numpy import array
import pandas as pd
import warnings
import copy
from joblib import Memory
from itertools import chain
import ast
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn import model_selection
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from skdist.distribute.search import DistGridSearchCV
from pyspark.sql import SparkSession
from scipy.spatial import procrustes
# This block of code is for the connection between the server, the database, and the client (plus routing).
# Access MongoDB
app = Flask(__name__)
app.config["MONGO_URI"] = "mongodb://localhost:27017/mydb"
mongo = PyMongo(app)
cors = CORS(app, resources={r"/data/*": {"origins": "*"}})
# Retrieve data from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/Reset', methods=["GET", "POST"])
def Reset():
global DataRawLength
global DataResultsRaw
global RANDOM_SEED
RANDOM_SEED = 42
global factors
factors = [1,1,1,1,1]
global XData
XData = []
global yData
yData = []
global detailsParams
detailsParams = []
global algorithmList
algorithmList = []
global ClassifierIDsList
ClassifierIDsList = ''
# Initializing models
global resultsList
resultsList = []
global RetrieveModelsList
RetrieveModelsList = []
global allParametersPerformancePerModel
allParametersPerformancePerModel = []
global all_classifiers
all_classifiers = []
global crossValidation
crossValidation = 3
global scoring
#scoring = {'accuracy': 'accuracy', 'f1_macro': 'f1_weighted', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'jaccard': 'jaccard_weighted', 'neg_log_loss': 'neg_log_loss', 'r2': 'r2', 'neg_mean_absolute_error': 'neg_mean_absolute_error', 'neg_mean_absolute_error': 'neg_mean_absolute_error'}
scoring = {'accuracy': 'accuracy', 'f1_macro': 'f1_weighted', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'jaccard': 'jaccard_weighted'}
global loopFeatures
loopFeatures = 2
global results
results = []
global resultsMetrics
resultsMetrics = []
global parametersSelData
parametersSelData = []
global target_names
target_names = []
return 'The reset was done!'
# Retrieve data from client and select the correct data set
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/ServerRequest', methods=["GET", "POST"])
def RetrieveFileName():
fileName = request.get_data().decode('utf8').replace("'", '"')
#global featureSelection
#featureSelection = request.get_data().decode('utf8').replace("'", '"')
#featureSelection = json.loads(featureSelection)
global DataRawLength
global DataResultsRaw
global RANDOM_SEED
RANDOM_SEED = 42
global XData
XData = []
global yData
yData = []
global ClassifierIDsList
ClassifierIDsList = ''
global algorithmList
algorithmList = []
global detailsParams
detailsParams = []
# Initializing models
global RetrieveModelsList
RetrieveModelsList = []
global resultsList
resultsList = []
global allParametersPerformancePerModel
allParametersPerformancePerModel = []
global all_classifiers
all_classifiers = []
global crossValidation
crossValidation = 3
global scoring
scoring = {'accuracy': 'accuracy', 'f1_macro': 'f1_weighted', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'jaccard': 'jaccard_weighted'}
#scoring = {'accuracy': 'accuracy', 'f1_macro': 'f1_weighted', 'precision': 'precision_weighted', 'recall': 'recall_weighted', 'jaccard': 'jaccard_weighted', 'neg_log_loss': 'neg_log_loss', 'r2': 'r2', 'neg_mean_absolute_error': 'neg_mean_absolute_error', 'neg_mean_absolute_error': 'neg_mean_absolute_error'}
global NumberofscoringMetrics
NumberofscoringMetrics = len(scoring)
global loopFeatures
loopFeatures = 2
global factors
factors = [1,1,1,1,1]
global results
results = []
global resultsMetrics
resultsMetrics = []
global parametersSelData
parametersSelData = []
global target_names
target_names = []
DataRawLength = -1
data = json.loads(fileName)
if data['fileName'] == 'BreastC':
CollectionDB = mongo.db.BreastC.find()
elif data['fileName'] == 'DiabetesC':
CollectionDB = mongo.db.DiabetesC.find()
else:
CollectionDB = mongo.db.IrisC.find()
DataResultsRaw = []
for index, item in enumerate(CollectionDB):
item['_id'] = str(item['_id'])
item['InstanceID'] = index
DataResultsRaw.append(item)
DataRawLength = len(DataResultsRaw)
DataSetSelection()
return 'Everything is okay'
# Sent data to client
@app.route('/data/ClientRequest', methods=["GET", "POST"])
def CollectionData():
json.dumps(DataResultsRaw)
response = {
'Collection': DataResultsRaw
}
return jsonify(response)
def DataSetSelection():
DataResults = copy.deepcopy(DataResultsRaw)
for dictionary in DataResultsRaw:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
DataResults.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResults:
del dictionary['_id']
del dictionary['InstanceID']
del dictionary[target]
AllTargets = [o[target] for o in DataResultsRaw]
AllTargetsFloatValues = []
previous = None
Class = 0
for i, value in enumerate(AllTargets):
if (i == 0):
previous = value
target_names.append(value)
if (value == previous):
AllTargetsFloatValues.append(Class)
else:
Class = Class + 1
target_names.append(value)
AllTargetsFloatValues.append(Class)
previous = value
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
global XData, yData, RANDOM_SEED
XData, yData = ArrayDataResults, AllTargetsFloatValues
warnings.simplefilter('ignore')
return 'Everything is okay'
# Main function
if __name__ == '__main__':
app.run()
# Debugging and mirroring client
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def catch_all(path):
if app.debug:
return requests.get('http://localhost:8080/{}'.format(path)).text
return render_template("index.html")
# This block of code is for server computations
def column_index(df, query_cols):
cols = df.columns.values
sidx = np.argsort(cols)
return sidx[np.searchsorted(cols,query_cols,sorter=sidx)].tolist()
def class_feature_importance(X, Y, feature_importances):
N, M = X.shape
X = scale(X)
out = {}
for c in set(Y):
out[c] = dict(
zip(range(N), np.mean(X[Y==c, :], axis=0)*feature_importances)
)
return out
# Initialize every model for each algorithm
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/ServerRequestSelParameters', methods=["GET", "POST"])
def RetrieveModel():
# get the models from the frontend
RetrievedModel = request.get_data().decode('utf8').replace("'", '"')
RetrievedModel = json.loads(RetrievedModel)
global algorithms
algorithms = RetrievedModel['Algorithms']
# loop through the algorithms
global allParametersPerformancePerModel
for eachAlgor in algorithms:
if (eachAlgor) == 'KNN':
clf = KNeighborsClassifier()
params = {'n_neighbors': list(range(1, 25)), 'weights': ['uniform', 'distance'], 'algorithm': ['brute', 'kd_tree', 'ball_tree'], 'metric': ['chebyshev', 'manhattan', 'euclidean', 'minkowski']}
AlgorithmsIDsEnd = 0
else:
clf = RandomForestClassifier()
params = {'n_estimators': list(range(40, 120)), 'criterion': ['gini', 'entropy']}
AlgorithmsIDsEnd = 576
allParametersPerformancePerModel = GridSearchForModels(clf, params, eachAlgor, factors, AlgorithmsIDsEnd)
# call the function that sends the results to the frontend
SendEachClassifiersPerformanceToVisualize()
return 'Everything Okay'
location = './cachedir'
memory = Memory(location, verbose=0)
# calculating for all algorithms and models the performance and other results
@memory.cache
def GridSearchForModels(clf, params, eachAlgor, factors, AlgorithmsIDsEnd):
# instantiate spark session
spark = (
SparkSession
.builder
.getOrCreate()
)
sc = spark.sparkContext
# this is the grid we use to train the models
grid = DistGridSearchCV(
estimator=clf, param_grid=params,
sc=sc, cv=crossValidation, refit='accuracy', scoring=scoring,
verbose=0, n_jobs=-1)
# fit and extract the probabilities
grid.fit(XData, yData)
# process the results
cv_results = []
cv_results.append(grid.cv_results_)
df_cv_results = pd.DataFrame.from_dict(cv_results)
# number of models stored
number_of_models = len(df_cv_results.iloc[0][0])
# initialize results per row
df_cv_results_per_row = []
# loop through number of models
modelsIDs = []
for i in range(number_of_models):
modelsIDs.append(AlgorithmsIDsEnd+i)
# initialize results per item
df_cv_results_per_item = []
for column in df_cv_results.iloc[0]:
df_cv_results_per_item.append(column[i])
df_cv_results_per_row.append(df_cv_results_per_item)
# store the results into a pandas dataframe
df_cv_results_classifiers = pd.DataFrame(data = df_cv_results_per_row, columns= df_cv_results.columns)
# copy and filter in order to get only the metrics
metrics = df_cv_results_classifiers.copy()
metrics = metrics.filter(['mean_test_accuracy','mean_test_f1_macro','mean_test_precision','mean_test_recall','mean_test_jaccard'])
# control the factors
sumperModel = []
for index, row in metrics.iterrows():
rowSum = 0
lengthFactors = NumberofscoringMetrics
for loop,elements in enumerate(row):
lengthFactors = lengthFactors - 1 + factors[loop]
rowSum = elements*factors[loop] + rowSum
if lengthFactors is 0:
sumperModel = 0
else:
sumperModel.append(rowSum/lengthFactors)
# summarize all models metrics
summarizedMetrics = pd.DataFrame(sumperModel)
summarizedMetrics.rename(columns={0:'sum'})
# concat parameters and performance
parameters = pd.DataFrame(df_cv_results_classifiers['params'])
parametersPerformancePerModel = pd.concat([summarizedMetrics, parameters], axis=1)
parametersPerformancePerModel = parametersPerformancePerModel.to_json()
parametersLocal = json.loads(parametersPerformancePerModel)['params'].copy()
Models = []
for index, items in enumerate(parametersLocal):
Models.append(str(index))
parametersLocalNew = [ parametersLocal[your_key] for your_key in Models ]
permList = []
PerFeatureAccuracy = []
PerFeatureAccuracyAll = []
PerClassMetric = []
perModelProb = []
for eachModelParameters in parametersLocalNew:
clf.set_params(**eachModelParameters)
perm = PermutationImportance(clf, cv = None, refit = True, n_iter = 25).fit(XData, yData)
permList.append(perm.feature_importances_)
n_feats = XData.shape[1]
PerFeatureAccuracy = []
for i in range(n_feats):
scores = model_selection.cross_val_score(clf, XData.values[:, i].reshape(-1, 1), yData, cv=crossValidation)
PerFeatureAccuracy.append(scores.mean())
PerFeatureAccuracyAll.append(PerFeatureAccuracy)
clf.fit(XData, yData)
yPredict = clf.predict(XData)
# retrieve target names (class names)
PerClassMetric.append(classification_report(yData, yPredict, target_names=target_names, digits=2, output_dict=True))
yPredictProb = clf.predict_proba(XData)
perModelProb.append(yPredictProb.tolist())
perModelProbPandas = pd.DataFrame(perModelProb)
perModelProbPandas = perModelProbPandas.to_json()
PerClassMetricPandas = pd.DataFrame(PerClassMetric)
del PerClassMetricPandas['accuracy']
del PerClassMetricPandas['macro avg']
del PerClassMetricPandas['weighted avg']
PerClassMetricPandas = PerClassMetricPandas.to_json()
perm_imp_eli5PD = pd.DataFrame(permList)
perm_imp_eli5PD = perm_imp_eli5PD.to_json()
PerFeatureAccuracyPandas = pd.DataFrame(PerFeatureAccuracyAll)
PerFeatureAccuracyPandas = PerFeatureAccuracyPandas.to_json()
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(XData,yData)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(XData.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
featureScores = featureScores.to_json()
# gather the results and send them back
results.append(modelsIDs) # Position: 0 and so on
results.append(parametersPerformancePerModel) # Position: 1 and so on
results.append(PerClassMetricPandas) # Position: 2 and so on
results.append(PerFeatureAccuracyPandas) # Position: 3 and so on
results.append(perm_imp_eli5PD) # Position: 4 and so on
results.append(featureScores) # Position: 5 and so on
metrics = metrics.to_json()
results.append(metrics) # Position: 6 and so on
results.append(perModelProbPandas) # Position: 7 and so on
return results
# Sending each model's results to frontend
@app.route('/data/PerformanceForEachModel', methods=["GET", "POST"])
def SendEachClassifiersPerformanceToVisualize():
response = {
'PerformancePerModel': allParametersPerformancePerModel,
}
return jsonify(response)
def Remove(duplicate):
final_list = []
for num in duplicate:
if num not in final_list:
if (isinstance(num, float)):
if np.isnan(num):
pass
else:
final_list.append(int(num))
else:
final_list.append(num)
return final_list
# Retrieve data from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/SendBrushedParam', methods=["GET", "POST"])
def RetrieveModelsParam():
RetrieveModelsPar = request.get_data().decode('utf8').replace("'", '"')
RetrieveModelsPar = json.loads(RetrieveModelsPar)
counter1 = 0
counter2 = 0
global KNNModels
KNNModels = []
global RFModels
RFModels = []
global algorithmsList
algorithmsList = RetrieveModelsPar['algorithms']
for index, items in enumerate(algorithmsList):
if (items == 'KNN'):
counter1 = counter1 + 1
KNNModels.append(int(RetrieveModelsPar['models'][index]))
else:
counter2 = counter2 + 1
RFModels.append(int(RetrieveModelsPar['models'][index]))
return 'Everything Okay'
# Retrieve data from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/factors', methods=["GET", "POST"])
def RetrieveFactors():
Factors = request.get_data().decode('utf8').replace("'", '"')
FactorsInt = json.loads(Factors)
factors = FactorsInt['Factors']
global sumPerClassifierSel
global ModelSpaceMDSNew
global ModelSpaceTSNENew
global metricsPerModel
sumPerClassifierSel = []
sumPerClassifierSel = preProcsumPerMetric(factors)
ModelSpaceMDSNew = []
ModelSpaceTSNENew = []
loopThroughMetrics = PreprocessingMetrics()
metricsPerModel = preProcMetricsAllAndSel()
flagLocal = 0
countRemovals = 0
for l,el in enumerate(factors):
if el is 0:
loopThroughMetrics.drop(loopThroughMetrics.columns[[l-countRemovals]], axis=1, inplace=True)
countRemovals = countRemovals + 1
flagLocal = 1
if flagLocal is 1:
ModelSpaceMDSNew = FunMDS(loopThroughMetrics)
ModelSpaceTSNENew = FunTsne(loopThroughMetrics)
ModelSpaceTSNENew = ModelSpaceTSNENew.tolist()
return 'Everything Okay'
@app.route('/data/UpdateOverv', methods=["GET", "POST"])
def UpdateOverview():
ResultsUpdateOverview = []
ResultsUpdateOverview.append(sumPerClassifierSel)
ResultsUpdateOverview.append(ModelSpaceMDSNew)
ResultsUpdateOverview.append(ModelSpaceTSNENew)
ResultsUpdateOverview.append(metricsPerModel)
response = {
'Results': ResultsUpdateOverview
}
return jsonify(response)
def PreprocessingMetrics():
dicKNN = json.loads(allParametersPerformancePerModel[6])
dicRF = json.loads(allParametersPerformancePerModel[14])
dfKNN = pd.DataFrame.from_dict(dicKNN)
dfKNN.index = dfKNN.index.astype(int)
dfKNNFiltered = dfKNN.loc[KNNModels, :]
dfRF = pd.DataFrame.from_dict(dicRF)
dfRF.index = dfRF.index.astype(int) + 576
dfRFFiltered = dfRF.loc[RFModels, :]
df_concatMetrics = pd.concat([dfKNNFiltered, dfRFFiltered])
return df_concatMetrics
def PreprocessingPred():
dicKNN = json.loads(allParametersPerformancePerModel[7])
dicRF = json.loads(allParametersPerformancePerModel[15])
dfKNN = pd.DataFrame.from_dict(dicKNN)
dfKNN.index = dfKNN.index.astype(int)
dfKNNFiltered = dfKNN.loc[KNNModels, :]
dfRF = pd.DataFrame.from_dict(dicRF)
dfRF.index = dfRF.index.astype(int) + 576
dfRFFiltered = dfRF.loc[RFModels, :]
df_concatProbs = pd.concat([dfKNNFiltered, dfRFFiltered])
predictions = []
for column, content in df_concatProbs.items():
el = [sum(x)/len(x) for x in zip(*content)]
predictions.append(el)
return predictions
def PreprocessingPredUpdate(Models):
Models = json.loads(Models)
ModelsList= []
for loop in Models['ClassifiersList']:
temp = [int(s) for s in re.findall(r'\b\d+\b', loop)]
ModelsList.append(temp[0])
dicKNN = json.loads(allParametersPerformancePerModel[7])
dicRF = json.loads(allParametersPerformancePerModel[15])
dfKNN = pd.DataFrame.from_dict(dicKNN)
dfKNN.index = dfKNN.index.astype(int)
dfKNNFiltered = dfKNN.loc[KNNModels, :]
dfRF = pd.DataFrame.from_dict(dicRF)
dfRF.index = dfRF.index.astype(int) + 576
dfRFFiltered = dfRF.loc[RFModels, :]
df_concatProbs = pd.concat([dfKNNFiltered, dfRFFiltered])
listProbs = df_concatProbs.index.values.tolist()
deletedElements = 0
for index, element in enumerate(listProbs):
if element in ModelsList:
index = index - deletedElements
df_concatProbs = df_concatProbs.drop(df_concatProbs.index[index])
deletedElements = deletedElements + 1
df_concatProbsCleared = df_concatProbs
listIDsRemaining = df_concatProbsCleared.index.values.tolist()
predictionsAll = PreprocessingPred()
PredictionSpaceAll = FunTsne(predictionsAll)
predictionsSel = []
for column, content in df_concatProbsCleared.items():
el = [sum(x)/len(x) for x in zip(*content)]
predictionsSel.append(el)
PredictionSpaceSel = FunTsne(predictionsSel)
#ModelSpaceMDSNewComb = [list(a) for a in zip(PredictionSpaceAll[0], ModelSpaceMDS[1])]
#ModelSpaceMDSNewSel = FunMDS(df_concatMetrics)
#ModelSpaceMDSNewSelComb = [list(a) for a in zip(ModelSpaceMDSNewSel[0], ModelSpaceMDSNewSel[1])]
mtx2PredFinal = []
mtx1Pred, mtx2Pred, disparity2 = procrustes(PredictionSpaceAll, PredictionSpaceSel)
a1, b1 = zip(*mtx2Pred)
mtx2PredFinal.append(a1)
mtx2PredFinal.append(b1)
return [mtx2PredFinal,listIDsRemaining]
def PreprocessingParam():
dicKNN = json.loads(allParametersPerformancePerModel[1])
dicRF = json.loads(allParametersPerformancePerModel[9])
dicKNN = dicKNN['params']
dicRF = dicRF['params']
dfKNN = pd.DataFrame.from_dict(dicKNN)
dfKNN = dfKNN.T
dfKNN.index = dfKNN.index.astype(int)
dfKNNFiltered = dfKNN.loc[KNNModels, :]
dfRF = pd.DataFrame.from_dict(dicRF)
dfRF = dfRF.T
dfRF.index = dfRF.index.astype(int) + 576
dfRFFiltered = dfRF.loc[RFModels, :]
df_params = pd.concat([dfKNNFiltered, dfRFFiltered])
return df_params
def PreprocessingParamSep():
dicKNN = json.loads(allParametersPerformancePerModel[1])
dicRF = json.loads(allParametersPerformancePerModel[9])
dicKNN = dicKNN['params']
dicRF = dicRF['params']
dfKNN = pd.DataFrame.from_dict(dicKNN)
dfKNN = dfKNN.T
dfKNN.index = dfKNN.index.astype(int)
dfKNNFiltered = dfKNN.loc[KNNModels, :]
dfRF = pd.DataFrame.from_dict(dicRF)
dfRF = dfRF.T
dfRF.index = dfRF.index.astype(int) + 576
dfRFFiltered = dfRF.loc[RFModels, :]
return [dfKNNFiltered,dfRFFiltered]
def preProcessPerClassM():
dicKNN = json.loads(allParametersPerformancePerModel[2])
dicRF = json.loads(allParametersPerformancePerModel[10])
dfKNN = pd.DataFrame.from_dict(dicKNN)
dfKNN.index = dfKNN.index.astype(int)
dfKNNFiltered = dfKNN.loc[KNNModels, :]
dfRF = pd.DataFrame.from_dict(dicRF)
dfRF.index = dfRF.index.astype(int) + 576
dfRFFiltered = dfRF.loc[RFModels, :]
df_concatParams = pd.concat([dfKNNFiltered, dfRFFiltered])
return df_concatParams
def preProcessFeatAcc():
dicKNN = json.loads(allParametersPerformancePerModel[3])
dicRF = json.loads(allParametersPerformancePerModel[11])
dfKNN = pd.DataFrame.from_dict(dicKNN)
dfKNN.index = dfKNN.index.astype(int)
dfKNNFiltered = dfKNN.loc[KNNModels, :]
dfRF = pd.DataFrame.from_dict(dicRF)
dfRF.index = dfRF.index.astype(int) + 576
dfRFFiltered = dfRF.loc[RFModels, :]
df_featAcc = pd.concat([dfKNNFiltered, dfRFFiltered])
return df_featAcc
def preProcessPerm():
dicKNN = json.loads(allParametersPerformancePerModel[4])
dicRF = json.loads(allParametersPerformancePerModel[12])
dfKNN = pd.DataFrame.from_dict(dicKNN)
dfKNN.index = dfKNN.index.astype(int)
dfKNNFiltered = dfKNN.loc[KNNModels, :]
dfRF = pd.DataFrame.from_dict(dicRF)
dfRF.index = dfRF.index.astype(int) + 576
dfRFFiltered = dfRF.loc[RFModels, :]
df_perm = pd.concat([dfKNNFiltered, dfRFFiltered])
return df_perm
def preProcessFeatSc():
dicKNN = json.loads(allParametersPerformancePerModel[5])
dfKNN = pd.DataFrame.from_dict(dicKNN)
return dfKNN
def preProcsumPerMetric(factors):
sumPerClassifier = []
loopThroughMetrics = PreprocessingMetrics()
for row in loopThroughMetrics.iterrows():
rowSum = 0
lengthFactors = len(scoring)
name, values = row
for loop, elements in enumerate(values):
lengthFactors = lengthFactors - 1 + factors[loop]
rowSum = elements*factors[loop] + rowSum
if lengthFactors is 0:
sumPerClassifier = 0
else:
sumPerClassifier.append(rowSum/lengthFactors)
return sumPerClassifier
def preProcMetricsAllAndSel():
loopThroughMetrics = PreprocessingMetrics()
metricsPerModelColl = []
metricsPerModelColl.append(loopThroughMetrics['mean_test_accuracy'].sum()/loopThroughMetrics['mean_test_accuracy'].count())
metricsPerModelColl.append(loopThroughMetrics['mean_test_f1_macro'].sum()/loopThroughMetrics['mean_test_f1_macro'].count())
metricsPerModelColl.append(loopThroughMetrics['mean_test_precision'].sum()/loopThroughMetrics['mean_test_precision'].count())
metricsPerModelColl.append(loopThroughMetrics['mean_test_recall'].sum()/loopThroughMetrics['mean_test_recall'].count())
metricsPerModelColl.append(loopThroughMetrics['mean_test_jaccard'].sum()/loopThroughMetrics['mean_test_jaccard'].count())
for index, metric in enumerate(metricsPerModelColl):
metricsPerModelColl[index] = metric*factors[index]
return metricsPerModelColl
def preProceModels():
models = KNNModels + RFModels
return models
def FunMDS (data):
mds = MDS(n_components=2, random_state=RANDOM_SEED)
XTransformed = mds.fit_transform(data).T
XTransformed = XTransformed.tolist()
return XTransformed
def FunTsne (data):
tsne = TSNE(n_components=2).fit_transform(data)
tsne.shape
return tsne
def InitializeEnsemble():
XModels = PreprocessingMetrics()
DataSpace = FunTsne(XData)
DataSpaceList = DataSpace.tolist()
global ModelSpaceMDS
global ModelSpaceTSNE
ModelSpaceMDS = FunMDS(XModels)
ModelSpaceTSNE = FunTsne(XModels)
ModelSpaceTSNE = ModelSpaceTSNE.tolist()
PredictionProbSel = PreprocessingPred()
PredictionSpace = FunTsne(PredictionProbSel)
PredictionSpaceList = PredictionSpace.tolist()
ModelsIDs = preProceModels()
key = 0
EnsembleModel(ModelsIDs, key)
ReturnResults(ModelSpaceMDS,ModelSpaceTSNE,DataSpaceList,PredictionSpaceList)
def ReturnResults(ModelSpaceMDS,ModelSpaceTSNE,DataSpaceList,PredictionSpaceList):
global Results
Results = []
parametersGen = PreprocessingParam()
PerClassMetrics = preProcessPerClassM()
FeatureAccuracy = preProcessFeatAcc()
perm_imp_eli5PDCon = preProcessPerm()
featureScoresCon = preProcessFeatSc()
metricsPerModel = preProcMetricsAllAndSel()
sumPerClassifier = preProcsumPerMetric(factors)
ModelsIDs = preProceModels()
parametersGenPD = parametersGen.to_json(orient='records')
PerClassMetrics = PerClassMetrics.to_json(orient='records')
FeatureAccuracy = FeatureAccuracy.to_json(orient='records')
perm_imp_eli5PDCon = perm_imp_eli5PDCon.to_json(orient='records')
featureScoresCon = featureScoresCon.to_json(orient='records')
XDataJSONEntireSet = XData.to_json(orient='records')
XDataJSON = XData.columns.tolist()
Results.append(json.dumps(sumPerClassifier)) # Position: 0
Results.append(json.dumps(ModelSpaceMDS)) # Position: 1
Results.append(json.dumps(parametersGenPD)) # Position: 2
Results.append(PerClassMetrics) # Position: 3
Results.append(json.dumps(target_names)) # Position: 4
Results.append(FeatureAccuracy) # Position: 5
Results.append(json.dumps(XDataJSON)) # Position: 6
Results.append(json.dumps(DataSpaceList)) # Position: 7
Results.append(json.dumps(PredictionSpaceList)) # Position: 8
Results.append(json.dumps(metricsPerModel)) # Position: 9
Results.append(perm_imp_eli5PDCon) # Position: 10
Results.append(featureScoresCon) # Position: 11
Results.append(json.dumps(ModelSpaceTSNE)) # Position: 12
Results.append(json.dumps(ModelsIDs)) # Position: 13
Results.append(json.dumps(XDataJSONEntireSet)) # Position: 14
Results.append(json.dumps(yData)) # Position: 15
return Results
# Sending the overview classifiers' results to be visualized as a scatterplot
@app.route('/data/PlotClassifiers', methods=["GET", "POST"])
def SendToPlot():
while (len(DataResultsRaw) != DataRawLength):
pass
InitializeEnsemble()
response = {
'OverviewResults': Results
}
return jsonify(response)
# Retrieve data from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/ServerRemoveFromStack', methods=["GET", "POST"])
def RetrieveSelClassifiersIDandRemoveFromStack():
ClassifierIDsList = request.get_data().decode('utf8').replace("'", '"')
PredictionProbSelUpdate = PreprocessingPredUpdate(ClassifierIDsList)
global resultsUpdatePredictionSpace
resultsUpdatePredictionSpace = []
resultsUpdatePredictionSpace.append(json.dumps(PredictionProbSelUpdate[0])) # Position: 0
resultsUpdatePredictionSpace.append(json.dumps(PredictionProbSelUpdate[1]))
key = 3
EnsembleModel(ClassifierIDsList, key)
return 'Everything Okay'
# Sending the overview classifiers' results to be visualized as a scatterplot
@app.route('/data/UpdatePredictionsSpace', methods=["GET", "POST"])
def SendPredBacktobeUpdated():
response = {
'UpdatePredictions': resultsUpdatePredictionSpace
}
return jsonify(response)
# Retrieve data from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/ServerRequestSelPoin', methods=["GET", "POST"])
def RetrieveSelClassifiersID():
ClassifierIDsList = request.get_data().decode('utf8').replace("'", '"')
ComputeMetricsForSel(ClassifierIDsList)
key = 1
EnsembleModel(ClassifierIDsList, key)
return 'Everything Okay'
def ComputeMetricsForSel(Models):
Models = json.loads(Models)
MetricsAlltoSel = PreprocessingMetrics()
listofModels = []
for loop in Models['ClassifiersList']:
temp = [int(s) for s in re.findall(r'\b\d+\b', loop)]
listofModels.append(temp[0])
MetricsAlltoSel = MetricsAlltoSel.loc[listofModels,:]
global metricsPerModelCollSel
metricsPerModelCollSel = []
metricsPerModelCollSel.append(MetricsAlltoSel['mean_test_accuracy'].sum()/MetricsAlltoSel['mean_test_accuracy'].count())
metricsPerModelCollSel.append(MetricsAlltoSel['mean_test_f1_macro'].sum()/MetricsAlltoSel['mean_test_f1_macro'].count())
metricsPerModelCollSel.append(MetricsAlltoSel['mean_test_precision'].sum()/MetricsAlltoSel['mean_test_precision'].count())
metricsPerModelCollSel.append(MetricsAlltoSel['mean_test_recall'].sum()/MetricsAlltoSel['mean_test_recall'].count())
metricsPerModelCollSel.append(MetricsAlltoSel['mean_test_jaccard'].sum()/MetricsAlltoSel['mean_test_jaccard'].count())
for index, metric in enumerate(metricsPerModelCollSel):
metricsPerModelCollSel[index] = metric*factors[index]
return 'okay'
# Sending the overview classifiers' results to be visualized as a scatterplot
@app.route('/data/BarChartSelectedModels', methods=["GET", "POST"])
def SendToUpdateBarChart():
response = {
'SelectedMetricsForModels': metricsPerModelCollSel
}
return jsonify(response)
# Retrieve data from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/ServerRequestDataPoint', methods=["GET", "POST"])
def RetrieveSelDataPoints():
DataPointsSel = request.get_data().decode('utf8').replace("'", '"')
DataPointsSelClear = json.loads(DataPointsSel)
listofDataPoints = []
for loop in DataPointsSelClear['DataPointsSel']:
temp = [int(s) for s in re.findall(r'\b\d+\b', loop)]
listofDataPoints.append(temp[0])
paramsListSepPD = []
paramsListSepPD = PreprocessingParamSep()
paramsListSeptoDicKNN = paramsListSepPD[0].to_dict(orient='list')
paramsListSeptoDicRF = paramsListSepPD[1].to_dict(orient='list')
RetrieveParamsCleared = {}
RetrieveParamsClearedListKNN = []
for key, value in paramsListSeptoDicKNN.items():
withoutDuplicates = Remove(value)
RetrieveParamsCleared[key] = withoutDuplicates
RetrieveParamsClearedListKNN.append(RetrieveParamsCleared)
RetrieveParamsCleared = {}
RetrieveParamsClearedListRF = []
for key, value in paramsListSeptoDicRF.items():
withoutDuplicates = Remove(value)
RetrieveParamsCleared[key] = withoutDuplicates
RetrieveParamsClearedListRF.append(RetrieveParamsCleared)
if (len(paramsListSeptoDicKNN['n_neighbors']) is 0):
RetrieveParamsClearedListKNN = []
if (len(paramsListSeptoDicRF['n_estimators']) is 0):
RetrieveParamsClearedListRF = []
for eachAlgor in algorithms:
if (eachAlgor) == 'KNN':
clf = KNeighborsClassifier()
#params = {'n_neighbors': list(range(1, 25)), 'weights': ['uniform', 'distance'], 'algorithm': ['brute', 'kd_tree', 'ball_tree'], 'metric': ['chebyshev', 'manhattan', 'euclidean', 'minkowski']}
params = RetrieveParamsClearedListKNN
AlgorithmsIDsEnd = 0
else:
clf = RandomForestClassifier()
#params = {'n_estimators': list(range(40, 120)), 'criterion': ['gini', 'entropy']}
params = RetrieveParamsClearedListRF
AlgorithmsIDsEnd = 576
metricsSelList = GridSearchSel(clf, params, factors, AlgorithmsIDsEnd, listofDataPoints)
if (len(metricsSelList[0]) != 0 and len(metricsSelList[1]) != 0):
dicKNN = json.loads(metricsSelList[0])
dfKNN = pd.DataFrame.from_dict(dicKNN)
parametersSelDataPD = parametersSelData[0].apply(pd.Series)
set_diff_df = pd.concat([parametersSelDataPD, paramsListSepPD[0], paramsListSepPD[0]]).drop_duplicates(keep=False)
set_diff_df = set_diff_df.index.tolist()
if (len(set_diff_df) == 0):
dfKNNCleared = dfKNN
else:
dfKNNCleared = dfKNN.drop(dfKNN.index[set_diff_df])
dicRF = json.loads(metricsSelList[1])
dfRF = pd.DataFrame.from_dict(dicRF)
parametersSelDataPD = parametersSelData[1].apply(pd.Series)
set_diff_df = pd.concat([parametersSelDataPD, paramsListSepPD[1], paramsListSepPD[1]]).drop_duplicates(keep=False)
set_diff_df = set_diff_df.index.tolist()
if (len(set_diff_df) == 0):
dfRFCleared = dfRF
else:
dfRFCleared = dfRF.drop(dfRF.index[set_diff_df])
df_concatMetrics = pd.concat([dfKNNCleared, dfRFCleared])
else:
if (len(metricsSelList[0]) != 0):
dicKNN = json.loads(metricsSelList[0])
dfKNN = pd.DataFrame.from_dict(dicKNN)
parametersSelDataPD = parametersSelData[0].apply(pd.Series)
set_diff_df = pd.concat([parametersSelDataPD, paramsListSepPD[0], paramsListSepPD[0]]).drop_duplicates(keep=False)
set_diff_df = set_diff_df.index.tolist()
if (len(set_diff_df) == 0):
dfKNNCleared = dfKNN
else:
dfKNNCleared = dfKNN.drop(dfKNN.index[set_diff_df])
df_concatMetrics = dfKNNCleared
else:
dicRF = json.loads(metricsSelList[1])
dfRF = pd.DataFrame.from_dict(dicRF)
parametersSelDataPD = parametersSelData[1].apply(pd.Series)
set_diff_df = pd.concat([parametersSelDataPD, paramsListSepPD[1], paramsListSepPD[1]]).drop_duplicates(keep=False)
set_diff_df = set_diff_df.index.tolist()
if (len(set_diff_df) == 0):
dfRFCleared = dfRF
else:
dfRFCleared = dfRF.drop(dfRF.index[set_diff_df])
df_concatMetrics = dfRFCleared
global sumPerClassifierSelUpdate
sumPerClassifierSelUpdate = []
sumPerClassifierSelUpdate = preProcsumPerMetricAccordingtoData(factors, df_concatMetrics)
ModelSpaceMDSNewComb = [list(a) for a in zip(ModelSpaceMDS[0], ModelSpaceMDS[1])]
ModelSpaceMDSNewSel = FunMDS(df_concatMetrics)
ModelSpaceMDSNewSelComb = [list(a) for a in zip(ModelSpaceMDSNewSel[0], ModelSpaceMDSNewSel[1])]
global mt2xFinal
mt2xFinal = []
mtx1, mtx2, disparity = procrustes(ModelSpaceMDSNewComb, ModelSpaceMDSNewSelComb)
a, b = zip(*mtx2)
mt2xFinal.append(a)
mt2xFinal.append(b)
return 'Everything Okay'
def GridSearchSel(clf, params, factors, AlgorithmsIDsEnd, DataPointsSel):
if (len(params) == 0):
resultsMetrics.append([]) # Position: 0 and so on
parametersSelData.append([])
else:
# instantiate spark session
spark = (
SparkSession
.builder
.getOrCreate()
)
sc = spark.sparkContext
XDatasubset = XData.loc[DataPointsSel,:]
yDataSubset = [yData[i] for i in DataPointsSel]
# this is the grid we use to train the models
grid = DistGridSearchCV(
estimator=clf, param_grid=params,
sc=sc, cv=crossValidation, refit='accuracy', scoring=scoring,
verbose=0, n_jobs=-1)
# fit and extract the probabilities
grid.fit(XDatasubset, yDataSubset)
# process the results
cv_results = []
cv_results.append(grid.cv_results_)
df_cv_results = pd.DataFrame.from_dict(cv_results)
# number of models stored
number_of_models = len(df_cv_results.iloc[0][0])
# initialize results per row
df_cv_results_per_row = []
# loop through number of models
modelsIDs = []
for i in range(number_of_models):
modelsIDs.append(AlgorithmsIDsEnd+i)
# initialize results per item
df_cv_results_per_item = []
for column in df_cv_results.iloc[0]:
df_cv_results_per_item.append(column[i])
df_cv_results_per_row.append(df_cv_results_per_item)
# store the results into a pandas dataframe
df_cv_results_classifiers = pd.DataFrame(data = df_cv_results_per_row, columns= df_cv_results.columns)
parametersSelData.append(df_cv_results_classifiers['params'])
# copy and filter in order to get only the metrics
metrics = df_cv_results_classifiers.copy()
metrics = metrics.filter(['mean_test_accuracy','mean_test_f1_macro','mean_test_precision','mean_test_recall','mean_test_jaccard'])
metrics = metrics.to_json()
resultsMetrics.append(metrics) # Position: 0 and so on
return resultsMetrics
def preProcsumPerMetricAccordingtoData(factors, loopThroughMetrics):
sumPerClassifier = []
for row in loopThroughMetrics.iterrows():
rowSum = 0
lengthFactors = len(scoring)
name, values = row
for loop, elements in enumerate(values):
lengthFactors = lengthFactors - 1 + factors[loop]
rowSum = elements*factors[loop] + rowSum
if lengthFactors is 0:
sumPerClassifier = 0
else:
sumPerClassifier.append(rowSum/lengthFactors)
return sumPerClassifier
# Sending the overview classifiers' results to be visualized as a scatterplot
@app.route('/data/ServerSentDataPointsModel', methods=["GET", "POST"])
def SendDataPointsModels():
ResultsUpdate = []
global sumPerClassifierSelUpdate
sumPerClassifierSelUpdateJSON = json.dumps(sumPerClassifierSelUpdate)
ResultsUpdate.append(sumPerClassifierSelUpdateJSON)
global mt2xFinal
mt2xFinalJSON = json.dumps(mt2xFinal)
ResultsUpdate.append(mt2xFinalJSON)
response = {
'DataPointsModels': ResultsUpdate
}
return jsonify(response)
# Retrieve data from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/FeaturesSelection', methods=["GET", "POST"])
def FeatureSelPerModel():
global featureSelection
featureSelection = request.get_data().decode('utf8').replace("'", '"')
featureSelection = json.loads(featureSelection)
key = 2
ModelsIDs = preProceModels()
EnsembleModel(ModelsIDs, key)
return 'Everything Okay'
def EnsembleModel(Models, keyRetrieved):
global scores
scores = []
global all_classifiersSelection
all_classifiersSelection = []
lr = LogisticRegression()
if (keyRetrieved == 0):
global all_classifiers
all_classifiers = []
columnsInit = []
columnsInit = [XData.columns.get_loc(c) for c in XData.columns if c in XData]
temp = json.loads(allParametersPerformancePerModel[1])
dfParamKNN = pd.DataFrame.from_dict(temp)
dfParamKNNFilt = dfParamKNN.iloc[:,1]
for eachelem in KNNModels:
arg = dfParamKNNFilt[eachelem]
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsInit), KNeighborsClassifier().set_params(**arg)))
temp = json.loads(allParametersPerformancePerModel[9])
dfParamRF = pd.DataFrame.from_dict(temp)
dfParamRFFilt = dfParamRF.iloc[:,1]
for eachelem in RFModels:
arg = dfParamRFFilt[eachelem-576]
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsInit), RandomForestClassifier().set_params(**arg)))
global sclfStack
sclfStack = 0
global sclf
sclf = 0
sclf = StackingCVClassifier(classifiers=all_classifiers,
use_probas=True,
meta_classifier=lr,
random_state=RANDOM_SEED,
n_jobs = -1)
sclfStack = sclf
elif (keyRetrieved == 1):
Models = json.loads(Models)
ModelsAll = preProceModels()
for index, modHere in enumerate(ModelsAll):
flag = 0
for loop in Models['ClassifiersList']:
temp = [int(s) for s in re.findall(r'\b\d+\b', loop)]
if (int(temp[0]) == int(modHere)):
flag = 1
if (flag is 1):
all_classifiersSelection.append(all_classifiers[index])
sclf = StackingCVClassifier(classifiers=all_classifiersSelection,
use_probas=True,
meta_classifier=lr,
random_state=RANDOM_SEED,
n_jobs = -1)
elif (keyRetrieved == 2):
# fix this part!
if (len(all_classifiersSelection) == 0):
all_classifiers = []
columnsInit = []
temp = json.loads(allParametersPerformancePerModel[1])
dfParamKNN = pd.DataFrame.from_dict(temp)
dfParamKNNFilt = dfParamKNN.iloc[:,1]
print(featureSelection)
flag = 0
for index, eachelem in enumerate(KNNModels):
arg = dfParamKNNFilt[eachelem]
all_classifiers.append(make_pipeline(ColumnSelector(cols=featureSelection['featureSelection'][index]), KNeighborsClassifier().set_params(**arg)))
store = index
flag = 1
temp = json.loads(allParametersPerformancePerModel[9])
dfParamRF = pd.DataFrame.from_dict(temp)
dfParamRFFilt = dfParamRF.iloc[:,1]
if (flag == 0):
store = 0
else:
store = store + 1
for index, eachelem in enumerate(RFModels):
arg = dfParamRFFilt[eachelem-576]
print(index)
print(featureSelection['featureSelection'][index+store])
all_classifiers.append(make_pipeline(ColumnSelector(cols=featureSelection['featureSelection'][index+store]), RandomForestClassifier().set_params(**arg)))
sclf = StackingCVClassifier(classifiers=all_classifiers,
use_probas=True,
meta_classifier=lr,
random_state=RANDOM_SEED,
n_jobs = -1)
else:
Models = json.loads(Models)
ModelsAll = preProceModels()
for index, modHere in enumerate(ModelsAll):
flag = 0
for loop in Models['ClassifiersList']:
temp = [int(s) for s in re.findall(r'\b\d+\b', loop)]
if (int(temp[0]) == int(modHere)):
flag = 1
if (flag is 0):
all_classifiersSelection.append(all_classifiers[index])
sclfStack = StackingCVClassifier(classifiers=all_classifiersSelection,
use_probas=True,
meta_classifier=lr,
random_state=RANDOM_SEED,
n_jobs = -1)
#else:
# for index, eachelem in enumerate(algorithmsWithoutDuplicates):
# if (eachelem == 'KNN'):
# for j, each in enumerate(resultsList[index][1]):
# all_classifiersSelection.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), KNeighborsClassifier().set_params(**each)))
# del columnsReduce[0:len(resultsList[index][1])]
# else:
# for j, each in enumerate(resultsList[index][1]):
# all_classifiersSelection.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), RandomForestClassifier().set_params(**each)))
# del columnsReduce[0:len(resultsList[index][1])]
# sclf = StackingCVClassifier(classifiers=all_classifiersSelection,
# use_probas=True,
# meta_classifier=lr,
# random_state=RANDOM_SEED,
# n_jobs = -1)
temp = model_selection.cross_val_score(sclf, XData, yData, cv=crossValidation, scoring='accuracy', n_jobs=-1)
scores.append(temp.mean())
scores.append(temp.std())
temp = model_selection.cross_val_score(sclf, XData, yData, cv=crossValidation, scoring='precision_weighted', n_jobs=-1)
scores.append(temp.mean())
scores.append(temp.std())
temp = model_selection.cross_val_score(sclf, XData, yData, cv=crossValidation, scoring='recall_weighted', n_jobs=-1)
scores.append(temp.mean())
scores.append(temp.std())
temp = model_selection.cross_val_score(sclfStack, XData, yData, cv=crossValidation, scoring='accuracy', n_jobs=-1)
scores.append(temp.mean())
scores.append(temp.std())
temp = model_selection.cross_val_score(sclfStack, XData, yData, cv=crossValidation, scoring='precision_weighted', n_jobs=-1)
scores.append(temp.mean())
scores.append(temp.std())
temp = model_selection.cross_val_score(sclfStack, XData, yData, cv=crossValidation, scoring='recall_weighted', n_jobs=-1)
scores.append(temp.mean())
scores.append(temp.std())
return 'Okay'
# Sending the final results to be visualized as a line plot
@app.route('/data/SendFinalResultsBacktoVisualize', methods=["GET", "POST"])
def SendToPlotFinalResults():
response = {
'FinalResults': scores
}
return jsonify(response)