FeatureEnVi: Visual Analytics for Feature Engineering Using Stepwise Selection and Semi-Automatic Extraction Approaches https://doi.org/10.1109/TVCG.2022.3141040
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
FeatureEnVi/run.py

2232 lines
89 KiB

4 years ago
from flask import Flask, render_template, jsonify, request
from flask_pymongo import PyMongo
from flask_cors import CORS, cross_origin
import json
import copy
import warnings
import re
import random
import math
3 years ago
import pandas as pd
pd.set_option('use_inf_as_na', True)
4 years ago
import numpy as np
import multiprocessing
from joblib import Memory
3 years ago
from xgboost import XGBClassifier
4 years ago
from sklearn import model_selection
4 years ago
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_validate
4 years ago
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import OneHotEncoder
4 years ago
from sklearn.metrics import classification_report
4 years ago
from sklearn.feature_selection import mutual_info_classif
4 years ago
from sklearn.feature_selection import SelectKBest
3 years ago
from sklearn.feature_selection import f_classif
3 years ago
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
4 years ago
import eli5
from eli5.sklearn import PermutationImportance
4 years ago
from joblib import Parallel, delayed
import multiprocessing
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
4 years ago
# this block of code is for the connection between the server, the database, and the client (plus routing)
# access MongoDB
app = Flask(__name__)
app.config["MONGO_URI"] = "mongodb://localhost:27017/mydb"
mongo = PyMongo(app)
cors = CORS(app, resources={r"/data/*": {"origins": "*"}})
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/Reset', methods=["GET", "POST"])
def reset():
global DataRawLength
global DataResultsRaw
global previousState
3 years ago
previousState = []\
4 years ago
3 years ago
global StanceTest
StanceTest = False
4 years ago
global filterActionFinal
filterActionFinal = ''
global keySpecInternal
keySpecInternal = 1
global RANDOM_SEED
RANDOM_SEED = 42
global keyData
keyData = 0
4 years ago
global keepOriginalFeatures
keepOriginalFeatures = []
4 years ago
global XData
XData = []
global yData
yData = []
3 years ago
global XDataNoRemoval
XDataNoRemoval = []
global XDataNoRemovalOrig
XDataNoRemovalOrig = []
4 years ago
global XDataStored
XDataStored = []
global yDataStored
yDataStored = []
3 years ago
global finalResultsData
finalResultsData = []
4 years ago
global detailsParams
detailsParams = []
global algorithmList
algorithmList = []
global ClassifierIDsList
ClassifierIDsList = ''
global RetrieveModelsList
RetrieveModelsList = []
global allParametersPerfCrossMutr
allParametersPerfCrossMutr = []
global all_classifiers
all_classifiers = []
global crossValidation
2 years ago
crossValidation = 8
#crossValidation = 5
#crossValidation = 3
3 years ago
4 years ago
global resultsMetrics
resultsMetrics = []
global parametersSelData
parametersSelData = []
global target_names
target_names = []
4 years ago
global keyFirstTime
keyFirstTime = True
4 years ago
global target_namesLoc
target_namesLoc = []
4 years ago
global featureCompareData
featureCompareData = []
global columnsKeep
columnsKeep = []
4 years ago
global columnsNewGen
columnsNewGen = []
4 years ago
global columnsNames
columnsNames = []
3 years ago
global fileName
fileName = []
4 years ago
global listofTransformations
3 years ago
listofTransformations = ["r","b","zs","mms","l2","l1p","l10","e2","em1","p2","p3","p4"]
4 years ago
4 years ago
return 'The reset was done!'
# retrieve data from client and select the correct data set
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/ServerRequest', methods=["GET", "POST"])
def retrieveFileName():
global DataRawLength
global DataResultsRaw
global DataResultsRawTest
global DataRawLengthTest
3 years ago
global DataResultsRawExternal
global DataRawLengthExternal
4 years ago
3 years ago
global fileName
fileName = []
4 years ago
fileName = request.get_data().decode('utf8').replace("'", '"')
global keySpecInternal
keySpecInternal = 1
global filterActionFinal
filterActionFinal = ''
global dataSpacePointsIDs
dataSpacePointsIDs = []
global RANDOM_SEED
RANDOM_SEED = 42
global keyData
keyData = 0
4 years ago
global keepOriginalFeatures
keepOriginalFeatures = []
4 years ago
global XData
XData = []
3 years ago
global XDataNoRemoval
XDataNoRemoval = []
global XDataNoRemovalOrig
XDataNoRemovalOrig = []
4 years ago
global previousState
previousState = []
global yData
yData = []
global XDataStored
XDataStored = []
global yDataStored
yDataStored = []
3 years ago
global finalResultsData
finalResultsData = []
4 years ago
global ClassifierIDsList
ClassifierIDsList = ''
global algorithmList
algorithmList = []
global detailsParams
detailsParams = []
# Initializing models
global RetrieveModelsList
RetrieveModelsList = []
global resultsList
resultsList = []
global allParametersPerfCrossMutr
allParametersPerfCrossMutr = []
global HistoryPreservation
HistoryPreservation = []
global all_classifiers
all_classifiers = []
global crossValidation
2 years ago
crossValidation = 8
#crossValidation = 5
#crossValidation = 3
4 years ago
global parametersSelData
parametersSelData = []
global StanceTest
StanceTest = False
global target_names
target_names = []
4 years ago
global keyFirstTime
keyFirstTime = True
4 years ago
global target_namesLoc
target_namesLoc = []
4 years ago
global featureCompareData
featureCompareData = []
global columnsKeep
columnsKeep = []
4 years ago
global columnsNewGen
columnsNewGen = []
4 years ago
global columnsNames
columnsNames = []
global listofTransformations
3 years ago
listofTransformations = ["r","b","zs","mms","l2","l1p","l10","e2","em1","p2","p3","p4"]
3 years ago
4 years ago
DataRawLength = -1
DataRawLengthTest = -1
data = json.loads(fileName)
if data['fileName'] == 'HeartC':
CollectionDB = mongo.db.HeartC.find()
3 years ago
target_names.append('Healthy')
target_names.append('Diseased')
3 years ago
elif data['fileName'] == 'biodegC':
4 years ago
StanceTest = True
4 years ago
CollectionDB = mongo.db.biodegC.find()
CollectionDBTest = mongo.db.biodegCTest.find()
CollectionDBExternal = mongo.db.biodegCExt.find()
3 years ago
target_names.append('Non-biodegr.')
target_names.append('Biodegr.')
4 years ago
elif data['fileName'] == 'BreastC':
3 years ago
CollectionDB = mongo.db.breastC.find()
3 years ago
elif data['fileName'] == 'DiabetesC':
CollectionDB = mongo.db.diabetesC.find()
target_names.append('Negative')
target_names.append('Positive')
2 years ago
elif data['fileName'] == 'MaterialC':
CollectionDB = mongo.db.MaterialC.find()
target_names.append('Cylinder')
target_names.append('Disk')
target_names.append('Flatellipsold')
target_names.append('Longellipsold')
target_names.append('Sphere')
3 years ago
elif data['fileName'] == 'ContraceptiveC':
CollectionDB = mongo.db.ContraceptiveC.find()
target_names.append('No-use')
target_names.append('Long-term')
target_names.append('Short-term')
elif data['fileName'] == 'VehicleC':
CollectionDB = mongo.db.VehicleC.find()
target_names.append('Van')
3 years ago
target_names.append('Car')
3 years ago
target_names.append('Bus')
elif data['fileName'] == 'WineC':
CollectionDB = mongo.db.WineC.find()
target_names.append('Fine')
target_names.append('Superior')
target_names.append('Inferior')
4 years ago
else:
CollectionDB = mongo.db.IrisC.find()
DataResultsRaw = []
for index, item in enumerate(CollectionDB):
item['_id'] = str(item['_id'])
item['InstanceID'] = index
DataResultsRaw.append(item)
DataRawLength = len(DataResultsRaw)
3 years ago
DataResultsRawTest = []
DataResultsRawExternal = []
if (StanceTest):
for index, item in enumerate(CollectionDBTest):
item['_id'] = str(item['_id'])
item['InstanceID'] = index
DataResultsRawTest.append(item)
DataRawLengthTest = len(DataResultsRawTest)
for index, item in enumerate(CollectionDBExternal):
item['_id'] = str(item['_id'])
item['InstanceID'] = index
DataResultsRawExternal.append(item)
DataRawLengthExternal = len(DataResultsRawExternal)
4 years ago
dataSetSelection()
return 'Everything is okay'
# Retrieve data set from client
@cross_origin(origin='localhost',headers=['Content-Type','Authorization'])
@app.route('/data/SendtoSeverDataSet', methods=["GET", "POST"])
def sendToServerData():
uploadedData = request.get_data().decode('utf8').replace("'", '"')
uploadedDataParsed = json.loads(uploadedData)
DataResultsRaw = uploadedDataParsed['uploadedData']
DataResults = copy.deepcopy(DataResultsRaw)
for dictionary in DataResultsRaw:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
DataResults.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResults:
del dictionary[target]
global AllTargets
global target_names
global target_namesLoc
AllTargets = [o[target] for o in DataResultsRaw]
AllTargetsFloatValues = []
3 years ago
global fileName
data = json.loads(fileName)
4 years ago
previous = None
Class = 0
for i, value in enumerate(AllTargets):
if (i == 0):
previous = value
3 years ago
if (data['fileName'] == 'IrisC' or data['fileName'] == 'BreastC'):
3 years ago
target_names.append(value)
else:
pass
4 years ago
if (value == previous):
AllTargetsFloatValues.append(Class)
else:
Class = Class + 1
3 years ago
if (data['fileName'] == 'IrisC' or data['fileName'] == 'BreastC'):
3 years ago
target_names.append(value)
else:
pass
4 years ago
AllTargetsFloatValues.append(Class)
previous = value
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
global XData, yData, RANDOM_SEED
XData, yData = ArrayDataResults, AllTargetsFloatValues
global XDataStored, yDataStored
XDataStored = XData.copy()
yDataStored = yData.copy()
4 years ago
global XDataStoredOriginal
XDataStoredOriginal = XData.copy()
3 years ago
global finalResultsData
finalResultsData = XData.copy()
3 years ago
global XDataNoRemoval
XDataNoRemoval = XData.copy()
global XDataNoRemovalOrig
XDataNoRemovalOrig = XData.copy()
4 years ago
return 'Processed uploaded data set'
def dataSetSelection():
global XDataTest, yDataTest
XDataTest = pd.DataFrame()
3 years ago
global XDataExternal, yDataExternal
XDataExternal = pd.DataFrame()
4 years ago
global StanceTest
global AllTargets
global target_names
target_namesLoc = []
if (StanceTest):
DataResultsTest = copy.deepcopy(DataResultsRawTest)
for dictionary in DataResultsRawTest:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRawTest.sort(key=lambda x: x[target], reverse=True)
DataResultsTest.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResultsTest:
del dictionary['_id']
del dictionary['InstanceID']
del dictionary[target]
AllTargetsTest = [o[target] for o in DataResultsRawTest]
AllTargetsFloatValuesTest = []
previous = None
Class = 0
for i, value in enumerate(AllTargetsTest):
if (i == 0):
previous = value
target_namesLoc.append(value)
if (value == previous):
AllTargetsFloatValuesTest.append(Class)
else:
Class = Class + 1
target_namesLoc.append(value)
AllTargetsFloatValuesTest.append(Class)
previous = value
ArrayDataResultsTest = pd.DataFrame.from_dict(DataResultsTest)
XDataTest, yDataTest = ArrayDataResultsTest, AllTargetsFloatValuesTest
3 years ago
DataResultsExternal = copy.deepcopy(DataResultsRawExternal)
for dictionary in DataResultsRawExternal:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRawExternal.sort(key=lambda x: x[target], reverse=True)
DataResultsExternal.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResultsExternal:
del dictionary['_id']
del dictionary['InstanceID']
del dictionary[target]
AllTargetsExternal = [o[target] for o in DataResultsRawExternal]
AllTargetsFloatValuesExternal = []
previous = None
Class = 0
for i, value in enumerate(AllTargetsExternal):
if (i == 0):
previous = value
target_namesLoc.append(value)
if (value == previous):
AllTargetsFloatValuesExternal.append(Class)
else:
Class = Class + 1
target_namesLoc.append(value)
AllTargetsFloatValuesExternal.append(Class)
previous = value
ArrayDataResultsExternal = pd.DataFrame.from_dict(DataResultsExternal)
XDataExternal, yDataExternal = ArrayDataResultsExternal, AllTargetsFloatValuesExternal
4 years ago
DataResults = copy.deepcopy(DataResultsRaw)
for dictionary in DataResultsRaw:
for key in dictionary.keys():
if (key.find('*') != -1):
target = key
continue
continue
DataResultsRaw.sort(key=lambda x: x[target], reverse=True)
DataResults.sort(key=lambda x: x[target], reverse=True)
for dictionary in DataResults:
del dictionary['_id']
del dictionary['InstanceID']
del dictionary[target]
AllTargets = [o[target] for o in DataResultsRaw]
AllTargetsFloatValues = []
3 years ago
global fileName
data = json.loads(fileName)
4 years ago
previous = None
Class = 0
for i, value in enumerate(AllTargets):
if (i == 0):
previous = value
3 years ago
if (data['fileName'] == 'IrisC' or data['fileName'] == 'BreastC'):
3 years ago
target_names.append(value)
else:
pass
4 years ago
if (value == previous):
AllTargetsFloatValues.append(Class)
else:
Class = Class + 1
3 years ago
if (data['fileName'] == 'IrisC' or data['fileName'] == 'BreastC'):
3 years ago
target_names.append(value)
else:
pass
4 years ago
AllTargetsFloatValues.append(Class)
previous = value
ArrayDataResults = pd.DataFrame.from_dict(DataResults)
global XData, yData, RANDOM_SEED
XData, yData = ArrayDataResults, AllTargetsFloatValues
4 years ago
global keepOriginalFeatures
3 years ago
global OrignList
3 years ago
if (data['fileName'] == 'biodegC'):
keepOriginalFeatures = XData.copy()
storeNewColumns = []
for col in keepOriginalFeatures.columns:
newCol = col.replace("-", "_")
storeNewColumns.append(newCol.replace("_",""))
keepOriginalFeatures.columns = [str(col) + ' F'+str(idx+1)+'' for idx, col in enumerate(storeNewColumns)]
columnsNewGen = keepOriginalFeatures.columns.values.tolist()
OrignList = keepOriginalFeatures.columns.values.tolist()
else:
keepOriginalFeatures = XData.copy()
keepOriginalFeatures.columns = [str(col) + ' F'+str(idx+1)+'' for idx, col in enumerate(keepOriginalFeatures.columns)]
columnsNewGen = keepOriginalFeatures.columns.values.tolist()
OrignList = keepOriginalFeatures.columns.values.tolist()
4 years ago
XData.columns = ['F'+str(idx+1) for idx, col in enumerate(XData.columns)]
3 years ago
XDataTest.columns = ['F'+str(idx+1) for idx, col in enumerate(XDataTest.columns)]
XDataExternal.columns = ['F'+str(idx+1) for idx, col in enumerate(XDataExternal.columns)]
4 years ago
4 years ago
global XDataStored, yDataStored
XDataStored = XData.copy()
yDataStored = yData.copy()
4 years ago
global XDataStoredOriginal
XDataStoredOriginal = XData.copy()
3 years ago
global finalResultsData
finalResultsData = XData.copy()
3 years ago
global XDataNoRemoval
XDataNoRemoval = XData.copy()
global XDataNoRemovalOrig
XDataNoRemovalOrig = XData.copy()
4 years ago
warnings.simplefilter('ignore')
4 years ago
executeModel([], 0, '')
4 years ago
return 'Everything is okay'
4 years ago
4 years ago
def create_global_function():
global estimator
3 years ago
location = './cachedir'
memory = Memory(location, verbose=0)
# calculating for all algorithms and models the performance and other results
@memory.cache
3 years ago
def estimator(n_estimators, eta, max_depth, subsample, colsample_bytree):
4 years ago
# initialize model
3 years ago
print('loopingQSAR')
3 years ago
n_estimators = int(n_estimators)
max_depth = int(max_depth)
3 years ago
model = XGBClassifier(n_estimators=n_estimators, eta=eta, max_depth=max_depth, subsample=subsample, colsample_bytree=colsample_bytree, n_jobs=-1, random_state=RANDOM_SEED, silent=True, verbosity = 0, use_label_encoder=False)
4 years ago
# set in cross-validation
4 years ago
result = cross_validate(model, XData, yData, cv=crossValidation, scoring='accuracy')
4 years ago
# result is mean of test_score
return np.mean(result['test_score'])
4 years ago
4 years ago
# check this issue later because we are not getting the same results
4 years ago
def executeModel(exeCall, flagEx, nodeTransfName):
4 years ago
3 years ago
global XDataTest, yDataTest
global XDataExternal, yDataExternal
4 years ago
global keyFirstTime
4 years ago
global estimator
4 years ago
global yPredictProb
4 years ago
global scores
4 years ago
global featureImportanceData
global XData
global XDataStored
4 years ago
global previousState
4 years ago
global columnsNewGen
4 years ago
global columnsNames
global listofTransformations
4 years ago
global XDataStoredOriginal
3 years ago
global finalResultsData
3 years ago
global OrignList
3 years ago
global tracker
3 years ago
global XDataNoRemoval
global XDataNoRemovalOrig
4 years ago
columnsNames = []
4 years ago
scores = []
4 years ago
if (len(exeCall) == 0):
if (flagEx == 3):
XDataStored = XData.copy()
3 years ago
XDataNoRemovalOrig = XDataNoRemoval.copy()
3 years ago
OrignList = columnsNewGen
3 years ago
elif (flagEx == 2):
XData = XDataStored.copy()
XDataStoredOriginal = XDataStored.copy()
XDataNoRemoval = XDataNoRemovalOrig.copy()
3 years ago
columnsNewGen = OrignList
4 years ago
else:
XData = XDataStored.copy()
3 years ago
XDataNoRemoval = XDataNoRemovalOrig.copy()
3 years ago
XDataStoredOriginal = XDataStored.copy()
4 years ago
else:
4 years ago
if (flagEx == 4):
XDataStored = XData.copy()
3 years ago
XDataNoRemovalOrig = XDataNoRemoval.copy()
#XDataStoredOriginal = XDataStored.copy()
3 years ago
elif (flagEx == 2):
3 years ago
XData = XDataStored.copy()
XDataStoredOriginal = XDataStored.copy()
XDataNoRemoval = XDataNoRemovalOrig.copy()
3 years ago
columnsNewGen = OrignList
4 years ago
else:
XData = XDataStored.copy()
3 years ago
#XDataNoRemoval = XDataNoRemovalOrig.copy()
3 years ago
XDataStoredOriginal = XDataStored.copy()
3 years ago
3 years ago
# Bayesian Optimization CHANGE INIT_POINTS!
4 years ago
if (keyFirstTime):
create_global_function()
3 years ago
params = {"n_estimators": (5, 200), "eta": (0.05, 0.3), "max_depth": (6,12), "subsample": (0.8,1), "colsample_bytree": (0.8,1)}
3 years ago
bayesopt = BayesianOptimization(estimator, params, random_state=RANDOM_SEED)
3 years ago
bayesopt.maximize(init_points=20, n_iter=5, acq='ucb') # 20 and 5
3 years ago
bestParams = bayesopt.max['params']
3 years ago
estimator = XGBClassifier(n_estimators=int(bestParams.get('n_estimators')), eta=bestParams.get('eta'), max_depth=int(bestParams.get('max_depth')), subsample=bestParams.get('subsample'), colsample_bytree=bestParams.get('colsample_bytree'), probability=True, random_state=RANDOM_SEED, silent=True, verbosity = 0, use_label_encoder=False)
3 years ago
columnsNewGen = OrignList
4 years ago
if (len(exeCall) != 0):
4 years ago
if (flagEx == 1):
3 years ago
currentColumnsDeleted = []
for uniqueValue in exeCall:
currentColumnsDeleted.append(tracker[uniqueValue])
for column in XData.columns:
if (column in currentColumnsDeleted):
XData = XData.drop(column, axis=1)
XDataStoredOriginal = XDataStoredOriginal.drop(column, axis=1)
4 years ago
elif (flagEx == 2):
4 years ago
columnsKeepNew = []
columns = XDataGen.columns.values.tolist()
for indx, col in enumerate(columns):
if indx in exeCall:
columnsKeepNew.append(col)
4 years ago
columnsNewGen.append(col)
4 years ago
XDataTemp = XDataGen[columnsKeepNew]
XData[columnsKeepNew] = XDataTemp.values
4 years ago
XDataStoredOriginal[columnsKeepNew] = XDataTemp.values
3 years ago
XDataNoRemoval[columnsKeepNew] = XDataTemp.values
4 years ago
elif (flagEx == 4):
splittedCol = nodeTransfName.split('_')
3 years ago
for col in XDataNoRemoval.columns:
3 years ago
splitCol = col.split('_')
if ((splittedCol[0] in splitCol[0])):
newSplitted = re.sub("[^0-9]", "", splittedCol[0])
newCol = re.sub("[^0-9]", "", splitCol[0])
if (newSplitted == newCol):
storeRenamedColumn = col
3 years ago
XData.rename(columns={ storeRenamedColumn: nodeTransfName }, inplace = True)
XDataNoRemoval.rename(columns={ storeRenamedColumn: nodeTransfName }, inplace = True)
4 years ago
currentColumn = columnsNewGen[exeCall[0]]
subString = currentColumn[currentColumn.find("(")+1:currentColumn.find(")")]
replacement = currentColumn.replace(subString, nodeTransfName)
3 years ago
for ind, column in enumerate(columnsNewGen):
3 years ago
splitCol = column.split('_')
if ((splittedCol[0] in splitCol[0])):
newSplitted = re.sub("[^0-9]", "", splittedCol[0])
newCol = re.sub("[^0-9]", "", splitCol[0])
if (newSplitted == newCol):
columnsNewGen[ind] = columnsNewGen[ind].replace(storeRenamedColumn, nodeTransfName)
4 years ago
if (len(splittedCol) == 1):
XData[nodeTransfName] = XDataStoredOriginal[nodeTransfName]
3 years ago
XDataNoRemoval[nodeTransfName] = XDataStoredOriginal[nodeTransfName]
4 years ago
else:
if (splittedCol[1] == 'r'):
XData[nodeTransfName] = XData[nodeTransfName].round()
3 years ago
elif (splittedCol[1] == 'b'):
number_of_bins = np.histogram_bin_edges(XData[nodeTransfName], bins='auto')
emptyLabels = []
for index, number in enumerate(number_of_bins):
if (index == 0):
pass
else:
emptyLabels.append(index)
XData[nodeTransfName] = pd.cut(XData[nodeTransfName], bins=number_of_bins, labels=emptyLabels, include_lowest=True, right=True)
XData[nodeTransfName] = pd.to_numeric(XData[nodeTransfName], downcast='signed')
elif (splittedCol[1] == 'zs'):
XData[nodeTransfName] = (XData[nodeTransfName]-XData[nodeTransfName].mean())/XData[nodeTransfName].std()
elif (splittedCol[1] == 'mms'):
XData[nodeTransfName] = (XData[nodeTransfName]-XData[nodeTransfName].min())/(XData[nodeTransfName].max()-XData[nodeTransfName].min())
elif (splittedCol[1] == 'l2'):
3 years ago
dfTemp = []
3 years ago
dfTemp = np.log2(XData[nodeTransfName])
3 years ago
dfTemp = dfTemp.replace([np.inf, -np.inf], np.nan)
dfTemp = dfTemp.fillna(0)
3 years ago
XData[nodeTransfName] = dfTemp
3 years ago
elif (splittedCol[1] == 'l1p'):
3 years ago
dfTemp = []
dfTemp = np.log1p(XData[nodeTransfName])
dfTemp = dfTemp.replace([np.inf, -np.inf], np.nan)
dfTemp = dfTemp.fillna(0)
XData[nodeTransfName] = dfTemp
3 years ago
elif (splittedCol[1] == 'l10'):
3 years ago
dfTemp = []
3 years ago
dfTemp = np.log10(XData[nodeTransfName])
3 years ago
dfTemp = dfTemp.replace([np.inf, -np.inf], np.nan)
dfTemp = dfTemp.fillna(0)
3 years ago
XData[nodeTransfName] = dfTemp
3 years ago
elif (splittedCol[1] == 'e2'):
3 years ago
dfTemp = []
dfTemp = np.exp2(XData[nodeTransfName])
3 years ago
dfTemp = dfTemp.replace([np.inf, -np.inf], np.nan)
dfTemp = dfTemp.fillna(0)
3 years ago
XData[nodeTransfName] = dfTemp
3 years ago
elif (splittedCol[1] == 'em1'):
3 years ago
dfTemp = []
dfTemp = np.expm1(XData[nodeTransfName])
3 years ago
dfTemp = dfTemp.replace([np.inf, -np.inf], np.nan)
dfTemp = dfTemp.fillna(0)
3 years ago
XData[nodeTransfName] = dfTemp
3 years ago
elif (splittedCol[1] == 'p2'):
XData[nodeTransfName] = np.power(XData[nodeTransfName], 2)
elif (splittedCol[1] == 'p3'):
XData[nodeTransfName] = np.power(XData[nodeTransfName], 3)
else:
XData[nodeTransfName] = np.power(XData[nodeTransfName], 4)
3 years ago
XDataNoRemoval[nodeTransfName] = XData[nodeTransfName]
XDataStored = XData.copy()
XDataNoRemovalOrig = XDataNoRemoval.copy()
3 years ago
4 years ago
columnsNamesLoc = XData.columns.values.tolist()
for col in columnsNamesLoc:
splittedCol = col.split('_')
if (len(splittedCol) == 1):
for tran in listofTransformations:
columnsNames.append(splittedCol[0]+'_'+tran)
else:
for tran in listofTransformations:
if (splittedCol[1] == tran):
columnsNames.append(splittedCol[0])
else:
columnsNames.append(splittedCol[0]+'_'+tran)
3 years ago
3 years ago
featureImportanceData = estimatorFeatureSelection(XDataNoRemoval, estimator)
3 years ago
tracker = []
for value in columnsNewGen:
value = value.split(' ')
if (len(value) > 1):
tracker.append(value[1])
else:
tracker.append(value[0])
3 years ago
4 years ago
estimator.fit(XData, yData)
yPredict = estimator.predict(XData)
yPredictProb = cross_val_predict(estimator, XData, yData, cv=crossValidation, method='predict_proba')
3 years ago
4 years ago
num_cores = multiprocessing.cpu_count()
3 years ago
inputsSc = ['accuracy','precision_weighted','recall_weighted']
4 years ago
flat_results = Parallel(n_jobs=num_cores)(delayed(solve)(estimator,XData,yData,crossValidation,item,index) for index, item in enumerate(inputsSc))
4 years ago
scoresAct = [item for sublist in flat_results for item in sublist]
3 years ago
print(scoresAct)
# if (StanceTest):
# y_pred = estimator.predict(XDataTest)
# print('Test data set')
# print(classification_report(yDataTest, y_pred))
# y_pred = estimator.predict(XDataExternal)
# print('External data set')
# print(classification_report(yDataExternal, y_pred))
4 years ago
howMany = 0
if (keyFirstTime):