parent
e4b65e4459
commit
96bc447d49
@ -1 +0,0 @@ |
||||
{"duration": 31.525413036346436, "input_args": {"keyRetrieved": "0"}} |
Binary file not shown.
@ -1,85 +0,0 @@ |
||||
# first line: 703 |
||||
@memory.cache |
||||
def EnsembleModel(keyRetrieved): |
||||
|
||||
scoresLocal = [] |
||||
all_classifiersSelection = [] |
||||
|
||||
if (keyRetrieved == 0): |
||||
columnsInit = [] |
||||
all_classifiers = [] |
||||
columnsInit = [XData.columns.get_loc(c) for c in XData.columns if c in XData] |
||||
|
||||
temp = json.loads(allParametersPerformancePerModel[1]) |
||||
dfParamKNN = pd.DataFrame.from_dict(temp) |
||||
dfParamKNNFilt = dfParamKNN.iloc[:,1] |
||||
|
||||
for eachelem in KNNModels: |
||||
arg = dfParamKNNFilt[eachelem] |
||||
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsInit), KNeighborsClassifier().set_params(**arg))) |
||||
|
||||
temp = json.loads(allParametersPerformancePerModel[9]) |
||||
dfParamRF = pd.DataFrame.from_dict(temp) |
||||
dfParamRFFilt = dfParamRF.iloc[:,1] |
||||
for eachelem in RFModels: |
||||
arg = dfParamRFFilt[eachelem-576] |
||||
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsInit), RandomForestClassifier().set_params(**arg))) |
||||
|
||||
lr = LogisticRegression() |
||||
sclf = StackingCVClassifier(classifiers=all_classifiers, |
||||
use_probas=True, |
||||
meta_classifier=lr, |
||||
random_state=RANDOM_SEED, |
||||
n_jobs = -1) |
||||
elif (keyRetrieved == 1): |
||||
ClassifierIDsList = json.loads(ClassifierIDsList) |
||||
for loop in ClassifierIDsList['ClassifiersList']: |
||||
temp = [int(s) for s in re.findall(r'\b\d+\b', loop)] |
||||
all_classifiersSelection.append(all_classifiers[temp[0]]) |
||||
|
||||
lr = LogisticRegression() |
||||
sclf = StackingCVClassifier(classifiers=all_classifiersSelection, |
||||
use_probas=True, |
||||
meta_classifier=lr, |
||||
random_state=RANDOM_SEED, |
||||
n_jobs = -1) |
||||
else: |
||||
columnsReduce = columns.copy() |
||||
lr = LogisticRegression() |
||||
if (len(all_classifiersSelection) == 0): |
||||
all_classifiers = [] |
||||
for index, eachelem in enumerate(algorithmsWithoutDuplicates): |
||||
if (eachelem == 'KNN'): |
||||
for j, each in enumerate(resultsList[index][1]): |
||||
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), KNeighborsClassifier().set_params(**each))) |
||||
del columnsReduce[0:len(resultsList[index][1])] |
||||
else: |
||||
for j, each in enumerate(resultsList[index][1]): |
||||
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), RandomForestClassifier().set_params(**each))) |
||||
del columnsReduce[0:len(resultsList[index][1])] |
||||
sclf = StackingCVClassifier(classifiers=all_classifiers, |
||||
use_probas=True, |
||||
meta_classifier=lr, |
||||
random_state=RANDOM_SEED, |
||||
n_jobs = -1) |
||||
else: |
||||
for index, eachelem in enumerate(algorithmsWithoutDuplicates): |
||||
if (eachelem == 'KNN'): |
||||
for j, each in enumerate(resultsList[index][1]): |
||||
all_classifiersSelection.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), KNeighborsClassifier().set_params(**each))) |
||||
del columnsReduce[0:len(resultsList[index][1])] |
||||
else: |
||||
for j, each in enumerate(resultsList[index][1]): |
||||
all_classifiersSelection.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), RandomForestClassifier().set_params(**each))) |
||||
del columnsReduce[0:len(resultsList[index][1])] |
||||
sclf = StackingCVClassifier(classifiers=all_classifiersSelection, |
||||
use_probas=True, |
||||
meta_classifier=lr, |
||||
random_state=RANDOM_SEED, |
||||
n_jobs = -1) |
||||
|
||||
for clf, label in zip([sclf], |
||||
['StackingClassifier']): |
||||
|
||||
scoresLocal = model_selection.cross_val_score(clf, XData, yData, cv=crossValidation, scoring='accuracy') |
||||
return scoresLocal |
@ -1 +0,0 @@ |
||||
59af2a38e533b8b9d34d25752ed21bc4d85f031d |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1 +0,0 @@ |
||||
d72d1ed1dc740749fdcd32387880cd1448ad700a |
@ -1 +0,0 @@ |
||||
47757dc7fd25e513f97c1aa5bcc6145288a37848 |
File diff suppressed because one or more lines are too long
@ -1 +0,0 @@ |
||||
0a4287fd308b5165ac624e57aa8252732c3a3c57 |
File diff suppressed because one or more lines are too long
Binary file not shown.
@ -1 +0,0 @@ |
||||
a12679dd9e7d55d04f94e4e01a7f665e805485c2 |
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1 +0,0 @@ |
||||
1b7338506297c2c62db74cfbb6db51b002880a46 |
@ -1 +0,0 @@ |
||||
0e5bfe037f9f60b45885bf572d29034cefec462a |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
@ -1,203 +0,0 @@ |
||||
# first line: 654 |
||||
@memory.cache |
||||
def GridSearchForModels(XData, yData, clf, params, eachAlgor, AlgorithmsIDsEnd, toggle): |
||||
|
||||
print('loop here') |
||||
# instantiate spark session |
||||
spark = ( |
||||
SparkSession |
||||
.builder |
||||
.getOrCreate() |
||||
) |
||||
sc = spark.sparkContext |
||||
|
||||
# this is the grid we use to train the models |
||||
grid = DistGridSearchCV( |
||||
estimator=clf, param_grid=params, |
||||
sc=sc, cv=crossValidation, refit='accuracy', scoring=scoring, |
||||
verbose=0, n_jobs=-1) |
||||
|
||||
# fit and extract the probabilities |
||||
grid.fit(XData, yData) |
||||
|
||||
# process the results |
||||
cv_results = [] |
||||
cv_results.append(grid.cv_results_) |
||||
df_cv_results = pd.DataFrame.from_dict(cv_results) |
||||
|
||||
# number of models stored |
||||
number_of_models = len(df_cv_results.iloc[0][0]) |
||||
|
||||
# initialize results per row |
||||
df_cv_results_per_row = [] |
||||
|
||||
# loop through number of models |
||||
modelsIDs = [] |
||||
for i in range(number_of_models): |
||||
modelsIDs.append(AlgorithmsIDsEnd+i) |
||||
# initialize results per item |
||||
df_cv_results_per_item = [] |
||||
for column in df_cv_results.iloc[0]: |
||||
df_cv_results_per_item.append(column[i]) |
||||
df_cv_results_per_row.append(df_cv_results_per_item) |
||||
|
||||
# store the results into a pandas dataframe |
||||
df_cv_results_classifiers = pd.DataFrame(data = df_cv_results_per_row, columns= df_cv_results.columns) |
||||
|
||||
# copy and filter in order to get only the metrics |
||||
metrics = df_cv_results_classifiers.copy() |
||||
|
||||
metrics = metrics.filter(['mean_test_accuracy','mean_test_precision_micro','mean_test_precision_macro','mean_test_precision_weighted','mean_test_recall_micro','mean_test_recall_macro','mean_test_recall_weighted','mean_test_roc_auc_ovo_weighted']) |
||||
|
||||
# concat parameters and performance |
||||
parametersPerformancePerModel = pd.DataFrame(df_cv_results_classifiers['params']) |
||||
parametersPerformancePerModel = parametersPerformancePerModel.to_json() |
||||
|
||||
parametersLocal = json.loads(parametersPerformancePerModel)['params'].copy() |
||||
Models = [] |
||||
for index, items in enumerate(parametersLocal): |
||||
Models.append(str(index)) |
||||
|
||||
parametersLocalNew = [ parametersLocal[your_key] for your_key in Models ] |
||||
|
||||
permList = [] |
||||
PerFeatureAccuracy = [] |
||||
PerFeatureAccuracyAll = [] |
||||
PerClassMetric = [] |
||||
perModelProb = [] |
||||
|
||||
perModelPrediction = [] |
||||
resultsMicro = [] |
||||
resultsMacro = [] |
||||
resultsWeighted = [] |
||||
resultsCorrCoef = [] |
||||
resultsMicroBeta5 = [] |
||||
resultsMacroBeta5 = [] |
||||
resultsWeightedBeta5 = [] |
||||
resultsMicroBeta1 = [] |
||||
resultsMacroBeta1 = [] |
||||
resultsWeightedBeta1 = [] |
||||
resultsMicroBeta2 = [] |
||||
resultsMacroBeta2 = [] |
||||
resultsWeightedBeta2 = [] |
||||
resultsLogLoss = [] |
||||
resultsLogLossFinal = [] |
||||
|
||||
loop = 8 |
||||
|
||||
# influence calculation for all the instances |
||||
inputs = range(len(XData)) |
||||
num_cores = multiprocessing.cpu_count() |
||||
|
||||
#impDataInst = Parallel(n_jobs=num_cores)(delayed(processInput)(i,XData,yData,crossValidation,clf) for i in inputs) |
||||
|
||||
for eachModelParameters in parametersLocalNew: |
||||
clf.set_params(**eachModelParameters) |
||||
if (toggle == 1): |
||||
perm = PermutationImportance(clf, cv = None, refit = True, n_iter = 25).fit(XData, yData) |
||||
permList.append(perm.feature_importances_) |
||||
n_feats = XData.shape[1] |
||||
PerFeatureAccuracy = [] |
||||
for i in range(n_feats): |
||||
scores = model_selection.cross_val_score(clf, XData.values[:, i].reshape(-1, 1), yData, cv=crossValidation) |
||||
PerFeatureAccuracy.append(scores.mean()) |
||||
PerFeatureAccuracyAll.append(PerFeatureAccuracy) |
||||
else: |
||||
permList.append(0) |
||||
PerFeatureAccuracyAll.append(0) |
||||
clf.fit(XData, yData) |
||||
yPredict = clf.predict(XData) |
||||
yPredict = np.nan_to_num(yPredict) |
||||
perModelPrediction.append(yPredict) |
||||
# retrieve target names (class names) |
||||
PerClassMetric.append(classification_report(yData, yPredict, target_names=target_names, digits=2, output_dict=True)) |
||||
yPredictProb = clf.predict_proba(XData) |
||||
yPredictProb = np.nan_to_num(yPredictProb) |
||||
perModelProb.append(yPredictProb.tolist()) |
||||
|
||||
resultsMicro.append(geometric_mean_score(yData, yPredict, average='micro')) |
||||
resultsMacro.append(geometric_mean_score(yData, yPredict, average='macro')) |
||||
resultsWeighted.append(geometric_mean_score(yData, yPredict, average='weighted')) |
||||
|
||||
resultsCorrCoef.append(matthews_corrcoef(yData, yPredict)) |
||||
|
||||
resultsMicroBeta5.append(fbeta_score(yData, yPredict, average='micro', beta=0.5)) |
||||
resultsMacroBeta5.append(fbeta_score(yData, yPredict, average='macro', beta=0.5)) |
||||
resultsWeightedBeta5.append(fbeta_score(yData, yPredict, average='weighted', beta=0.5)) |
||||
|
||||
resultsMicroBeta1.append(fbeta_score(yData, yPredict, average='micro', beta=1)) |
||||
resultsMacroBeta1.append(fbeta_score(yData, yPredict, average='macro', beta=1)) |
||||
resultsWeightedBeta1.append(fbeta_score(yData, yPredict, average='weighted', beta=1)) |
||||
|
||||
resultsMicroBeta2.append(fbeta_score(yData, yPredict, average='micro', beta=2)) |
||||
resultsMacroBeta2.append(fbeta_score(yData, yPredict, average='macro', beta=2)) |
||||
resultsWeightedBeta2.append(fbeta_score(yData, yPredict, average='weighted', beta=2)) |
||||
|
||||
resultsLogLoss.append(log_loss(yData, yPredictProb, normalize=True)) |
||||
|
||||
maxLog = max(resultsLogLoss) |
||||
minLog = min(resultsLogLoss) |
||||
for each in resultsLogLoss: |
||||
resultsLogLossFinal.append((each-minLog)/(maxLog-minLog)) |
||||
|
||||
metrics.insert(loop,'geometric_mean_score_micro',resultsMicro) |
||||
metrics.insert(loop+1,'geometric_mean_score_macro',resultsMacro) |
||||
metrics.insert(loop+2,'geometric_mean_score_weighted',resultsWeighted) |
||||
|
||||
metrics.insert(loop+3,'matthews_corrcoef',resultsCorrCoef) |
||||
|
||||
metrics.insert(loop+4,'f5_micro',resultsMicroBeta5) |
||||
metrics.insert(loop+5,'f5_macro',resultsMacroBeta5) |
||||
metrics.insert(loop+6,'f5_weighted',resultsWeightedBeta5) |
||||
|
||||
metrics.insert(loop+7,'f1_micro',resultsMicroBeta1) |
||||
metrics.insert(loop+8,'f1_macro',resultsMacroBeta1) |
||||
metrics.insert(loop+9,'f1_weighted',resultsWeightedBeta1) |
||||
|
||||
metrics.insert(loop+10,'f2_micro',resultsMicroBeta2) |
||||
metrics.insert(loop+11,'f2_macro',resultsMacroBeta2) |
||||
metrics.insert(loop+12,'f2_weighted',resultsWeightedBeta2) |
||||
|
||||
metrics.insert(loop+13,'log_loss',resultsLogLossFinal) |
||||
|
||||
perModelPredPandas = pd.DataFrame(perModelPrediction) |
||||
perModelPredPandas = perModelPredPandas.to_json() |
||||
|
||||
perModelProbPandas = pd.DataFrame(perModelProb) |
||||
perModelProbPandas = perModelProbPandas.to_json() |
||||
|
||||
PerClassMetricPandas = pd.DataFrame(PerClassMetric) |
||||
del PerClassMetricPandas['accuracy'] |
||||
del PerClassMetricPandas['macro avg'] |
||||
del PerClassMetricPandas['weighted avg'] |
||||
PerClassMetricPandas = PerClassMetricPandas.to_json() |
||||
|
||||
|
||||
perm_imp_eli5PD = pd.DataFrame(permList) |
||||
perm_imp_eli5PD = perm_imp_eli5PD.to_json() |
||||
|
||||
PerFeatureAccuracyPandas = pd.DataFrame(PerFeatureAccuracyAll) |
||||
PerFeatureAccuracyPandas = PerFeatureAccuracyPandas.to_json() |
||||
|
||||
bestfeatures = SelectKBest(score_func=chi2, k='all') |
||||
fit = bestfeatures.fit(XData,yData) |
||||
dfscores = pd.DataFrame(fit.scores_) |
||||
dfcolumns = pd.DataFrame(XData.columns) |
||||
featureScores = pd.concat([dfcolumns,dfscores],axis=1) |
||||
featureScores.columns = ['Specs','Score'] #naming the dataframe columns |
||||
featureScores = featureScores.to_json() |
||||
|
||||
# gather the results and send them back |
||||
results.append(modelsIDs) # Position: 0 and so on |
||||
results.append(parametersPerformancePerModel) # Position: 1 and so on |
||||
results.append(PerClassMetricPandas) # Position: 2 and so on |
||||
results.append(PerFeatureAccuracyPandas) # Position: 3 and so on |
||||
results.append(perm_imp_eli5PD) # Position: 4 and so on |
||||
results.append(featureScores) # Position: 5 and so on |
||||
|
||||
metrics = metrics.to_json() |
||||
results.append(metrics) # Position: 6 and so on |
||||
results.append(perModelProbPandas) # Position: 7 and so on |
||||
results.append(json.dumps(perModelPredPandas)) # Position: 8 and so on |
||||
|
||||
return results |
Loading…
Reference in new issue