Former-commit-id: b9fe60bb04
master
Angelos Chatzimparmpas 4 years ago
parent e4b65e4459
commit 96bc447d49
  1. 1
      cachedir/joblib/run/EnsembleModel/caf5346693cf07e3226b8e0c61576f07/metadata.json
  2. BIN
      cachedir/joblib/run/EnsembleModel/caf5346693cf07e3226b8e0c61576f07/output.pkl
  3. 85
      cachedir/joblib/run/EnsembleModel/func_code.py
  4. 1
      cachedir/joblib/run/GridSearchForModels/0b4e69a8baed05c0418e2d3292e0e404/output.pkl.REMOVED.git-id
  5. 1
      cachedir/joblib/run/GridSearchForModels/0c1b5af6557b3f6f8a244aa338a86356/metadata.json
  6. 1
      cachedir/joblib/run/GridSearchForModels/1734b8b679adb5bc8c999add2f949aff/metadata.json
  7. 1
      cachedir/joblib/run/GridSearchForModels/19f4367df2ce205b69217d4c836e2469/output.pkl.REMOVED.git-id
  8. 1
      cachedir/joblib/run/GridSearchForModels/1c0053a45d18d97819db917c49597997/output.pkl.REMOVED.git-id
  9. 1
      cachedir/joblib/run/GridSearchForModels/2731bb6398df6c2462ae5f92224ff00e/metadata.json
  10. 1
      cachedir/joblib/run/GridSearchForModels/2d731f5bcad3a373e7a82382ff88a7b5/output.pkl.REMOVED.git-id
  11. 1
      cachedir/joblib/run/GridSearchForModels/372d26f34cadbd83816f5c5666b62318/metadata.json
  12. BIN
      cachedir/joblib/run/GridSearchForModels/497edee69e4dc4c4fbcd813ae239cfe7/output.pkl
  13. 1
      cachedir/joblib/run/GridSearchForModels/4a66eb2045f9c657924ec340394e66e6/output.pkl.REMOVED.git-id
  14. 1
      cachedir/joblib/run/GridSearchForModels/7980170f30554d441d65041c96d2144d/metadata.json
  15. BIN
      cachedir/joblib/run/GridSearchForModels/799cda33dfa4a66ead8a40ee190b2b8b/output.pkl
  16. 1
      cachedir/joblib/run/GridSearchForModels/8083a40c1ab53f75e5ae6169a5f649a1/metadata.json
  17. 1
      cachedir/joblib/run/GridSearchForModels/8471fda224bc6acf2166930f71f674b7/metadata.json
  18. 1
      cachedir/joblib/run/GridSearchForModels/8a2c7b54003e9b135d2b921d578d3b55/metadata.json
  19. 1
      cachedir/joblib/run/GridSearchForModels/b014832ebe493a5bda77ebe0f6f9f9e4/metadata.json
  20. 1
      cachedir/joblib/run/GridSearchForModels/c34a92873db0bb52bd6dadf8395eeeaf/output.pkl.REMOVED.git-id
  21. 1
      cachedir/joblib/run/GridSearchForModels/ce594f4bb102c09bf180541c45a7c090/output.pkl.REMOVED.git-id
  22. 1
      cachedir/joblib/run/GridSearchForModels/eb3fdc194d813342281a8ba6a7fc8e44/metadata.json
  23. 1
      cachedir/joblib/run/GridSearchForModels/ed02020d7d44cd73e5d2225764de3a22/metadata.json
  24. BIN
      cachedir/joblib/run/GridSearchForModels/f45cfc9f0c1141406e895c118a75c8c3/output.pkl
  25. BIN
      cachedir/joblib/run/GridSearchForModels/fd4c7819372feea60946c1bbcdb13983/output.pkl
  26. 203
      cachedir/joblib/run/GridSearchForModels/func_code.py

@ -1 +0,0 @@
{"duration": 31.525413036346436, "input_args": {"keyRetrieved": "0"}}

@ -1,85 +0,0 @@
# first line: 703
@memory.cache
def EnsembleModel(keyRetrieved):
scoresLocal = []
all_classifiersSelection = []
if (keyRetrieved == 0):
columnsInit = []
all_classifiers = []
columnsInit = [XData.columns.get_loc(c) for c in XData.columns if c in XData]
temp = json.loads(allParametersPerformancePerModel[1])
dfParamKNN = pd.DataFrame.from_dict(temp)
dfParamKNNFilt = dfParamKNN.iloc[:,1]
for eachelem in KNNModels:
arg = dfParamKNNFilt[eachelem]
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsInit), KNeighborsClassifier().set_params(**arg)))
temp = json.loads(allParametersPerformancePerModel[9])
dfParamRF = pd.DataFrame.from_dict(temp)
dfParamRFFilt = dfParamRF.iloc[:,1]
for eachelem in RFModels:
arg = dfParamRFFilt[eachelem-576]
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsInit), RandomForestClassifier().set_params(**arg)))
lr = LogisticRegression()
sclf = StackingCVClassifier(classifiers=all_classifiers,
use_probas=True,
meta_classifier=lr,
random_state=RANDOM_SEED,
n_jobs = -1)
elif (keyRetrieved == 1):
ClassifierIDsList = json.loads(ClassifierIDsList)
for loop in ClassifierIDsList['ClassifiersList']:
temp = [int(s) for s in re.findall(r'\b\d+\b', loop)]
all_classifiersSelection.append(all_classifiers[temp[0]])
lr = LogisticRegression()
sclf = StackingCVClassifier(classifiers=all_classifiersSelection,
use_probas=True,
meta_classifier=lr,
random_state=RANDOM_SEED,
n_jobs = -1)
else:
columnsReduce = columns.copy()
lr = LogisticRegression()
if (len(all_classifiersSelection) == 0):
all_classifiers = []
for index, eachelem in enumerate(algorithmsWithoutDuplicates):
if (eachelem == 'KNN'):
for j, each in enumerate(resultsList[index][1]):
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), KNeighborsClassifier().set_params(**each)))
del columnsReduce[0:len(resultsList[index][1])]
else:
for j, each in enumerate(resultsList[index][1]):
all_classifiers.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), RandomForestClassifier().set_params(**each)))
del columnsReduce[0:len(resultsList[index][1])]
sclf = StackingCVClassifier(classifiers=all_classifiers,
use_probas=True,
meta_classifier=lr,
random_state=RANDOM_SEED,
n_jobs = -1)
else:
for index, eachelem in enumerate(algorithmsWithoutDuplicates):
if (eachelem == 'KNN'):
for j, each in enumerate(resultsList[index][1]):
all_classifiersSelection.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), KNeighborsClassifier().set_params(**each)))
del columnsReduce[0:len(resultsList[index][1])]
else:
for j, each in enumerate(resultsList[index][1]):
all_classifiersSelection.append(make_pipeline(ColumnSelector(cols=columnsReduce[j]), RandomForestClassifier().set_params(**each)))
del columnsReduce[0:len(resultsList[index][1])]
sclf = StackingCVClassifier(classifiers=all_classifiersSelection,
use_probas=True,
meta_classifier=lr,
random_state=RANDOM_SEED,
n_jobs = -1)
for clf, label in zip([sclf],
['StackingClassifier']):
scoresLocal = model_selection.cross_val_score(clf, XData, yData, cv=crossValidation, scoring='accuracy')
return scoresLocal

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -1,203 +0,0 @@
# first line: 654
@memory.cache
def GridSearchForModels(XData, yData, clf, params, eachAlgor, AlgorithmsIDsEnd, toggle):
print('loop here')
# instantiate spark session
spark = (
SparkSession
.builder
.getOrCreate()
)
sc = spark.sparkContext
# this is the grid we use to train the models
grid = DistGridSearchCV(
estimator=clf, param_grid=params,
sc=sc, cv=crossValidation, refit='accuracy', scoring=scoring,
verbose=0, n_jobs=-1)
# fit and extract the probabilities
grid.fit(XData, yData)
# process the results
cv_results = []
cv_results.append(grid.cv_results_)
df_cv_results = pd.DataFrame.from_dict(cv_results)
# number of models stored
number_of_models = len(df_cv_results.iloc[0][0])
# initialize results per row
df_cv_results_per_row = []
# loop through number of models
modelsIDs = []
for i in range(number_of_models):
modelsIDs.append(AlgorithmsIDsEnd+i)
# initialize results per item
df_cv_results_per_item = []
for column in df_cv_results.iloc[0]:
df_cv_results_per_item.append(column[i])
df_cv_results_per_row.append(df_cv_results_per_item)
# store the results into a pandas dataframe
df_cv_results_classifiers = pd.DataFrame(data = df_cv_results_per_row, columns= df_cv_results.columns)
# copy and filter in order to get only the metrics
metrics = df_cv_results_classifiers.copy()
metrics = metrics.filter(['mean_test_accuracy','mean_test_precision_micro','mean_test_precision_macro','mean_test_precision_weighted','mean_test_recall_micro','mean_test_recall_macro','mean_test_recall_weighted','mean_test_roc_auc_ovo_weighted'])
# concat parameters and performance
parametersPerformancePerModel = pd.DataFrame(df_cv_results_classifiers['params'])
parametersPerformancePerModel = parametersPerformancePerModel.to_json()
parametersLocal = json.loads(parametersPerformancePerModel)['params'].copy()
Models = []
for index, items in enumerate(parametersLocal):
Models.append(str(index))
parametersLocalNew = [ parametersLocal[your_key] for your_key in Models ]
permList = []
PerFeatureAccuracy = []
PerFeatureAccuracyAll = []
PerClassMetric = []
perModelProb = []
perModelPrediction = []
resultsMicro = []
resultsMacro = []
resultsWeighted = []
resultsCorrCoef = []
resultsMicroBeta5 = []
resultsMacroBeta5 = []
resultsWeightedBeta5 = []
resultsMicroBeta1 = []
resultsMacroBeta1 = []
resultsWeightedBeta1 = []
resultsMicroBeta2 = []
resultsMacroBeta2 = []
resultsWeightedBeta2 = []
resultsLogLoss = []
resultsLogLossFinal = []
loop = 8
# influence calculation for all the instances
inputs = range(len(XData))
num_cores = multiprocessing.cpu_count()
#impDataInst = Parallel(n_jobs=num_cores)(delayed(processInput)(i,XData,yData,crossValidation,clf) for i in inputs)
for eachModelParameters in parametersLocalNew:
clf.set_params(**eachModelParameters)
if (toggle == 1):
perm = PermutationImportance(clf, cv = None, refit = True, n_iter = 25).fit(XData, yData)
permList.append(perm.feature_importances_)
n_feats = XData.shape[1]
PerFeatureAccuracy = []
for i in range(n_feats):
scores = model_selection.cross_val_score(clf, XData.values[:, i].reshape(-1, 1), yData, cv=crossValidation)
PerFeatureAccuracy.append(scores.mean())
PerFeatureAccuracyAll.append(PerFeatureAccuracy)
else:
permList.append(0)
PerFeatureAccuracyAll.append(0)
clf.fit(XData, yData)
yPredict = clf.predict(XData)
yPredict = np.nan_to_num(yPredict)
perModelPrediction.append(yPredict)
# retrieve target names (class names)
PerClassMetric.append(classification_report(yData, yPredict, target_names=target_names, digits=2, output_dict=True))
yPredictProb = clf.predict_proba(XData)
yPredictProb = np.nan_to_num(yPredictProb)
perModelProb.append(yPredictProb.tolist())
resultsMicro.append(geometric_mean_score(yData, yPredict, average='micro'))
resultsMacro.append(geometric_mean_score(yData, yPredict, average='macro'))
resultsWeighted.append(geometric_mean_score(yData, yPredict, average='weighted'))
resultsCorrCoef.append(matthews_corrcoef(yData, yPredict))
resultsMicroBeta5.append(fbeta_score(yData, yPredict, average='micro', beta=0.5))
resultsMacroBeta5.append(fbeta_score(yData, yPredict, average='macro', beta=0.5))
resultsWeightedBeta5.append(fbeta_score(yData, yPredict, average='weighted', beta=0.5))
resultsMicroBeta1.append(fbeta_score(yData, yPredict, average='micro', beta=1))
resultsMacroBeta1.append(fbeta_score(yData, yPredict, average='macro', beta=1))
resultsWeightedBeta1.append(fbeta_score(yData, yPredict, average='weighted', beta=1))
resultsMicroBeta2.append(fbeta_score(yData, yPredict, average='micro', beta=2))
resultsMacroBeta2.append(fbeta_score(yData, yPredict, average='macro', beta=2))
resultsWeightedBeta2.append(fbeta_score(yData, yPredict, average='weighted', beta=2))
resultsLogLoss.append(log_loss(yData, yPredictProb, normalize=True))
maxLog = max(resultsLogLoss)
minLog = min(resultsLogLoss)
for each in resultsLogLoss:
resultsLogLossFinal.append((each-minLog)/(maxLog-minLog))
metrics.insert(loop,'geometric_mean_score_micro',resultsMicro)
metrics.insert(loop+1,'geometric_mean_score_macro',resultsMacro)
metrics.insert(loop+2,'geometric_mean_score_weighted',resultsWeighted)
metrics.insert(loop+3,'matthews_corrcoef',resultsCorrCoef)
metrics.insert(loop+4,'f5_micro',resultsMicroBeta5)
metrics.insert(loop+5,'f5_macro',resultsMacroBeta5)
metrics.insert(loop+6,'f5_weighted',resultsWeightedBeta5)
metrics.insert(loop+7,'f1_micro',resultsMicroBeta1)
metrics.insert(loop+8,'f1_macro',resultsMacroBeta1)
metrics.insert(loop+9,'f1_weighted',resultsWeightedBeta1)
metrics.insert(loop+10,'f2_micro',resultsMicroBeta2)
metrics.insert(loop+11,'f2_macro',resultsMacroBeta2)
metrics.insert(loop+12,'f2_weighted',resultsWeightedBeta2)
metrics.insert(loop+13,'log_loss',resultsLogLossFinal)
perModelPredPandas = pd.DataFrame(perModelPrediction)
perModelPredPandas = perModelPredPandas.to_json()
perModelProbPandas = pd.DataFrame(perModelProb)
perModelProbPandas = perModelProbPandas.to_json()
PerClassMetricPandas = pd.DataFrame(PerClassMetric)
del PerClassMetricPandas['accuracy']
del PerClassMetricPandas['macro avg']
del PerClassMetricPandas['weighted avg']
PerClassMetricPandas = PerClassMetricPandas.to_json()
perm_imp_eli5PD = pd.DataFrame(permList)
perm_imp_eli5PD = perm_imp_eli5PD.to_json()
PerFeatureAccuracyPandas = pd.DataFrame(PerFeatureAccuracyAll)
PerFeatureAccuracyPandas = PerFeatureAccuracyPandas.to_json()
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(XData,yData)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(XData.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
featureScores = featureScores.to_json()
# gather the results and send them back
results.append(modelsIDs) # Position: 0 and so on
results.append(parametersPerformancePerModel) # Position: 1 and so on
results.append(PerClassMetricPandas) # Position: 2 and so on
results.append(PerFeatureAccuracyPandas) # Position: 3 and so on
results.append(perm_imp_eli5PD) # Position: 4 and so on
results.append(featureScores) # Position: 5 and so on
metrics = metrics.to_json()
results.append(metrics) # Position: 6 and so on
results.append(perModelProbPandas) # Position: 7 and so on
results.append(json.dumps(perModelPredPandas)) # Position: 8 and so on
return results
Loading…
Cancel
Save