StackGenVis: Alignment of Data, Algorithms, and Models for Stacking Ensemble Learning Using Performance Metrics
https://doi.org/10.1109/TVCG.2020.3030352
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
96 lines
3.3 KiB
96 lines
3.3 KiB
# first line: 714
|
|
def GridSearchForModels(clf, params, eachAlgor, factors):
|
|
|
|
# scoring parameters
|
|
global scoring
|
|
|
|
# number of scoring parameters
|
|
global NumberofscoringMetrics
|
|
|
|
# crossvalidation number
|
|
global crossValidation
|
|
|
|
# instantiate spark session
|
|
spark = (
|
|
SparkSession
|
|
.builder
|
|
.getOrCreate()
|
|
)
|
|
sc = spark.sparkContext
|
|
|
|
# this is the grid we use to train the models
|
|
grid = DistGridSearchCV(
|
|
estimator=clf, param_grid=params,
|
|
sc=sc, cv=crossValidation, refit='accuracy', scoring=scoring,
|
|
verbose=0, n_jobs=-1)
|
|
|
|
# fit and extract the probabilities
|
|
grid.fit(XData, yData)
|
|
yPredict = grid.predict(XData)
|
|
|
|
# process the results
|
|
cv_results = []
|
|
cv_results.append(grid.cv_results_)
|
|
df_cv_results = pd.DataFrame.from_dict(cv_results)
|
|
|
|
# number of models stored
|
|
number_of_models = len(df_cv_results.iloc[0][0])
|
|
|
|
# initialize results per row
|
|
df_cv_results_per_row = []
|
|
|
|
# loop through number of models
|
|
for i in range(number_of_models):
|
|
# initialize results per item
|
|
df_cv_results_per_item = []
|
|
for column in df_cv_results.iloc[0]:
|
|
df_cv_results_per_item.append(column[i])
|
|
df_cv_results_per_row.append(df_cv_results_per_item)
|
|
|
|
# store the results into a pandas dataframe
|
|
df_cv_results_classifiers = pd.DataFrame(data = df_cv_results_per_row, columns= df_cv_results.columns)
|
|
|
|
# copy and filter in order to get only the metrics
|
|
metrics = df_cv_results_classifiers.copy()
|
|
metrics = metrics.filter(['mean_test_accuracy','mean_test_f1_macro','mean_test_precision','mean_test_recall','mean_test_jaccard'])
|
|
|
|
# control the factors
|
|
sumperModel = []
|
|
for row in metrics.iterrows():
|
|
rowSum = 0
|
|
lengthFactors = NumberofscoringMetrics
|
|
for loop,elements in enumerate(row):
|
|
lengthFactors = lengthFactors - 1 + factors[loop]
|
|
rowSum = elements*factors[loop] + rowSum
|
|
if lengthFactors is 0:
|
|
sumperModel = 0
|
|
else:
|
|
sumperModel.append(rowSum/lengthFactors)
|
|
# summarize all models metrics
|
|
summarizedMetrics = pd.DataFrame(sumperModel)
|
|
summarizedMetrics.rename(columns={0:'sum'})
|
|
|
|
yPredictProb.append(grid.predict_proba(XData))
|
|
|
|
# retrieve target names (class names)
|
|
global target_names
|
|
PerClassMetric = []
|
|
PerClassMetric.append(classification_report(yData, yPredict, target_names=target_names, digits=2, output_dict=True))
|
|
PerClassMetricPandas = pd.DataFrame(PerClassMetric)
|
|
print(PerClassMetricPandas)
|
|
del PerClassMetricPandas['accuracy']
|
|
del PerClassMetricPandas['macro avg']
|
|
del PerClassMetricPandas['weighted avg']
|
|
PerClassMetricPandas = PerClassMetricPandas.to_json()
|
|
|
|
# concat parameters and performance
|
|
parameters = pd.DataFrame(df_cv_results_classifiers['params'])
|
|
parametersPerformancePerModel = pd.concat([summarizedMetrics, parameters], axis=1)
|
|
parametersPerformancePerModel = parametersPerformancePerModel.to_json()
|
|
|
|
# make global the parameters performance to send it back
|
|
global allParametersPerformancePerModel
|
|
allParametersPerformancePerModel.append(parametersPerformancePerModel)
|
|
allParametersPerformancePerModel.append(PerClassMetricPandas)
|
|
|
|
return 'Everything is okay'
|
|
|