# first line: 371 @memory.cache def GridSearchForModels(clf, params, eachAlgor, factors, AlgorithmsIDsEnd): # instantiate spark session spark = ( SparkSession .builder .getOrCreate() ) sc = spark.sparkContext # this is the grid we use to train the models grid = DistGridSearchCV( estimator=clf, param_grid=params, sc=sc, cv=crossValidation, refit='accuracy', scoring=scoring, verbose=0, n_jobs=-1) # fit and extract the probabilities grid.fit(XData, yData) # process the results cv_results = [] cv_results.append(grid.cv_results_) df_cv_results = pd.DataFrame.from_dict(cv_results) # number of models stored number_of_models = len(df_cv_results.iloc[0][0]) # initialize results per row df_cv_results_per_row = [] # loop through number of models modelsIDs = [] for i in range(number_of_models): modelsIDs.append(AlgorithmsIDsEnd+i) # initialize results per item df_cv_results_per_item = [] for column in df_cv_results.iloc[0]: df_cv_results_per_item.append(column[i]) df_cv_results_per_row.append(df_cv_results_per_item) # store the results into a pandas dataframe df_cv_results_classifiers = pd.DataFrame(data = df_cv_results_per_row, columns= df_cv_results.columns) # copy and filter in order to get only the metrics metrics = df_cv_results_classifiers.copy() metrics = metrics.filter(['mean_test_accuracy','mean_test_f1_macro','mean_test_precision','mean_test_recall','mean_test_jaccard']) # control the factors sumperModel = [] for index, row in metrics.iterrows(): rowSum = 0 lengthFactors = NumberofscoringMetrics for loop,elements in enumerate(row): lengthFactors = lengthFactors - 1 + factors[loop] rowSum = elements*factors[loop] + rowSum if lengthFactors is 0: sumperModel = 0 else: sumperModel.append(rowSum/lengthFactors) # summarize all models metrics summarizedMetrics = pd.DataFrame(sumperModel) summarizedMetrics.rename(columns={0:'sum'}) # concat parameters and performance parameters = pd.DataFrame(df_cv_results_classifiers['params']) parametersPerformancePerModel = pd.concat([summarizedMetrics, parameters], axis=1) parametersPerformancePerModel = parametersPerformancePerModel.to_json() parametersLocal = json.loads(parametersPerformancePerModel)['params'].copy() Models = [] for index, items in enumerate(parametersLocal): Models.append(str(index)) parametersLocalNew = [ parametersLocal[your_key] for your_key in Models ] permList = [] PerFeatureAccuracy = [] PerFeatureAccuracyAll = [] PerClassMetric = [] perModelProb = [] for eachModelParameters in parametersLocalNew: clf.set_params(**eachModelParameters) perm = PermutationImportance(clf, cv = None, refit = True, n_iter = 25).fit(XData, yData) permList.append(perm.feature_importances_) n_feats = XData.shape[1] PerFeatureAccuracy = [] for i in range(n_feats): scores = model_selection.cross_val_score(clf, XData.values[:, i].reshape(-1, 1), yData, cv=crossValidation) PerFeatureAccuracy.append(scores.mean()) PerFeatureAccuracyAll.append(PerFeatureAccuracy) clf.fit(XData, yData) yPredict = clf.predict(XData) # retrieve target names (class names) PerClassMetric.append(classification_report(yData, yPredict, target_names=target_names, digits=2, output_dict=True)) yPredictProb = clf.predict_proba(XData) perModelProb.append(yPredictProb.tolist()) perModelProbPandas = pd.DataFrame(perModelProb) perModelProbPandas = perModelProbPandas.to_json() PerClassMetricPandas = pd.DataFrame(PerClassMetric) del PerClassMetricPandas['accuracy'] del PerClassMetricPandas['macro avg'] del PerClassMetricPandas['weighted avg'] PerClassMetricPandas = PerClassMetricPandas.to_json() perm_imp_eli5PD = pd.DataFrame(permList) perm_imp_eli5PD = perm_imp_eli5PD.to_json() PerFeatureAccuracyPandas = pd.DataFrame(PerFeatureAccuracyAll) PerFeatureAccuracyPandas = PerFeatureAccuracyPandas.to_json() bestfeatures = SelectKBest(score_func=chi2, k='all') fit = bestfeatures.fit(XData,yData) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(XData.columns) featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Specs','Score'] #naming the dataframe columns featureScores = featureScores.to_json() # gather the results and send them back results.append(modelsIDs) # Position: 0 and so on results.append(parametersPerformancePerModel) # Position: 1 and so on results.append(PerClassMetricPandas) # Position: 2 and so on results.append(PerFeatureAccuracyPandas) # Position: 3 and so on results.append(perm_imp_eli5PD) # Position: 4 and so on results.append(featureScores) # Position: 5 and so on metrics = metrics.to_json() results.append(metrics) # Position: 6 and so on results.append(perModelProbPandas) # Position: 7 and so on return results