t-viSNE/tsneGrid.py

#!flask/bin/python

import sys
import os

from flask import Flask, request, Response, jsonify
from flask_cors import CORS
from multiprocessing import Pool
from scipy.spatial import procrustes
from scipy.spatial import distance
from sklearn_extra.cluster import KMedoids
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy import spatial
from scipy import stats

import numpy as np
import pandas as pd
import random, json
import bhtsne

app = Flask(__name__)
CORS(app)

# NOTE: Only works with labeled data
def neighborhood_hit(X, y, k):

    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    neighbors = knn.kneighbors(X, return_distance=False)
    y = np.array(y)
    neigh = y[neighbors]
    tile =  np.tile(y.reshape((-1, 1)), k)
    equals = (neigh == tile)
    returnthis = np.mean(np.mean(equals).astype('uint8'), axis=1)
    return returnthis

def trustworthiness(D_high, D_low, k):
    n = D_high.shape[0]

    nn_orig = D_high.argsort()
    nn_proj = D_low.argsort()

    knn_orig = nn_orig[:, :k + 1][:, 1:]
    knn_proj = nn_proj[:, :k + 1][:, 1:]

    sum_i = 0

    for i in range(n):
        U = np.setdiff1d(knn_proj[i], knn_orig[i])

        sum_j = 0
        for j in range(U.shape[0]):
            sum_j += np.where(nn_orig[i] == U[j])[0] - k

        sum_i += sum_j

    return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())

def continuity(D_high, D_low, k):
    n = D_high.shape[0]

    nn_orig = D_high.argsort()
    nn_proj = D_low.argsort()

    knn_orig = nn_orig[:, :k + 1][:, 1:]
    knn_proj = nn_proj[:, :k + 1][:, 1:]

    sum_i = 0

    for i in range(n):
        V = np.setdiff1d(knn_proj[i], knn_orig[i])

        sum_j = 0
        for j in range(V.shape[0]):
            sum_j += np.where(nn_proj[i] == V[j])[0] - k

        sum_i += sum_j

    return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())

def normalized_stress(D_high, D_low):
    return np.sum((D_high - D_low)**2) / np.sum(D_high**2) / 100

def shepard_diagram_correlation(D_high, D_low):
    if len(D_high.shape) > 1:
        D_high = spatial.distance.squareform(D_high)
    if len(D_low.shape) > 1:
        D_low = spatial.distance.squareform(D_low)
    return stats.spearmanr(D_high, D_low)[0]

def preprocess(data):
    dataPandas = pd.DataFrame(data)
    dataPandas.dropna()
    for column in dataPandas:
        if ('*' in column):
            gatherLabels = dataPandas[column]
            del dataPandas[column]
    length = len(dataPandas.columns)
    dataNP = dataPandas.to_numpy()
    return dataNP, length, gatherLabels

def multi_run_wrapper(args):
    embedding_array = bhtsne.run_bh_tsne(*args)
    return embedding_array


def procrustesFun(projections):
    similarityList = []
    for proj1 in projections:
        disparityList = []
        for proj2 in projections:
            mtx1, mtx2, disparity = procrustes(proj1, proj2)
            if np.array_equal(proj1, proj2):
                disparityList.append(0)
            else:
                disparityList.append(1/disparity)
        similarityList.append(disparityList)
    clusterIndex = Clustering(similarityList)

    return clusterIndex

def Clustering(similarity):
    similarityNP = np.array(similarity)
    n_clusters = 36
    kmedoids = KMedoids(n_clusters=n_clusters, random_state=0, metric='precomputed').fit(similarityNP)
    global dataProc
    clusterIndex = []
    for c in range(n_clusters):
        cluster_indices = np.argwhere(kmedoids.labels_ == c).reshape(-1,)
        D_c = similarityNP[cluster_indices][:, cluster_indices]
        center = np.argmin(np.sum(D_c, axis=0))
        clusterIndex.append(cluster_indices[center])

    return clusterIndex


@app.route('/receiver', methods = ['POST'])
def calculateGrid():
    data = request.get_data().decode('utf8').replace("'", '"')
    data = json.loads(data)
    global dataProc
    dataProc, length, labels = preprocess(data)

    D_highSpace = distance.squareform(distance.pdist(dataProc))

    DEFAULT_NO_DIMS = 2
    INITIAL_DIMENSIONS = 50
    DEFAULT_PERPLEXITY = 50
    DEFAULT_THETA = 0.5
    EMPTY_SEED = -1
    VERBOSE = True
    DEFAULT_USE_PCA = False
    perplexity = [25,30] # 10 perplexity
    learning_rate = [10,20,30,40,50,60] # 15 learning rate
    n_iter = [200,250,300,350] # 7 iterations

    global overalProjectionsNumber
    overalProjectionsNumber = 0
    overalProjectionsNumber = len(perplexity)*len(learning_rate)*len(n_iter)

    global projectionsAll

    pool = Pool()
    listofParamsPlusData = []
    listofParamsAll= []
    for k in n_iter:
        for j in learning_rate:
            for i in perplexity:
                listofParamsPlusData.append((dataProc,DEFAULT_NO_DIMS,length,i,j,EMPTY_SEED,VERBOSE,DEFAULT_USE_PCA,k))
                listofParamsAll.append((i,j,k))
    projectionsAll = pool.map(multi_run_wrapper, listofParamsPlusData)
    pool.close()
    pool.join()

    global SelectedListofParams
    SelectedListofParams = []
    global SelectedProjectionsReturn
    SelectedProjectionsReturn = []
    clusterIndex = procrustesFun(projectionsAll)

    metricNeigh = []
    metricTrust = []
    metricCont = []
    metricStress = []
    metricShepCorr = []
    convertLabels = []
    for index, label in enumerate(labels):
        if (label == 0):
            convertLabels.append(0)
        elif (label == 1):
            convertLabels.append(1)
        elif (label == 'Benign'):
            convertLabels.append(0)
        elif (label == 'Malignant'):
            convertLabels.append(1)
        elif (label == 'Iris-setosa'):
            convertLabels.append(0)
        elif (label == 'Iris-versicolor'):
            convertLabels.append(1)
        elif (label == 'Iris-virginica'):
                convertLabels.append(2)
        else:
            pass

    for index in clusterIndex:
        SelectedProjectionsReturn.append(projectionsAll[index].tolist())
        SelectedListofParams.append(listofParamsAll[index])

        D_lowSpace = distance.squareform(distance.pdist(projectionsAll[index]))

        k = listofParamsAll[index][0] # k = perplexity

        #resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, k)
        resultNeigh = trustworthiness(D_highSpace, D_lowSpace, k)
        resultTrust = trustworthiness(D_highSpace, D_lowSpace, k)
        resultContinuity = continuity(D_highSpace, D_lowSpace, k)
        resultStress = normalized_stress(D_highSpace, D_lowSpace)
        resultShep = shepard_diagram_correlation(D_highSpace, D_lowSpace)

        metricNeigh.append(resultNeigh)
        metricTrust.append(resultTrust)
        metricCont.append(resultContinuity)
        metricStress.append(resultStress)
        metricShepCorr.append(resultShep)

    max_value_neigh = max(metricNeigh)
    min_value_neigh = min(metricNeigh)

    max_value_trust = max(metricTrust)
    min_value_trust = min(metricTrust)

    max_value_cont = max(metricCont)
    min_value_cont = min(metricCont)

    max_value_stress = max(metricStress)
    min_value_stress = min(metricStress)

    max_value_shep = max(metricShepCorr)
    min_value_shep = min(metricShepCorr)

    global metricsMatrixEntire
    metricsMatrixEntire = []

    for index, data in enumerate(metricTrust):
        #valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
        valueNeigh = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
        valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
        valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
        valueStress = (metricStress[index] - min_value_stress) / (max_value_stress - min_value_stress)
        valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
        metricsMatrixEntire.append([valueNeigh,valueTrust,valueCont,valueStress,valueShep])

    sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
    sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
    sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
    sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=True)
    sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)

    global metricsMatrix
    metricsMatrix = []

    metricsMatrix.append(sortNeigh)
    metricsMatrix.append(sortTrust)
    metricsMatrix.append(sortCont)
    metricsMatrix.append(sortStress)
    metricsMatrix.append(sortShepCorr)

    return 'OK'

@app.route('/sender')
def background_process():
    global SelectedProjectionsReturn
    global projectionsAll
    global overalProjectionsNumber
    global metricsMatrix
    global metricsMatrixEntire

    while (len(projectionsAll) != overalProjectionsNumber):
        pass
    return jsonify({ 'projections': SelectedProjectionsReturn, 'parameters': SelectedListofParams, 'metrics': metricsMatrix, 'metricsEntire': metricsMatrixEntire })

if __name__ == '__main__':
    app.run("0.0.0.0", "5000")