t-viSNE: Interactive Assessment and Interpretation of t-SNE Projections https://doi.org/10.1109/TVCG.2020.2986996
t-viSNE/tsneGrid.py

609 lines
18 KiB

#!flask/bin/python
import sys
import os
from flask import Flask, request, Response, jsonify
from flask_cors import CORS
from multiprocessing import Pool
from scipy.spatial import procrustes
from scipy.spatial import distance
from sklearn_extra.cluster import KMedoids
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy import spatial
from scipy import stats
from joblib import Memory
import numpy as np
import time
import pandas as pd
import random, json
import bhtsne
app = Flask(__name__)
CORS(app)
@app.route('/resetAll', methods = ['POST'])
def Reset():
global dataProc
dataProc = []
global D_highSpace
D_highSpace = []
global overalProjectionsNumber
overalProjectionsNumber = []
global projectionsAll
projectionsAll = []
global betas
betas = []
global cpp
cpp = []
global cpi
cpi = []
global SelectedListofParams
SelectedListofParams = []
global SelectedProjectionsReturn
SelectedProjectionsReturn = []
global SelectedProjectionsBeta
SelectedProjectionsBeta = []
global SelectedProjectionsCPP
SelectedProjectionsCPP = []
global SelectedProjectionsCPI
SelectedProjectionsCPI = []
global clusterIndex
clusterIndex = []
global convertLabels
convertLabels = []
global D_lowSpaceList
D_lowSpaceList = []
global KeepKs
KeepKs = []
global metricsMatrixEntire
metricsMatrixEntire = []
global metricsMatrix
metricsMatrix = []
global metricsMatrixSel
metricsMatrixSel = []
global metricsMatrixEntireSel
metricsMatrixEntireSel = []
return 'Reset'
location = './cachedir'
memory = Memory(location, verbose=0)
# NOTE: Only works with labeled data
def neighborhood_hit(X, y, k, selected=None):
# Add 1 to k because the nearest neighbor is always the point itself
k += 1
y = np.array(y)
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, y)
if selected:
X = X[selected, :]
neighbors = knn.kneighbors(X, return_distance=False)
score = np.mean((y[neighbors] == np.tile(y[selected].reshape((-1, 1)), k)).astype('uint8'))
return score
neighborhood_hit = memory.cache(neighborhood_hit)
location = './cachedir'
memory = Memory(location, verbose=0)
def trustworthiness(D_high, D_low, k):
n = D_high.shape[0]
nn_orig = D_high.argsort()
nn_proj = D_low.argsort()
knn_orig = nn_orig[:, :k + 1][:, 1:]
knn_proj = nn_proj[:, :k + 1][:, 1:]
sum_i = 0
for i in range(n):
U = np.setdiff1d(knn_proj[i], knn_orig[i])
sum_j = 0
for j in range(U.shape[0]):
sum_j += np.where(nn_orig[i] == U[j])[0] - k
sum_i += sum_j
return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())
trustworthiness = memory.cache(trustworthiness)
location = './cachedir'
memory = Memory(location, verbose=0)
def continuity(D_high, D_low, k):
n = D_high.shape[0]
nn_orig = D_high.argsort()
nn_proj = D_low.argsort()
knn_orig = nn_orig[:, :k + 1][:, 1:]
knn_proj = nn_proj[:, :k + 1][:, 1:]
sum_i = 0
for i in range(n):
V = np.setdiff1d(knn_proj[i], knn_orig[i])
sum_j = 0
for j in range(V.shape[0]):
sum_j += np.where(nn_proj[i] == V[j])[0] - k
sum_i += sum_j
return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())
continuity = memory.cache(continuity)
location = './cachedir'
memory = Memory(location, verbose=0)
def normalized_stress(D_high, D_low):
return (-1) * (np.sum((D_high - D_low)**2) / np.sum(D_high**2) / 100)
normalized_stress = memory.cache(normalized_stress)
location = './cachedir'
memory = Memory(location, verbose=0)
def shepard_diagram_correlation(D_high, D_low):
if len(D_high.shape) > 1:
D_high = spatial.distance.squareform(D_high)
if len(D_low.shape) > 1:
D_low = spatial.distance.squareform(D_low)
return stats.spearmanr(D_high, D_low)[0]
shepard_diagram_correlation = memory.cache(shepard_diagram_correlation)
location = './cachedir'
memory = Memory(location, verbose=0)
def preprocess(data):
dataPandas = pd.DataFrame(data)
dataPandas.dropna()
for column in dataPandas:
if ('*' in column):
gatherLabels = dataPandas[column]
del dataPandas[column]
length = len(dataPandas.columns)
dataNP = dataPandas.to_numpy()
return dataNP, length, gatherLabels
preprocess = memory.cache(preprocess)
def multi_run_wrapper(args):
projectionsAllLoc, betasL, cppL, cpiL = bhtsne.run_bh_tsne(*args)
return projectionsAllLoc, betasL, cppL, cpiL
location = './cachedir'
memory = Memory(location, verbose=0)
def procrustesFun(projections):
similarityList = []
for proj1 in projections:
disparityList = []
for proj2 in projections:
mtx1, mtx2, disparity = procrustes(proj1, proj2)
if np.array_equal(proj1, proj2):
disparityList.append(0)
else:
disparityList.append(1/disparity)
similarityList.append(disparityList)
clusterIndex = Clustering(similarityList)
return clusterIndex
procrustesFun = memory.cache(procrustesFun)
location = './cachedir'
memory = Memory(location, verbose=0)
def Clustering(similarity):
similarityNP = np.array(similarity)
n_clusters = 25 # change that to send less diverse projections
kmedoids = KMedoids(n_clusters=n_clusters, random_state=0, metric='precomputed').fit(similarityNP)
global dataProc
clusterIndex = []
for c in range(n_clusters):
cluster_indices = np.argwhere(kmedoids.labels_ == c).reshape(-1,)
D_c = similarityNP[cluster_indices][:, cluster_indices]
center = np.argmin(np.sum(D_c, axis=0))
clusterIndex.append(cluster_indices[center])
return clusterIndex
Clustering = memory.cache(Clustering)
location = './cachedir'
memory = Memory(location, verbose=0)
def wrapGetResults(listofParamsPlusData):
pool = Pool()
return zip(*pool.map(multi_run_wrapper, listofParamsPlusData))
wrapGetResults = memory.cache(wrapGetResults)
@app.route('/receiver', methods = ['POST'])
def calculateGrid():
data = request.get_data().decode('utf8').replace("'", '"')
data = json.loads(data)
global dataProc
dataProc, length, labels = preprocess(data)
global D_highSpace
D_highSpace = distance.squareform(distance.pdist(dataProc))
DEFAULT_NO_DIMS = 2
VERBOSE = False
DEFAULT_USE_PCA = True
randseed=1137
# all other data sets
perplexity = [5,10,15,20,25,30,35,40,45,50] # 10 perplexity
# iris data set
if (labels[0] == 'Iris-setosa'):
perplexity = [5,10,15,20,25,28,32,35,40,45] # 10 perplexity
# breast cancer data set
if (labels[0] == 'Benign'):
perplexity =[30,35,40,45,50,55,60,65,70,75] # 10 perplexity
# diabetes data set
if (labels[0] == 1):
perplexity = [10,15,20,25,30,35,40,45,50,55] # 10 perplexity
learning_rate = [1,10,20,30,40,50,60,70,80,90] # 10 learning rate
n_iter = [200,250,350,400,450] # 5 iterations
global overalProjectionsNumber
overalProjectionsNumber = 0
overalProjectionsNumber = len(perplexity)*len(learning_rate)*len(n_iter)
global projectionsAll
listofParamsPlusData = []
listofParamsAll= []
for k in n_iter:
for j in learning_rate:
for i in perplexity:
listofParamsPlusData.append((dataProc,DEFAULT_NO_DIMS,i,j,randseed,VERBOSE,length,DEFAULT_USE_PCA,k,True,True,True))
listofParamsAll.append((i,j,k))
projectionsAll, betas, cpp, cpi = wrapGetResults(listofParamsPlusData)
global SelectedListofParams
SelectedListofParams = []
global SelectedProjectionsReturn
SelectedProjectionsReturn = []
global SelectedProjectionsBeta
SelectedProjectionsBeta = []
global SelectedProjectionsCPP
SelectedProjectionsCPP = []
global SelectedProjectionsCPI
SelectedProjectionsCPI = []
global clusterIndex
clusterIndex = procrustesFun(projectionsAll)
metricNeigh = []
metricTrust = []
metricCont = []
metricStress = []
metricShepCorr = []
metricsAverage = []
global convertLabels
convertLabels = []
for index, label in enumerate(labels):
if (label == 0):
convertLabels.append(0)
elif (label == 1):
convertLabels.append(1)
elif (label == 'Benign'):
convertLabels.append(0)
elif (label == 'Malignant'):
convertLabels.append(1)
elif (label == 'Iris-setosa'):
convertLabels.append(0)
elif (label == 'Iris-versicolor'):
convertLabels.append(1)
elif (label == 'Iris-virginica'):
convertLabels.append(2)
else:
pass
global D_lowSpaceList
D_lowSpaceList = []
global KeepKs
KeepKs = []
for index in clusterIndex:
SelectedProjectionsReturn.append(projectionsAll[index].tolist())
SelectedListofParams.append(listofParamsAll[index])
SelectedProjectionsBeta.append(betas[index].tolist())
SelectedProjectionsCPP.append(cpp[index].tolist())
SelectedProjectionsCPI.append(cpi[index].tolist())
D_lowSpace = distance.squareform(distance.pdist(projectionsAll[index]))
D_lowSpaceList.append(D_lowSpace)
k = listofParamsAll[index][0] # k = perplexity
KeepKs.append(k)
resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, k)
resultTrust = trustworthiness(D_highSpace, D_lowSpace, k)
resultContinuity = continuity(D_highSpace, D_lowSpace, k)
resultStress = normalized_stress(D_highSpace, D_lowSpace)
resultShep = shepard_diagram_correlation(D_highSpace, D_lowSpace)
metricNeigh.append(resultNeigh)
metricTrust.append(resultTrust)
metricCont.append(resultContinuity)
metricStress.append(resultStress)
metricShepCorr.append(resultShep)
max_value_neigh = max(metricNeigh)
min_value_neigh = min(metricNeigh)
max_value_trust = max(metricTrust)
min_value_trust = min(metricTrust)
max_value_cont = max(metricCont)
min_value_cont = min(metricCont)
max_value_stress = max(metricStress)
min_value_stress = min(metricStress)
max_value_shep = max(metricShepCorr)
min_value_shep = min(metricShepCorr)
global metricsMatrixEntire
metricsMatrixEntire = []
for index, data in enumerate(metricTrust):
valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
valueStress = 1 - ((metricStress[index]*(-1) - max_value_stress*(-1)) / (min_value_stress*(-1) - max_value_stress*(-1))) # we need the opposite
valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
average = (valueNeigh + valueTrust + valueCont + valueStress + valueShep) / 5
metricsAverage.append(average)
metricsMatrixEntire.append([average,valueNeigh,valueTrust,valueCont,valueStress,valueShep])
sortMetricsAverage = sorted(range(len(metricsAverage)), key=lambda k: metricsAverage[k], reverse=True)
sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=True)
sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)
global metricsMatrix
metricsMatrix = []
metricsMatrix.append(sortMetricsAverage)
metricsMatrix.append(sortNeigh)
metricsMatrix.append(sortTrust)
metricsMatrix.append(sortCont)
metricsMatrix.append(sortStress)
metricsMatrix.append(sortShepCorr)
return 'OK'
@app.route('/sender')
def background_process():
global SelectedProjectionsReturn
global projectionsAll
global overalProjectionsNumber
global metricsMatrix
global metricsMatrixEntire
global SelectedProjectionsBeta
global SelectedProjectionsCPP
global SelectedProjectionsCPI
while (len(projectionsAll) != overalProjectionsNumber):
pass
return jsonify({ 'projections': SelectedProjectionsReturn, 'parameters': SelectedListofParams, 'metrics': metricsMatrix, 'metricsEntire': metricsMatrixEntire, 'betas': SelectedProjectionsBeta, 'cpp': SelectedProjectionsCPP, 'cpi': SelectedProjectionsCPI})
@app.route('/receiverOptimizer', methods = ['POST'])
def OptimizeSelection():
dataReceived= request.get_data().decode('utf8').replace("'", '"')
dataReceived = json.loads(dataReceived)
dataSelected = []
for data in dataReceived:
if data != None:
dataSelected.append(data)
metricNeigh = []
metricTrust = []
metricCont = []
metricStress = []
metricShepCorr = []
metricsAverage = []
for index, loop in enumerate(clusterIndex):
resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, KeepKs[index], dataSelected)
resultTrust = trustworthiness(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :], KeepKs[index])
resultContinuity = continuity(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :], KeepKs[index])
resultStress = normalized_stress(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :])
resultShep = shepard_diagram_correlation(D_highSpace[dataSelected][:, dataSelected], D_lowSpaceList[index][dataSelected][:, dataSelected])
metricNeigh.append(resultNeigh)
metricTrust.append(resultTrust)
metricCont.append(resultContinuity)
metricStress.append(resultStress)
metricShepCorr.append(resultShep)
max_value_neigh = max(metricNeigh)
min_value_neigh = min(metricNeigh)
max_value_trust = max(metricTrust)
min_value_trust = min(metricTrust)
max_value_cont = max(metricCont)
min_value_cont = min(metricCont)
max_value_stress = max(metricStress)
min_value_stress = min(metricStress)
max_value_shep = max(metricShepCorr)
min_value_shep = min(metricShepCorr)
global metricsMatrixEntireSel
metricsMatrixEntireSel = []
for index, data in enumerate(metricTrust):
valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
valueStress = 1 - ((metricStress[index]*(-1) - max_value_stress*(-1)) / (min_value_stress*(-1) - max_value_stress*(-1))) # we need the opposite
valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
average = (valueNeigh + valueTrust + valueCont + valueStress + valueShep) / 5
metricsAverage.append(average)
metricsMatrixEntireSel.append([average,valueNeigh,valueTrust,valueCont,valueStress,valueShep])
sortMetricsAverage = sorted(range(len(metricsAverage)), key=lambda k: metricsAverage[k], reverse=True)
sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=True)
sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)
global metricsMatrixSel
metricsMatrixSel = []
metricsMatrixSel.append(sortMetricsAverage)
metricsMatrixSel.append(sortNeigh)
metricsMatrixSel.append(sortTrust)
metricsMatrixSel.append(sortCont)
metricsMatrixSel.append(sortStress)
metricsMatrixSel.append(sortShepCorr)
return 'OK'
@app.route('/senderOptimizer')
def SendOptimizedProjections():
global metricsMatrixSel
global metricsMatrixEntireSel
return jsonify({'metrics': metricsMatrixSel, 'metricsEntire': metricsMatrixEntireSel })
@app.route('/receiverSingle', methods = ['POST'])
def singleParameters():
data = request.get_data().decode('utf8').replace("'", '"')
data = json.loads(data)
global dataProc
dataProc, length, labels = preprocess(data[3])
DEFAULT_NO_DIMS = 2
VERBOSE = False
DEFAULT_USE_PCA = True
randseed=1137
perplexity = int(data[0])
learning_rate = int(data[1])
n_iter = int(data[2])
global projectionsAll
listofParamsPlusData = []
listofParamsAll= []
listofParamsPlusData.append((dataProc,DEFAULT_NO_DIMS,perplexity,learning_rate,randseed,VERBOSE,length,DEFAULT_USE_PCA,n_iter,True,True,True))
listofParamsAll.append((perplexity,learning_rate,n_iter))
projectionsAll, betas, cpp, cpi = wrapGetResults(listofParamsPlusData)
global SelectedProjectionsReturn
SelectedProjectionsReturn = []
global SelectedProjectionsBeta
SelectedProjectionsBeta = []
global SelectedProjectionsCPP
SelectedProjectionsCPP = []
global SelectedProjectionsCPI
SelectedProjectionsCPI = []
SelectedProjectionsReturn.append(projectionsAll[0].tolist())
SelectedProjectionsBeta.append(betas[0].tolist())
SelectedProjectionsCPP.append(cpp[0].tolist())
SelectedProjectionsCPI.append(cpi[0].tolist())
return 'OK'
@app.route('/senderSingle')
def sendSingle():
global projectionsAll
global SelectedProjectionsReturn
global SelectedProjectionsBeta
global SelectedProjectionsCPP
global SelectedProjectionsCPI
while (len(projectionsAll) != 1):
pass
return jsonify({ 'projections': SelectedProjectionsReturn, 'betas': SelectedProjectionsBeta, 'cpp': SelectedProjectionsCPP, 'cpi': SelectedProjectionsCPI})
if __name__ == '__main__':
app.run("0.0.0.0", "5000")