t-viSNE: Interactive Assessment and Interpretation of t-SNE Projections https://doi.org/10.1109/TVCG.2020.2986996
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
t-viSNE/tsneGrid.py

467 lines
15 KiB

#!flask/bin/python
import sys
import os
from flask import Flask, request, Response, jsonify
from flask_cors import CORS
from multiprocessing import Pool
from scipy.spatial import procrustes
from scipy.spatial import distance
from sklearn_extra.cluster import KMedoids
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy import spatial
from scipy import stats
from joblib import Memory
import numpy as np
import pandas as pd
import random, json
import bhtsne
app = Flask(__name__)
CORS(app)
@app.route('/resetAll', methods = ['POST'])
def Reset():
global dataProc
dataProc = []
global D_highSpace
D_highSpace = []
global overalProjectionsNumber
overalProjectionsNumber = []
global projectionsAll
projectionsAll = []
global SelectedListofParams
SelectedListofParams = []
global SelectedProjectionsReturn
SelectedProjectionsReturn = []
global clusterIndex
clusterIndex = []
global convertLabels
convertLabels = []
global D_lowSpaceList
D_lowSpaceList = []
global KeepKs
KeepKs = []
global metricsMatrixEntire
metricsMatrixEntire = []
global metricsMatrix
metricsMatrix = []
global metricsMatrixSel
metricsMatrixSel = []
global metricsMatrixEntireSel
metricsMatrixEntireSel = []
return 'Reset'
# NOTE: Only works with labeled data
def neighborhood_hit(X, y, k, selected=None):
# Add 1 to k because the nearest neighbor is always the point itself
k += 1
y = np.array(y)
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, y)
if selected:
X = X[selected, :]
neighbors = knn.kneighbors(X, return_distance=False)
score = np.mean((y[neighbors] == np.tile(y[selected].reshape((-1, 1)), k)).astype('uint8'))
return score
def trustworthiness(D_high, D_low, k):
n = D_high.shape[0]
nn_orig = D_high.argsort()
nn_proj = D_low.argsort()
knn_orig = nn_orig[:, :k + 1][:, 1:]
knn_proj = nn_proj[:, :k + 1][:, 1:]
sum_i = 0
for i in range(n):
U = np.setdiff1d(knn_proj[i], knn_orig[i])
sum_j = 0
for j in range(U.shape[0]):
sum_j += np.where(nn_orig[i] == U[j])[0] - k
sum_i += sum_j
return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())
def continuity(D_high, D_low, k):
n = D_high.shape[0]
nn_orig = D_high.argsort()
nn_proj = D_low.argsort()
knn_orig = nn_orig[:, :k + 1][:, 1:]
knn_proj = nn_proj[:, :k + 1][:, 1:]
sum_i = 0
for i in range(n):
V = np.setdiff1d(knn_proj[i], knn_orig[i])
sum_j = 0
for j in range(V.shape[0]):
sum_j += np.where(nn_proj[i] == V[j])[0] - k
sum_i += sum_j
return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())
def normalized_stress(D_high, D_low):
return (-1) * (np.sum((D_high - D_low)**2) / np.sum(D_high**2) / 100)
def shepard_diagram_correlation(D_high, D_low):
if len(D_high.shape) > 1:
D_high = spatial.distance.squareform(D_high)
if len(D_low.shape) > 1:
D_low = spatial.distance.squareform(D_low)
return stats.spearmanr(D_high, D_low)[0]
def preprocess(data):
dataPandas = pd.DataFrame(data)
dataPandas.dropna()
for column in dataPandas:
if ('*' in column):
gatherLabels = dataPandas[column]
del dataPandas[column]
length = len(dataPandas.columns)
dataNP = dataPandas.to_numpy()
return dataNP, length, gatherLabels
def multi_run_wrapper(args):
embedding_array = bhtsne.run_bh_tsne(*args)
return embedding_array
def procrustesFun(projections):
similarityList = []
for proj1 in projections:
disparityList = []
for proj2 in projections:
mtx1, mtx2, disparity = procrustes(proj1, proj2)
if np.array_equal(proj1, proj2):
disparityList.append(0)
else:
disparityList.append(1/disparity)
similarityList.append(disparityList)
clusterIndex = Clustering(similarityList)
return clusterIndex
def Clustering(similarity):
similarityNP = np.array(similarity)
n_clusters = 25 # change that to send less diverse projections
kmedoids = KMedoids(n_clusters=n_clusters, random_state=0, metric='precomputed').fit(similarityNP)
global dataProc
clusterIndex = []
for c in range(n_clusters):
cluster_indices = np.argwhere(kmedoids.labels_ == c).reshape(-1,)
D_c = similarityNP[cluster_indices][:, cluster_indices]
center = np.argmin(np.sum(D_c, axis=0))
clusterIndex.append(cluster_indices[center])
return clusterIndex
location = './cachedir'
memory = Memory(location, verbose=0)
def wrapGetResults(listofParamsPlusData):
pool = Pool()
return pool.map(multi_run_wrapper, listofParamsPlusData)
wrapGetResults = memory.cache(wrapGetResults)
@app.route('/receiver', methods = ['POST'])
def calculateGrid():
data = request.get_data().decode('utf8').replace("'", '"')
data = json.loads(data)
global dataProc
dataProc, length, labels = preprocess(data)
global D_highSpace
D_highSpace = distance.squareform(distance.pdist(dataProc))
DEFAULT_NO_DIMS = 2
INITIAL_DIMENSIONS = 50
DEFAULT_PERPLEXITY = 50
DEFAULT_THETA = 0.5
EMPTY_SEED = -1
VERBOSE = True
DEFAULT_USE_PCA = False
# all other data sets
perplexity = [5,10,15,20,25,30,35,40,45,50] # 10 perplexity
# iris data set
if (labels[0] == 'Iris-setosa'):
perplexity = [5,10,15,20,25,28,32,35,40,45] # 10 perplexity
# breast cancer data set
if (labels[0] == 'Benign'):
perplexity =[30,35,40,45,50,55,60,65,70,75] # 10 perplexity
# diabetes data set
if (labels[0] == 1):
perplexity = [10,15,20,25,30,35,40,45,50,55] # 10 perplexity
learning_rate = [10,20,30,40,50,60,70,80,90,100] # 10 learning rate
n_iter = [100,150,200,250,350] # 5 iterations
global overalProjectionsNumber
overalProjectionsNumber = 0
overalProjectionsNumber = len(perplexity)*len(learning_rate)*len(n_iter)
global projectionsAll
listofParamsPlusData = []
listofParamsAll= []
for k in n_iter:
for j in learning_rate:
for i in perplexity:
listofParamsPlusData.append((dataProc,DEFAULT_NO_DIMS,length,i,j,EMPTY_SEED,VERBOSE,DEFAULT_USE_PCA,k))
listofParamsAll.append((i,j,k))
projectionsAll = wrapGetResults(listofParamsPlusData)
global SelectedListofParams
SelectedListofParams = []
global SelectedProjectionsReturn
SelectedProjectionsReturn = []
global clusterIndex
clusterIndex = procrustesFun(projectionsAll)
metricNeigh = []
metricTrust = []
metricCont = []
metricStress = []
metricShepCorr = []
metricsAverage = []
global convertLabels
convertLabels = []
for index, label in enumerate(labels):
if (label == 0):
convertLabels.append(0)
elif (label == 1):
convertLabels.append(1)
elif (label == 'Benign'):
convertLabels.append(0)
elif (label == 'Malignant'):
convertLabels.append(1)
elif (label == 'Iris-setosa'):
convertLabels.append(0)
elif (label == 'Iris-versicolor'):
convertLabels.append(1)
elif (label == 'Iris-virginica'):
convertLabels.append(2)
else:
pass
global D_lowSpaceList
D_lowSpaceList = []
global KeepKs
KeepKs = []
for index in clusterIndex:
SelectedProjectionsReturn.append(projectionsAll[index].tolist())
SelectedListofParams.append(listofParamsAll[index])
D_lowSpace = distance.squareform(distance.pdist(projectionsAll[index]))
D_lowSpaceList.append(D_lowSpace)
k = listofParamsAll[index][0] # k = perplexity
KeepKs.append(k)
resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, k)
resultTrust = trustworthiness(D_highSpace, D_lowSpace, k)
resultContinuity = continuity(D_highSpace, D_lowSpace, k)
resultStress = normalized_stress(D_highSpace, D_lowSpace)
resultShep = shepard_diagram_correlation(D_highSpace, D_lowSpace)
metricNeigh.append(resultNeigh)
metricTrust.append(resultTrust)
metricCont.append(resultContinuity)
metricStress.append(resultStress)
metricShepCorr.append(resultShep)
max_value_neigh = max(metricNeigh)
min_value_neigh = min(metricNeigh)
max_value_trust = max(metricTrust)
min_value_trust = min(metricTrust)
max_value_cont = max(metricCont)
min_value_cont = min(metricCont)
max_value_stress = max(metricStress)
min_value_stress = min(metricStress)
max_value_shep = max(metricShepCorr)
min_value_shep = min(metricShepCorr)
global metricsMatrixEntire
metricsMatrixEntire = []
for index, data in enumerate(metricTrust):
valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
valueStress = 1 - ((metricStress[index]*(-1) - max_value_stress*(-1)) / (min_value_stress*(-1) - max_value_stress*(-1))) # we need the opposite
valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
average = (valueNeigh + valueTrust + valueCont + valueStress + valueShep) / 5
metricsAverage.append(average)
metricsMatrixEntire.append([average,valueNeigh,valueTrust,valueCont,valueStress,valueShep])
sortMetricsAverage = sorted(range(len(metricsAverage)), key=lambda k: metricsAverage[k], reverse=True)
sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=True)
sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)
global metricsMatrix
metricsMatrix = []
metricsMatrix.append(sortMetricsAverage)
metricsMatrix.append(sortNeigh)
metricsMatrix.append(sortTrust)
metricsMatrix.append(sortCont)
metricsMatrix.append(sortStress)
metricsMatrix.append(sortShepCorr)
return 'OK'
@app.route('/sender')
def background_process():
global SelectedProjectionsReturn
global projectionsAll
global overalProjectionsNumber
global metricsMatrix
global metricsMatrixEntire
while (len(projectionsAll) != overalProjectionsNumber):
pass
return jsonify({ 'projections': SelectedProjectionsReturn, 'parameters': SelectedListofParams, 'metrics': metricsMatrix, 'metricsEntire': metricsMatrixEntire })
@app.route('/receiverOptimizer', methods = ['POST'])
def OptimizeSelection():
dataReceived= request.get_data().decode('utf8').replace("'", '"')
dataReceived = json.loads(dataReceived)
dataSelected = []
for data in dataReceived:
if data != None:
dataSelected.append(data)
metricNeigh = []
metricTrust = []
metricCont = []
metricStress = []
metricShepCorr = []
metricsAverage = []
for index, loop in enumerate(clusterIndex):
resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, KeepKs[index], dataSelected)
resultTrust = trustworthiness(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :], KeepKs[index])
resultContinuity = continuity(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :], KeepKs[index])
resultStress = normalized_stress(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :])
resultShep = shepard_diagram_correlation(D_highSpace[dataSelected][:, dataSelected], D_lowSpaceList[index][dataSelected][:, dataSelected])
metricNeigh.append(resultNeigh)
metricTrust.append(resultTrust)
metricCont.append(resultContinuity)
metricStress.append(resultStress)
metricShepCorr.append(resultShep)
max_value_neigh = max(metricNeigh)
min_value_neigh = min(metricNeigh)
max_value_trust = max(metricTrust)
min_value_trust = min(metricTrust)
max_value_cont = max(metricCont)
min_value_cont = min(metricCont)
max_value_stress = max(metricStress)
min_value_stress = min(metricStress)
max_value_shep = max(metricShepCorr)
min_value_shep = min(metricShepCorr)
global metricsMatrixEntireSel
metricsMatrixEntireSel = []
for index, data in enumerate(metricTrust):
valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
valueStress = 1 - ((metricStress[index]*(-1) - max_value_stress*(-1)) / (min_value_stress*(-1) - max_value_stress*(-1))) # we need the opposite
valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
average = (valueNeigh + valueTrust + valueCont + valueStress + valueShep) / 5
metricsAverage.append(average)
metricsMatrixEntireSel.append([average,valueNeigh,valueTrust,valueCont,valueStress,valueShep])
sortMetricsAverage = sorted(range(len(metricsAverage)), key=lambda k: metricsAverage[k], reverse=True)
sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=True)
sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)
global metricsMatrixSel
metricsMatrixSel = []
metricsMatrixSel.append(sortMetricsAverage)
metricsMatrixSel.append(sortNeigh)
metricsMatrixSel.append(sortTrust)
metricsMatrixSel.append(sortCont)
metricsMatrixSel.append(sortStress)
metricsMatrixSel.append(sortShepCorr)
return 'OK'
@app.route('/senderOptimizer')
def SendOptimizedProjections():
global metricsMatrixSel
global metricsMatrixEntireSel
return jsonify({'metrics': metricsMatrixSel, 'metricsEntire': metricsMatrixEntireSel })
if __name__ == '__main__':
app.run("0.0.0.0", "5000")