t-viSNE: Interactive Assessment and Interpretation of t-SNE Projections
https://doi.org/10.1109/TVCG.2020.2986996
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
467 lines
15 KiB
467 lines
15 KiB
#!flask/bin/python
|
|
|
|
import sys
|
|
import os
|
|
|
|
from flask import Flask, request, Response, jsonify
|
|
from flask_cors import CORS
|
|
from multiprocessing import Pool
|
|
from scipy.spatial import procrustes
|
|
from scipy.spatial import distance
|
|
from sklearn_extra.cluster import KMedoids
|
|
from sklearn import metrics
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.model_selection import GridSearchCV, train_test_split
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from scipy import spatial
|
|
from scipy import stats
|
|
from joblib import Memory
|
|
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import random, json
|
|
import bhtsne
|
|
|
|
app = Flask(__name__)
|
|
CORS(app)
|
|
|
|
@app.route('/resetAll', methods = ['POST'])
|
|
def Reset():
|
|
|
|
global dataProc
|
|
dataProc = []
|
|
|
|
global D_highSpace
|
|
D_highSpace = []
|
|
|
|
global overalProjectionsNumber
|
|
overalProjectionsNumber = []
|
|
|
|
global projectionsAll
|
|
projectionsAll = []
|
|
|
|
global SelectedListofParams
|
|
SelectedListofParams = []
|
|
|
|
global SelectedProjectionsReturn
|
|
SelectedProjectionsReturn = []
|
|
|
|
global clusterIndex
|
|
clusterIndex = []
|
|
|
|
global convertLabels
|
|
convertLabels = []
|
|
|
|
global D_lowSpaceList
|
|
D_lowSpaceList = []
|
|
|
|
global KeepKs
|
|
KeepKs = []
|
|
|
|
global metricsMatrixEntire
|
|
metricsMatrixEntire = []
|
|
|
|
global metricsMatrix
|
|
metricsMatrix = []
|
|
|
|
global metricsMatrixSel
|
|
metricsMatrixSel = []
|
|
|
|
global metricsMatrixEntireSel
|
|
metricsMatrixEntireSel = []
|
|
|
|
return 'Reset'
|
|
|
|
# NOTE: Only works with labeled data
|
|
def neighborhood_hit(X, y, k, selected=None):
|
|
# Add 1 to k because the nearest neighbor is always the point itself
|
|
k += 1
|
|
|
|
y = np.array(y)
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=k)
|
|
knn.fit(X, y)
|
|
|
|
if selected:
|
|
X = X[selected, :]
|
|
|
|
neighbors = knn.kneighbors(X, return_distance=False)
|
|
|
|
score = np.mean((y[neighbors] == np.tile(y[selected].reshape((-1, 1)), k)).astype('uint8'))
|
|
|
|
return score
|
|
|
|
def trustworthiness(D_high, D_low, k):
|
|
n = D_high.shape[0]
|
|
|
|
nn_orig = D_high.argsort()
|
|
nn_proj = D_low.argsort()
|
|
|
|
knn_orig = nn_orig[:, :k + 1][:, 1:]
|
|
knn_proj = nn_proj[:, :k + 1][:, 1:]
|
|
|
|
sum_i = 0
|
|
|
|
for i in range(n):
|
|
U = np.setdiff1d(knn_proj[i], knn_orig[i])
|
|
|
|
sum_j = 0
|
|
for j in range(U.shape[0]):
|
|
sum_j += np.where(nn_orig[i] == U[j])[0] - k
|
|
|
|
sum_i += sum_j
|
|
|
|
return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())
|
|
|
|
def continuity(D_high, D_low, k):
|
|
n = D_high.shape[0]
|
|
|
|
nn_orig = D_high.argsort()
|
|
nn_proj = D_low.argsort()
|
|
|
|
knn_orig = nn_orig[:, :k + 1][:, 1:]
|
|
knn_proj = nn_proj[:, :k + 1][:, 1:]
|
|
|
|
sum_i = 0
|
|
|
|
for i in range(n):
|
|
V = np.setdiff1d(knn_proj[i], knn_orig[i])
|
|
|
|
sum_j = 0
|
|
for j in range(V.shape[0]):
|
|
sum_j += np.where(nn_proj[i] == V[j])[0] - k
|
|
|
|
sum_i += sum_j
|
|
|
|
return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())
|
|
|
|
def normalized_stress(D_high, D_low):
|
|
return (-1) * (np.sum((D_high - D_low)**2) / np.sum(D_high**2) / 100)
|
|
|
|
def shepard_diagram_correlation(D_high, D_low):
|
|
if len(D_high.shape) > 1:
|
|
D_high = spatial.distance.squareform(D_high)
|
|
if len(D_low.shape) > 1:
|
|
D_low = spatial.distance.squareform(D_low)
|
|
|
|
return stats.spearmanr(D_high, D_low)[0]
|
|
|
|
def preprocess(data):
|
|
dataPandas = pd.DataFrame(data)
|
|
dataPandas.dropna()
|
|
for column in dataPandas:
|
|
if ('*' in column):
|
|
gatherLabels = dataPandas[column]
|
|
del dataPandas[column]
|
|
length = len(dataPandas.columns)
|
|
dataNP = dataPandas.to_numpy()
|
|
return dataNP, length, gatherLabels
|
|
|
|
def multi_run_wrapper(args):
|
|
embedding_array = bhtsne.run_bh_tsne(*args)
|
|
return embedding_array
|
|
|
|
def procrustesFun(projections):
|
|
similarityList = []
|
|
for proj1 in projections:
|
|
disparityList = []
|
|
for proj2 in projections:
|
|
mtx1, mtx2, disparity = procrustes(proj1, proj2)
|
|
if np.array_equal(proj1, proj2):
|
|
disparityList.append(0)
|
|
else:
|
|
disparityList.append(1/disparity)
|
|
similarityList.append(disparityList)
|
|
clusterIndex = Clustering(similarityList)
|
|
|
|
return clusterIndex
|
|
|
|
def Clustering(similarity):
|
|
similarityNP = np.array(similarity)
|
|
n_clusters = 25 # change that to send less diverse projections
|
|
kmedoids = KMedoids(n_clusters=n_clusters, random_state=0, metric='precomputed').fit(similarityNP)
|
|
global dataProc
|
|
clusterIndex = []
|
|
for c in range(n_clusters):
|
|
cluster_indices = np.argwhere(kmedoids.labels_ == c).reshape(-1,)
|
|
D_c = similarityNP[cluster_indices][:, cluster_indices]
|
|
center = np.argmin(np.sum(D_c, axis=0))
|
|
clusterIndex.append(cluster_indices[center])
|
|
|
|
return clusterIndex
|
|
|
|
location = './cachedir'
|
|
memory = Memory(location, verbose=0)
|
|
|
|
def wrapGetResults(listofParamsPlusData):
|
|
pool = Pool()
|
|
|
|
return pool.map(multi_run_wrapper, listofParamsPlusData)
|
|
|
|
wrapGetResults = memory.cache(wrapGetResults)
|
|
|
|
@app.route('/receiver', methods = ['POST'])
|
|
def calculateGrid():
|
|
data = request.get_data().decode('utf8').replace("'", '"')
|
|
data = json.loads(data)
|
|
global dataProc
|
|
dataProc, length, labels = preprocess(data)
|
|
|
|
global D_highSpace
|
|
D_highSpace = distance.squareform(distance.pdist(dataProc))
|
|
|
|
DEFAULT_NO_DIMS = 2
|
|
INITIAL_DIMENSIONS = 50
|
|
DEFAULT_PERPLEXITY = 50
|
|
DEFAULT_THETA = 0.5
|
|
EMPTY_SEED = -1
|
|
VERBOSE = True
|
|
DEFAULT_USE_PCA = False
|
|
|
|
# all other data sets
|
|
perplexity = [5,10,15,20,25,30,35,40,45,50] # 10 perplexity
|
|
|
|
# iris data set
|
|
if (labels[0] == 'Iris-setosa'):
|
|
perplexity = [5,10,15,20,25,28,32,35,40,45] # 10 perplexity
|
|
|
|
# breast cancer data set
|
|
if (labels[0] == 'Benign'):
|
|
perplexity =[30,35,40,45,50,55,60,65,70,75] # 10 perplexity
|
|
|
|
# diabetes data set
|
|
if (labels[0] == 1):
|
|
perplexity = [10,15,20,25,30,35,40,45,50,55] # 10 perplexity
|
|
|
|
learning_rate = [10,20,30,40,50,60,70,80,90,100] # 10 learning rate
|
|
n_iter = [100,150,200,250,350] # 5 iterations
|
|
|
|
global overalProjectionsNumber
|
|
overalProjectionsNumber = 0
|
|
overalProjectionsNumber = len(perplexity)*len(learning_rate)*len(n_iter)
|
|
|
|
global projectionsAll
|
|
|
|
listofParamsPlusData = []
|
|
listofParamsAll= []
|
|
for k in n_iter:
|
|
for j in learning_rate:
|
|
for i in perplexity:
|
|
listofParamsPlusData.append((dataProc,DEFAULT_NO_DIMS,length,i,j,EMPTY_SEED,VERBOSE,DEFAULT_USE_PCA,k))
|
|
listofParamsAll.append((i,j,k))
|
|
|
|
projectionsAll = wrapGetResults(listofParamsPlusData)
|
|
|
|
global SelectedListofParams
|
|
SelectedListofParams = []
|
|
global SelectedProjectionsReturn
|
|
SelectedProjectionsReturn = []
|
|
|
|
global clusterIndex
|
|
clusterIndex = procrustesFun(projectionsAll)
|
|
|
|
metricNeigh = []
|
|
metricTrust = []
|
|
metricCont = []
|
|
metricStress = []
|
|
metricShepCorr = []
|
|
metricsAverage = []
|
|
|
|
global convertLabels
|
|
convertLabels = []
|
|
for index, label in enumerate(labels):
|
|
if (label == 0):
|
|
convertLabels.append(0)
|
|
elif (label == 1):
|
|
convertLabels.append(1)
|
|
elif (label == 'Benign'):
|
|
convertLabels.append(0)
|
|
elif (label == 'Malignant'):
|
|
convertLabels.append(1)
|
|
elif (label == 'Iris-setosa'):
|
|
convertLabels.append(0)
|
|
elif (label == 'Iris-versicolor'):
|
|
convertLabels.append(1)
|
|
elif (label == 'Iris-virginica'):
|
|
convertLabels.append(2)
|
|
else:
|
|
pass
|
|
|
|
global D_lowSpaceList
|
|
D_lowSpaceList = []
|
|
|
|
global KeepKs
|
|
KeepKs = []
|
|
|
|
for index in clusterIndex:
|
|
SelectedProjectionsReturn.append(projectionsAll[index].tolist())
|
|
SelectedListofParams.append(listofParamsAll[index])
|
|
|
|
D_lowSpace = distance.squareform(distance.pdist(projectionsAll[index]))
|
|
D_lowSpaceList.append(D_lowSpace)
|
|
|
|
k = listofParamsAll[index][0] # k = perplexity
|
|
KeepKs.append(k)
|
|
|
|
resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, k)
|
|
resultTrust = trustworthiness(D_highSpace, D_lowSpace, k)
|
|
resultContinuity = continuity(D_highSpace, D_lowSpace, k)
|
|
resultStress = normalized_stress(D_highSpace, D_lowSpace)
|
|
resultShep = shepard_diagram_correlation(D_highSpace, D_lowSpace)
|
|
|
|
metricNeigh.append(resultNeigh)
|
|
metricTrust.append(resultTrust)
|
|
metricCont.append(resultContinuity)
|
|
metricStress.append(resultStress)
|
|
metricShepCorr.append(resultShep)
|
|
|
|
max_value_neigh = max(metricNeigh)
|
|
min_value_neigh = min(metricNeigh)
|
|
|
|
max_value_trust = max(metricTrust)
|
|
min_value_trust = min(metricTrust)
|
|
|
|
max_value_cont = max(metricCont)
|
|
min_value_cont = min(metricCont)
|
|
|
|
max_value_stress = max(metricStress)
|
|
min_value_stress = min(metricStress)
|
|
|
|
max_value_shep = max(metricShepCorr)
|
|
min_value_shep = min(metricShepCorr)
|
|
|
|
global metricsMatrixEntire
|
|
metricsMatrixEntire = []
|
|
|
|
for index, data in enumerate(metricTrust):
|
|
valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
|
|
valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
|
|
valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
|
|
valueStress = 1 - ((metricStress[index]*(-1) - max_value_stress*(-1)) / (min_value_stress*(-1) - max_value_stress*(-1))) # we need the opposite
|
|
valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
|
|
average = (valueNeigh + valueTrust + valueCont + valueStress + valueShep) / 5
|
|
|
|
metricsAverage.append(average)
|
|
metricsMatrixEntire.append([average,valueNeigh,valueTrust,valueCont,valueStress,valueShep])
|
|
|
|
sortMetricsAverage = sorted(range(len(metricsAverage)), key=lambda k: metricsAverage[k], reverse=True)
|
|
sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
|
|
sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
|
|
sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
|
|
sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=True)
|
|
sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)
|
|
|
|
global metricsMatrix
|
|
metricsMatrix = []
|
|
|
|
metricsMatrix.append(sortMetricsAverage)
|
|
metricsMatrix.append(sortNeigh)
|
|
metricsMatrix.append(sortTrust)
|
|
metricsMatrix.append(sortCont)
|
|
metricsMatrix.append(sortStress)
|
|
metricsMatrix.append(sortShepCorr)
|
|
|
|
return 'OK'
|
|
|
|
@app.route('/sender')
|
|
def background_process():
|
|
global SelectedProjectionsReturn
|
|
global projectionsAll
|
|
global overalProjectionsNumber
|
|
global metricsMatrix
|
|
global metricsMatrixEntire
|
|
|
|
while (len(projectionsAll) != overalProjectionsNumber):
|
|
pass
|
|
return jsonify({ 'projections': SelectedProjectionsReturn, 'parameters': SelectedListofParams, 'metrics': metricsMatrix, 'metricsEntire': metricsMatrixEntire })
|
|
|
|
@app.route('/receiverOptimizer', methods = ['POST'])
|
|
def OptimizeSelection():
|
|
dataReceived= request.get_data().decode('utf8').replace("'", '"')
|
|
dataReceived = json.loads(dataReceived)
|
|
dataSelected = []
|
|
for data in dataReceived:
|
|
if data != None:
|
|
dataSelected.append(data)
|
|
|
|
metricNeigh = []
|
|
metricTrust = []
|
|
metricCont = []
|
|
metricStress = []
|
|
metricShepCorr = []
|
|
metricsAverage = []
|
|
|
|
for index, loop in enumerate(clusterIndex):
|
|
resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, KeepKs[index], dataSelected)
|
|
resultTrust = trustworthiness(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :], KeepKs[index])
|
|
resultContinuity = continuity(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :], KeepKs[index])
|
|
resultStress = normalized_stress(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :])
|
|
resultShep = shepard_diagram_correlation(D_highSpace[dataSelected][:, dataSelected], D_lowSpaceList[index][dataSelected][:, dataSelected])
|
|
|
|
|
|
metricNeigh.append(resultNeigh)
|
|
metricTrust.append(resultTrust)
|
|
metricCont.append(resultContinuity)
|
|
metricStress.append(resultStress)
|
|
metricShepCorr.append(resultShep)
|
|
|
|
max_value_neigh = max(metricNeigh)
|
|
min_value_neigh = min(metricNeigh)
|
|
|
|
max_value_trust = max(metricTrust)
|
|
min_value_trust = min(metricTrust)
|
|
|
|
max_value_cont = max(metricCont)
|
|
min_value_cont = min(metricCont)
|
|
|
|
max_value_stress = max(metricStress)
|
|
min_value_stress = min(metricStress)
|
|
|
|
max_value_shep = max(metricShepCorr)
|
|
min_value_shep = min(metricShepCorr)
|
|
|
|
global metricsMatrixEntireSel
|
|
metricsMatrixEntireSel = []
|
|
|
|
for index, data in enumerate(metricTrust):
|
|
valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
|
|
valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
|
|
valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
|
|
valueStress = 1 - ((metricStress[index]*(-1) - max_value_stress*(-1)) / (min_value_stress*(-1) - max_value_stress*(-1))) # we need the opposite
|
|
valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
|
|
average = (valueNeigh + valueTrust + valueCont + valueStress + valueShep) / 5
|
|
|
|
metricsAverage.append(average)
|
|
metricsMatrixEntireSel.append([average,valueNeigh,valueTrust,valueCont,valueStress,valueShep])
|
|
|
|
sortMetricsAverage = sorted(range(len(metricsAverage)), key=lambda k: metricsAverage[k], reverse=True)
|
|
sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
|
|
sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
|
|
sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
|
|
sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=True)
|
|
sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)
|
|
|
|
global metricsMatrixSel
|
|
metricsMatrixSel = []
|
|
|
|
metricsMatrixSel.append(sortMetricsAverage)
|
|
metricsMatrixSel.append(sortNeigh)
|
|
metricsMatrixSel.append(sortTrust)
|
|
metricsMatrixSel.append(sortCont)
|
|
metricsMatrixSel.append(sortStress)
|
|
metricsMatrixSel.append(sortShepCorr)
|
|
|
|
return 'OK'
|
|
|
|
@app.route('/senderOptimizer')
|
|
def SendOptimizedProjections():
|
|
global metricsMatrixSel
|
|
global metricsMatrixEntireSel
|
|
|
|
return jsonify({'metrics': metricsMatrixSel, 'metricsEntire': metricsMatrixEntireSel })
|
|
|
|
if __name__ == '__main__':
|
|
app.run("0.0.0.0", "5000")
|
|
|
|
|