t-viSNE: Interactive Assessment and Interpretation of t-SNE Projections https://doi.org/10.1109/TVCG.2020.2986996
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
t-viSNE/tsneGrid.py

469 lines
15 KiB

5 years ago
#!flask/bin/python
import sys
import os
from flask import Flask, request, Response, jsonify
from flask_cors import CORS
from multiprocessing import Pool
from scipy.spatial import procrustes
from scipy.spatial import distance
from sklearn_extra.cluster import KMedoids
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy import spatial
from scipy import stats
5 years ago
from joblib import Memory
5 years ago
import numpy as np
import pandas as pd
import random, json
import bhtsne
app = Flask(__name__)
CORS(app)
5 years ago
@app.route('/resetAll', methods = ['POST'])
def Reset():
global dataProc
dataProc = []
global D_highSpace
D_highSpace = []
global overalProjectionsNumber
overalProjectionsNumber = []
global projectionsAll
projectionsAll = []
global SelectedListofParams
SelectedListofParams = []
global SelectedProjectionsReturn
SelectedProjectionsReturn = []
global clusterIndex
clusterIndex = []
global convertLabels
convertLabels = []
global D_lowSpaceList
D_lowSpaceList = []
global KeepKs
KeepKs = []
global metricsMatrixEntire
metricsMatrixEntire = []
global metricsMatrix
metricsMatrix = []
global metricsMatrixSel
metricsMatrixSel = []
global metricsMatrixEntireSel
metricsMatrixEntireSel = []
return 'Reset'
5 years ago
# NOTE: Only works with labeled data
5 years ago
def neighborhood_hit(X, y, k, selected=None):
# Add 1 to k because the nearest neighbor is always the point itself
k += 1
y = np.array(y)
5 years ago
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, y)
5 years ago
if selected:
X = X[selected, :]
neighbors = knn.kneighbors(X, return_distance=False)
score = np.mean((y[neighbors] == np.tile(y[selected].reshape((-1, 1)), k)).astype('uint8'))
return score
5 years ago
def trustworthiness(D_high, D_low, k):
n = D_high.shape[0]
nn_orig = D_high.argsort()
nn_proj = D_low.argsort()
knn_orig = nn_orig[:, :k + 1][:, 1:]
knn_proj = nn_proj[:, :k + 1][:, 1:]
sum_i = 0
for i in range(n):
U = np.setdiff1d(knn_proj[i], knn_orig[i])
sum_j = 0
for j in range(U.shape[0]):
sum_j += np.where(nn_orig[i] == U[j])[0] - k
sum_i += sum_j
return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())
def continuity(D_high, D_low, k):
n = D_high.shape[0]
nn_orig = D_high.argsort()
nn_proj = D_low.argsort()
knn_orig = nn_orig[:, :k + 1][:, 1:]
knn_proj = nn_proj[:, :k + 1][:, 1:]
sum_i = 0
for i in range(n):
V = np.setdiff1d(knn_proj[i], knn_orig[i])
sum_j = 0
for j in range(V.shape[0]):
sum_j += np.where(nn_proj[i] == V[j])[0] - k
sum_i += sum_j
return float((1 - (2 / (n * k * (2 * n - 3 * k - 1)) * sum_i)).squeeze())
def normalized_stress(D_high, D_low):
5 years ago
return (-1) * (np.sum((D_high - D_low)**2) / np.sum(D_high**2) / 100)
5 years ago
def shepard_diagram_correlation(D_high, D_low):
if len(D_high.shape) > 1:
D_high = spatial.distance.squareform(D_high)
if len(D_low.shape) > 1:
D_low = spatial.distance.squareform(D_low)
5 years ago
5 years ago
return stats.spearmanr(D_high, D_low)[0]
def preprocess(data):
dataPandas = pd.DataFrame(data)
dataPandas.dropna()
for column in dataPandas:
if ('*' in column):
gatherLabels = dataPandas[column]
del dataPandas[column]
length = len(dataPandas.columns)
dataNP = dataPandas.to_numpy()
return dataNP, length, gatherLabels
def multi_run_wrapper(args):
embedding_array = bhtsne.run_bh_tsne(*args)
return embedding_array
def procrustesFun(projections):
similarityList = []
for proj1 in projections:
disparityList = []
for proj2 in projections:
mtx1, mtx2, disparity = procrustes(proj1, proj2)
if np.array_equal(proj1, proj2):
disparityList.append(0)
else:
disparityList.append(1/disparity)
similarityList.append(disparityList)
clusterIndex = Clustering(similarityList)
return clusterIndex
def Clustering(similarity):
similarityNP = np.array(similarity)
5 years ago
n_clusters = 25 # change that to send less diverse projections
5 years ago
kmedoids = KMedoids(n_clusters=n_clusters, random_state=0, metric='precomputed').fit(similarityNP)
global dataProc
clusterIndex = []
for c in range(n_clusters):
cluster_indices = np.argwhere(kmedoids.labels_ == c).reshape(-1,)
D_c = similarityNP[cluster_indices][:, cluster_indices]
center = np.argmin(np.sum(D_c, axis=0))
clusterIndex.append(cluster_indices[center])
return clusterIndex
5 years ago
location = './cachedir'
memory = Memory(location, verbose=0)
def wrapGetResults(listofParamsPlusData):
pool = Pool()
return pool.map(multi_run_wrapper, listofParamsPlusData)
wrapGetResults = memory.cache(wrapGetResults)
5 years ago
@app.route('/receiver', methods = ['POST'])
def calculateGrid():
data = request.get_data().decode('utf8').replace("'", '"')
data = json.loads(data)
global dataProc
dataProc, length, labels = preprocess(data)
global D_highSpace
5 years ago
D_highSpace = distance.squareform(distance.pdist(dataProc))
DEFAULT_NO_DIMS = 2
INITIAL_DIMENSIONS = 50
DEFAULT_PERPLEXITY = 50
DEFAULT_THETA = 0.5
EMPTY_SEED = -1
VERBOSE = True
DEFAULT_USE_PCA = False
5 years ago
# all other data sets
perplexity = [5,10,15,20,25,30,35,40,45,50] # 10 perplexity
# iris data set
if (labels[0] == 'Iris-setosa'):
perplexity = [5,10,15,20,25,28,32,35,40,45] # 10 perplexity
# breast cancer data set
if (labels[0] == 'Benign'):
perplexity =[30,35,40,45,50,55,60,65,70,75] # 10 perplexity
# diabetes data set
if (labels[0] == 1):
perplexity = [10,15,20,25,30,35,40,45,50,55] # 10 perplexity
learning_rate = [10,20,30,40,50,60,70,80,90,100] # 10 learning rate
n_iter = [100,150,200,250,350] # 5 iterations
5 years ago
global overalProjectionsNumber
overalProjectionsNumber = 0
overalProjectionsNumber = len(perplexity)*len(learning_rate)*len(n_iter)
global projectionsAll
listofParamsPlusData = []
listofParamsAll= []
for k in n_iter:
for j in learning_rate:
for i in perplexity:
listofParamsPlusData.append((dataProc,DEFAULT_NO_DIMS,length,i,j,EMPTY_SEED,VERBOSE,DEFAULT_USE_PCA,k))
5 years ago
listofParamsAll.append((i,j,k))
5 years ago
5 years ago
projectionsAll = wrapGetResults(listofParamsPlusData)
5 years ago
global SelectedListofParams
SelectedListofParams = []
global SelectedProjectionsReturn
SelectedProjectionsReturn = []
global clusterIndex
5 years ago
clusterIndex = procrustesFun(projectionsAll)
metricNeigh = []
metricTrust = []
metricCont = []
metricStress = []
metricShepCorr = []
5 years ago
metricsAverage = []
global convertLabels
5 years ago
convertLabels = []
for index, label in enumerate(labels):
if (label == 0):
convertLabels.append(0)
elif (label == 1):
convertLabels.append(1)
elif (label == 'Benign'):
convertLabels.append(0)
elif (label == 'Malignant'):
convertLabels.append(1)
elif (label == 'Iris-setosa'):
convertLabels.append(0)
elif (label == 'Iris-versicolor'):
convertLabels.append(1)
elif (label == 'Iris-virginica'):
convertLabels.append(2)
else:
pass
global D_lowSpaceList
D_lowSpaceList = []
global KeepKs
KeepKs = []
5 years ago
for index in clusterIndex:
SelectedProjectionsReturn.append(projectionsAll[index].tolist())
SelectedListofParams.append(listofParamsAll[index])
D_lowSpace = distance.squareform(distance.pdist(projectionsAll[index]))
D_lowSpaceList.append(D_lowSpace)
5 years ago
k = listofParamsAll[index][0] # k = perplexity
KeepKs.append(k)
5 years ago
5 years ago
resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, k)
5 years ago
resultTrust = trustworthiness(D_highSpace, D_lowSpace, k)
resultContinuity = continuity(D_highSpace, D_lowSpace, k)
resultStress = normalized_stress(D_highSpace, D_lowSpace)
resultShep = shepard_diagram_correlation(D_highSpace, D_lowSpace)
metricNeigh.append(resultNeigh)
metricTrust.append(resultTrust)
metricCont.append(resultContinuity)
metricStress.append(resultStress)
metricShepCorr.append(resultShep)
max_value_neigh = max(metricNeigh)
min_value_neigh = min(metricNeigh)
max_value_trust = max(metricTrust)
min_value_trust = min(metricTrust)
max_value_cont = max(metricCont)
min_value_cont = min(metricCont)
max_value_stress = max(metricStress)
min_value_stress = min(metricStress)
max_value_shep = max(metricShepCorr)
min_value_shep = min(metricShepCorr)
global metricsMatrixEntire
metricsMatrixEntire = []
for index, data in enumerate(metricTrust):
5 years ago
valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
5 years ago
valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
5 years ago
valueStress = 1 - ((metricStress[index]*(-1) - max_value_stress*(-1)) / (min_value_stress*(-1) - max_value_stress*(-1))) # we need the opposite
5 years ago
valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
5 years ago
average = (valueNeigh + valueTrust + valueCont + valueStress + valueShep) / 5
metricsAverage.append(average)
metricsMatrixEntire.append([average,valueNeigh,valueTrust,valueCont,valueStress,valueShep])
5 years ago
sortMetricsAverage = sorted(range(len(metricsAverage)), key=lambda k: metricsAverage[k], reverse=True)
5 years ago
sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=True)
sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)
global metricsMatrix
metricsMatrix = []
5 years ago
metricsMatrix.append(sortMetricsAverage)
5 years ago
metricsMatrix.append(sortNeigh)
metricsMatrix.append(sortTrust)
metricsMatrix.append(sortCont)
metricsMatrix.append(sortStress)
metricsMatrix.append(sortShepCorr)
return 'OK'
@app.route('/sender')
def background_process():
global SelectedProjectionsReturn
global projectionsAll
global overalProjectionsNumber
global metricsMatrix
global metricsMatrixEntire
while (len(projectionsAll) != overalProjectionsNumber):
pass
return jsonify({ 'projections': SelectedProjectionsReturn, 'parameters': SelectedListofParams, 'metrics': metricsMatrix, 'metricsEntire': metricsMatrixEntire })
@app.route('/receiverOptimizer', methods = ['POST'])
def OptimizeSelection():
dataReceived= request.get_data().decode('utf8').replace("'", '"')
dataReceived = json.loads(dataReceived)
dataSelected = []
for data in dataReceived:
if data != None:
dataSelected.append(data)
metricNeigh = []
metricTrust = []
metricCont = []
metricStress = []
metricShepCorr = []
5 years ago
metricsAverage = []
for index, loop in enumerate(clusterIndex):
5 years ago
resultNeigh = neighborhood_hit(np.array(projectionsAll[index]), convertLabels, KeepKs[index], dataSelected)
resultTrust = trustworthiness(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :], KeepKs[index])
resultContinuity = continuity(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :], KeepKs[index])
resultStress = normalized_stress(D_highSpace[dataSelected, :], D_lowSpaceList[index][dataSelected, :])
5 years ago
resultShep = shepard_diagram_correlation(D_highSpace[dataSelected][:, dataSelected], D_lowSpaceList[index][dataSelected][:, dataSelected])
5 years ago
resultAverage = (resultNeigh + resultTrust + resultContinuity + resultStress + resultShep) / 5
print(resultAverage)
metricsAverage.append(resultAverage)
metricNeigh.append(resultNeigh)
metricTrust.append(resultTrust)
metricCont.append(resultContinuity)
metricStress.append(resultStress)
metricShepCorr.append(resultShep)
max_value_neigh = max(metricNeigh)
min_value_neigh = min(metricNeigh)
max_value_trust = max(metricTrust)
min_value_trust = min(metricTrust)
max_value_cont = max(metricCont)
min_value_cont = min(metricCont)
max_value_stress = max(metricStress)
min_value_stress = min(metricStress)
max_value_shep = max(metricShepCorr)
min_value_shep = min(metricShepCorr)
global metricsMatrixEntireSel
metricsMatrixEntireSel = []
for index, data in enumerate(metricTrust):
5 years ago
valueNeigh = (metricNeigh[index] - min_value_neigh) / (max_value_neigh - min_value_neigh)
valueTrust = (metricTrust[index] - min_value_trust) / (max_value_trust - min_value_trust)
valueCont = (metricCont[index] - min_value_cont) / (max_value_cont - min_value_cont)
valueStress = (metricStress[index] - min_value_stress) / (max_value_stress - min_value_stress)
valueShep = (metricShepCorr[index] - min_value_shep) / (max_value_shep - min_value_shep)
5 years ago
average = (valueNeigh + valueTrust + valueCont + valueStress + valueShep) / 5
metricsMatrixEntireSel.append([average,valueNeigh,valueTrust,valueCont,valueStress,valueShep])
5 years ago
sortMetricsAverage = sorted(range(len(metricsAverage)), key=lambda k: metricsAverage[k], reverse=True)
print(sortMetricsAverage)
print(metricsAverage)
sortNeigh = sorted(range(len(metricNeigh)), key=lambda k: metricNeigh[k], reverse=True)
sortTrust = sorted(range(len(metricTrust)), key=lambda k: metricTrust[k], reverse=True)
sortCont = sorted(range(len(metricCont)), key=lambda k: metricCont[k], reverse=True)
5 years ago
sortStress = sorted(range(len(metricStress)), key=lambda k: metricStress[k], reverse=False)
sortShepCorr = sorted(range(len(metricShepCorr)), key=lambda k: metricShepCorr[k], reverse=True)
global metricsMatrixSel
metricsMatrixSel = []
5 years ago
metricsMatrixSel.append(sortMetricsAverage)
metricsMatrixSel.append(sortNeigh)
metricsMatrixSel.append(sortTrust)
metricsMatrixSel.append(sortCont)
metricsMatrixSel.append(sortStress)
metricsMatrixSel.append(sortShepCorr)
return 'OK'
@app.route('/senderOptimizer')
def SendOptimizedProjections():
global metricsMatrixSel
global metricsMatrixEntireSel
return jsonify({'metrics': metricsMatrixSel, 'metricsEntire': metricsMatrixEntireSel })
5 years ago
if __name__ == '__main__':
app.run("0.0.0.0", "5000")