Python实现TSNE

来源：互联网发布：网络推广培训学校编辑：程序博客网时间：2024/06/06 20:57

TSNE的实现总体上并不复杂，麻烦的是其超高的浮点运算和大型矩阵的操控，在上一篇Largevis的算法中，TangJian大神很明显用的是MATLAB，我这里贴出Python版本的代码，和大家一起学习。

代码分为几个模块

1、计算高维空间分布P

2、计算低维空间分布Q

3、计算梯度

4、主函数，进行迭代

1、计算高维空间分布P

def cal_matrix_P(X,neighbors):    entropy=numpy.log(neighbors)    n1,n2=X.shape    D=numpy.square(metrics.pairwise_distances(X))    D_sort=numpy.argsort(D,axis=1)    P=numpy.zeros((n1,n1))    for i in xrange(n1):        Di=D[i,D_sort[i,1:]]        P[i,D_sort[i,1:]]=cal_p(Di,entropy=entropy)    P=(P+numpy.transpose(P))/(2*n1)    P=numpy.maximum(P,1e-100)    return P

neighbors为邻域点个数，P是逐行计算，最后在计算平均，使其成为对称矩阵的，每一行的计算都需要找到一个合适beta，使得这一行的分布熵小于等于log(neighbors)，我这里偷了个懒，没有找邻域点，而是对数据进行了排序，选取排序前面k个作为邻域点，在解决大规模问题时，有兴趣的可以自行改进。

def cal_p(D,entropy,K=50):    beta=1.0    H=cal_entropy(D,beta)    error=H-entropy    k=0    betamin=-numpy.inf    betamax=numpy.inf    while numpy.abs(error)>1e-4 and k<=K:        if error > 0:            betamin=copy.deepcopy(beta)            if betamax==numpy.inf:                beta=beta*2            else:                beta=(beta+betamax)/2        else:            betamax=copy.deepcopy(beta)            if betamin==-numpy.inf:                beta=beta/2            else:                beta=(beta+betamin)/2        H=cal_entropy(D,beta)        error=H-entropy        k+=1    P=numpy.exp(-D*beta)    P=P/numpy.sum(P)    return P

def cal_entropy(D,beta):    # P=numpy.exp(-(numpy.sqrt(D))*beta)    P=numpy.exp(-D*beta)    sumP=sum(P)    sumP=numpy.maximum(sumP,1e-200)    H=numpy.log(sumP) + beta * numpy.sum(D * P) / sumP    return H

2、计算低维空间分布Q

这里修改掉几个注释就能在TSNE和Largevis之间转换。

def cal_matrix_Q(Y):    n1,n2=Y.shape    D=numpy.square(metrics.pairwise_distances(Y))    #Q=1/(1+numpy.exp(D))    #Q=1/(1+numpy.square(D))    #Q=1/(1+2*D)    #Q=1/(1+0.5*D)    Q=(1/(1+D))/(numpy.sum(1/(1+D))-n1)    Q=Q/(numpy.sum(Q)-numpy.sum(Q[range(n1),range(n1)]))    Q[range(n1),range(n1)]=0    Q=numpy.maximum(Q,1e-100)    return Q

3、计算梯度

def cal_gradients(P,Q,Y):    n1,n2=Y.shape    DC=numpy.zeros((n1,n2))    for i in xrange(n1):        E=(1+numpy.sum((Y[i,:]-Y)**2,axis=1))**(-1)        F=Y[i,:]-Y        G=(P[i,:]-Q[i,:])        E=E.reshape((-1,1))        G=G.reshape((-1,1))        G=numpy.tile(G,(1,n2))        E=numpy.tile(E,(1,n2))        DC[i,:]=numpy.sum(4*G*E*F,axis=0)    return DC

4、计算损失函数KL散度

def cal_loss(P,Q):    C=numpy.sum(P * numpy.log(P / Q))    return C

5、主函数过程

迭代采用的是探测步长的进退法，就是给定一个步长，如果误差下降了，而且下降速度比上一步快了，我就增加一点步长，如果下降了，但是没有比上一步快，我就继续保持这个，如果误差上升了，我就减小步长，退回上一步。

def tsne(X,n=2,neighbors=30,max_iter=200):    tsne_dat=shelve.open('tsne.dat')    data=[]    n1,n2=X.shape    P=cal_matrix_P(X,neighbors)    Y=numpy.random.randn(n1,n)*1e-4    Q = cal_matrix_Q(Y)    DY = cal_gradients(P, Q, Y)    A=200.0    B=0.1    for i  in xrange(max_iter):        data.append(Y)        if i==0:            Y=Y-A*DY            Y1=Y            error1=cal_loss(P,Q)        elif i==1:            Y=Y-A*DY            Y2=Y            error2=cal_loss(P,Q)        else:            YY=Y-A*DY+B*(Y2-Y1)            QQ = cal_matrix_Q(YY)            error=cal_loss(P,QQ)            if error>error2:                A=A*0.7                continue            elif (error-error2)>(error2-error1):                A=A*1.2            Y=YY            error1=error2            error2=error            Q = QQ            DY = cal_gradients(P, Q, Y)            Y1=Y2            Y2=Y        if cal_loss(P,Q)<1e-3:            return Y        if numpy.fmod(i+1,10)==0:            print '%s iterations the error is %s, A is %s'%(str(i+1),str(round(cal_loss(P,Q),2)),str(round(A,3)))    tsne_dat['data']=data    tsne_dat.close()    return Y

测试（将其与Sklearn包中的TSNE比较，CUSTOM代表自定义算法）

鸢尾花：150个样本，4个特征，3类

手写数字：1760个样本，64个特征（8*8），10类

def test_iris():    data=datasets.load_iris()    X=data.data    target=data.target    t1=time.time()    Y=tsne(X,n=2,max_iter=300,neighbors=20)    t2=time.time()    print "Custom TSNE cost time: %s"%str(round(t2-t1,2))    figure1=pyplot.figure()    pyplot.subplot(1,2,1)    pyplot.plot(Y[0:50,0],Y[0:50,1],'ro',markersize=30)    pyplot.plot(Y[50:100,0],Y[50:100,1],'gx',markersize=30)    pyplot.plot(Y[100:150,0],Y[100:150,1],'b*',markersize=30)    pyplot.title('CUSTOM')    pyplot.subplot(1,2,2)    t1=time.time()    Y1=manifold.TSNE(2).fit_transform(data.data)    t2=time.time()    print "Sklearn TSNE cost time: %s"%str(round(t2-t1,2))    pyplot.plot(Y1[0:50,0],Y1[0:50,1],'ro',markersize=30)    pyplot.plot(Y1[50:100,0],Y1[50:100,1],'gx',markersize=30)    pyplot.plot(Y1[100:150,0],Y1[100:150,1],'b*',markersize=30)    pyplot.title('SKLEARN')    pyplot.show()

显示迭代过程：

10 iterations the error is 0.78, A is 288.0
30 iterations the error is 0.55, A is 174.828
50 iterations the error is 0.5, A is 181.934
60 iterations the error is 0.49, A is 184.857
70 iterations the error is 0.48, A is 187.826
80 iterations the error is 0.47, A is 190.844
90 iterations the error is 0.47, A is 193.909
100 iterations the error is 0.46, A is 165.5
120 iterations the error is 0.46, A is 172.227
140 iterations the error is 0.45, A is 149.356
150 iterations the error is 0.45, A is 218.528
160 iterations the error is 0.45, A is 155.427
170 iterations the error is 0.45, A is 157.924
180 iterations the error is 0.45, A is 160.46
190 iterations the error is 0.44, A is 136.952
200 iterations the error is 0.44, A is 166.982
210 iterations the error is 0.44, A is 142.518
220 iterations the error is 0.44, A is 208.523
230 iterations the error is 0.44, A is 177.973
240 iterations the error is 0.44, A is 180.832
260 iterations the error is 0.44, A is 186.689
270 iterations the error is 0.44, A is 225.819
280 iterations the error is 0.44, A is 191.205
290 iterations the error is 0.44, A is 194.277
300 iterations the error is 0.44, A is 197.398
Custom TSNE cost time: 8.49
Sklearn TSNE cost time: 10.86

def test_digits():    data=datasets.load_digits()    X=data.data    target=data.target    t1=time.time()    Y=tsne(X,n=2,max_iter=100,neighbors=50)    t2=time.time()    t=t2-t1    print "Custom TSNE cost time: %s"%str(round(t,2))    figure1=pyplot.figure()    pyplot.subplot(1,2,1)    for i in range(10):        xxx1 = Y[target == i, 0]        xxx2 = Y[target == i, 1]        pyplot.scatter(xxx1,xxx2,c=color[i])    pyplot.xlim(numpy.min(Y)-5,numpy.max(Y)+5)    pyplot.xlim(numpy.min(Y)-5,numpy.max(Y)+5)    pyplot.title('CUSTOM: %ss'%str(round(t,2)))    pyplot.subplot(1,2,2)    t1=time.time()    Y1=manifold.TSNE(2).fit_transform(data.data)    t2=time.time()    t=t2-t1    print "Sklearn TSNE cost time: %s"%str(round(t,2))    for i in range(10):        xxx1 = Y1[target == i, 0]        xxx2 = Y1[target == i, 1]        pyplot.scatter(xxx1,xxx2,c=color[i])    pyplot.xlim(numpy.min(Y1)-5,numpy.max(Y1)+5)    pyplot.xlim(numpy.min(Y1)-5,numpy.max(Y1)+5)    pyplot.title('SKLEARN: %ss'%str(round(t,2)))    pyplot.show()

10 iterations the error is 3.49, A is 240.0
20 iterations the error is 3.48, A is 240.0
30 iterations the error is 2.71, A is 288.0
40 iterations the error is 1.81, A is 859.963
50 iterations the error is 1.3, A is 2139.864
70 iterations the error is 1.08, A is 1558.787
80 iterations the error is 1.03, A is 1108.679
90 iterations the error is 0.99, A is 1622.145
100 iterations the error is 0.96, A is 1153.742
Custom TSNE cost time: 271.09
Sklearn TSNE cost time: 82.11

0 0