为了方便说明,先生成一个‘HELLO’形状的数据点
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set() def make_hello(N=1000, rseed=42): # Make a plot with "HELLO" text; save as PNG fig, ax = plt.subplots(figsize=(4, 1)) fig.subplots_adjust(left=0, right=1, bottom=0, top=1) ax.axis('off') ax.text(0.5, 0.4, 'HELLO', va='center', ha='center', weight='bold', size=85) fig.savefig('hello.png') plt.close(fig) # Open this PNG and draw random points from it from matplotlib.image import imread data = imread('hello.png')[::-1, :, 0].T rng = np.random.RandomState(rseed) X = rng.rand(4 * N, 2) i, j = (X * data.shape).astype(int).T mask = (data[i, j] < 1) X = X[mask] X[:, 0] *= (data.shape[0] / data.shape[1]) X = X[:N] return X[np.argsort(X[:, 0])] X = make_hello(1000) colorize = dict(c=X[:, 0], cmap=plt.cm.get_cmap('rainbow', 5)) plt.scatter(X[:, 0], X[:, 1], colorize) plt.axis('equal');

这里x和y坐标不是对数据的最佳描述,因为如果对图像进行旋转,虽然x和y的值都改变了,但是图形的形状还是清晰可辨。
def rotate(X, angle): #将度转为弧度 theta = np.deg2rad(angle) R = [[np.cos(theta), np.sin(theta)], [-np.sin(theta), np.cos(theta)]] return np.dot(X, R) X2 = rotate(X, 20) + 5 plt.scatter(X2[:, 0], X2[:, 1], colorize) plt.axis('equal')

其实这里真正的基础特征是图中每个点与其他点距离,常用关系矩阵表示,N个点就有一个N✖️N的矩阵
from sklearn.metrics import pairwise_distances D = pairwise_distances(X) D.shape (1000, 1000)
将以上矩阵用图像表示出来,颜色越浅距离越近
plt.imshow(D,zorder=2,cmap='Blues',interpolation='nearest') plt.colorbar()

MDS可以将距离矩阵还原为原来的D为坐标来表示数据,注意处理距离矩阵时要把参数dissimilarity设为precomputed
from sklearn.manifold import MDS model = MDS(n_components=2,dissimilarity='precomputed',random_state=1) out = model.fit_transform(D) plt.scatter(out[:,0],out[:,1],colorize) plt.axis('equal')

MDS用于流形学习
构造一个三维空间的‘HELLO’字样
def random_projection(X, dimension=3, rseed=42): def random_projection(X, dimension=3, rseed=42): assert dimension >= X.shape[1] rng = np.random.RandomState(rseed) C = rng.randn(dimension, dimension) e, V = np.linalg.eigh(np.dot(C, C.T)) return np.dot(X, V[:X.shape[1]]) X3 = random_projection(X, 3) from mpl_toolkits import mplot3d ax = plt.axes(projection='3d') ax.scatter3D(X3[:, 0], X3[:, 1], X3[:, 2], colorize) ax.view_init(azim=70, elev=50)

现在用MDS评估这个三维数据,它会计算距离矩阵,并得出距离矩阵的最优二维映射
model = MDS(n_components=2,random_state=1) out3 = model.fit_transform(X3) plt.scatter(out3[:,0],out3[:,1],colorize) plt.axis('equal')

局部线性嵌入(LLE)
def make_hello_s_curve(X): t = (X[:, 0] - 2) * 0.75 * np.pi x = np.sin(t) y = X[:, 1] z = np.sign(t) * (np.cos(t) - 1) return np.vstack((x, y, z)).T XS = make_hello_s_curve(X) from mpl_toolkits import mplot3d ax = plt.axes(projection='3d') ax.scatter3D(XS[:, 0], XS[:, 1], XS[:, 2], colorize);

用MDS将数据降维到2维看下效果
from sklearn.manifold import MDS model = MDS(n_components=2,random_state=2) outs = model.fit_transform(XS) plt.scatter(outs[:,0],outs[:,1],colorize) plt.axis('equal')

MDS无法还原HELLO字样了
此时局部线性嵌入LLE可以发挥威力了,MDS试图保留数据每对数据点之间的距离,而LLE只保留领巾N个点间的距离


图中每条线表示保留的距离,可以想象,当用LLE把图像展开始,由于只保留了最近n个点的距离,比如一个字母上的取一个点,他最近的n个点也基本在这个字母上,这样就能较好的还原形状了。
from sklearn.manifold import LocallyLinearEmbedding model = LocallyLinearEmbedding(n_neighbors=100,n_components=2 ,method='modified',eigen_solver='dense') out = model.fit_transform(XS) plt.scatter(out[:,0],out[:,1],colorize)

流形学习与PCA比较
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/200062.html原文链接:https://javaforall.net
