Python - A demo of K-Means clustering on the handwritten digits data
================
# http://wiki.swarma.net/index.php?title=K-means%E8%81%9A%E7%B1%BB&variant=zh-hant
# https://blog.csdn.net/GoodShot/article/details/61208708
# https://blog.csdn.net/u013346007/article/details/54581253
#A demo of K-Means clustering on the handwritten digits data
from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
np.random.seed(42)
digits = load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape #1797 組數據 每個數據 64個特徵
n_digits = len(np.unique(digits.target)) # 10
labels = digits.target # 結果
sample_size = 300
print("n_digits: %d, \t n_samples %d, \t n_features %d"
% (n_digits, n_samples, n_features))
'''
Shorthand full name
homo homogeneity score
compl completeness score
v-meas V measure
ARI adjusted Rand index
AMI adjusted mutual information
silhouette silhouette coefficient
'''
print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
def bench_k_means(estimator, name, data):
t0 = time()
estimator.fit(data)
print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
% (name, (time() - t0), estimator.inertia_,
metrics.homogeneity_score(labels, estimator.labels_),
metrics.completeness_score(labels, estimator.labels_),
metrics.v_measure_score(labels, estimator.labels_),
metrics.adjusted_rand_score(labels, estimator.labels_),
metrics.adjusted_mutual_info_score(labels, estimator.labels_),
metrics.silhouette_score(data, estimator.labels_,
metric='euclidean',
sample_size=sample_size)))
# def bench_k_means(estimator, name, data):
# K-means聚類的結果會隨着制定類別數和初始點的選擇有所不同。我們這裡總是聚成十類,因為手寫數字一共有十種。
# 至於初始點的選取我們定義三種,k-means++,隨機,和PCA降到十個維度後的中心點。
bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
name="k-means++", data=data)
#kmeans++算法的主要工作体现在种子点的选择上,基本原则是使得各个种子点之间的距离尽可能的大,
#但是又得排除噪声的影响。
bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
name="random", data=data)
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
# n_digits = 10
pca = PCA(n_components=n_digits).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
name="PCA-based",
data=data)
print(82 * '_')
# #############################################################################
# Visualize the results on PCA-reduced data
#我們還可以將原始數據用PCA降到兩維,這樣所有的數據就可以在二維空間中畫出來。
#然後我們再用降維後的只有兩個特徵的數據進行Kmeans分析,聚成十類,
#然後再畫在Voronoi diagram圖的背景上。效果如下所示:
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02 # point in the mesh [x_min, x_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min = reduced_data[:, 0].min() - 1
x_max = reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
# meshgrid函数用两个坐标轴上的点在平面上画格。
# https://blog.csdn.net/u013346007/article/details/54581253
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) #将多维数组降位一维
'''
np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等,类似于pandas中的concat()。
np.c_是按行连接两个矩阵,就是把两矩阵左右相加,要求行数相等,类似于pandas中的merge()。
'''
# Put the result into a color plot
Z = Z.reshape(xx.shape) #(927,949)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
##############################################
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
##############################################
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
==============================================