聚类算法 —— K-means
算法原理
- 在给定的数据集上随机选定
K
个种子点
- 对数据集中每个样本与选定的种子点之间的距离进行计算
- 将距离最近的点归纳到对应的种子点所代表的聚类中
- 针对
k
个聚类重新计算每个类别的质点作为新的种子点
- 重复上述的距离计算,并更新种子点,直至不再改变
算法优劣
k-means
算法需要具备一定的先验知识,选择聚类的数量决定最终的结果
- 聚类的结果还受到随机点选择的影响
- 无监督训练,对异常值较为敏感
- 对样本分布有一定的要求,样本不能分布太过离散,对非凸形状的分类效果不好
代码实现
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def calcDis(dataSet, centroids, k):
clalist=[]
for data in dataSet:
diff = np.tile(data, (k, 1)) - centroids
squaredDiff = diff ** 2
squaredDist = np.sum(squaredDiff, axis=1)
distance = squaredDist ** 0.5
clalist.append(distance)
clalist = np.array(clalist)
return clalist
def classify(dataSet, centroids, k):
clalist = calcDis(dataSet, centroids, k)
minDistIndices = np.argmin(clalist, axis=1)
newCentroids = pd.DataFrame(dataSet).groupby(minDistIndices).mean()
newCentroids = newCentroids.values
changed = newCentroids - centroids
return changed, newCentroids
def kmeans(dataSet, k):
centroids = random.sample(dataSet, k)
changed, newCentroids = classify(dataSet, centroids, k)
while np.any(changed != 0):
changed, newCentroids = classify(dataSet, newCentroids, k)
centroids = sorted(newCentroids.tolist())
cluster = []
clalist = calcDis(dataSet, centroids, k)
minDistIndices = np.argmin(clalist, axis=1)
for i in range(k):
cluster.append([])
for i, j in enumerate(minDistIndices):
cluster[j].append(dataSet[i])
return centroids, cluster
def createDataSet():
return [[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]]
if __name__=='__main__':
dataset = createDataSet()
centroids, cluster = kmeans(dataset, 2)
print('质心为:%s' % centroids)
print('集群为:%s' % cluster)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], marker = 'o',color = 'green', s = 40 ,label = '原始点')
for j in range(len(centroids)):
plt.scatter(centroids[j][0],centroids[j][1],marker='x',color='red',s=50,label='质心')
plt.show