【模式识别】实验三:K均值算法和模糊C均值算法
【摘要】
本文采用了sonar和Iris数据集,完整的程序代码实验报告pdf,数据集可以戳下面的链接下载。 Link:https://download.csdn.net/download/qq11987681...
本文采用了sonar和Iris数据集,完整的程序代码实验报告pdf,数据集可以戳下面的链接下载。
Link:https://download.csdn.net/download/qq1198768105/71411278
实验报告图片版
程序代码
以Iris数据集为例:
k-means
import numpy as np
import matplotlib.pyplot as plt
import random
# 正常导入数据
def load_dataset():
data = np.genfromtxt('./iris.txt', delimiter=',', usecols=(0, 1, 2, 3))
target = np.genfromtxt('./iris.txt', delimiter=',', usecols=(4), dtype=str)
t = np.zeros(len(target))
t[target == 'setosa'] = 1
t[target == 'versicolor'] = 2
t[target == 'virginica'] = 3
return data, t
# 随机初始化k个聚类中心,从样本中随机选取
def randChosenCent(data, k):
# 样本数
m = data.shape[0]
# 初始化列表
centroids = []
# 生成类似于样本索引的列表
centroidsIndex = random.sample(range(0, m), k) # 产生k个[0,60)的不同随机数
# 根据索引获取样本
for j in centroidsIndex:
centroids.append(data[j])
return centroids
def osdistance(vecA, vecB): # 两个向量间欧式距离
return np.sqrt(sum(np.power(vecA - vecB, 2)))
def kMeans(data, k):
# 样本总数
m = len(data)
# 分配样本到最近的簇:存[簇序号,距离],m行2列
cluster = np.zeros((m, 2))
# 通过随机产生的样本点初始化聚类中心
centroids = np.array(randChosenCent(data, k))
# print('最初的中心=', centroids)
clusterChanged = True # 标记每次迭代后聚类中心是否发生变化
iterTime = 0 # 标记迭代次数
# 所有样本分配结果不再改变,迭代终止
while clusterChanged:
# 分配到最近的聚类中心对应的簇中
for i in range(m):
# 初始定义距离为无穷大
minDist = float('inf')
# 初始化索引值
minIndex = -1
# 计算每个样本与k个中心点距离
for j in range(k):
# 计算第i个样本到第j个中心点的距离
distJI = osdistance(centroids[j], data[i])
# 判断距离是否为最小
if distJI < minDist:
# 更新获取到最小距离
minDist = distJI
# 获取对应的簇序号
minIndex = j
cluster[i, 0] = minIndex
cluster[i, 1] = minDist
iterTime += 1
# 更新聚类中心
centroids_pre = centroids.copy() # 将之前的聚类中心做深拷贝
for cent in range(k):
cent_sum = np.zeros((1, 4)) # (1,4)维度的向量
num = 0 # num 用来计量簇内个数
for i in range(m):
if (cluster[i, 0] == cent):
cent_sum += data[i, :]
num += 1
centroids[cent, :] = cent_sum / num
if ((centroids_pre == centroids).all()):
clusterChanged = False
# print('迭代次数为', '%d' % iterTime)
return cluster, iterTime, centroids
# 计算分类纯度
def cal_accuracy(k):
accuracy = 0
for i in range(k):
label_list = [] # label_list 存储第i簇样本的真实标签
for j in range(len(cluster)):
if (cluster[j][0] == i):
label_list.append(t[j])
# print(label_list)
true_label = max(label_list, key=label_list.count) # 选取数量最大的标签作为其标签
# 再次遍历真实样本类别,若真实样本类别=簇类别,accuracy+1
for n in range(len(label_list)):
if (label_list[n] == true_label):
accuracy += 1
accuracy = accuracy / len(data)
return accuracy
def draw(data, t):
x0 = data[t == 1]
x1 = data[t == 2]
x2 = data[t == 3]
plt.figure(1)
plt.scatter(x0[:, 0], x0[:, 1], c='r', marker='o', label='类别一')
plt.scatter(x1[:, 0], x1[:, 1], c='g', marker='o', label='类别二')
plt.scatter(x2[:, 0], x2[:, 1], c='blue', marker='o', label='类别三')
plt.xlabel('花瓣长度')
plt.ylabel('花瓣宽度')
plt.title('花瓣长度和花瓣宽度特征之间的散点图(真实数据)')
plt.legend(loc=2) # 把图例放到左上角
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.savefig('./iris_kmeans(yuanshi)')
plt.show()
def draw_pre(cluster, data, centroids):
x0 = []
x1 = []
x2 = []
for i in range(len(cluster)):
if cluster[i][0] == 0:
x0.append(data[i])
elif cluster[i][0] == 1:
x1.append(data[i])
elif cluster[i][0] == 2:
x2.append(data[i])
x0 = np.array(x0)
x1 = np.array(x1)
x2 = np.array(x2)
plt.figure(2)
plt.scatter(x0[:, 0], x0[:, 1], c='r', marker='o', label='类别一')
plt.scatter(x1[:, 0], x1[:, 1], c='g', marker='o', label='类别二')
plt.scatter(x2[:, 0], x2[:, 1], c='b', marker='o', label='类别三')
plt.scatter(centroids[:, 0], centroids[:, 1], c='black', marker='x')
plt.xlabel('花瓣长度')
plt.ylabel('花瓣宽度')
plt.title('花瓣长度和花瓣宽度特征之间的散点图(预测数据)')
plt.legend(loc=2) # 把图例放到左上角
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.savefig('./iris_kmeans(yuce)')
plt.show()
if __name__ == '__main__':
data, t = load_dataset()
k = 3
cluster, iterTime, centroids = kMeans(data, k)
# 绘制前后对比散点图
draw(data, t)
draw_pre(cluster, data, centroids)
sum_iterTime = 0
sum_accuracy = 0
for i in range(10):
cluster, iterTime, centroids = kMeans(data, k)
accuracy = cal_accuracy(k)
sum_iterTime += iterTime
sum_accuracy += accuracy
print("平均迭代次数为:", "{}".format(sum_iterTime / 10))
print("平均分类纯度为:", "{:.2%}".format(sum_accuracy / 10))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
FCM
import numpy as np
# 正常导入数据
def load_dataset():
data = np.genfromtxt('./iris.txt', delimiter=',', usecols=(0, 1, 2, 3))
target = np.genfromtxt('./iris.txt', delimiter=',', usecols=(4), dtype=str)
t = np.zeros(len(target))
t[target == 'setosa'] = 1
t[target == 'versicolor'] = 2
t[target == 'virginica'] = 3
return data, t
def osdistance(vecA, vecB): # 两个向量间欧式距离
return np.sqrt(sum(np.power(vecA - vecB, 2)))
# 初始化U矩阵
def initmatU(m, c):
mat_u = np.random.uniform(0, 1, (m, c)) # 0,1之间均匀分布初始化
# 归一化——每一个样本对所有分类集合隶属度总和为1
for i in range(m):
addsum = 0
for j in range(c):
addsum += mat_u[i, j]
mat_u[i, :] = mat_u[i, :] / addsum
return mat_u
def FCMtrain(data, c, alpha, theta):
m = len(data)
dim = data.shape[1] # 样本维度
mat_u = initmatU(m, c)
# 计算c个聚类中心
c_list = np.zeros([c, dim])
iterTime = 0 # 标记迭代次数
last_cost = 0 # 上一次的损失
while True:
# 计算聚类中心c_list
for j in range(c):
sum_uij = 0 # 表达式分母
sum_uij_x = 0 # 表达式分子
for i in range(m):
sum_uij += mat_u[i, j] ** alpha
sum_uij_x += mat_u[i, j] ** alpha * data[i, :]
c_list[j, :] = sum_uij_x / sum_uij
# 计算损失函数
cost = 0
for j in range(c):
for i in range(m):
vec1 = np.array(data[i, :]) # 第i条样本
vec2 = np.array(c_list[j, :]) # 第j个中心
dis = osdistance(vec1, vec2)
cost += mat_u[i, j] ** alpha * dis ** 2
if abs(last_cost - cost) < theta:
break
last_cost = cost
# 重新计算U
for j in range(c):
vec1 = np.array(c_list[j, :]) # 第j条样本
for i in range(m):
vec2 = np.array(data[i, :]) # 第i个中心
dis_ij = osdistance(vec1, vec2)
sumd_d = 0
for k in range(c):
vec3 = np.array(c_list[k, :]) # 第k个中心
dis_ki = osdistance(vec2, vec3)
sumd_d += (dis_ij / dis_ki) ** (2 / (alpha - 1))
mat_u[i, j] = 1 / sumd_d
# 归一化
for i in range(m):
addsum = 0
for j in range(c):
addsum += mat_u[i, j]
mat_u[i, :] = mat_u[i, :] / addsum
iterTime += 1
# print('迭代次数为', '%d' % iterTime)
# 对每一条样本进行遍历,隶属度最大的集合类别即为样本预测类别
pred = []
for i in range(m):
t = np.argmax(mat_u[i, :])
pred.append(t)
return c_list, pred, iterTime
# 计算分类纯度
def cal_accuracy(c, pred):
accuracy = 0
for i in range(c):
label_list = [] # label_list 存储第i簇样本的真实标签
for j in range(len(pred)):
if (pred[j] == i):
label_list.append(t[j])
true_label = max(label_list, key=label_list.count) # 选取数量最大的标签作为其标签
# 再次遍历真实样本类别,若真实样本类别=簇类别,accuracy+1
for n in range(len(label_list)):
if (label_list[n] == true_label):
accuracy += 1
accuracy = accuracy / len(data)
return accuracy
if __name__ == '__main__':
data, t = load_dataset()
c = 3
alpha = 6
theta = 0.001
sum_iterTime = 0
sum_accuracy = 0
c_list, pred, iterTime = FCMtrain(data, c, alpha, theta)
# print(c_list)
for i in range(10):
c_list, pred, iterTime = FCMtrain(data, c, alpha, theta)
accuracy = cal_accuracy(c, pred)
sum_iterTime += iterTime
sum_accuracy += accuracy
print("平均迭代次数为:", "{}".format(sum_iterTime/10))
print("平均分类纯度为:", "{:.2%}".format(sum_accuracy/10))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
文章来源: zstar.blog.csdn.net,作者:zstar-_,版权归原作者所有,如需转载,请联系作者。
原文链接:zstar.blog.csdn.net/article/details/122158578
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)