第二节2:OPTICS算法Python实现和效果展示
Posted 快乐江湖
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了第二节2:OPTICS算法Python实现和效果展示相关的知识,希望对你有一定的参考价值。
文章目录
三:Python实现
import operator
import numpy as np
import random
# 更新
def update(data_set, p_neighbor, eps, min_pts, visited, unvisited, seeds, i, core_distance, reachability_distance):
i_nighbor = p_neighbor
for j in i_nighbor:
if j in unvisited:
# 计算j到p的可达距离
tmp_reach_ability_j = max(core_distance[i], np.sqrt(np.sum(np.power(data_set[j] - data_set[i], 2))))
# 如果j的可达距离未定义
if np.isnan(reachability_distance[j]):
# 将tmp_reach_ability_j当作j的可达距离
reachability_distance[j] = tmp_reach_ability_j
# 把点j按照可达距离的大小插入到seeds中
seeds[j] = tmp_reach_ability_j
# 如果tmp_reach_ability_j小于j的可达距离
elif tmp_reach_ability_j < reachability_distance[j]:
# 用tmp_reach_ability_j替换j的可达距离
reachability_distance[j] = tmp_reach_ability_j
# 将seeds中的数据按照可达距离大小重新排序
seeds[j] = tmp_reach_ability_j
return seeds
def optics(data_set, eps, min_pts):
# 样本数量
example_nums = np.shape(data_set)[0]
# 数据处理序列
order = []
# 核心距离
core_distance = [np.nan for i in range(example_nums)]
# 可达距离
reachability_distance = [np.nan for i in range(example_nums)]
# 未被访问的点
unvisited = [i for i in range(example_nums)]
# 已被访问的点
visited = []
# 开始循环随机选取一个未被访问的点
while len(unvisited) > 0:
p = random.choice(unvisited)
visited.append(p)
unvisited.remove(p)
# 把p点输出到序列里
order.append(p)
# 初始化seeds为空
seeds = dict()
# 确定p的pes邻域
p_nighbor = []
# 用于保存p的eps领域内的点到p点的距离,便于拿到核心距离
tmp_distance_p = []
for i in range(example_nums):
eculid_distance = np.sqrt(np.sum(np.power(data_set[i, :] - data_set[p, :], 2)))
# 计算距离,看是否在邻域内
if i != p and eculid_distance <= eps:
tmp_distance_p.append(eculid_distance)
p_nighbor.append(i)
# 从小到大排序
tmp_distance_p.sort()
if len(p_nighbor) >= min_pts: # 是核心对象
# 计算核心距离
core_distance[p] = tmp_distance_p[min_pts-2]
# 计算可达距离(Update)
seeds = update(data_set, p_nighbor, eps, min_pts, visited, unvisited, seeds, p, core_distance, reachability_distance)
# 对于seeds中未访问的点(和上面一样)
while len(seeds) > 0:
# 根据可达距离大小对seeds进行排序
q = sorted(seeds.items(), key=operator.itemgetter(1))[0][0]
del seeds[q]
unvisited.remove(q)
visited.append(q)
order.append(q)
q_nighbor = []
tmp_distance_q = []
for z in range(example_nums):
eculid_distance = np.sqrt(np.sum(np.power(data_set[z] - data_set[q], 2)))
if eculid_distance <= eps and z != q:
tmp_distance_q.append(eculid_distance)
q_nighbor.append(z)
tmp_distance_q.sort()
if len(q_nighbor) >= min_pts:
core_distance[q] = tmp_distance_q[min_pts - 2]
# 计算可达距离
seeds = update(data_set, q_nighbor, eps, min_pts, visited, unvisited, seeds, q, core_distance, reachability_distance)
return order, core_distance, reachability_distance
# 簇抽取过程
def cluster_extraction(example_nums, eps, order, core_distance, reachability_distance):
cluster = [0 for _ in range(example_nums)]
k = 0
for each_order in order:
# 如果该点可达距离大于簇抽取半径,那么说明该点与其之前的点都不密度可达
if reachability_distance[each_order] > eps:
# 又如果该点核心距离小,那么形成新的簇
if core_distance[each_order] < eps:
k += 1
cluster[each_order] = k
# 否则标记为噪声
else:
cluster[each_order] = -1
# 密度可达,分配到前面的簇中
else:
cluster[each_order] = k
return cluster
四:效果展示
- OPTICS算法对
eps
和min_pts
不敏感,所以eps一般设置为np.inf
,min_pts
一般设置为10(稠密数据要在此基础增加、稀疏数据则要减小)
(1)人造数据集
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import OPTICS
raw_data = pd.read_csv('./dataset/438-3.csv', header=None)
raw_data.columns = ['X', 'Y']
x_axis = 'X'
y_axis = 'Y'
examples_num = raw_data.shape[0]
train_data = raw_data[[x_axis, y_axis]].values.reshape(examples_num, 2)
# 参数
eps = np.inf
min_pts = 4
order, core_distance, reachability_distance = OPTICS.optics(train_data, eps, min_pts)
cluster = OPTICS.cluster_extraction(np.shape(train_data)[0], 1.3, order, core_distance, reachability_distance)
order = np.array(order)
core_distance = np.array(core_distance)
reachability_distance = np.array(reachability_distance)
data = reachability_distance[order]
plt.figure(figsize=(12, 5), dpi=80)
# 决策图
plt.subplot(1, 2, 1)
plt.plot(range(0, len(data)), data)
plt.plot([0, len(data)], [1.3, 1.3])
plt.title("decision graph")
# 第二幅图聚类结果
plt.subplot(1, 2, 2)
class1_X = []
class1_Y = []
class2_X = []
class2_Y = []
class3_X = []
class3_Y = []
noise_X = [] # 噪声点
noise_Y = [] # 噪声点
for index, value in enumerate(cluster):
if value == 0:
class1_X.append(train_data[index][0])
class1_Y.append(train_data[index][1])
elif value == 1:
class2_X.append(train_data[index][0])
class2_Y.append(train_data[index][1])
elif value == 2:
class3_X.append(train_data[index][0])
class3_Y.append(train_data[index][1])
elif value == -1:
noise_X.append(train_data[index][0])
noise_Y.append(train_data[index][1])
plt.scatter(class1_X, class1_Y, c='g', label='class1')
plt.scatter(class2_X, class2_Y, c='r', label='class2')
plt.scatter(class3_X, class3_Y, c='blue', label='class3')
plt.scatter(noise_X, noise_Y, c='black', label='noise')
plt.title('OPTICS')
plt.legend()
plt.show()
(2)Square数据集
(3)Arrevation
(4)Gassian
(5)Jain数据集
(6)Lineblobs
(7)Melon数据集
(8)Spril数据集
(9)Threecircles数据集
(10)788Points数据集
以上是关于第二节2:OPTICS算法Python实现和效果展示的主要内容,如果未能解决你的问题,请参考以下文章
第二节2:K-Means算法及其Python实现(算法实现结果展示)