第二节2:OPTICS算法Python实现和效果展示

Posted 快乐江湖

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了第二节2:OPTICS算法Python实现和效果展示相关的知识,希望对你有一定的参考价值。

文章目录

三:Python实现

import operator

import numpy as np
import random


# 更新
def update(data_set, p_neighbor, eps, min_pts, visited, unvisited, seeds, i, core_distance, reachability_distance):
    i_nighbor = p_neighbor

    for j in i_nighbor:
        if j in unvisited:
            # 计算j到p的可达距离
            tmp_reach_ability_j = max(core_distance[i], np.sqrt(np.sum(np.power(data_set[j] - data_set[i], 2))))
            # 如果j的可达距离未定义
            if np.isnan(reachability_distance[j]):
                # 将tmp_reach_ability_j当作j的可达距离
                reachability_distance[j] = tmp_reach_ability_j
                # 把点j按照可达距离的大小插入到seeds中
                seeds[j] = tmp_reach_ability_j
            # 如果tmp_reach_ability_j小于j的可达距离
            elif tmp_reach_ability_j < reachability_distance[j]:
                #  用tmp_reach_ability_j替换j的可达距离
                reachability_distance[j] = tmp_reach_ability_j
                # 将seeds中的数据按照可达距离大小重新排序
                seeds[j] = tmp_reach_ability_j

    return seeds


def optics(data_set, eps, min_pts):
    # 样本数量
    example_nums = np.shape(data_set)[0]
    # 数据处理序列
    order = []
    # 核心距离
    core_distance = [np.nan for i in range(example_nums)]
    # 可达距离
    reachability_distance = [np.nan for i in range(example_nums)]
    # 未被访问的点
    unvisited = [i for i in range(example_nums)]
    # 已被访问的点
    visited = []

    # 开始循环随机选取一个未被访问的点
    while len(unvisited) > 0:
        p = random.choice(unvisited)
        visited.append(p)
        unvisited.remove(p)

        # 把p点输出到序列里
        order.append(p)
        # 初始化seeds为空
        seeds = dict()

        # 确定p的pes邻域
        p_nighbor = []
        # 用于保存p的eps领域内的点到p点的距离,便于拿到核心距离
        tmp_distance_p = []
        for i in range(example_nums):
            eculid_distance = np.sqrt(np.sum(np.power(data_set[i, :] - data_set[p, :], 2)))
            # 计算距离,看是否在邻域内
            if i != p and eculid_distance <= eps:
                tmp_distance_p.append(eculid_distance)
                p_nighbor.append(i)
        # 从小到大排序
        tmp_distance_p.sort()
        if len(p_nighbor) >= min_pts:  # 是核心对象
            # 计算核心距离
            core_distance[p] = tmp_distance_p[min_pts-2]
            # 计算可达距离(Update)
            seeds = update(data_set, p_nighbor, eps, min_pts, visited, unvisited, seeds, p, core_distance, reachability_distance)

            # 对于seeds中未访问的点(和上面一样)
            while len(seeds) > 0:
                # 根据可达距离大小对seeds进行排序
                q = sorted(seeds.items(), key=operator.itemgetter(1))[0][0]
                del seeds[q]
                unvisited.remove(q)
                visited.append(q)
                order.append(q)

                q_nighbor = []
                tmp_distance_q = []
                for z in range(example_nums):
                    eculid_distance = np.sqrt(np.sum(np.power(data_set[z] - data_set[q], 2)))
                    if eculid_distance <= eps and z != q:
                        tmp_distance_q.append(eculid_distance)
                        q_nighbor.append(z)
                tmp_distance_q.sort()
                if len(q_nighbor) >= min_pts:
                    core_distance[q] = tmp_distance_q[min_pts - 2]
                    # 计算可达距离
                    seeds = update(data_set, q_nighbor, eps, min_pts, visited, unvisited, seeds, q, core_distance, reachability_distance)
    return order, core_distance, reachability_distance

# 簇抽取过程
def cluster_extraction(example_nums, eps, order, core_distance, reachability_distance):
    cluster = [0 for _ in range(example_nums)]
    k = 0

    for each_order in order:
        # 如果该点可达距离大于簇抽取半径,那么说明该点与其之前的点都不密度可达
        if reachability_distance[each_order] > eps:
            # 又如果该点核心距离小,那么形成新的簇
            if core_distance[each_order] < eps:
                k += 1
                cluster[each_order] = k
            # 否则标记为噪声
            else:
                cluster[each_order] = -1
        # 密度可达,分配到前面的簇中
        else:
            cluster[each_order] = k

    return cluster





四:效果展示

  • OPTICS算法对epsmin_pts不敏感,所以eps一般设置为np.infmin_pts一般设置为10(稠密数据要在此基础增加、稀疏数据则要减小)

(1)人造数据集

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import OPTICS


raw_data = pd.read_csv('./dataset/438-3.csv', header=None)
raw_data.columns = ['X', 'Y']
x_axis = 'X'
y_axis = 'Y'

examples_num = raw_data.shape[0]
train_data = raw_data[[x_axis, y_axis]].values.reshape(examples_num, 2)

# 参数
eps = np.inf
min_pts = 4
order, core_distance, reachability_distance = OPTICS.optics(train_data, eps, min_pts)
cluster = OPTICS.cluster_extraction(np.shape(train_data)[0], 1.3, order, core_distance, reachability_distance)
order = np.array(order)
core_distance = np.array(core_distance)
reachability_distance = np.array(reachability_distance)
data = reachability_distance[order]

plt.figure(figsize=(12, 5), dpi=80)

# 决策图
plt.subplot(1, 2, 1)
plt.plot(range(0, len(data)), data)
plt.plot([0, len(data)], [1.3, 1.3])
plt.title("decision graph")

# 第二幅图聚类结果
plt.subplot(1, 2, 2)

class1_X = []
class1_Y = []
class2_X = []
class2_Y = []
class3_X = []
class3_Y = []
noise_X = []  # 噪声点
noise_Y = []  # 噪声点

for index, value in enumerate(cluster):
    if value == 0:
        class1_X.append(train_data[index][0])
        class1_Y.append(train_data[index][1])
    elif value == 1:
        class2_X.append(train_data[index][0])
        class2_Y.append(train_data[index][1])
    elif value == 2:
        class3_X.append(train_data[index][0])
        class3_Y.append(train_data[index][1])
    elif value == -1:
        noise_X.append(train_data[index][0])
        noise_Y.append(train_data[index][1])

plt.scatter(class1_X, class1_Y, c='g', label='class1')
plt.scatter(class2_X, class2_Y, c='r', label='class2')
plt.scatter(class3_X, class3_Y, c='blue', label='class3')
plt.scatter(noise_X, noise_Y, c='black', label='noise')
plt.title('OPTICS')
plt.legend()
plt.show()

(2)Square数据集

(3)Arrevation

(4)Gassian

(5)Jain数据集

(6)Lineblobs

(7)Melon数据集

(8)Spril数据集

(9)Threecircles数据集

(10)788Points数据集

以上是关于第二节2:OPTICS算法Python实现和效果展示的主要内容,如果未能解决你的问题,请参考以下文章

第二节2:K-Means算法及其Python实现(算法实现结果展示)

第二节4:K-Means算法及其Python实现(初始中心点的选择和K-Means++算法)

OPTICS(聚类)算法的 Python 实现

第二节:谱聚类算法之切图聚类算法流程及其实现

optics聚类算法

无约束优化算法-第二节:梯度类算法