当前位置: 代码迷 >> 综合 >> 【一般聚类/时序聚类】python实现多密度自适应聚类:Multi-DBSCAN
  详细解决方案

【一般聚类/时序聚类】python实现多密度自适应聚类:Multi-DBSCAN

热度:22   发布时间:2023-12-01 14:44:16.0

本文代码基于该篇进行魔改,功能调用更加方便,速度经测试快了几十倍

import math
import copy
import numpy as np
from sklearn.cluster import DBSCAN
import sklearn.metrics.pairwise as pairwiseclass Adapter_DBSCAN():# 默认统计聚类个数在2-25之间的聚类情况, 参数符合python左闭右开def __init__(self,num_cluster_range=(2,26)):self.num_cluster_range = num_cluster_rangedef returnEpsCandidate(self,dataSet):""":param dataSet: 数据集:return: eps候选集合"""#self.DistMatrix = self.CalculateDistMatrix(dataSet)self.DistMatrix = pairwise.euclidean_distances(dataSet)tmp_matrix = copy.deepcopy(self.DistMatrix)for i in range(len(tmp_matrix)):tmp_matrix[i].sort()EpsCandidate = []for k in range(1,len(dataSet)):#Dk = self.returnDk(tmp_matrix,k)Dk = tmp_matrix[:,k]# DkAverage = self.returnDkAverage(Dk)# 快160+倍DkAverage = np.mean(Dk)EpsCandidate.append(DkAverage)return EpsCandidatedef returnMinptsCandidate(self,DistMatrix,EpsCandidate,X):""":param DistMatrix: 距离矩阵:param EpsCandidate: Eps候选列表:return: Minpts候选列表"""MinptsCandidate = []for k in range(len(EpsCandidate)):tmp_eps = EpsCandidate[k]tmp_count = 0# for i in range(len(DistMatrix)):#     for j in range(len(DistMatrix[i])):#         if DistMatrix[i][j] <= tmp_eps:#             tmp_count = tmp_count + 1# 快250+倍tmp_count = np.sum(DistMatrix <= tmp_eps)MinptsCandidate.append(tmp_count/len(X))return MinptsCandidatedef fit(self,X):self.EpsCandidate = self.returnEpsCandidate(X)self.MinptsCandidate = self.returnMinptsCandidate(self.DistMatrix,self.EpsCandidate,X)self.do_multi_dbscan(X)self.set_num_clusters_range(self.num_cluster_range)def do_multi_dbscan(self,X):self.all_predict_dict = {}self.all_param_dict = {}for i in range(len(self.EpsCandidate)):eps = self.EpsCandidate[i]minpts = self.MinptsCandidate[i]db = DBSCAN(eps=eps,min_samples=minpts).fit(X)num_clusters = max(db.labels_) + 1# 统计符合范围的聚类情况if num_clusters not in self.all_predict_dict.keys():self.all_predict_dict[num_clusters] = []self.all_param_dict[num_clusters] = []self.all_predict_dict[num_clusters].append(db.labels_)self.all_param_dict[num_clusters].append({"eps":eps,"minpts":minpts})# 筛选聚类个数,比如Multi-DBSCAN共产生了3聚类、15聚类、136聚类三种情况# 我想只看10~20的聚类情况,就可以设置set_num_clusters_range(10~21)后调用get_predict_dict()def set_num_clusters_range(self,num_cluster_range:tuple):self.predict_dict = {}self.param_dict = {}# 统计符合范围的聚类情况for num_cluster, labels, params in zip(self.all_predict_dict.keys(),\self.all_predict_dict.values(), self.all_param_dict.values()):if num_cluster >= num_cluster_range[0] and \num_cluster < num_cluster_range[1]:self.predict_dict[num_cluster] = labelsself.param_dict[num_cluster] = params# 获取当前Multi-DBSCAN的聚类预测信息,格式为{聚类个数:[[预测可能1],[预测可能2],...]}# 比如聚类个数为3的情况可能有多种,所以有预测可能1、预测可能2...def get_predict_dict(self):if self.predict_dict is None:raise RuntimeError("get_predict_dict before fit")return self.predict_dict# 获取当前Multi-DBSCAN的聚类参数信息,格式为{聚类个数:[{"eps":x1,"minpts":y1},{"eps":x2,"minpts":y2},...]}def get_info_dict(self):if self.param_dict is None:raise RuntimeError("get_info_dict before fit")return self.param_dict

实验用到的数据集格式,共788行:

测试:

def loadDataSet(fileName):"""输入:文件名输出:数据集描述:从文件读入数据集"""dataSet = []with open(fileName) as fr:for line in fr.readlines():curline = line.strip().split(",")fltline = list(map(float, curline))dataSet.append(fltline)return np.array(dataSet)if __name__ == '__main__':X = loadDataSet('common/788points.txt')DB = Adapter_DBSCAN()DB.fit(X)# 输出15聚类的情况DB.set_num_clusters_range((15,16))# label预测信息predict_dict = DB.get_predict_dict()# 参数信息info_dict = DB.get_info_dict()print(predict_dict)print("================================")print(info_dict)# 注意到整个Multi-DBSCAN过程中eps、minpts参数都不需要你手工设置
# 还可以控制输出聚类个数范围,是不是很方便?

打印:

{15: [array([-1, -1, -1,  0,  0,  0,  0,  0, -1, -1,  0,  0,  0, -1, -1, -1,  0,0,  0,  0,  0,  0, -1,  0,  0, -1, -1, -1, -1, -1,  1,  1,  1,  1,1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,1,  1,  1,  1,  3,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,1, -1,  1,  2, -1,  1, -1, -1, -1,  2,  2,  2,  2,  2,  2,  2,  2,2,  2,  2,  2,  2, -1,  4,  4,  2,  2,  2,  2,  3,  3,  3,  4,  4,4,  4,  5,  5,  5,  5, -1,  6,  3,  6,  6,  6,  6,  6,  6, -1,  7,7,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, -1, -1,  3,  3,  3,3,  3,  3,  3,  3,  3,  3,  3, -1, -1, -1,  3,  3,  3,  6,  7,  7,3, -1, -1, -1,  7,  7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,8,  8,  8, -1,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,-1, -1,  9,  9, -1,  9,  9,  9,  9,  9,  9,  9,  9, -1,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, -1,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, -1,9,  9,  9,  9,  9,  9, -1,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, -1, -1,  9,  9,9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9, -1,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9, 10, 10, 10, 10, 10, -1, 10, -1, -1, -1, 11, -1, 10, 10, 10, 10,10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1, 10, -1, 10, 10, 11,11, 11, -1, -1, 11, 11, 11, 11, 11, -1, 11, 11, 11, 11, 11, 11, 11,11, 11, 11, 11, -1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,11, 11, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,12, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1,-1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, 13, 13, 13, 13,13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,-1, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, 13, 13, 13, 13, -1,13, -1, -1, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,14, 14, 14, 14, 14, 14]), array([-1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0,0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1, -1, -1, -1,  1,  1,  1,1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,1,  1,  1,  1,  1,  1,  1,  1, -1,  2,  2,  1,  1,  1,  1,  1,  1,1,  2,  2, -1, -1, -1, -1,  3,  2,  2,  3,  3,  3,  3,  3,  3,  3,3, -1, -1, -1, -1, -1, -1, -1,  3, -1, -1, -1, -1, -1, -1, -1, -1,-1,  3,  3,  3,  3,  3,  3,  3,  3,  3, -1, -1, -1, -1, -1, -1, -1,-1,  3,  3,  3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,-1, -1, -1, -1, -1, -1,  5, -1,  5,  5,  5,  5,  5,  5, -1, -1,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, -1,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, -1,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, -1,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,-1,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,5,  5,  5,  5, -1,  5, -1,  5,  5,  5, -1,  5,  5,  5,  5,  5,  5,5,  6,  6, -1,  6, -1, -1, -1, -1, -1,  7,  7, -1, -1,  6,  6,  6,6,  6,  6,  6,  6,  6,  6, -1, -1,  6,  6, -1, -1, -1, -1,  6,  8,8,  6, -1, -1, -1,  7,  7,  7,  7,  7,  7,  7,  9, -1,  8,  8,  8,8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,-1,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,9,  9,  9,  9,  8,  8,  8,  8,  8,  8, -1, -1, -1, -1,  9,  9,  9,9, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,10, 10, 10, 10, 10, 10, 10, 11, 10, 10, 11, 11, 11, 11, 11, 11, -1,11, 12, 12, 12, 12, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1,10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1, 10, 10, 10,10, 10, 10, 10, 10, 10, -1, 10, 10, 10, 10, 10, 10, 10, 12, 10, 12,12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1,12, 12, 10, 10, -1, -1, -1, -1, -1, -1, -1, 10, 13, 13, 13, 13, 13,13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, -1,-1, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, 13, 13, 13, 13, -1,13, 13, -1, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,14, 14, 14, 14, 14, 14])]}
================================
{15: [{'eps': 1.0547224554276167, 'minpts': 5.916243654822335}, {'eps': 1.238569836383374, 'minpts': 8.055837563451776}]}

上述结果显示了Multi-DBSCAN在15聚类情况下的2种可能

时序聚类测试

再测试时序聚类,选取东海、渤海、黄海等海面温等数据,样本比较少,60多个,每个样本单要素,2800+时间点(已降维成100),形成shape=(60+,100)的数据

import time
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics.pairwise as pairwise
from sklearn.cluster import DBSCAN
from common.my_clustering import Adapter_DBSCAN
from sklearn.metrics.cluster import \silhouette_score as sklearn_silhouette_score# 自动根据聚类个数布局行列
def multip_decomposition(num_cluster):s = math.sqrt(num_cluster)col = math.ceil(s)row = int(s)flag = Falsewhile(col * row < num_cluster):if flag:row += 1flag = Trueelse:col += 1flag = Falsereturn row, coldef test_multi_dbscan(config):X = np.loadtxt(config["data_path"],dtype=np.float32)adb = Adapter_DBSCAN(num_cluster_range=(2,30))t0 = time.time()adb.fit(X)t1 = time.time()predict_dict = adb.get_predict_dict()info_dict = adb.get_info_dict()for num_cluster, labels, infos in zip(predict_dict.keys(),predict_dict.values(),info_dict.values()):for possible,y_pred in enumerate(labels):# 时序图横着放更好看,所以是col,row而不是row,colcol,row = multip_decomposition(num_cluster + 2)y_pred_valid = []X_valid = []for yi in range(num_cluster):plt.subplot(row, col, yi + 1)clusters_yi = X[y_pred == yi]for xs in clusters_yi:X_valid.append(xs.tolist())y_pred_valid.append(yi)for xx in clusters_yi:plt.plot(xx.ravel(), "k-", alpha=.3)plt.plot(np.mean(clusters_yi,axis=0).ravel(), "r-")X_valid = np.array(X_valid)y_pred_valid = np.array(y_pred_valid)print(X_valid.shape)print(y_pred_valid.shape)plt.subplot(row, col, num_cluster + 1)clusters_noise = X[y_pred == -1]for xx in clusters_noise:plt.plot(xx.ravel(), "k-", alpha=.3)plt.plot(np.mean(clusters_noise,axis=0).ravel(), "r-")plt.text(0.55, 0.85,'noise',transform=plt.gca().transAxes)plt.subplot(row, col, num_cluster + 2)clusters_uncertain = X[y_pred == -2]for xx in clusters_uncertain:plt.plot(xx.ravel(), "k-", alpha=.3)plt.plot(np.mean(clusters_uncertain,axis=0).ravel(), "r-")plt.text(0.55, 0.85,'uncertain',transform=plt.gca().transAxes)plt.tight_layout()# prefix, suffix = filepath.split(".")# prefix = data_process.exp_replace(prefix,"exp4") # img_save_path = "{}_{}_{}minpts_{}clusters_possible{}.png".format(\#   prefix,config["test_name"],"%.2f" % infos[possible]["minpts"],num_cluster,possible)# logger.log_img("聚类结果图: {}".format(img_save_path),img_save_path)# 自己找地方保存plt的图像# 轮廓系数t2 = time.time()dists = pairwise.pairwise_distances(X_valid,metric=config["cdists_method"],n_jobs=-1)score = sklearn_silhouette_score(dists,y_pred_valid,metric=config["method"])t3 = time.time()# logger.log("silhouette_score="+str(score)+", time cost: "+str(t3 - t2)+"s\n\n")# 自己找地方保存轮廓系数# 测试
filepath=...
config = {"method":"precomputed","cdists_method":"l2", "test_name":"multidbscan", "data_path":filepath,"seed":0}
test_multi_dbscan(config)

由于该部分数据海域选取的地理点较近,且样本较少,较杂且较难区分

实验结果:

实验日志显示除noise外的数据仍有较好的轮廓系数0.5+,说明聚类结果算理想(相比于KMeans系列轮廓系数只有0.2+):