提问者:小点点

使用Python实现DBSCAN可视化


我在SKL中使用Iris数据集和DBSCAN群集学习如何对数据集中的不同数据点进行群集,然后根据使用Python 3中的matplotlib在数据集上训练的DBSCAN对群集数据点进行着色。

我的密码如下-

import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns


# Load CSV dataset-
iris_data = pd.read_csv("iris.csv")

# Get dimension of dataset-
iris_data.shape
# (150, 5)

# Get data types of all attributes in dataset-
iris_data.dtypes
'''
sepallength    float64
sepalwidth     float64
petallength    float64
petalwidth     float64
class           object
dtype: object
'''

# Do label encoding for 'class' attribute-
le = LabelEncoder()
encoded_class = le.fit_transform(iris_data['class'])

# Delete 'class' attribute-
iris_data.drop('class', axis = 1, inplace=True)

# Add 'encoded_class' attribute-
iris_data['encoded_class'] = encoded_class


# Create an instance of DBSCAN with default values for
# 'eps' and 'min_samples' parameters-
dbscan = DBSCAN()

# Check default parameters being used-
dbscan
'''
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)
'''

# Train the dateset using DBSCAN clustering algorithm-
dbscan.fit(iris_data)

# To check outcome of DBSCAN clustering algorithm-
dbscan.labels_
# Noisy samples are given the label -1

# To get count of unique labels assigned to the different data points in dataset-
np.unique(dbscan.labels_)
array([-1, 0, 1, 2])

# A dict to count number of data points assigned to different labels by DBSCAN algorithm-
# label : number of data points assigned to label
count_elements = {}

count_elements[-1] = 0
count_elements[0] = 0
count_elements[1] = 0
count_elements[2] = 0

for i in dbscan.labels_:
    if i == -1:
        count_elements[-1] += 1
    elif i == 0:
        count_elements[0] += 1
    elif i == 1:
        count_elements[1] += 1
    elif i == 2:
        count_elements[2] += 1


count_elements
# {-1: 18, 0: 49, 1: 44, 2: 39}



print("\nNumber of data points and their computed labels are:\n{0}\n".format(count_elements))
'''
Number of data points and their computed labels are:
{-1: 18, 0: 49, 1: 44, 2: 39}
'''


# Visualize iris dataset using 'petallength' and 'petalwidth' attributes-
plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c=iris_data['encoded_class'])

plt.xlabel('petal length')
plt.ylabel('petal width')
plt.show()


plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = dbscan.labels_)

plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.legend(list(dbscan.labels_))
plt.show()


for i in range(iris_data.shape[0]):
    if dbscan.labels_[i] == 0:
        c1 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'green', marker = '+')
    elif dbscan.labels_[i] == 1:
        c2 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'red', marker = 'o')
    elif dbscan.labels_[i] == 2:
        c3 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'blue', marker = '*')
    elif dbscan.labels_[i] == -1:
        c4 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'black', marker = '.') 

plt.legend([c1, c2, c3, c4], ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Noise'])
plt.title("DBSCAN clustering finds 3 clusters and noise")
plt.show()

用于可视化分配给每个数据点的簇的最后一个代码不起作用。怎么了?

谢谢


共1个答案

匿名用户

data = np.load('clusterable_data.npy')
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, prediction_data=True).fit(data)
pal = sns.color_palette('deep', 8)
colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_,
                                                            clusterer.probabilities_)]
plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);