我在SKL中使用Iris数据集和DBSCAN群集学习如何对数据集中的不同数据点进行群集,然后根据使用Python 3中的matplotlib在数据集上训练的DBSCAN对群集数据点进行着色。
我的密码如下-
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
# Load CSV dataset-
iris_data = pd.read_csv("iris.csv")
# Get dimension of dataset-
iris_data.shape
# (150, 5)
# Get data types of all attributes in dataset-
iris_data.dtypes
'''
sepallength float64
sepalwidth float64
petallength float64
petalwidth float64
class object
dtype: object
'''
# Do label encoding for 'class' attribute-
le = LabelEncoder()
encoded_class = le.fit_transform(iris_data['class'])
# Delete 'class' attribute-
iris_data.drop('class', axis = 1, inplace=True)
# Add 'encoded_class' attribute-
iris_data['encoded_class'] = encoded_class
# Create an instance of DBSCAN with default values for
# 'eps' and 'min_samples' parameters-
dbscan = DBSCAN()
# Check default parameters being used-
dbscan
'''
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
metric_params=None, min_samples=5, n_jobs=None, p=None)
'''
# Train the dateset using DBSCAN clustering algorithm-
dbscan.fit(iris_data)
# To check outcome of DBSCAN clustering algorithm-
dbscan.labels_
# Noisy samples are given the label -1
# To get count of unique labels assigned to the different data points in dataset-
np.unique(dbscan.labels_)
array([-1, 0, 1, 2])
# A dict to count number of data points assigned to different labels by DBSCAN algorithm-
# label : number of data points assigned to label
count_elements = {}
count_elements[-1] = 0
count_elements[0] = 0
count_elements[1] = 0
count_elements[2] = 0
for i in dbscan.labels_:
if i == -1:
count_elements[-1] += 1
elif i == 0:
count_elements[0] += 1
elif i == 1:
count_elements[1] += 1
elif i == 2:
count_elements[2] += 1
count_elements
# {-1: 18, 0: 49, 1: 44, 2: 39}
print("\nNumber of data points and their computed labels are:\n{0}\n".format(count_elements))
'''
Number of data points and their computed labels are:
{-1: 18, 0: 49, 1: 44, 2: 39}
'''
# Visualize iris dataset using 'petallength' and 'petalwidth' attributes-
plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c=iris_data['encoded_class'])
plt.xlabel('petal length')
plt.ylabel('petal width')
plt.show()
plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = dbscan.labels_)
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.legend(list(dbscan.labels_))
plt.show()
for i in range(iris_data.shape[0]):
if dbscan.labels_[i] == 0:
c1 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'green', marker = '+')
elif dbscan.labels_[i] == 1:
c2 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'red', marker = 'o')
elif dbscan.labels_[i] == 2:
c3 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'blue', marker = '*')
elif dbscan.labels_[i] == -1:
c4 = plt.scatter(iris_data['petallength'], iris_data['petalwidth'], c = 'black', marker = '.')
plt.legend([c1, c2, c3, c4], ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Noise'])
plt.title("DBSCAN clustering finds 3 clusters and noise")
plt.show()
用于可视化分配给每个数据点的簇的最后一个代码不起作用。怎么了?
谢谢
data = np.load('clusterable_data.npy')
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, prediction_data=True).fit(data)
pal = sns.color_palette('deep', 8)
colors = [sns.desaturate(pal[col], sat) for col, sat in zip(clusterer.labels_,
clusterer.probabilities_)]
plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds);