Python源码示例:sklearn.metrics.silhouette_samples()
示例1
def test_knn(datasets_dimred, genes, labels, idx, distr, xlabels):
knns = [ 5, 10, 50, 100 ]
len_distr = len(distr)
for knn in knns:
integrated = assemble(datasets_dimred[:], knn=knn, sigma=150)
X = np.concatenate(integrated)
distr.append(sil(X[idx, :], labels[idx]))
for d in distr[:len_distr]:
print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
xlabels.append(str(knn))
print('')
plt.figure()
plt.boxplot(distr, showmeans=True, whis='range')
plt.xticks(range(1, len(xlabels) + 1), xlabels)
plt.ylabel('Silhouette Coefficient')
plt.ylim((-1, 1))
plt.savefig('param_sensitivity_{}.svg'.format('knn'))
示例2
def test_sigma(datasets_dimred, genes, labels, idx, distr, xlabels):
sigmas = [ 10, 50, 100, 200 ]
len_distr = len(distr)
for sigma in sigmas:
integrated = assemble(datasets_dimred[:], sigma=sigma)
X = np.concatenate(integrated)
distr.append(sil(X[idx, :], labels[idx]))
for d in distr[:len_distr]:
print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
xlabels.append(str(sigma))
print('')
plt.figure()
plt.boxplot(distr, showmeans=True, whis='range')
plt.xticks(range(1, len(xlabels) + 1), xlabels)
plt.ylabel('Silhouette Coefficient')
plt.ylim((-1, 1))
plt.savefig('param_sensitivity_{}.svg'.format('sigma'))
示例3
def test_alpha(datasets_dimred, genes, labels, idx, distr, xlabels):
alphas = [ 0, 0.05, 0.20, 0.50 ]
len_distr = len(distr)
for alpha in alphas:
integrated = assemble(datasets_dimred[:], alpha=alpha, sigma=150)
X = np.concatenate(integrated)
distr.append(sil(X[idx, :], labels[idx]))
for d in distr[:len_distr]:
print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
xlabels.append(str(alpha))
print('')
plt.figure()
plt.boxplot(distr, showmeans=True, whis='range')
plt.xticks(range(1, len(xlabels) + 1), xlabels)
plt.ylabel('Silhouette Coefficient')
plt.ylim((-1, 1))
plt.savefig('param_sensitivity_{}.svg'.format('alpha'))
示例4
def test_approx(datasets_dimred, genes, labels, idx, distr, xlabels):
integrated = assemble(datasets_dimred[:], approx=False, sigma=150)
X = np.concatenate(integrated)
distr.append(sil(X[idx, :], labels[idx]))
len_distr = len(distr)
for d in distr[:len_distr]:
print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
xlabels.append('Exact NN')
print('')
plt.figure()
plt.boxplot(distr, showmeans=True, whis='range')
plt.xticks(range(1, len(xlabels) + 1), xlabels)
plt.ylabel('Silhouette Coefficient')
plt.ylim((-1, 1))
plt.savefig('param_sensitivity_{}.svg'.format('approx'))
示例5
def test_perplexity(datasets_dimred, genes, labels, idx,
distr, xlabels):
X = np.concatenate(datasets_dimred)
perplexities = [ 10, 100, 500, 2000 ]
len_distr = len(distr)
for perplexity in perplexities:
embedding = fit_tsne(X, perplexity=perplexity)
distr.append(sil(embedding[idx, :], labels[idx]))
for d in distr[:len_distr]:
print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
xlabels.append(str(perplexity))
print('')
plt.figure()
plt.boxplot(distr, showmeans=True, whis='range')
plt.xticks(range(1, len(xlabels) + 1), xlabels)
plt.ylabel('Silhouette Coefficient')
plt.ylim((-1, 1))
plt.savefig('param_sensitivity_{}.svg'.format('perplexity'))
示例6
def gen_eval(self, output_dir, assigned_clusters, quick=False):
if quick:
self.silhouette_avg = 0
return
if self.distances is not None:
self.silhouette_values = silhouette_samples(self.distances,
assigned_clusters,
metric='precomputed')
else:
features = self.instances.features.get_values()
self.silhouette_values = silhouette_samples(features,
assigned_clusters)
self.silhouette_avg = np.mean(self.silhouette_values)
self.dispaly_silhouette(output_dir, assigned_clusters)
# Code from a scikit-learn example:
# Selecting the number of clusters with silhouette analysis on KMeans
# clustering
示例7
def fit(self, df, options):
"""Do the clustering & merge labels with original data."""
# Make a copy of the input data
X = df.copy()
# Use the df_util prepare_features method to
# - drop null columns & rows
# - convert categorical columns into dummy indicator columns
# X is our cleaned data, nans is a mask of the null value locations
X, nans, columns = df_util.prepare_features(X, self.feature_variables)
# Do the actual clustering
y_hat = self.estimator.fit_predict(X.values)
# attach silhouette coefficient score for each row
silhouettes = silhouette_samples(X, y_hat)
# Combine the two arrays, and transpose them.
y_hat = np.vstack([y_hat, silhouettes]).T
# Assign default output names
default_name = 'cluster'
# Get the value from the as-clause if present
output_name = options.get('output_name', default_name)
# There are two columns - one for the labels, for the silhouette scores
output_names = [output_name, 'silhouette_score']
# Use the predictions & nans-mask to create a new dataframe
output_df = df_util.create_output_dataframe(y_hat, nans, output_names)
# Merge the dataframe with the original input data
df = df_util.merge_predictions(df, output_df)
return df
示例8
def test_gmm():
sil = pyclust.validate.Silhouette()
sil_score = sil.score(X, ypred, sample_size=None)
print(sil_score[0])
print(sil.sample_scores[:10])
print(silhouette_score(X, ypred, sample_size=None))
print(silhouette_samples(X, ypred)[:10])
示例9
def test_silhouette_samples(self):
result = self.df.metrics.silhouette_samples()
expected = metrics.silhouette_samples(self.data, self.pred)
self.assertTrue(isinstance(result, pdml.ModelSeries))
tm.assert_index_equal(result.index, self.df.index)
self.assert_numpy_array_almost_equal(result.values, expected)
示例10
def silhouette_confidence(audio_signal, features, num_sources, threshold=95,
max_points=1000, **kwargs):
"""
Uses the silhouette score to compute the clusterability of the feature space.
The Silhouette Coefficient is calculated using the
mean intra-cluster distance (a) and the mean nearest-cluster distance (b)
for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b).
To clarify, b is the distance between a sample and the nearest cluster
that the sample is not a part of. Note that Silhouette Coefficient is
only defined if number of labels is 2 <= n_labels <= n_samples - 1.
References:
Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition.
Diss. Northwestern University, 2019.
Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the
Interpretation and Validation of Cluster Analysis”. Computational and
Applied Mathematics 20: 53-65.
Args:
audio_signal (AudioSignal): AudioSignal object which will be used to compute
the mask over which to compute the confidence measure. This can be None, if
and only if ``representation`` is passed as a keyword argument to this
function.
features (np.ndarray): Numpy array containing the features to be clustered.
Should have the same dimensions as the representation.
n_sources (int): Number of sources to cluster the features into.
threshold (int, optional): Threshold by loudness. Points below the threshold are
excluded from being used in the confidence measure. Defaults to 95.
kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
go here as a keyword argument.
max_points (int, optional): Maximum number of points to compute the Silhouette
score for. Silhouette score is a costly operation. Defaults to 1000.
Returns:
float: Confidence given by Silhouette score.
"""
mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
embedding_size = features.shape[-1]
features = features[mask].reshape(-1, embedding_size)
if features.shape[0] > max_points:
idx = np.random.choice(
np.arange(features.shape[0]), max_points,
replace=False)
features = features[idx]
kmeans = KMeans(num_sources)
labels = kmeans.fit_predict(features)
confidence = silhouette_samples(features, labels)
return confidence.mean()
示例11
def silhouette(adata_name, cluster_annot, value='X_pca', metric='euclidean',
key_added=None, copy=False):
"""
Compute silhouette scores.
It computes the general silhouette score as well as a silhouette score for every cell according
to the cell cluster assigned to it.
Parameters
----------
adata_name: AnnData object
cluster_annot: observational variable corresponding to a cell clustering
value: measure used to build the silhouette plot (X_pca, X_tsne, X_umap)
metric: 'euclidean'
key_added: key to save the computed silhouette scores
Return
------
general silhouette score in 'uns' of the AnnData object
individual silhouette scores in 'obs' of the AnnData object
Credit to sklearn script :
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py
return score and silhouette plot. Still some work to do to finish the function.
size=None but you can put 'large' if you want a bigger default figure size
"""
if copy:
adata_name = adata_name.copy()
X = adata_name.obsm[value]
cluster_labels = adata_name.obs[cluster_annot]
n_clusters = len(set(adata_name.obs[cluster_annot]))
## also, return sample_silhouette_values as adata.obs['silhouette_samples']
silhouette_avg = silhouette_score(X, cluster_labels, metric)
sample_silhouette_values = silhouette_samples(X, cluster_labels, metric)
if key_added:
adata_name.obs[key_added] = sample_silhouette_values
adata_name.uns[key_added] = silhouette_avg
else:
adata_name.obs['silhouette_samples'] = sample_silhouette_values
adata_name.uns['silhouette_samples_avg'] = silhouette_avg
if copy:
return(adata_name)
else:
return()
示例12
def evaluate_scores(div_ent_code, sil_code, cell_labels, dataset_labels, num_datasets, div_ent_dim, sil_dim, sil_dist):
""" Calculate three proposed evaluation metrics
Args:
div_ent_code: num_cells * num_features, embedding for divergence and entropy calculation, usually with dim of 2
sil_code: num_cells * num_features, embedding for silhouette score calculation
cell_labels:
dataset_labels:
num_datasets:
div_ent_dim: if dimension of div_ent_code > div_ent_dim, apply PCA first
sil_dim: if dimension of sil_code > sil_dim, apply PCA first
sil_dist: distance metric for silhouette score calculation
Returns:
div_score: divergence score
ent_score: entropy score
sil_score: silhouette score
"""
# calculate divergence and entropy
if div_ent_code.shape[1] > div_ent_dim:
div_ent_code = PCA(n_components=div_ent_dim).fit_transform(div_ent_code)
div_pq = [] # divergence dataset p, q
div_qp = [] # divergence dataset q, p
ent = [] # entropy
# pairs of datasets
for d1 in range(1, num_datasets+1):
for d2 in range(d1+1, num_datasets+1):
idx1 = dataset_labels == d1
idx2 = dataset_labels == d2
labels = np.intersect1d(np.unique(cell_labels[idx1]), np.unique(cell_labels[idx2]))
idx1_mutual = np.logical_and(idx1, np.isin(cell_labels, labels))
idx2_mutual = np.logical_and(idx2, np.isin(cell_labels, labels))
idx_specific = np.logical_and(np.logical_or(idx1, idx2), np.logical_not(np.isin(cell_labels, labels)))
# divergence
if np.sum(idx1_mutual) >= cal_min and np.sum(idx2_mutual) >= cal_min:
div_pq.append(max(estimate(div_ent_code[idx1_mutual, :], div_ent_code[idx2_mutual, :], cal_min), 0))
div_qp.append(max(estimate(div_ent_code[idx2_mutual, :], div_ent_code[idx1_mutual, :], cal_min), 0))
# entropy
if (sum(idx_specific) > 0):
ent_tmp = cal_entropy(div_ent_code, idx_specific, dataset_labels)
ent.append(sum(ent_tmp) / len(ent_tmp))
if len(ent) == 0: # if no dataset specific cell types, store entropy as -1
ent.append(-1)
# calculate silhouette_score
if sil_code.shape[1] > sil_dim:
sil_code = PCA(n_components=sil_dim).fit_transform(sil_code)
sil_scores = silhouette_samples(sil_code, cell_labels, metric=sil_dist)
# average for scores
div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2
ent_score = sum(ent) / len(ent)
sil_score = sum(sil_scores) / len(sil_scores)
return div_score, ent_score, sil_score