Python源码示例:sklearn.metrics.silhouette_samples()

示例1
def test_knn(datasets_dimred, genes, labels, idx, distr, xlabels):
    knns = [ 5, 10, 50, 100 ]
    len_distr = len(distr)
    for knn in knns:
        integrated = assemble(datasets_dimred[:], knn=knn, sigma=150)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(knn))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('knn')) 
示例2
def test_sigma(datasets_dimred, genes, labels, idx, distr, xlabels):
    sigmas = [ 10, 50, 100, 200 ]
    len_distr = len(distr)
    for sigma in sigmas:
        integrated = assemble(datasets_dimred[:], sigma=sigma)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(sigma))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('sigma')) 
示例3
def test_alpha(datasets_dimred, genes, labels, idx, distr, xlabels):
    alphas = [ 0, 0.05, 0.20, 0.50 ]
    len_distr = len(distr)
    for alpha in alphas:
        integrated = assemble(datasets_dimred[:], alpha=alpha, sigma=150)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(alpha))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('alpha')) 
示例4
def test_approx(datasets_dimred, genes, labels, idx, distr, xlabels):
    integrated = assemble(datasets_dimred[:], approx=False, sigma=150)
    X = np.concatenate(integrated)
    distr.append(sil(X[idx, :], labels[idx]))
    len_distr = len(distr)
    for d in distr[:len_distr]:
        print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
    xlabels.append('Exact NN')
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('approx')) 
示例5
def test_perplexity(datasets_dimred, genes, labels, idx,
                    distr, xlabels):
    X = np.concatenate(datasets_dimred)

    perplexities = [ 10, 100, 500, 2000 ]
    len_distr = len(distr)
    for perplexity in perplexities:
        embedding = fit_tsne(X, perplexity=perplexity)
        distr.append(sil(embedding[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(perplexity))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('perplexity')) 
示例6
def gen_eval(self, output_dir, assigned_clusters, quick=False):
        if quick:
            self.silhouette_avg = 0
            return
        if self.distances is not None:
            self.silhouette_values = silhouette_samples(self.distances,
                                                        assigned_clusters,
                                                        metric='precomputed')
        else:
            features = self.instances.features.get_values()
            self.silhouette_values = silhouette_samples(features,
                                                        assigned_clusters)
        self.silhouette_avg = np.mean(self.silhouette_values)
        self.dispaly_silhouette(output_dir, assigned_clusters)

    # Code from a scikit-learn example:
    # Selecting the number of clusters with silhouette analysis on KMeans
    # clustering 
示例7
def fit(self, df, options):
        """Do the clustering & merge labels with original data."""
        # Make a copy of the input data
        X = df.copy()

        # Use the df_util prepare_features method to
        # - drop null columns & rows
        # - convert categorical columns into dummy indicator columns
        # X is our cleaned data, nans is a mask of the null value locations
        X, nans, columns = df_util.prepare_features(X, self.feature_variables)

        # Do the actual clustering
        y_hat = self.estimator.fit_predict(X.values)

        # attach silhouette coefficient score for each row
        silhouettes = silhouette_samples(X, y_hat)

        # Combine the two arrays, and transpose them.
        y_hat = np.vstack([y_hat, silhouettes]).T

        # Assign default output names
        default_name = 'cluster'

        # Get the value from the as-clause if present
        output_name = options.get('output_name', default_name)

        # There are two columns - one for the labels, for the silhouette scores
        output_names = [output_name, 'silhouette_score']

        # Use the predictions & nans-mask to create a new dataframe
        output_df = df_util.create_output_dataframe(y_hat, nans, output_names)

        # Merge the dataframe with the original input data
        df = df_util.merge_predictions(df, output_df)
        return df 
示例8
def test_gmm():
    sil = pyclust.validate.Silhouette()
    sil_score = sil.score(X, ypred, sample_size=None)

    print(sil_score[0])

    print(sil.sample_scores[:10])

    print(silhouette_score(X, ypred, sample_size=None))
    
    print(silhouette_samples(X, ypred)[:10]) 
示例9
def test_silhouette_samples(self):
        result = self.df.metrics.silhouette_samples()
        expected = metrics.silhouette_samples(self.data, self.pred)

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        tm.assert_index_equal(result.index, self.df.index)
        self.assert_numpy_array_almost_equal(result.values, expected) 
示例10
def silhouette_confidence(audio_signal, features, num_sources, threshold=95, 
                          max_points=1000, **kwargs):
    """
    Uses the silhouette score to compute the clusterability of the feature space.

    The Silhouette Coefficient is calculated using the 
    mean intra-cluster distance (a) and the mean nearest-cluster distance (b) 
    for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b). 
    To clarify, b is the distance between a sample and the nearest cluster 
    that the sample is not a part of. Note that Silhouette Coefficient is 
    only defined if number of labels is 2 <= n_labels <= n_samples - 1.

    References:

    Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition. 
    Diss. Northwestern University, 2019.

    Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the 
    Interpretation and Validation of Cluster Analysis”. Computational and 
    Applied Mathematics 20: 53-65.
    
    Args:
        audio_signal (AudioSignal): AudioSignal object which will be used to compute
          the mask over which to compute the confidence measure. This can be None, if
          and only if ``representation`` is passed as a keyword argument to this 
          function.
        features (np.ndarray): Numpy array containing the features to be clustered. 
          Should have the same dimensions as the representation.
        n_sources (int): Number of sources to cluster the features into.
        threshold (int, optional): Threshold by loudness. Points below the threshold are
          excluded from being used in the confidence measure. Defaults to 95.
        kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
          go here as a keyword argument.
        max_points (int, optional): Maximum number of points to compute the Silhouette
          score for. Silhouette score is a costly operation. Defaults to 1000.
    
    Returns:
        float: Confidence given by Silhouette score.
    """
    mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
    embedding_size = features.shape[-1]
    features = features[mask].reshape(-1, embedding_size)

    if features.shape[0] > max_points:
        idx = np.random.choice(
            np.arange(features.shape[0]), max_points,
            replace=False)
        features = features[idx]
    
    kmeans = KMeans(num_sources)

    labels = kmeans.fit_predict(features)
    confidence = silhouette_samples(features, labels)

    return confidence.mean() 
示例11
def silhouette(adata_name, cluster_annot, value='X_pca', metric='euclidean',
               key_added=None, copy=False):
    """

    Compute silhouette scores.

    It computes the general silhouette score as well as a silhouette score for every cell according 
    to the cell cluster assigned to it. 

    Parameters
    ----------
    adata_name: AnnData object

    cluster_annot: observational variable corresponding to a cell clustering

    value: measure used to build the silhouette plot (X_pca, X_tsne, X_umap)

    metric: 'euclidean'

    key_added: key to save the computed silhouette scores

    Return
    ------

    general silhouette score in 'uns' of the AnnData object
    individual silhouette scores in 'obs' of the AnnData object



    Credit to sklearn script : 
    https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py
    return score and silhouette plot. Still some work to do to finish the function.
    size=None but you can put 'large' if you want a bigger default figure size
    """
    
    if copy:
      adata_name = adata_name.copy()
      
    X = adata_name.obsm[value]
    cluster_labels = adata_name.obs[cluster_annot]
    n_clusters = len(set(adata_name.obs[cluster_annot]))

    ## also, return sample_silhouette_values as adata.obs['silhouette_samples']
    silhouette_avg = silhouette_score(X, cluster_labels, metric)
    sample_silhouette_values = silhouette_samples(X, cluster_labels, metric)
    
    if key_added:
        adata_name.obs[key_added] = sample_silhouette_values
        adata_name.uns[key_added] = silhouette_avg
    else:
        adata_name.obs['silhouette_samples'] = sample_silhouette_values
        adata_name.uns['silhouette_samples_avg'] = silhouette_avg

    if copy:
        return(adata_name)
    else:
        return() 
示例12
def evaluate_scores(div_ent_code, sil_code, cell_labels, dataset_labels, num_datasets, div_ent_dim, sil_dim, sil_dist):
    """ Calculate three proposed evaluation metrics
    Args:
        div_ent_code: num_cells * num_features, embedding for divergence and entropy calculation, usually with dim of 2
        sil_code: num_cells * num_features, embedding for silhouette score calculation
        cell_labels:
        dataset_labels:
        num_datasets:
        div_ent_dim: if dimension of div_ent_code > div_ent_dim, apply PCA first
        sil_dim: if dimension of sil_code > sil_dim, apply PCA first
        sil_dist: distance metric for silhouette score calculation
    Returns:
        div_score: divergence score
        ent_score: entropy score
        sil_score: silhouette score
    """
    # calculate divergence and entropy
    if div_ent_code.shape[1] > div_ent_dim:
        div_ent_code = PCA(n_components=div_ent_dim).fit_transform(div_ent_code)
    div_pq = []  # divergence dataset p, q
    div_qp = []  # divergence dataset q, p
    ent = []  # entropy
    # pairs of datasets
    for d1 in range(1, num_datasets+1):
        for d2 in range(d1+1, num_datasets+1):
            idx1 = dataset_labels == d1
            idx2 = dataset_labels == d2
            labels = np.intersect1d(np.unique(cell_labels[idx1]), np.unique(cell_labels[idx2]))
            idx1_mutual = np.logical_and(idx1, np.isin(cell_labels, labels))
            idx2_mutual = np.logical_and(idx2, np.isin(cell_labels, labels))
            idx_specific = np.logical_and(np.logical_or(idx1, idx2), np.logical_not(np.isin(cell_labels, labels)))
            # divergence
            if np.sum(idx1_mutual) >= cal_min and np.sum(idx2_mutual) >= cal_min:
                div_pq.append(max(estimate(div_ent_code[idx1_mutual, :], div_ent_code[idx2_mutual, :], cal_min), 0))
                div_qp.append(max(estimate(div_ent_code[idx2_mutual, :], div_ent_code[idx1_mutual, :], cal_min), 0))
            # entropy
            if (sum(idx_specific) > 0):
                ent_tmp = cal_entropy(div_ent_code, idx_specific, dataset_labels)
                ent.append(sum(ent_tmp) / len(ent_tmp))
    if len(ent) == 0:  # if no dataset specific cell types, store entropy as -1
        ent.append(-1)

    # calculate silhouette_score
    if sil_code.shape[1] > sil_dim:
        sil_code = PCA(n_components=sil_dim).fit_transform(sil_code)
    sil_scores = silhouette_samples(sil_code, cell_labels, metric=sil_dist)

    # average for scores
    div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2
    ent_score = sum(ent) / len(ent)
    sil_score = sum(sil_scores) / len(sil_scores)

    return div_score, ent_score, sil_score