Python源码示例:sklearn.metrics.mutual_info_score()

示例1
def ami(x, y=None, n_bins=10):
    """Calculate the average mutual information between $x(t)$ and $y(t)$.

    Parameters
    ----------
    x : array-like
    y : array-like, optional
        $x(t)$ and $y(t)$.
        If only `x` is passed, it must have two columns;
        the first column defines $x(t)$ and the second $y(t)$.
    n_bins : int
        The number of bins to use when computing the joint histogram.

    Returns
    -------
    scalar
        Average mutual information between $x(t)$ and $y(t)$, in nats (natural log equivalent of bits).

    See Also
    --------
    lagged_ami

    References
    ----------
    Arbanel, H. D. (1996). *Analysis of Observed Chaotic Data* (p. 28). New York: Springer.

    """
    x, y = _vector_pair(x, y)
    if x.shape[0] != y.shape[0]:
        raise ValueError('timeseries must have the same length')

    return metrics.mutual_info_score(None, None, contingency=np.histogram2d(x, y, bins=n_bins)[0]) 
示例2
def mutual_information(x, y, bins=8):
    """Mutual information score with set number of bins

    Helper function for `sklearn.metrics.mutual_info_score` that builds a
    contingency table over a set number of bins.
    Credit: `Warran Weckesser <https://stackoverflow.com/a/20505476/3996580>`_.


    Parameters
    ----------
    x : array-like, shape=[n_samples]
        Input data (feature 1)
    y : array-like, shape=[n_samples]
        Input data (feature 2)
    bins : int or array-like, (default: 8)
        Passed to np.histogram2d to calculate a contingency table.

    Returns
    -------
    mi : float
        Mutual information between x and y.

    Examples
    --------
    >>> import scprep
    >>> data = scprep.io.load_csv("my_data.csv")
    >>> mi = scprep.stats.mutual_information(data['GENE1'], data['GENE2'])
    """
    x, y = _vector_coerce_two_dense(x, y)
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = metrics.mutual_info_score(None, None, contingency=c_xy)
    return mi 
示例3
def calc_MI(x, y, bins):
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi 
示例4
def test_mutual_info_score(self):
        result = self.df.metrics.mutual_info_score()
        expected = metrics.mutual_info_score(self.target, self.pred)
        self.assertEqual(result, expected) 
示例5
def permutation_test_ct2(data, num_samples=10000):
    """
    Monte-Carlo permutation test for a 2-way contingency table

    Parameters
    ----------
    data :
        the contingency table

    num_samples :
        the number of random permutations to perform

    Returns
    -------
    pval :
        the p-value

    References
    ----------
    https://en.wikipedia.org/wiki/Resampling_(statistics)
    """
    if isinstance(data, pd.DataFrame):
        data = np.array(data)

    dim = data.shape
    data_x = []
    data_y = []

    for x in range(0, dim[0]):
        for y in range(0, dim[1]):
            data_x += [x]*data[x, y]
            data_y += [y]*data[x, y]

    stat_0 = metrics.mutual_info_score(data_x, data_y)

    k = 0
    for _ in range(num_samples):
        np.random.shuffle(data_x)
        mi = metrics.mutual_info_score(data_x, data_y)
        k += stat_0 < mi

    pval = (1.0*k) / num_samples
    return max(pval, 1.0/num_samples) 
示例6
def accuracy(self, sources: Sources) -> Accuracy:
        if not self._filepath.is_file():
            raise ModelNotTrained("Train model before assessing for accuracy.")
        xdata = []
        ydata = []
        target = []
        estimator_type = self.clf._estimator_type
        if estimator_type == "clusterer":
            target = (
                []
                if self.parent.config.tcluster is None
                else [self.parent.config.tcluster.name]
            )
        async for record in sources.with_features(self.features):
            feature_data = record.features(self.features)
            xdata.append(list(feature_data.values()))
            ydata.append(list(record.features(target).values()))
        xdata = self.np.array(xdata)
        self.logger.debug("Number of input records: {}".format(len(xdata)))
        if target:
            ydata = self.np.array(ydata).flatten()
            if hasattr(self.clf, "predict"):
                # xdata can be training data or unseen data
                # inductive clusterer with ground truth
                y_pred = self.clf.predict(xdata)
                self.confidence = mutual_info_score(ydata, y_pred)
            else:
                # requires xdata = training data
                # transductive clusterer with ground truth
                self.logger.critical(
                    "Accuracy found transductive clusterer, ensure data being passed is training data"
                )
                self.confidence = mutual_info_score(ydata, self.clf.labels_)
        else:
            if hasattr(self.clf, "predict"):
                # xdata can be training data or unseen data
                # inductive clusterer without ground truth
                y_pred = self.clf.predict(xdata)
                self.confidence = silhouette_score(xdata, y_pred)
            else:
                # requires xdata = training data
                # transductive clusterer without ground truth
                self.logger.critical(
                    "Accuracy found transductive clusterer, ensure data being passed is training data"
                )
                self.confidence = silhouette_score(xdata, self.clf.labels_)
        self.logger.debug("Model Accuracy: {}".format(self.confidence))
        return self.confidence 
示例7
def one_way_mi(df, feature_list, group_column, y_var, bins):

    """
    Calculates one-way mutual information group variable and a
    target variable (y) given a feature list regarding.

    Parameters
    ----------
    df : pandas DataFrame
         df with features used to train model, plus a target variable
         and a group column.
    feature_list : list DataFrame
        List of strings, feature names.
    group_column : string
        name of column for testing bias, should contain numeric categories
    y_var : string
        name of target variable column
    bins : tuple
        number of bins for each dimension

    Returns
    -------
    mi_table : pandas DataFrame
        data frame with mutual information values, with one row per feature
        in the feature_list, columns for group and y.
    """

    group_cats = df[group_column].values
    y_cats = df[y_var].values

    c_g = [
        np.histogramdd([np.array(df[feature]), group_cats], bins=bins)[0]
        for feature in feature_list
        ]
    c_y = [
        np.histogramdd([np.array(df[feature]), y_cats], bins=bins)[0]
        for feature in feature_list
        ]

    # compute mutual information (MI) between trait and gender/eth/y
    mi_g = [mutual_info_score(None, None, contingency=i) for i in c_g]
    mi_y = [mutual_info_score(None, None, contingency=i) for i in c_y]
    mi_table = pd.DataFrame({'feature': feature_list,
                             group_column: mi_g,
                             y_var: mi_y})

    # NOTE: Scale group and y where the highest MI is scaled to 1 to
    # facilitate interpreting relative importance to bias and performance
    mi_table["{}_scaled".format(group_column)] = (
        mi_table[group_column] / mi_table[group_column].max()
    )
    mi_table["{}_scaled".format(y_var)] = (
        mi_table[y_var] / mi_table[y_var].max()
    )

    return mi_table