Python源码示例:sklearn.preprocessing.OrdinalEncoder()

示例1
def test_gp_regressor():
    rng = np.random.RandomState(0)
    X = np.asarray([
        ["ham", "spam", "ted"],
        ["ham", "ted", "ted"],
        ["ham", "spam", "spam"]])
    y = rng.randn(3)
    hm = HammingKernel(length_scale=[1.0, 1.0, 1.0])
    if UseOrdinalEncoder:
        enc = OrdinalEncoder()
        enc.fit(X)

    gpr = GaussianProcessRegressor(hm)
    if UseOrdinalEncoder:
        gpr.fit(enc.transform(X), y)
        assert_array_almost_equal(gpr.predict(enc.transform(X)), y)
        assert_array_almost_equal(gpr.predict(enc.transform(X[:2])), y[:2])
    else:
        gpr.fit(X, y)
        assert_array_almost_equal(gpr.predict(X), y)
        assert_array_almost_equal(gpr.predict(X[:2]), y[:2]) 
示例2
def make_xgboost_column_transformer(dtypes, missing_value_aware = True):
	"""Construct a ColumnTransformer for feeding complex data into an XGBModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns
	-------
	ColumnTransformer

	"""
	transformers = list()
	for column, dtype in dtypes.items():
		if _is_categorical(dtype):
			transformers.append((str(column), PMMLLabelBinarizer(sparse_output = True) if missing_value_aware else Pipeline([("ordinal_encoder", OrdinalEncoder()), ("one_hot_encoder", OneHotEncoder())]), [column]))
		else:
			transformers.append((str(column), "passthrough", [column]))
	return ColumnTransformer(transformers, remainder = "drop") 
示例3
def test_model_ordinal_encoder(self):
        model = OrdinalEncoder(dtype=np.int64)
        data = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]],
                        dtype=np.int64)
        model.fit(data)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn ordinal encoder",
            [("input", Int64TensorType([None, 3]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnOrdinalEncoderInt64-SkipDim1",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
        ) 
示例4
def test_ordinal_encoder_onecat(self):
        data = [["cat"], ["cat"]]
        model = OrdinalEncoder(categories="auto")
        model.fit(data)
        inputs = [("input1", StringTensorType([None, 1]))]
        model_onnx = convert_sklearn(model, "ordinal encoder one string cat",
                                     inputs)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnOrdinalEncoderOneStringCat",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
        ) 
示例5
def test_ordinal_encoder_twocats(self):
        data = [["cat2"], ["cat1"]]
        model = OrdinalEncoder(categories="auto")
        model.fit(data)
        inputs = [("input1", StringTensorType([None, 1]))]
        model_onnx = convert_sklearn(model, "ordinal encoder two string cats",
                                     inputs)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
            basename="SklearnOrdinalEncoderTwoStringCat",
        ) 
示例6
def test_model_ordinal_encoder_cat_list(self):
        model = OrdinalEncoder(categories=[[0, 1, 4, 5],
                                           [1, 2, 3, 5],
                                           [0, 3, 4, 6]])
        data = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]],
                        dtype=np.int64)
        model.fit(data)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn ordinal encoder",
            [("input", Int64TensorType([None, 3]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnOrdinalEncoderCatList",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
        ) 
示例7
def encode_variables(data):
  """ Encodes variables using simple ordinal encoding."""
  data2 = np.copy(data)
  encoder = OrdinalEncoder()
  categorical_indices = kBytesIndices
  data2[:,
        categorical_indices] = encoder.fit_transform(data2[:,
                                                           categorical_indices])
  return data2 
示例8
def test_ordinal_encoder(X):
    enc = OrdinalEncoder()
    exp = np.array([[0, 1, 0],
                    [1, 0, 0]], dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
    enc = OrdinalEncoder(dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp) 
示例9
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OrdinalEncoder(categories=cats)
    exp = np.array([[0.], [1.]])
    assert_array_equal(enc.fit_transform(X), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OrdinalEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2) 
示例10
def test_ordinal_encoder_inverse():
    X = [['abc', 2, 55], ['def', 1, 55]]
    enc = OrdinalEncoder()
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
    msg = re.escape('Shape of the passed X data is not correct')
    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr) 
示例11
def test_ordinal_encoder_raise_missing(X):
    ohe = OrdinalEncoder()

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit_transform(X)

    ohe.fit(X[:1, :])

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.transform(X) 
示例12
def test_ordinal_encoder_raise_categories_shape():

    X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
    cats = ['Low', 'Medium', 'High']
    enc = OrdinalEncoder(categories=cats)
    msg = ("Shape mismatch: if categories is an array,")

    with pytest.raises(ValueError, match=msg):
        enc.fit(X) 
示例13
def make_lightgbm_column_transformer(dtypes, missing_value_aware = True):
	"""Construct a ColumnTransformer for feeding complex data into a LGBMModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns:
	Tuple (ColumnTransformer, list of categorical column indices)

	"""
	transformers = list()
	categorical_features = list()
	i = 0
	for column, dtype in dtypes.items():
		if _is_categorical(dtype):
			transformers.append((str(column), PMMLLabelEncoder(missing_values = -1) if missing_value_aware else OrdinalEncoder(), [column]))
			categorical_features.append(i)
		else:
			transformers.append((str(column), "passthrough", [column]))
		i += 1
	return (ColumnTransformer(transformers, remainder = "drop"), categorical_features) 
示例14
def test_ordinal_encoder_mixed_string_int_drop(self):
        data = [
            ["c0.4", "c0.2", 3],
            ["c1.4", "c1.2", 0],
            ["c0.2", "c2.2", 1],
            ["c0.2", "c2.2", 1],
            ["c0.2", "c2.2", 1],
            ["c0.2", "c2.2", 1],
        ]
        test = [["c0.2", "c2.2", 1]]
        model = OrdinalEncoder(categories="auto")
        model.fit(data)
        inputs = [
            ("input1", StringTensorType([None, 2])),
            ("input2", Int64TensorType([None, 1])),
        ]
        model_onnx = convert_sklearn(
            model, "ordinal encoder", inputs)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            test,
            model,
            model_onnx,
            basename="SklearnOrdinalEncoderMixedStringIntDrop",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
        ) 
示例15
def create_data(X: dt.Frame = None):
        if X is None:
            return []

        data = X.to_pandas().copy()

        # identify categorical colmns and trasform them
        cats = [x for x in data.select_dtypes(exclude=np.number).columns if x not in [target] + cols2ignore]

        for c in cats:
            data[c] = OrdinalEncoder().fit_transform(data[c].astype(str).values.reshape(-1, 1))

        # Get the actual importance, i.e. without shuffling
        actual_imp_df = get_feature_importances(data=data, cats=cats, shuffle=False, seed=42)

        # Seed the unexpected randomness of this world
        np.random.seed(123)

        seeds = np.random.randint(0, 2 ** 30, size=number_of_iterations)
        null_imp_df = pd.DataFrame()

        for i, s in enumerate(seeds):
            # Get current run importances
            imp_df = get_feature_importances(data=data, cats=cats, shuffle=True, seed=s)
            imp_df['run'] = i + 1
            # Concat the latest importances with the old ones
            null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)

        feature_scores = []
        for _f in actual_imp_df['feature'].unique():
            f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance'].values
            f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance'].mean()
            _score = np.log(
                1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, max(75, min(99, threshold)))))

            feature_scores.append((_f, _score))

        scores_df = pd.DataFrame(feature_scores, columns=['feature', 'score'])
        # final feature selection
        selected_features = scores_df[scores_df['score'] > 0]['feature'].values.tolist()
        selected_features = np.unique(selected_features).tolist()

        data = X.to_pandas().copy()
        return data[cols2ignore + selected_features + [target]] 
示例16
def inverse_transform(self, X):
        """Convert the data back to the original representation.
        In slots where the encoding is that of an unrecognised category, the output of the inverse transform is np.nan
        for float or complex arrays, and None otherwise

        Parameters
        ----------
        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
            The transformed data.

        Returns
        -------
        X_tr : array-like, shape [n_samples, n_features]
            Inverse transformed array.

        Notes
        -----
        Most of the logic is copied from sklearn.preprocessing.OrdinalEncoder.inverse_transform. The difference is in
        handling unknown values.

        """
        check_is_fitted(self, "categories_")
        X = check_array(X, dtype="numeric")

        n_samples, _ = X.shape
        n_features = len(self.categories_)

        # validate shape of passed X
        msg = "Shape of the passed X data is not correct. Expected {0} " "columns, got {1}."
        if X.shape[1] != n_features:
            raise ValueError(msg.format(n_features, X.shape[1]))

        # create resulting array of appropriate dtype
        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
        X_tr = np.empty((n_samples, n_features), dtype=dt)

        found_unknown = {}
        for i in range(n_features):
            labels = X[:, i].astype("int64", copy=False)
            known_mask = labels != self.categories_[i].shape[0]
            labels *= known_mask
            X_tr[:, i] = self.categories_[i][labels]
            if not np.all(known_mask):
                found_unknown[i] = ~known_mask

        # if unknown are found cast to an object array and transform the missing values to None
        if found_unknown:
            if X_tr.dtype != object:
                X_tr = X_tr.astype(object)

            for idx, unknown_mask in found_unknown.items():
                X_tr[unknown_mask, idx] = None

        return X_tr 
示例17
def string_index(self, columns=None):
        """Indexes categorical string features on the dataset.

        :param columns: Optional parameter specifying the subset of columns that may need to be string indexed.
        :type columns: list
        :return: The transformation steps to index the given dataset.
        :rtype: ColumnTransformer
        """
        if self._string_indexed:
            return self._column_indexer
        # Optimization so we don't redo this operation multiple times on the same dataset
        self._string_indexed = True
        # If the data was previously successfully summarized, then there are no
        # categorical columns as it must be numeric.
        # Also, if the dataset is sparse, we can assume there are no categorical strings
        if isinstance(self._dataset, DenseData) or issparse(self._dataset):
            return None
        # If the user doesn't have a newer version of scikit-learn with OrdinalEncoder, don't do encoding
        try:
            from sklearn.compose import ColumnTransformer
            from sklearn.preprocessing import OrdinalEncoder
        except ImportError:
            return None
        tmp_dataset = self._dataset
        # Temporarily convert to pandas for easier and uniform string handling
        if isinstance(self._dataset, np.ndarray):
            tmp_dataset = pd.DataFrame(self._dataset, dtype=self._dataset.dtype)
        categorical_col_names = list(np.array(list(tmp_dataset))[(tmp_dataset.applymap(type) == str).all(0)])
        if categorical_col_names:
            all_columns = tmp_dataset.columns
            if columns is not None:
                categorical_col_indices = \
                    [all_columns.get_loc(col_name) for col_name in categorical_col_names if col_name in columns]
            else:
                categorical_col_indices = [all_columns.get_loc(col_name) for col_name in categorical_col_names]
            ordinal_enc = OrdinalEncoder()
            ct = ColumnTransformer([('ord', ordinal_enc, categorical_col_indices)], remainder='drop')
            string_indexes_dataset = ct.fit_transform(tmp_dataset)
            # Inplace replacement of columns
            # (danger: using remainder=passthrough with ColumnTransformer will change column order!)
            for idx, categorical_col_index in enumerate(categorical_col_indices):
                self._dataset[:, categorical_col_index] = string_indexes_dataset[:, idx]
            self._column_indexer = ct
        return self._column_indexer