Python源码示例:sklearn.preprocessing.OneHotEncoder()
示例1
def cat_onehot_encoder(df,y,col,selection=True):
feat_x = df.values.reshape(-1,1)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(feat_x)
feat_x = le.transform(feat_x)
mlbs = OneHotEncoder(sparse=True).fit(feat_x.reshape(-1,1))
from scipy.sparse import csr_matrix
features_tmp = mlbs.transform(feat_x.reshape(-1,1))
features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
models = None
auc_score = None
if selection is True:
auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
print(col, "auc", auc_score)
#new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
new_feature = features_tmp
return new_feature,mlbs,models,auc_score,le
示例2
def loadmodel(self, prefix):
""" Load the model.
:param prefix: prefix of the model path
:return: None
:type prefix: str
"""
self.dictionary = Dictionary.load(prefix+'_vocabs.gensimdict')
parameters = json.load(open(prefix+'_config.json', 'r'))
self.operation = parameters['operation']
self.alph = parameters['alph']
self.specialsignals = parameters['special_signals']
self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals)
self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(self.alph)
self.batchsize = parameters['batchsize']
self.nb_hiddenunits = parameters['nb_hiddenunits']
self.onehotencoder = OneHotEncoder()
self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1)))
self.model = kerasio.load_model(prefix)
self.trained = True
示例3
def get_X_y(**kwargs):
"""simple wrapper around pd.read_csv that extracts features and labels
Some systematic preprocessing is also carried out to avoid doing this
transformation repeatedly in the code.
"""
global label_encoder
df = pd.read_csv(info['path'], sep='\t', **kwargs)
return preprocess(df, label_encoder)
###############################################################################
# Classifier objects in |sklearn| often require :code:`y` to be integer labels.
# Additionally, |APS| requires a binary version of the labels. For these two
# purposes, we create:
#
# * a |LabelEncoder|, that we pre-fitted on the known :code:`y` classes
# * a |OneHotEncoder|, pre-fitted on the resulting integer labels.
#
# Their |transform| methods can the be called at appopriate times.
示例4
def __call__(self, data):
if 'metadata' not in data:
raise TransformException(
f"Expected metadata in data, got {list(data.keys())}")
if 'labels' not in data['metadata']:
raise TransformException(
f"Expected labels in data['metadata'], got "
f"{list(data['metadata'].keys())}")
enc = OneHotEncoder(categories=[data['metadata']['labels']])
sources = data[self.source_key]
source_keys = [k.split('::')[0] for k in list(sources.keys())]
source_labels = [[l] for l in sorted(source_keys)]
one_hot_labels = enc.fit_transform(source_labels)
data['one_hot_labels'] = one_hot_labels.toarray()
return data
示例5
def __init__(self,
maptimes = 10,
enhencetimes = 10,
map_function = 'linear',
enhence_function = 'linear',
batchsize = 'auto',
reg = 0.001):
self._maptimes = maptimes
self._enhencetimes = enhencetimes
self._batchsize = batchsize
self._reg = reg
self._map_function = map_function
self._enhence_function = enhence_function
self.W = 0
self.pesuedoinverse = 0
self.normalscaler = scaler()
self.onehotencoder = preprocessing.OneHotEncoder(sparse = False)
self.mapping_generator = node_generator()
self.enhence_generator = node_generator(whiten = True)
示例6
def test_column_transformer_list():
X_list = [
[1, float('nan'), 'a'],
[0, 0, 'b']
]
expected_result = np.array([
[1, float('nan'), 1, 0],
[-1, 0, 0, 1],
])
ct = ColumnTransformer([
('numerical', StandardScaler(), [0, 1]),
('categorical', OneHotEncoder(), [2]),
])
assert_array_equal(ct.fit_transform(X_list), expected_result)
assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
示例7
def test_encode_options():
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='ordinal').fit(X)
Xt_1 = est.transform(X)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='onehot-dense').fit(X)
Xt_2 = est.transform(X)
assert not sp.issparse(Xt_2)
assert_array_equal(OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]],
sparse=False)
.fit_transform(Xt_1), Xt_2)
est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
encode='onehot').fit(X)
Xt_3 = est.transform(X)
assert sp.issparse(Xt_3)
assert_array_equal(OneHotEncoder(
categories=[np.arange(i) for i in [2, 3, 3, 3]],
sparse=True)
.fit_transform(Xt_1).toarray(),
Xt_3.toarray())
示例8
def test_one_hot_encoder_force_new_behaviour():
# ambiguous integer case (non secutive range of categories)
X = np.array([[1, 2]]).T
X2 = np.array([[0, 1]]).T
# without argument -> by default using legacy behaviour with warnings
enc = OneHotEncoder()
with ignore_warnings(category=FutureWarning):
enc.fit(X)
res = enc.transform(X2)
exp = np.array([[0, 0], [1, 0]])
assert_array_equal(res.toarray(), exp)
# with explicit auto argument -> don't use legacy behaviour
# (so will raise an error on unseen value within range)
enc = OneHotEncoder(categories='auto')
enc.fit(X)
assert_raises(ValueError, enc.transform, X2)
示例9
def test_one_hot_encoder_categorical_features():
X = np.array([[3, 2, 1], [0, 1, 1]])
X2 = np.array([[1, 1, 1]])
cat = [True, False, False]
_check_one_hot(X, X2, cat, 4)
# Edge case: all non-categorical
cat = [False, False, False]
_check_one_hot(X, X2, cat, 3)
# Edge case: all categorical
cat = [True, True, True]
_check_one_hot(X, X2, cat, 5)
# check error raised if also specifying categories
oh = OneHotEncoder(categories=[range(3)],
categorical_features=[True, False, False])
assert_raises(ValueError, oh.fit, X)
示例10
def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
enc = OneHotEncoder(categories=cats)
exp = np.array([[1., 0., 0.],
[0., 1., 0.]])
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert list(enc.categories[0]) == list(cats[0])
assert enc.categories_[0].tolist() == list(cats[0])
# manually specified categories should have same dtype as
# the data when coerced from lists
assert enc.categories_[0].dtype == cat_dtype
# when specifying categories manually, unknown categories should already
# raise when fitting
enc = OneHotEncoder(categories=cats)
with pytest.raises(ValueError, match="Found unknown categories"):
enc.fit(X2)
enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
exp = np.array([[1., 0., 0.], [0., 0., 0.]])
assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
示例11
def test_one_hot_encoder_unsorted_categories():
X = np.array([['a', 'b']], dtype=object).T
enc = OneHotEncoder(categories=[['b', 'a', 'c']])
exp = np.array([[0., 1., 0.],
[1., 0., 0.]])
assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert enc.categories_[0].tolist() == ['b', 'a', 'c']
assert np.issubdtype(enc.categories_[0].dtype, np.object_)
# unsorted passed categories still raise for numerical values
X = np.array([[1, 2]]).T
enc = OneHotEncoder(categories=[[2, 1, 3]])
msg = 'Unsorted categories are not supported'
with pytest.raises(ValueError, match=msg):
enc.fit_transform(X)
示例12
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
if as_data_frame:
pd = pytest.importorskip('pandas')
X = pd.DataFrame(X)
ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit(X)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit_transform(X)
if as_data_frame:
X_partial = X.iloc[:1, :]
else:
X_partial = X[:1, :]
ohe.fit(X_partial)
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.transform(X)
示例13
def test_encoder_dtypes():
# check that dtypes are preserved when determining categories
enc = OneHotEncoder(categories='auto')
exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
np.array([[1, 2], [3, 4]], dtype='float64'),
np.array([['a', 'b'], ['c', 'd']]), # string dtype
np.array([[1, 'a'], [3, 'b']], dtype='object')]:
enc.fit(X)
assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = [[1, 2], [3, 4]]
enc.fit(X)
assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = [[1, 'a'], [3, 'b']]
enc.fit(X)
assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
示例14
def test_encoder_dtypes_pandas():
# check dtype (similar to test_categorical_encoder_dtypes for dataframes)
pd = pytest.importorskip('pandas')
enc = OneHotEncoder(categories='auto')
exp = np.array([[1., 0., 1., 0., 1., 0.],
[0., 1., 0., 1., 0., 1.]], dtype='float64')
X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
enc.fit(X)
assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)
X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
X_type = [int, object, float]
enc.fit(X)
assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
assert_array_equal(enc.transform(X).toarray(), exp)
示例15
def test_one_hot_encoder_drop_manual():
cats_to_drop = ['def', 12, 3, 56]
enc = OneHotEncoder(drop=cats_to_drop)
X = [['abc', 12, 2, 55],
['def', 12, 1, 55],
['def', 12, 3, 56]]
trans = enc.fit_transform(X).toarray()
exp = [[1, 0, 1, 1],
[0, 1, 0, 1],
[0, 0, 0, 0]]
assert_array_equal(trans, exp)
dropped_cats = [cat[feature]
for cat, feature in zip(enc.categories_,
enc.drop_idx_)]
assert_array_equal(dropped_cats, cats_to_drop)
assert_array_equal(np.array(X, dtype=object),
enc.inverse_transform(trans))
示例16
def test_categories(density, drop):
ohe_base = OneHotEncoder(sparse=density)
ohe_test = OneHotEncoder(sparse=density, drop=drop)
X = [['c', 1, 'a'],
['a', 2, 'b']]
ohe_base.fit(X)
ohe_test.fit(X)
assert_array_equal(ohe_base.categories_, ohe_test.categories_)
if drop == 'first':
assert_array_equal(ohe_test.drop_idx_, 0)
else:
for drop_cat, drop_idx, cat_list in zip(drop,
ohe_test.drop_idx_,
ohe_test.categories_):
assert cat_list[drop_idx] == drop_cat
assert isinstance(ohe_test.drop_idx_, np.ndarray)
assert ohe_test.drop_idx_.dtype == np.int_
示例17
def fit_transform(self, X, y=None, sample_weight=None):
X = check_array(X, accept_sparse=['csc'], ensure_2d=False)
if sp.issparse(X):
# Pre-sort indices to avoid that each individual tree of the
# ensemble sorts the indices.
X.sort_indices()
X_, y_ = generate_discriminative_dataset(X)
super(RandomForestEmbedding, self).fit(X_, y_,
sample_weight=sample_weight)
self.one_hot_encoder_ = OneHotEncoder(sparse=True)
if self.sparse_output:
return self.one_hot_encoder_.fit_transform(self.apply(X))
return self.apply(X)
示例18
def test_binarizer_remove_first(self):
"""...Test binarizer fit when remove_first=True
"""
n_cuts = 3
one_hot_encoder = OneHotEncoder(sparse=True)
expected_binarization = one_hot_encoder.fit_transform(
self.default_expected_intervals)
binarizer = FeaturesBinarizer(method='quantile', n_cuts=n_cuts,
detect_column_type="auto",
remove_first=True)
binarizer.fit(self.features)
binarized_array = binarizer.transform(self.features)
self.assertEqual(binarized_array.__class__, csr.csr_matrix)
expected_binarization_without_first = \
np.delete(expected_binarization.toarray(), [0, 4, 8, 10], 1)
np.testing.assert_array_equal(expected_binarization_without_first,
binarized_array.toarray())
return
示例19
def load_cifar10_image(root='dataset',labels=False):
helpers.create(root, 'cifar10')
droot = root+'/'+'cifar10'
if not os.path.exists('{}/cifar10.pkl'.format(droot)):
from downloader import download_cifar10
download_cifar10(droot)
f = lambda d:d.astype(floatX)
filename = '{}/cifar10.pkl'.format(droot)
tr_x, tr_y, te_x, te_y = pickle.load(open(filename,'r'))
if tr_x.max() == 255:
tr_x = tr_x / 256.
te_x = te_x / 256.
if labels:
enc = OneHotEncoder(10)
tr_y = enc.fit_transform(tr_y).toarray().reshape(50000,10).astype(int)
te_y = enc.fit_transform(te_y).toarray().reshape(10000,10).astype(int)
return (f(d) for d in [tr_x, tr_y, te_x, te_y])
else:
return (f(d) for d in [tr_x, te_x])
示例20
def test_conversion_many_columns(self):
scikit_model = OneHotEncoder()
scikit_model.fit(self.scikit_data_multiple_cols)
spec = sklearn.convert(
scikit_model, ["feature_1", "feature_2"], "out"
).get_spec()
test_data = [
{"feature_1": row[0], "feature_2": row[1]}
for row in self.scikit_data_multiple_cols
]
scikit_output = [
{"out": row}
for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()
]
metrics = evaluate_transformer(spec, test_data, scikit_output)
self.assertIsNotNone(spec)
self.assertIsNotNone(spec.description)
self.assertEquals(metrics["num_errors"], 0)
示例21
def test_conversion_one_column_of_several(self):
scikit_model = OneHotEncoder(categorical_features=[0])
scikit_model.fit(copy(self.scikit_data_multiple_cols))
spec = sklearn.convert(
scikit_model, ["feature_1", "feature_2"], "out"
).get_spec()
test_data = [
{"feature_1": row[0], "feature_2": row[1]}
for row in self.scikit_data_multiple_cols
]
scikit_output = [
{"out": row}
for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()
]
metrics = evaluate_transformer(spec, test_data, scikit_output)
self.assertIsNotNone(spec)
self.assertIsNotNone(spec.description)
self.assertEquals(metrics["num_errors"], 0)
示例22
def test_boston_OHE_pipeline(self):
data = load_boston()
for categorical_features in [[3], [8], [3, 8], [8, 3]]:
# Put it in a pipeline so that we can test whether the output dimension
# handling is correct.
model = Pipeline(
[
("OHE", OneHotEncoder(categorical_features=categorical_features)),
("Normalizer", Normalizer()),
]
)
model.fit(data.data.copy(), data.target)
# Convert the model
spec = sklearn.convert(model, data.feature_names, "out").get_spec()
input_data = [dict(zip(data.feature_names, row)) for row in data.data]
output_data = [{"out": row} for row in model.transform(data.data.copy())]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
示例23
def summonehot(corpus):
allwords=[]
annotated={}
for sent in corpus:
for word in wt(sent):
allwords.append(word.lower())
print(len(set(allwords)), "unique characters in corpus")
#maxcorp=int(input("Enter desired number of vocabulary: "))
maxcorp=int(len(set(allwords))/1.1)
wordcount = Counter(allwords).most_common(maxcorp)
allwords=[]
for p in wordcount:
allwords.append(p[0])
allwords=list(set(allwords))
print(len(allwords), "unique characters in corpus after max corpus cut")
#integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(allwords)
#one hot
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
#make look up dict
for k in range(len(onehot_encoded)):
inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip()
annotated[inverted]=onehot_encoded[k]
return label_encoder,onehot_encoded,annotated
示例24
def getdataset(datasetname, onehot_encode_strings=True):
# load
dataset = fetch_openml(datasetname)
# get X and y
X = dshape(dataset.data)
try:
target = dshape(dataset.target)
except:
print("WARNING: No target found. Taking last column of data matrix as target")
target = X[:, -1]
X = X[:, :-1]
if (
len(target.shape) > 1 and target.shape[1] > X.shape[1]
): # some mldata sets are mixed up...
X = target
target = dshape(dataset.data)
if len(X.shape) == 1 or X.shape[1] <= 1:
for k in dataset.keys():
if k != "data" and k != "target" and len(dataset[k]) == X.shape[1]:
X = np.hstack((X, dshape(dataset[k])))
# one-hot for categorical values
if onehot_encode_strings:
cat_ft = [
i
for i in range(X.shape[1])
if "str" in str(type(unpack(X[0, i])))
or "unicode" in str(type(unpack(X[0, i])))
]
if len(cat_ft):
for i in cat_ft:
X[:, i] = tonumeric(X[:, i])
X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
# if sparse, make dense
try:
X = X.toarray()
except:
pass
# convert y to monotonically increasing ints
y = tonumeric(target).astype(int)
return np.nan_to_num(X.astype(float)), y
示例25
def one_hot_encode(sequences):
sequence_length = len(sequences[0])
integer_type = np.int8 if sys.version_info[
0] == 2 else np.int32 # depends on Python version
integer_array = LabelEncoder().fit(
np.array(('ACGTN',)).view(integer_type)).transform(
sequences.view(integer_type)).reshape(
len(sequences), sequence_length)
one_hot_encoding = OneHotEncoder(
sparse=False, n_values=5, dtype=integer_type).fit_transform(integer_array)
return one_hot_encoding.reshape(len(sequences), 1, sequence_length,
5).swapaxes(2, 3)[:, :, [0, 1, 2, 4], :]
示例26
def __encode(self, X):
Xenc = X.copy(deep=True)
if self._label_encoder is None or self._onehot_encoder is None:
self._label_encoder = [None] * len(Xenc.columns)
self._onehot_encoder = [None] * len(Xenc.columns)
del_columns = []
for i in range(len(Xenc.columns)):
if Xenc.dtypes[i] == np.dtype('O'):
if self._label_encoder[i] is None:
self._label_encoder[i] = LabelEncoder().fit(Xenc.iloc[:,i])
col_enc = self._label_encoder[i].transform(Xenc.iloc[:,i])
if self._onehot_encoder[i] is None:
self._onehot_encoder[i] = OneHotEncoder(categories='auto').fit(
col_enc.reshape(-1, 1))
col_onehot = np.array(self._onehot_encoder[i].transform(
col_enc.reshape(-1, 1)).todense())
col_names = [str(Xenc.columns[i]) + '_' + c
for c in self._label_encoder[i].classes_]
col_onehot = pd.DataFrame(col_onehot, columns=col_names,
index=Xenc.index)
Xenc = pd.concat([Xenc, col_onehot], axis=1)
del_columns.append(Xenc.columns[i])
for col in del_columns:
del Xenc[col]
return Xenc, del_columns
示例27
def get_Processed_NHANES_Data(filename):
"""
Args:
filename (str): Enter NHANES filename
Returns:
One hot encoded features and original input
"""
# returns original and one hot encoded data
# Input: XPT filename e.g. 2_H.XPT)
# output:
# One hot endcoded, e.g. (5924 x 145)
# original, e.g. (5924 x 9)
with open(filename, 'rb') as f:
original = xport.to_numpy(f)
# replace nan's with 0's.
original[np.isnan(original)] = 0
# delete 1st column (contains sequence numbers)
original = original[:, 1:]
# one hot encoding of all columns/features
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(original)
# return one hot encoded and original data
return (onehot_encoded, original)
示例28
def fit(self, list_all_action):
"""
Fit the encoder of Label Encoder. So it can map an integer to an action key.
Also fit the One Hot Encoder.
:param list_all_action: list of all possible action keys in the game
:return:
"""
self.le = preprocessing.LabelEncoder()
list_all_action = self.le.fit_transform(list_all_action)
self.shape_all_actions = len(list_all_action)
self.onehot_encoder = OneHotEncoder(self.shape_all_actions, sparse=False)
list_all_action = list_all_action.reshape(len(list_all_action), 1)
self.onehot_encoder.fit(list_all_action)
self.create_mirror_dict()
示例29
def oneHotEncodingForFastFM(X: pd.DataFrame):
numeric_table = X[[c for c in X.columns if c.startswith(CONSTANT.NUMERICAL_PREFIX) or c.startswith(CONSTANT.TIME_PREFIX)]]
X = X[[c for c in X.columns if not (c.startswith(CONSTANT.NUMERICAL_PREFIX) or c.startswith(CONSTANT.TIME_PREFIX))]]
numeric_table = (numeric_table - numeric_table.min())/(numeric_table.max() - numeric_table.min())
enc = OneHotEncoder(sparse=True, dtype=np.float32, categories="auto")
X = enc.fit_transform(X)
X = hstack((X, numeric_table.values), dtype=np.float32).tocsr()
return X
示例30
def _normalize(x, norm=None):
"""
Apply one-hot encoding or z-score to a list of node features
"""
if norm == 'ohe':
fnorm = OneHotEncoder(sparse=False, categories='auto')
elif norm == 'zscore':
fnorm = StandardScaler()
else:
return x
return fnorm.fit_transform(x)