Python源码示例:sklearn.preprocessing.LabelEncoder()
示例1
def test_explain_model_local_with_predicted_label(self):
"""
Test for explain_local of classical explainer
:return:
"""
X_train, X_test, y_train, y_test = setup_mnli_test_train_split()
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
explainer = ClassicalTextExplainer()
classifier, best_params = explainer.fit(X_train, y_train)
explainer.preprocessor.labelEncoder = label_encoder
y = classifier.predict(DOCUMENT)
predicted_label = label_encoder.inverse_transform(y)
local_explanation = explainer.explain_local(DOCUMENT, predicted_label)
assert len(local_explanation.local_importance_values) == len(local_explanation.features)
示例2
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
示例3
def __init__(self,
corpus,
sherlock_features: List[str] = None,
topic_feature: str = None,
label_enc: LabelEncoder = None,
id_filter: List[str] = None,
max_col_count:int = None,
shuffle_group:str=None):
super().__init__(corpus,
sherlock_features,
topic_feature,
label_enc,
id_filter,
max_col_count)
l = len(self.df_header)
self.tempcorpus = corpus
self.shuffle_group = shuffle_group
self.prng = np.random.RandomState(SEED)
self.shuffle_order = self.prng.permutation(l)
示例4
def __init__(self,
df_dict: Dict[str, pd.DataFrame]=None,
tensor_dict: Dict[str, torch.FloatTensor]=None,
labels: List[str] =[],
label_enc: LabelEncoder = None,
shuffle_group: str = None):
super().__init__(df_dict,
tensor_dict,
labels,
label_enc)
l = self.__len__()
self.shuffle_group = shuffle_group
prng = np.random.RandomState(SEED)
self.shuffle_order = prng.permutation(l)
示例5
def cat_onehot_encoder(df,y,col,selection=True):
feat_x = df.values.reshape(-1,1)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(feat_x)
feat_x = le.transform(feat_x)
mlbs = OneHotEncoder(sparse=True).fit(feat_x.reshape(-1,1))
from scipy.sparse import csr_matrix
features_tmp = mlbs.transform(feat_x.reshape(-1,1))
features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
models = None
auc_score = None
if selection is True:
auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
print(col, "auc", auc_score)
#new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
new_feature = features_tmp
return new_feature,mlbs,models,auc_score,le
示例6
def preprocessData(dataset):
le = preprocessing.LabelEncoder()
# in case divid-by-zero
dataset.Open[dataset.Open == 0] = 1
# add prediction target: next day Up/Down
threshold = 0.000
dataset['UpDown'] = (dataset['Close'] - dataset['Open']) / dataset['Open']
dataset.UpDown[dataset.UpDown >= threshold] = 'Up'
dataset.UpDown[dataset.UpDown < threshold] = 'Down'
dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
dataset.UpDown = dataset.UpDown.shift(-1) # shift 1, so the y is actually next day's up/down
dataset = dataset.drop(dataset.index[-1]) # drop last one because it has no up/down value
return dataset
示例7
def get_query_y(self, Qy, Qyc, class_label):
"""
Returns labeled representation of classes of Query set and a list of labels.
"""
labels = []
m = len(Qy)
for i in range(m):
labels += [Qy[i]] * Qyc[i]
labels = np.array(labels).reshape(len(labels), 1)
label_encoder = LabelEncoder()
Query_y = torch.Tensor(
label_encoder.fit_transform(labels).astype(int)).long()
if self.gpu:
Query_y = Query_y.cuda()
Query_y_labels = np.unique(labels)
return Query_y, Query_y_labels
示例8
def get_cars_data():
"""
Load the cars dataset, split it into X and y, and then call the label encoder to get an integer y column.
:return:
"""
df = pd.read_csv('source_data/cars/car.data.txt')
X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
y = df.reindex(columns=['class'])
y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))
mapping = [
{'col': 'buying', 'mapping': [('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]},
{'col': 'maint', 'mapping': [('vhigh', 0), ('high', 1), ('med', 2), ('low', 3)]},
{'col': 'doors', 'mapping': [('2', 0), ('3', 1), ('4', 2), ('5more', 3)]},
{'col': 'persons', 'mapping': [('2', 0), ('4', 1), ('more', 2)]},
{'col': 'lug_boot', 'mapping': [('small', 0), ('med', 1), ('big', 2)]},
{'col': 'safety', 'mapping': [('high', 0), ('med', 1), ('low', 2)]},
]
return X, y, mapping
示例9
def get_mushroom_data():
"""
Load the mushroom dataset, split it into X and y, and then call the label encoder to get an integer y column.
:return:
"""
df = pd.read_csv('source_data/mushrooms/agaricus-lepiota.csv')
X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
y = df.reindex(columns=['class'])
y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))
# this data is truly categorical, with no known concept of ordering
mapping = None
return X, y, mapping
示例10
def get_splice_data():
"""
Load the mushroom dataset, split it into X and y, and then call the label encoder to get an integer y column.
:return:
"""
df = pd.read_csv('source_data/splice/splice.csv')
X = df.reindex(columns=[x for x in df.columns.values if x != 'class'])
X['dna'] = X['dna'].map(lambda x: list(str(x).strip()))
for idx in range(60):
X['dna_%d' % (idx, )] = X['dna'].map(lambda x: x[idx])
del X['dna']
y = df.reindex(columns=['class'])
y = preprocessing.LabelEncoder().fit_transform(y.values.reshape(-1, ))
# this data is truly categorical, with no known concept of ordering
mapping = None
return X, y, mapping
示例11
def get_X_y(**kwargs):
"""simple wrapper around pd.read_csv that extracts features and labels
Some systematic preprocessing is also carried out to avoid doing this
transformation repeatedly in the code.
"""
global label_encoder
df = pd.read_csv(info['path'], sep='\t', **kwargs)
return preprocess(df, label_encoder)
###############################################################################
# Classifier objects in |sklearn| often require :code:`y` to be integer labels.
# Additionally, |APS| requires a binary version of the labels. For these two
# purposes, we create:
#
# * a |LabelEncoder|, that we pre-fitted on the known :code:`y` classes
# * a |OneHotEncoder|, pre-fitted on the resulting integer labels.
#
# Their |transform| methods can the be called at appopriate times.
示例12
def fit(self, X, y):
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight
label_encoder = LabelEncoder().fit(y)
classes = label_encoder.classes_
class_weight = compute_class_weight(self.class_weight, classes, y)
# Intentionally modify the balanced class_weight
# to simulate a bug and raise an exception
if self.class_weight == "balanced":
class_weight += 1.
# Simply assigning coef_ to the class_weight
self.coef_ = class_weight
return self
示例13
def score(self,
actual: np.array,
predicted: np.array,
sample_weight: typing.Optional[np.array] = None,
labels: typing.Optional[np.array] = None,
**kwargs) -> float:
lb = LabelEncoder()
labels = lb.fit_transform(labels)
actual = lb.transform(actual)
method = "binary"
if len(labels) > 2:
predicted = np.argmax(predicted, axis=1)
method = "micro"
else:
predicted = (predicted > self._threshold)
f4_score = fbeta_score(actual, predicted, labels=labels, average=method, sample_weight=sample_weight, beta=4)
return f4_score
示例14
def score(self,
actual: np.array,
predicted: np.array,
sample_weight: typing.Optional[np.array] = None,
labels: typing.Optional[np.array] = None,
**kwargs) -> float:
# label actuals as 1 or 0
lb = LabelEncoder()
labels = lb.fit_transform(labels)
actual = lb.transform(actual)
# label predictions as 1 or 0
predicted = predicted >= self._threshold
# use sklearn to get fp and fn
cm = confusion_matrix(actual, predicted, sample_weight=sample_weight, labels=labels)
tn, fp, fn, tp = cm.ravel()
# calculate`$1*FP + $2*FN`
return ((fp * self.__class__._fp_cost) + (fn * self.__class__._fn_cost)) / (
tn + fp + fn + tp) # divide by total weighted count to make loss invariant to data size
示例15
def score(self,
actual: np.array,
predicted: np.array,
sample_weight: typing.Optional[np.array] = None,
labels: typing.Optional[np.array] = None,
**kwargs) -> float:
lb = LabelEncoder()
labels = lb.fit_transform(labels)
actual = lb.transform(actual)
method = "binary"
if len(labels) > 2:
predicted = np.argmax(predicted, axis=1)
method = "micro"
else:
predicted = (predicted > self._threshold)
f3_score = fbeta_score(actual, predicted, labels=labels, average=method, sample_weight=sample_weight, beta=3)
return f3_score
示例16
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
orig_cols = list(X.names)
XX = X.to_pandas()
params = {
'train_dir': user_dir(),
'allow_writing_files': False,
'thread_count': 10,
# 'loss_function': 'Logloss'
}
from catboost import CatBoostClassifier
model = CatBoostClassifier(**params)
model.fit(XX, y=y, sample_weight=sample_weight, verbose=False,
cat_features=list(X[:, [str, int]].names)) # Amazon specific, also no early stopping
# must always set best_iterations
self.set_model_properties(model=model,
features=orig_cols,
importances=model.feature_importances_,
iterations=0)
示例17
def test_transactional_to_iid():
ret = TransactionalToIID.create_data()
for name, X in ret.items():
le = LabelEncoder()
y = le.fit_transform(X[target]).ravel()
print(name)
print(X.head(10))
print(X.tail(10))
for col in X.names:
if "_past_" in col:
auc = roc_auc_score(y, X[col].to_numpy().ravel())
print("%s: auc = %f" % (col, auc))
if "leaky" not in col:
assert auc > 0.53 # all lags must have signal
assert auc < 0.8 # but not too much
else:
assert auc > 0.75 # all leaky lags must have a lot of signal
示例18
def get_feature_importances(data, shuffle, cats=[], seed=None):
# Gather real features
train_features = [f for f in data if f not in [target] + cols2ignore]
# Shuffle target if required
y = data[target].copy()
if shuffle:
y = data[target].copy().sample(frac=1.0, random_state=seed + 4)
from h2oaicore.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb
import lightgbm as lgbm
if is_regression:
model = lgbm.LGBMRegressor(random_state=seed, importance_type=importance, **lgbm_params)
else:
model = lgbm.LGBMClassifier(random_state=seed, importance_type=importance, **lgbm_params)
y = LabelEncoder().fit_transform(y)
# Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
model.fit(data[train_features], y, categorical_feature=cats)
# Get feature importances
imp_df = pd.DataFrame()
imp_df["feature"] = list(train_features)
imp_df["importance"] = model.feature_importances_
return imp_df
示例19
def summonehot(corpus):
allwords=[]
annotated={}
for sent in corpus:
for word in wt(sent):
allwords.append(word.lower())
print(len(set(allwords)), "unique characters in corpus")
#maxcorp=int(input("Enter desired number of vocabulary: "))
maxcorp=int(len(set(allwords))/1.1)
wordcount = Counter(allwords).most_common(maxcorp)
allwords=[]
for p in wordcount:
allwords.append(p[0])
allwords=list(set(allwords))
print(len(allwords), "unique characters in corpus after max corpus cut")
#integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(allwords)
#one hot
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
#make look up dict
for k in range(len(onehot_encoded)):
inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip()
annotated[inverted]=onehot_encoded[k]
return label_encoder,onehot_encoded,annotated
示例20
def __init__(self):
self.tok_raw = Tokenizer()
self.le = {}
self.cat_cols = ["brand_name", "subcat_0", "subcat_1", "subcat_2"]
self.cat_vocab = {}
for cat in self.cat_cols:
self.le[cat] = LabelEncoder()
self.freqs = {}
self.max_freqs = {}
self.voc = None
示例21
def __init__(self):
"""Initializes the Encoder object and sets internal tokenizer,
labelEncoder and vectorizer using predefined objects.
"""
self.tokenizer = BOWTokenizer(
English()
) # the tokenizer must have a tokenize() and parse() function.
self.labelEncoder = LabelEncoder()
self.vectorizer = CountVectorizer(
tokenizer=self.tokenizer.tokenize, ngram_range=(1, 1)
)
self.decode_params = {}
# The keep_ids flag, is used by explain local in the explainer to decode
# importances over raw features.
示例22
def test_explain_model_local_default(self):
"""
Test for explain_local of classical explainer
:return:
"""
X_train, X_test, y_train, y_test = setup_mnli_test_train_split()
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
explainer = ClassicalTextExplainer()
classifier, best_params = explainer.fit(X_train, y_train)
explainer.preprocessor.labelEncoder = label_encoder
local_explanation = explainer.explain_local(DOCUMENT)
assert len(local_explanation.local_importance_values) == len(local_explanation.features)
示例23
def main():
from sklearn import preprocessing
from sklearn.datasets import fetch_openml as fetch_mldata
from sklearn.model_selection import cross_val_score
db_name = 'iris'
hid_num = 1000
data_set = fetch_mldata(db_name, version=1)
data_set.data = preprocessing.scale(data_set.data)
data_set.target = preprocessing.LabelEncoder().fit_transform(data_set.target)
print(db_name)
print('ECOBELM', hid_num)
e = ECOBELM(hid_num, c=2**5)
ave = 0
for i in range(10):
scores = cross_val_score(
e, data_set.data, data_set.target, cv=5, scoring='accuracy')
ave += scores.mean()
ave /= 10
print("Accuracy: %0.2f " % (ave))
print('ELM', hid_num)
e = ELM(hid_num)
ave = 0
for i in range(10):
scores = cross_val_score(
e, data_set.data, data_set.target, cv=5, scoring='accuracy')
ave += scores.mean()
ave /= 10
print("Accuracy: %0.2f " % (ave))
示例24
def __init__(self,
df_dict: Dict[str, pd.DataFrame]=None,
tensor_dict: Dict[str, torch.FloatTensor]=None,
labels: List[str] =[],
label_enc: LabelEncoder = None):
assert not (df_dict is None and tensor_dict is None),\
print('df_dict and tensor_dict can\'t be both None')
assert len(labels)>0, 'lables can\'t be empty'
if label_enc is None:
label_enc = LabelEncoder()
label_enc.fit(labels)
self.label_enc = label_enc
self.label_ids = self.label_enc.transform(labels)
if tensor_dict is not None:
self.name_tensor_dict = tensor_dict
self.f_g_names = list(tensor_dict.keys())
self.len = tensor_dict[self.f_g_names[0]].shape[0]
else:
self.f_g_names = df_dict.keys()
self.len = len(list(df_dict.values())[0])
# df_dict must have at least one key-value pair
assert len(df_dict) > 0
# Make sure each df has the same size
for name, df in df_dict.items():
assert len(df) == len(list(df_dict.values())[0])
# Convert dataframe into a dictionary of FloatTensor to avoid on-the-fly conversion
self.name_tensor_dict = {}
for name, df in df_dict.items():
self.name_tensor_dict[name] = torch.FloatTensor(df.values.astype('float'))
示例25
def encode_categorical_features(cls, df):
cat_feature_map = OrderedDict()
for pos, f in enumerate(df):
if not np.issubdtype(df[f].dtype, np.number):
encoder = LabelEncoder()
df[f] = encoder.fit_transform(df[f])
#TODO: must ensure the mapping is consistent
cat_feature_map[pos] = encoder.classes_.tolist()
return cat_feature_map
示例26
def label_encode(df, features, name):
df[name] = df[name].astype('str')
if name in transformers: # test
df[name] = transformers[name].transform(df[name])
else: # train
transformers[name] = LabelEncoder()
df[name] = transformers[name].fit_transform(df[name])
features.append(name)
示例27
def one_hot_encode(sequences):
sequence_length = len(sequences[0])
integer_type = np.int8 if sys.version_info[
0] == 2 else np.int32 # depends on Python version
integer_array = LabelEncoder().fit(
np.array(('ACGTN',)).view(integer_type)).transform(
sequences.view(integer_type)).reshape(
len(sequences), sequence_length)
one_hot_encoding = OneHotEncoder(
sparse=False, n_values=5, dtype=integer_type).fit_transform(integer_array)
return one_hot_encoding.reshape(len(sequences), 1, sequence_length,
5).swapaxes(2, 3)[:, :, [0, 1, 2, 4], :]
示例28
def _encode_y(self, y):
# encode classes into 0 ... n_classes - 1 and sets attributes classes_
# and n_trees_per_iteration_
check_classification_targets(y)
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y)
self.classes_ = label_encoder.classes_
n_classes = self.classes_.shape[0]
# only 1 tree for binary classification. For multiclass classification,
# we build 1 tree per class.
self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes
encoded_y = encoded_y.astype(np.float32, copy=False)
return encoded_y
示例29
def __encode(self, X):
Xenc = X.copy(deep=True)
if self._label_encoder is None or self._onehot_encoder is None:
self._label_encoder = [None] * len(Xenc.columns)
self._onehot_encoder = [None] * len(Xenc.columns)
del_columns = []
for i in range(len(Xenc.columns)):
if Xenc.dtypes[i] == np.dtype('O'):
if self._label_encoder[i] is None:
self._label_encoder[i] = LabelEncoder().fit(Xenc.iloc[:,i])
col_enc = self._label_encoder[i].transform(Xenc.iloc[:,i])
if self._onehot_encoder[i] is None:
self._onehot_encoder[i] = OneHotEncoder(categories='auto').fit(
col_enc.reshape(-1, 1))
col_onehot = np.array(self._onehot_encoder[i].transform(
col_enc.reshape(-1, 1)).todense())
col_names = [str(Xenc.columns[i]) + '_' + c
for c in self._label_encoder[i].classes_]
col_onehot = pd.DataFrame(col_onehot, columns=col_names,
index=Xenc.index)
Xenc = pd.concat([Xenc, col_onehot], axis=1)
del_columns.append(Xenc.columns[i])
for col in del_columns:
del Xenc[col]
return Xenc, del_columns
示例30
def preprocess_labels(labels, encoder=None, categorical=True):
if not encoder:
encoder = LabelEncoder()
encoder.fit(labels)
y = encoder.transform(labels).astype(np.int32)
if categorical:
y = np_utils.to_categorical(y)
return y, encoder