Python源码示例:sklearn.preprocessing.Imputer()
示例1
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
scikit_data = load_boston()
scikit_model = Imputer(strategy="most_frequent", axis=0)
scikit_data["data"][1, 8] = np.NaN
input_data = scikit_data["data"][:, 8].reshape(-1, 1)
scikit_model.fit(input_data, scikit_data["target"])
# Save the data and the model
self.scikit_data = scikit_data
self.scikit_model = scikit_model
示例2
def readFile(inpath):
if os.path.isfile(inpath):
dataset = genfromtxt(open(inpath,'r'), delimiter=',', dtype='f8')[0:]
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)# fill in the missing values with the mean of each column
transformedData = imp.fit_transform(dataset)
rmvedCols = imp.statistics_
idxRmved = np.where(np.isnan(rmvedCols))#take the indices of the nan columns
nanTarget = dataset.shape[1]-1 in idxRmved[0]#check if the target is a nan column
if nanTarget:
raise ValueError("The target variable contains only nan values or inf")
else:
raise ValueError("File does not exist")
return transformedData
#parameters: vector 'target' which is the target variable
#returns: the dataset which includes the previous values of the target
示例3
def test_imputer_float_inputs(self):
model = Imputer(missing_values="NaN", strategy="mean", axis=0)
data = [[1, 2], [np.nan, 3], [7, 6]]
model.fit(data)
model_onnx = convert_sklearn(model, "scikit-learn imputer",
[("input", FloatTensorType([None, 2]))])
self.assertTrue(model_onnx.graph.node is not None)
# should contain only node
self.assertEqual(len(model_onnx.graph.node), 1)
# last node should contain the Imputer
outputs = model_onnx.graph.output
self.assertEqual(len(outputs), 1)
self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value,
2)
dump_data_and_model(
np.array(data, dtype=np.float32),
model,
model_onnx,
basename="SklearnImputerMeanFloat32",
)
示例4
def test_simple_imputer_float_inputs(self):
model = SimpleImputer(strategy="mean", fill_value="nan")
data = [[1, 2], [np.nan, 3], [7, 6]]
model.fit(data)
model_onnx = convert_sklearn(
model,
"scikit-learn simple imputer",
[("input", FloatTensorType([None, 2]))],
target_opset=TARGET_OPSET)
self.assertTrue(model_onnx.graph.node is not None)
# should contain only node
self.assertEqual(len(model_onnx.graph.node), 1)
# last node should contain the Imputer
outputs = model_onnx.graph.output
self.assertEqual(len(outputs), 1)
self.assertEqual(
outputs[0].type.tensor_type.shape.dim[-1].dim_value, 2)
dump_data_and_model(
np.array(data, dtype=np.float32),
model, model_onnx,
basename="SklearnSimpleImputerMeanFloat32")
示例5
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
self.assertIs(df.preprocessing.FunctionTransformer,
pp.FunctionTransformer)
self.assertIs(df.preprocessing.Imputer, pp.Imputer)
self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
示例6
def test_transform_1d_frame_int(self):
arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
idx = pd.Index('a b c d e f g h i'.split(' '))
df = pdml.ModelFrame(arr, index=idx, columns=['X'])
self.assertEqual(len(df.columns), 1)
# reshape arr to 2d
arr = arr.reshape(-1, 1)
if pd.compat.PY3:
models = ['Binarizer', 'Imputer', 'StandardScaler']
# MinMaxScalar raises TypeError in ufunc
else:
models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler']
for model in models:
mod1 = getattr(df.preprocessing, model)()
mod2 = getattr(pp, model)()
self._assert_transform(df, arr, mod1, mod2)
mod1 = getattr(df.preprocessing, model)()
mod2 = getattr(pp, model)()
self._assert_fit_transform(df, arr, mod1, mod2)
示例7
def test_Imputer(self):
arr = np.array([1, np.nan, 3, 2])
s = pdml.ModelSeries(arr)
mod1 = s.pp.Imputer(axis=0)
s.fit(mod1)
result = s.transform(mod1)
expected = np.array([1, 2, 3, 2])
self.assertIsInstance(result, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(result.values, expected)
mod1 = s.pp.Imputer(axis=0)
result = s.fit_transform(mod1)
self.assertIsInstance(result, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(result.values, expected)
示例8
def FeatureCombination(Df,s='',num_feature=2):
feature_set = []
for c in Df.columns:
if c.startswith(s): feature_set.append(c)
print('combining', len(feature_set), 'features')
data = Df[feature_set].values
for c in Df.columns:
if Df[c].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(Df[c].values))
Df[c] = lbl.transform(list(Df[c].values))
imp = preprocessing.Imputer()
data = imp.fit_transform(data)
data = preprocessing.scale(data)
pca = PCA(num_feature)
pca.fit(data)
print('explained_variance_ratio_:', pca.explained_variance_ratio_)
trans = pca.transform(data)
for i in range(0,num_feature):
Df[s+'_%d'%(i+1)] = trans[:,i]
Df.drop(feature_set,1,inplace=True)
return Df
示例9
def test_imputer(self):
try:
model = Imputer(missing_values='NaN', strategy='mean', axis=0)
except TypeError:
model = Imputer(missing_values=np.nan, strategy='mean')
model.axis = 0
data = [[1, 2], [np.nan, 3], [7, 6]]
model.fit(data)
from onnxmltools.convert.coreml.convert import convert
import coremltools # noqa
try:
model_coreml = coremltools.converters.sklearn.convert(model)
except ValueError as e:
if 'not supported' in str(e):
# Python 2.7 + scikit-learn 0.22
return
model_onnx = convert(model_coreml.get_spec())
self.assertTrue(model_onnx is not None)
dump_data_and_model(np.array(data, dtype=np.float32),
model, model_onnx, basename="CmlImputerMeanFloat32")
示例10
def impute_data(self,x):
"""Imputes data set containing Nan values"""
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
return imp.fit_transform(x)
示例11
def __init__(self, params, dataset):
"""Initializes a UMAPTransformer object.
Args:
params (Namespace): Contains parameters used to instantiate the transformer.
dataset (Dataset): Dataset used to "train" the projection mapping.
"""
# TODO: decide whether to make n_epochs a parameter
#default_n_epochs = None
default_n_epochs = 500
if params.prediction_type == 'classification':
target_metric = 'categorical'
else:
target_metric = 'l2'
self.scaler = RobustScaler()
# Use Imputer to replace missing values (NaNs) with means for each column
self.imputer = Imputer()
scaled_X = self.scaler.fit_transform(self.imputer.fit_transform(dataset.X))
self.mapper = umap.UMAP(n_neighbors=params.umap_neighbors,
n_components=params.umap_dim,
metric=params.umap_metric,
target_metric=target_metric,
target_weight=params.umap_targ_wt,
min_dist=params.umap_min_dist,
n_epochs=default_n_epochs)
# TODO: How to deal with multitask data?
self.mapper.fit(scaled_X, y=dataset.y.flatten())
# ****************************************************************************************
示例12
def get_clf_pipeline():
clf = models.DefaultClassifier(
GradientBoostingClassifier(
loss='deviance', learning_rate=0.01, n_estimators=3000,
subsample=0.6, min_samples_split=12, min_samples_leaf=12,
max_depth=6, random_state=1357, verbose=0)
)
steps = [('features', models.FeatureSelector()),
('Impute', Imputer(strategy='median')),
('scaler', StandardScaler()),
('clf', clf)]
return Pipeline(steps)
示例13
def get_reg_pipeline():
clf = models.PartialRegressor(
GradientBoostingRegressor(loss='ls', learning_rate=0.0075, n_estimators=5000,
subsample=0.5, min_samples_split=20, min_samples_leaf=20, max_leaf_nodes=30,
random_state=9753, verbose=0)
)
steps = [('features', models.FeatureSelector()),
('Impute', Imputer(strategy='median')),
('scaler', StandardScaler()),
('clf', clf)]
return Pipeline(steps)
示例14
def __init__(self, max_iter=10, initial_strategy='mean', tol=1e-3, f_model="RandomForest"):
self.max_iter = max_iter
self.initial_strategy = initial_strategy
self.initial_imputer = Imputer(strategy=initial_strategy)
self.tol = tol
self.f_model = f_model
示例15
def data_handlemissing(dataframe, pipeline):
try:
if pipeline['options']['type'] == "dropcolumns":
thresh = pipeline['options']['thresh']
if thresh == -1:
dataframe.dropna(axis=1, how="all", inplace=True)
elif thresh == 0:
dataframe.dropna(axis=1, how="any", inplace=True)
elif thresh > 0:
dataframe.dropna(axis=1, thresh=thresh, inplace=True)
elif pipeline['options']['type'] == "droprows":
thresh = pipeline['options']['thresh']
if thresh == -1:
dataframe.dropna(axis=0, how="all", inplace=True)
elif thresh == 0:
dataframe.dropna(axis=0, how="any", inplace=True)
elif thresh > 0:
dataframe.dropna(axis=0, thresh=thresh)
elif pipeline['options']['type'] == "fillmissing":
strategy = pipeline['options']['strategy']
imp = Imputer(missing_values='NaN', strategy=strategy, axis=0)
array = imp.fit_transform(dataframe.values)
dataframe = pandas.DataFrame(array, columns = dataframe.columns)
return dataframe
except Exception as e:
raise Exception("data_handlemissing: " + str(e))
示例16
def test_conversion_boston(self):
from sklearn.datasets import load_boston
scikit_data = load_boston()
sh = scikit_data.data.shape
rn.seed(0)
missing_value_indices = [
(rn.randint(sh[0]), rn.randint(sh[1])) for k in range(sh[0])
]
for strategy in ["mean", "median", "most_frequent"]:
for missing_value in [0, "NaN", -999]:
X = np.array(scikit_data.data).copy()
for i, j in missing_value_indices:
X[i, j] = missing_value
model = Imputer(missing_values=missing_value, strategy=strategy)
model = model.fit(X)
tr_X = model.transform(X.copy())
spec = converter.convert(model, scikit_data.feature_names, "out")
input_data = [dict(zip(scikit_data.feature_names, row)) for row in X]
output_data = [{"out": row} for row in tr_X]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
示例17
def test_conversion_bad_inputs(self):
# Error on converting an untrained model
with self.assertRaises(Exception):
model = Imputer()
spec = converter.convert(model, "data", "out")
# Check the expected class during covnersion.
with self.assertRaises(Exception):
from sklearn.linear_model import LinearRegression
model = LinearRegression()
spec = converter.convert(model, "data", "out")
示例18
def __init__(self):
self.supports_output = True
self.default_transformation_pipeline = [Imputer(strategy='mean'), StandardScaler()]
示例19
def createAuto(target):
win=13 # window size, how many previous values we take of the target (here 12 because the range goes from 1-12 without the 13)
dataAuto = np.empty((len(target),win-1))
for i in range(1,win):
dataAuto[:,i-1] = shift2(target, i)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
transformedDataAuto = imp.fit_transform(dataAuto)
X_auto = transformedDataAuto
return X_auto
#parameters: 'X' the predictors, 'y' the target, 'cvFolds' number of folds, 'estimator' machine learning algorithm
#returns: the R squared for each fold
示例20
def fit(self, X, y=None):
self.imp = Imputer(strategy=self.strategy)
self.imp.fit(X)
self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
return self
示例21
def train_model(titanic_data_path, model_output_path):
print('Loading the data...')
try:
with tf.gfile.Open(titanic_data_path, 'r') as data_file:
train_df = pd.read_csv(data_file)
print('Number of samples: {}'.format(train_df.shape[0]))
target_name = 'Survived'
feature_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']
print('Preparing the features...')
train_features = train_df[feature_names].copy()
train_features['Age'] = Imputer().fit_transform(train_features['Age'].values.reshape(-1, 1))
embarked = train_features['Embarked']
train_features['Embarked'] = embarked.fillna(embarked.mode()[0])
train_features = pd.get_dummies(train_features)
train_target = train_df[target_name]
print('Training the model...')
parameters = {'max_depth': [2, 3, 4, 5, 6, 7], 'n_estimators': [50, 100, 150, 200]}
gsc = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=-1, cv=5)
gsc.fit(train_features, train_target)
print('Best Hyper Parameters: {}'.format(gsc.best_params_))
print('Accuracy: {}'.format(gsc.best_score_))
with tf.gfile.Open(model_output_path, 'wb') as model_file:
joblib.dump(gsc.best_estimator_, model_file, protocol=1)
except Exception as e:
print('Error: {}'.format(e))
示例22
def test_model_imputer(self):
model = Imputer(missing_values="NaN", strategy="mean", axis=0)
data = [[1, 2], [np.nan, 3], [7, 6]]
model.fit(data)
# The conversion works but internally scikit-learn converts
# everything into float before looking into missing values.
# There is no nan integer. The runtime is not tested
# in this case.
model_onnx = convert_sklearn(model, "scikit-learn imputer",
[("input", Int64TensorType([None, 2]))])
self.assertTrue(model_onnx is not None)
示例23
def test_imputer_int_inputs(self):
model = Imputer(missing_values="NaN", strategy="mean", axis=0)
data = [[1, 2], [np.nan, 3], [7, 6]]
model.fit(data)
model_onnx = convert_sklearn(model, "scikit-learn imputer",
[("input", Int64TensorType([None, 2]))])
self.assertEqual(len(model_onnx.graph.node), 2)
# Last node should be Imputer
outputs = model_onnx.graph.output
self.assertEqual(len(outputs), 1)
self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value,
2)
示例24
def test_transform_series_int(self):
arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
s = pdml.ModelSeries(arr, index='a b c d e f g h i'.split(' '))
# reshape arr to 2d
arr = arr.reshape(-1, 1)
if pd.compat.PY3:
models = ['Binarizer', 'Imputer', 'StandardScaler']
# MinMaxScalar raises TypeError in ufunc
else:
models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler']
for model in models:
mod1 = getattr(s.preprocessing, model)()
mod2 = getattr(pp, model)()
s.fit(mod1)
mod2.fit(arr)
result = s.transform(mod1)
expected = mod2.transform(arr).flatten()
self.assertIsInstance(result, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(result.values, expected)
mod1 = getattr(s.preprocessing, model)()
mod2 = getattr(pp, model)()
result = s.fit_transform(mod1)
expected = mod2.fit_transform(arr).flatten()
self.assertIsInstance(result, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(result.values, expected)
示例25
def impute_and_scale(df, scaling='std'):
"""Impute missing values with mean and scale data included in pandas dataframe.
Parameters
----------
df : pandas dataframe
dataframe to impute and scale
scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
type of scaling to apply
"""
df = df.dropna(axis=1, how='all')
#imputer = Imputer(strategy='mean', axis=0)
imputer = Imputer(strategy='mean')
mat = imputer.fit_transform(df)
if scaling is None or scaling.lower() == 'none':
return pd.DataFrame(mat, columns=df.columns)
if scaling == 'maxabs':
scaler = MaxAbsScaler()
elif scaling == 'minmax':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
df = pd.DataFrame(mat, columns=df.columns)
return df
示例26
def impute_and_scale(df, scaling='std'):
"""Impute missing values with mean and scale data included in pandas dataframe.
Parameters
----------
df : pandas dataframe
dataframe to impute and scale
scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
type of scaling to apply
"""
df = df.dropna(axis=1, how='all')
imputer = Imputer(strategy='mean')
mat = imputer.fit_transform(df)
if scaling is None or scaling.lower() == 'none':
return pd.DataFrame(mat, columns=df.columns)
if scaling == 'maxabs':
scaler = MaxAbsScaler()
elif scaling == 'minmax':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
df = pd.DataFrame(mat, columns=df.columns)
return df
示例27
def impute_and_scale(df, scaling='std', imputing='mean', dropna='all'):
"""Impute missing values with mean and scale data included in pandas dataframe.
Parameters
----------
df : pandas dataframe
dataframe to impute and scale
scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
type of scaling to apply
"""
if dropna:
df = df.dropna(axis=1, how=dropna)
else:
empty_cols = df.columns[df.notnull().sum() == 0]
df[empty_cols] = 0
if imputing is None or imputing.lower() == 'none':
mat = df.values
else:
imputer = Imputer(strategy=imputing)
mat = imputer.fit_transform(df)
if scaling is None or scaling.lower() == 'none':
return pd.DataFrame(mat, columns=df.columns)
if scaling == 'maxabs':
scaler = MaxAbsScaler()
elif scaling == 'minmax':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
df = pd.DataFrame(mat, columns=df.columns)
return df
示例28
def impute_and_scale_array(mat, scaling=None):
""" Impute missing values with mean and scale data included in numpy array.
Parameters
----------
mat : numpy array
Array to scale
scaling : string
String describing type of scaling to apply.
Options recognized: 'maxabs', 'minmax', 'std'.
'maxabs' : scales data to range [-1 to 1].
'minmax' : scales data to range [-1 to 1].
'std' : scales data to normal variable with mean 0 and standard deviation 1.
(Default: None, no scaling).
Return
----------
Returns the numpy array imputed with the mean value of the \
column and scaled by the method specified. If no scaling method is specified, \
it returns the imputed numpy array.
"""
# imputer = Imputer(strategy='mean', axis=0, copy=False)
# imputer = SimpleImputer(strategy='mean', copy=False)
# Next line is from conditional import. axis=0 is default
# in old version so it is not necessary.
imputer = Imputer(strategy='mean', copy=False)
imputer.fit_transform(mat)
return scale_array(mat, scaling)
示例29
def test_permutation_test_score_allow_nans():
# Check that permutation_test_score allows input data with NaNs
X = np.arange(200, dtype=np.float64).reshape(10, -1)
X[2, :] = np.nan
y = np.repeat([0, 1], X.shape[0] / 2)
p = Pipeline([
('imputer', Imputer(strategy='mean', missing_values='NaN')),
('classifier', MockClassifier()),
])
permutation_test_score(p, X, y, cv=5)
示例30
def test_cross_val_score_allow_nans():
# Check that cross_val_score allows input data with NaNs
X = np.arange(200, dtype=np.float64).reshape(10, -1)
X[2, :] = np.nan
y = np.repeat([0, 1], X.shape[0] / 2)
p = Pipeline([
('imputer', Imputer(strategy='mean', missing_values='NaN')),
('classifier', MockClassifier()),
])
cross_val_score(p, X, y, cv=5)