Python源码示例:sklearn.metrics.get_scorer()
示例1
def test_regression_scorers():
# Test regression scorers.
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = Ridge()
clf.fit(X_train, y_train)
score1 = get_scorer('r2')(clf, X_test, y_test)
score2 = r2_score(y_test, clf.predict(X_test))
assert_almost_equal(score1, score2)
示例2
def test_thresholded_scorers_multilabel_indicator_data():
# Test that the scorer work with multilabel-indicator format
# for multilabel and multi-output multi-class classifier
X, y = make_multilabel_classification(allow_unlabeled=False,
random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Multi-output multi-class predict_proba
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_proba = clf.predict_proba(X_test)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T)
assert_almost_equal(score1, score2)
# Multi-output multi-class decision_function
# TODO Is there any yet?
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf._predict_proba = clf.predict_proba
clf.predict_proba = None
clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)]
y_proba = clf.decision_function(X_test)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T)
assert_almost_equal(score1, score2)
# Multilabel predict_proba
clf = OneVsRestClassifier(DecisionTreeClassifier())
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.predict_proba(X_test))
assert_almost_equal(score1, score2)
# Multilabel decision function
clf = OneVsRestClassifier(LinearSVC(random_state=0))
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.decision_function(X_test))
assert_almost_equal(score1, score2)
示例3
def test_supervised_cluster_scorers():
# Test clustering scorers against gold standard labeling.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
km = KMeans(n_clusters=3)
km.fit(X_train)
for name in CLUSTER_SCORERS:
score1 = get_scorer(name)(km, X_test, y_test)
score2 = getattr(cluster_module, name)(y_test, km.predict(X_test))
assert_almost_equal(score1, score2)
示例4
def test(dataset_loader_test, model_persister,
scoring=None, model_version=None):
with timer(logger.info, "Loading data"):
X, y = dataset_loader_test()
with timer(logger.info, "Reading model"):
model = model_persister.read(version=model_version)
logger.info(
'Loaded model version {}'.format(model.__metadata__['version']))
if not (hasattr(model, 'score') or scoring is not None):
raise ValueError(
"Your model doesn't seem to implement a 'score' method. You may "
"want to define a 'scoring' option in the configuration."
)
with timer(logger.info, "Applying model"):
scores = []
if scoring is not None:
if not isinstance(scoring, dict):
scoring = {'score': scoring}
for key, scorer in scoring.items():
scorer = get_scorer(scorer)
scores.append("{}: {}".format(key, scorer(model, X, y)))
else:
scores.append("score: {}".format(model.score(X, y)))
logger.info("Score: {}.".format('\n '.join(scores)))
return scores
示例5
def test_regression_scorers():
# Test regression scorers.
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = Ridge()
clf.fit(X_train, y_train)
score1 = get_scorer('r2')(clf, X_test, y_test)
score2 = r2_score(y_test, clf.predict(X_test))
assert_almost_equal(score1, score2)
示例6
def test_thresholded_scorers():
# Test scorers that take thresholds.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.decision_function(X_test))
score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
assert_almost_equal(score1, score3)
logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
logloss = log_loss(y_test, clf.predict_proba(X_test))
assert_almost_equal(-logscore, logloss)
# same for an estimator without decision_function
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
# test with a regressor (no decision_function)
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(reg, X_test, y_test)
score2 = roc_auc_score(y_test, reg.predict(X_test))
assert_almost_equal(score1, score2)
# Test that an exception is raised on more than two classes
X, y = make_blobs(random_state=0, centers=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf.fit(X_train, y_train)
assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
示例7
def test_thresholded_scorers_multilabel_indicator_data():
# Test that the scorer work with multilabel-indicator format
# for multilabel and multi-output multi-class classifier
X, y = make_multilabel_classification(allow_unlabeled=False,
random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Multi-output multi-class predict_proba
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_proba = clf.predict_proba(X_test)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, np.vstack(p[:, -1] for p in y_proba).T)
assert_almost_equal(score1, score2)
# Multi-output multi-class decision_function
# TODO Is there any yet?
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf._predict_proba = clf.predict_proba
clf.predict_proba = None
clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)]
y_proba = clf.decision_function(X_test)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, np.vstack(p for p in y_proba).T)
assert_almost_equal(score1, score2)
# Multilabel predict_proba
clf = OneVsRestClassifier(DecisionTreeClassifier())
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.predict_proba(X_test))
assert_almost_equal(score1, score2)
# Multilabel decision function
clf = OneVsRestClassifier(LinearSVC(random_state=0))
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.decision_function(X_test))
assert_almost_equal(score1, score2)
示例8
def test_supervised_cluster_scorers():
# Test clustering scorers against gold standard labeling.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
km = KMeans(n_clusters=3)
km.fit(X_train)
for name in CLUSTER_SCORERS:
score1 = get_scorer(name)(km, X_test, y_test)
score2 = getattr(cluster_module, name)(y_test, km.predict(X_test))
assert_almost_equal(score1, score2)
示例9
def test_classification_scores():
# Test classification scorers.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)
for prefix, metric in [('f1', f1_score), ('precision', precision_score),
('recall', recall_score),
('jaccard', jaccard_score)]:
score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test)
score2 = metric(y_test, clf.predict(X_test), pos_label=None,
average='weighted')
assert_almost_equal(score1, score2)
score1 = get_scorer('%s_macro' % prefix)(clf, X_test, y_test)
score2 = metric(y_test, clf.predict(X_test), pos_label=None,
average='macro')
assert_almost_equal(score1, score2)
score1 = get_scorer('%s_micro' % prefix)(clf, X_test, y_test)
score2 = metric(y_test, clf.predict(X_test), pos_label=None,
average='micro')
assert_almost_equal(score1, score2)
score1 = get_scorer('%s' % prefix)(clf, X_test, y_test)
score2 = metric(y_test, clf.predict(X_test), pos_label=1)
assert_almost_equal(score1, score2)
# test fbeta score that takes an argument
scorer = make_scorer(fbeta_score, beta=2)
score1 = scorer(clf, X_test, y_test)
score2 = fbeta_score(y_test, clf.predict(X_test), beta=2)
assert_almost_equal(score1, score2)
# test that custom scorer can be pickled
unpickled_scorer = pickle.loads(pickle.dumps(scorer))
score3 = unpickled_scorer(clf, X_test, y_test)
assert_almost_equal(score1, score3)
# smoke test the repr:
repr(fbeta_score)
示例10
def test_thresholded_scorers():
# Test scorers that take thresholds.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.decision_function(X_test))
score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
assert_almost_equal(score1, score3)
logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
logloss = log_loss(y_test, clf.predict_proba(X_test))
assert_almost_equal(-logscore, logloss)
# same for an estimator without decision_function
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
# test with a regressor (no decision_function)
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(reg, X_test, y_test)
score2 = roc_auc_score(y_test, reg.predict(X_test))
assert_almost_equal(score1, score2)
# Test that an exception is raised on more than two classes
X, y = make_blobs(random_state=0, centers=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf.fit(X_train, y_train)
with pytest.raises(ValueError, match="multiclass format is not supported"):
get_scorer('roc_auc')(clf, X_test, y_test)
# test error is raised with a single class present in model
# (predict_proba shape is not suitable for binary auc)
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = DecisionTreeClassifier()
clf.fit(X_train, np.zeros_like(y_train))
with pytest.raises(ValueError, match="need classifier with two classes"):
get_scorer('roc_auc')(clf, X_test, y_test)
# for proba scorers
with pytest.raises(ValueError, match="need classifier with two classes"):
get_scorer('neg_log_loss')(clf, X_test, y_test)
示例11
def fit(dataset_loader_train, model, model_persister, persist=True,
activate=True, dataset_loader_test=None, evaluate=False,
persist_if_better_than=None, scoring=None):
if persist_if_better_than is not None:
evaluate = True
if dataset_loader_test is None:
raise ValueError(
"When using 'persist_if_better_than', make sure you also "
"provide a 'dataset_loader_test'."
)
if evaluate and not (hasattr(model, 'score') or scoring is not None):
raise ValueError(
"Your model doesn't seem to implement a 'score' method. You may "
"want to define a 'scoring' option in the configuration."
)
if scoring is not None:
scorer = get_scorer(scoring)
else:
def scorer(model, X, y):
return model.score(X, y)
with timer(logger.info, "Loading data"):
X, y = dataset_loader_train()
with timer(logger.info, "Fitting model"):
model.fit(X, y)
if evaluate:
with timer(logger.debug, "Evaluating model on train set"):
score_train = scorer(model, X, y)
annotate(model, {'score_train': score_train})
logger.info("Train score: {}".format(score_train))
X, y = None, None
gc.collect()
score_test = None
if evaluate and dataset_loader_test is not None:
with timer(logger.info, "Loading test data"):
X_test, y_test = dataset_loader_test()
with timer(logger.debug, "Evaluating model on test set"):
score_test = scorer(model, X_test, y_test)
annotate(model, {'score_test': score_test})
logger.info("Test score: {}".format(score_test))
if persist:
if (persist_if_better_than is not None and
score_test < persist_if_better_than):
logger.info("Not persisting model that has a test score "
"{} < {}".format(score_test, persist_if_better_than))
else:
_persist_model(model, model_persister, activate=activate)
return model
示例12
def __init__(self, model, dataset, metric, shuffle_seed=0, data_root=None):
"""Build class that wraps sklearn classifier/regressor CV score for use as an objective function.
Parameters
----------
model : str
Which classifier to use, must be key in `MODELS_CLF` or `MODELS_REG` dict depending on if dataset is
classification or regression.
dataset : str
Which data set to use, must be key in `DATA_LOADERS` dict, or name of custom csv file.
metric : str
Which sklearn scoring metric to use, in `SCORERS_CLF` list or `SCORERS_REG` dict depending on if dataset is
classification or regression.
shuffle_seed : int
Random seed to use when splitting the data into train and validation in the cross-validation splits. This
is needed in order to keep the split constant across calls. Otherwise there would be extra noise in the
objective function for varying splits.
data_root : str
Root directory to look for all custom csv files.
"""
TestFunction.__init__(self)
data, target, problem_type = load_data(dataset, data_root=data_root)
assert problem_type in (ProblemType.clf, ProblemType.reg)
self.is_classifier = problem_type == ProblemType.clf
# Do some validation on loaded data
assert isinstance(data, np.ndarray)
assert isinstance(target, np.ndarray)
assert data.ndim == 2 and target.ndim == 1
assert data.shape[0] == target.shape[0]
assert data.size > 0
assert data.dtype == np.float_
assert np.all(np.isfinite(data)) # also catch nan
assert target.dtype == (np.int_ if self.is_classifier else np.float_)
assert np.all(np.isfinite(target)) # also catch nan
model_lookup = MODELS_CLF if self.is_classifier else MODELS_REG
base_model, fixed_params, api_config = model_lookup[model]
# New members for model
self.base_model = base_model
self.fixed_params = fixed_params
self.api_config = api_config
# Always shuffle your data to be safe. Use fixed seed for reprod.
self.data_X, self.data_Xt, self.data_y, self.data_yt = train_test_split(
data, target, test_size=0.2, random_state=shuffle_seed, shuffle=True
)
assert metric in METRICS, "Unknown metric %s" % metric
assert metric in METRICS_LOOKUP[problem_type], "Incompatible metric %s with problem type %s" % (
metric,
problem_type,
)
self.scorer = get_scorer(SklearnModel._METRIC_MAP[metric])
示例13
def evaluate(self, params):
"""Evaluate the sklearn CV objective at a particular parameter setting.
Parameters
----------
params : dict(str, object)
The varying (non-fixed) parameter dict to the sklearn model.
Returns
-------
cv_loss : float
Average loss over CV splits for sklearn model when tested using the settings in params.
"""
params = dict(params) # copy to avoid modification of original
params.update(self.fixed_params) # add in fixed params
# now build the skl object
clf = self.base_model(**params)
assert np.all(np.isfinite(self.data_X)), "all features must be finite"
assert np.all(np.isfinite(self.data_y)), "all targets must be finite"
# Do the x-val, ignore user warn since we expect BO to try weird stuff
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
S = cross_val_score(clf, self.data_X, self.data_y, scoring=self.scorer, cv=CV_SPLITS)
# Take the mean score across all x-val splits
cv_score = np.mean(S)
# Now let's get the generalization error for same hypers
clf = self.base_model(**params)
clf.fit(self.data_X, self.data_y)
generalization_score = self.scorer(clf, self.data_Xt, self.data_yt)
# get_scorer makes everything a score not a loss, so we need to negate to get the loss back
cv_loss = -cv_score
assert np.isfinite(cv_loss), "loss not even finite"
generalization_loss = -generalization_score
assert np.isfinite(generalization_loss), "loss not even finite"
# Unbox to basic float to keep it simple
cv_loss = cv_loss.item()
assert isinstance(cv_loss, float)
generalization_loss = generalization_loss.item()
assert isinstance(generalization_loss, float)
# For now, score with same objective. We can later add generalization error
return cv_loss, generalization_loss
示例14
def test_classification_scores():
# Test classification scorers.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)
for prefix, metric in [('f1', f1_score), ('precision', precision_score),
('recall', recall_score)]:
score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test)
score2 = metric(y_test, clf.predict(X_test), pos_label=None,
average='weighted')
assert_almost_equal(score1, score2)
score1 = get_scorer('%s_macro' % prefix)(clf, X_test, y_test)
score2 = metric(y_test, clf.predict(X_test), pos_label=None,
average='macro')
assert_almost_equal(score1, score2)
score1 = get_scorer('%s_micro' % prefix)(clf, X_test, y_test)
score2 = metric(y_test, clf.predict(X_test), pos_label=None,
average='micro')
assert_almost_equal(score1, score2)
score1 = get_scorer('%s' % prefix)(clf, X_test, y_test)
score2 = metric(y_test, clf.predict(X_test), pos_label=1)
assert_almost_equal(score1, score2)
# test fbeta score that takes an argument
scorer = make_scorer(fbeta_score, beta=2)
score1 = scorer(clf, X_test, y_test)
score2 = fbeta_score(y_test, clf.predict(X_test), beta=2)
assert_almost_equal(score1, score2)
# test that custom scorer can be pickled
unpickled_scorer = pickle.loads(pickle.dumps(scorer))
score3 = unpickled_scorer(clf, X_test, y_test)
assert_almost_equal(score1, score3)
# smoke test the repr:
repr(fbeta_score)