Python源码示例:sklearn.datasets.load_breast_cancer()
示例1
def setUp(self) -> None:
self.random_state = 0
d: dict = load_breast_cancer()
X: DataFrame = DataFrame(d['data'], columns=d['feature_names'])
self.col_ordinal = X.columns.to_list()
np.random.seed(self.random_state)
s = np.array(['a', 'b', 'c'])
X['cat alpha'] = s[np.random.randint(0, 3, len(X))]
X['cat num'] = np.random.randint(0, 3, len(X))
self.col_categorical = ['cat alpha', 'cat num']
s = np.array(['a', 'b'])
X['bin alpha'] = s[np.random.randint(0, 2, len(X))]
X['bin num'] = np.random.randint(0, 2, len(X))
self.col_binary = ['bin alpha', 'bin num']
self.X = X
self.y: ndarray = d['target']
self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(self.X, self.y, test_size=0.4, random_state=self.random_state)
示例2
def setUp(self):
self.roc_floor = 0.9
self.accuracy_floor = 0.9
random_state = 42
X, y = load_breast_cancer(return_X_y=True)
self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)
classifiers = [DecisionTreeClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
RandomForestClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state)]
self.clf = DES_LA(classifiers, local_region_size=30)
self.clf.fit(self.X_train, self.y_train)
示例3
def setUp(self):
self.X, self.y = load_breast_cancer(return_X_y=True)
self.n_clusters = 5
self.n_estimators = 3
# Initialize a set of estimators
estimators = [KMeans(n_clusters=self.n_clusters),
MiniBatchKMeans(n_clusters=self.n_clusters),
AgglomerativeClustering(n_clusters=self.n_clusters)]
# Clusterer Ensemble without initializing a new Class
self.original_labels = np.zeros([self.X.shape[0], self.n_estimators])
for i, estimator in enumerate(estimators):
estimator.fit(self.X)
self.original_labels[:, i] = estimator.labels_
示例4
def setUp(self):
self.roc_floor = 0.9
self.accuracy_floor = 0.9
random_state = 42
X, y = load_breast_cancer(return_X_y=True)
self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)
classifiers = [DecisionTreeClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
RandomForestClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state)]
self.clf = Stacking(classifiers, n_folds=4)
self.clf.fit(self.X_train, self.y_train)
示例5
def setUp(self):
self.roc_floor = 0.9
self.accuracy_floor = 0.9
random_state = 42
X, y = load_breast_cancer(return_X_y=True)
self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)
classifiers = [DecisionTreeClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
RandomForestClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state)]
self.clf = SimpleClassifierAggregator(classifiers, method='average')
示例6
def setUp(self):
self.roc_floor = 0.9
self.accuracy_floor = 0.9
random_state = 42
X, y = load_breast_cancer(return_X_y=True)
self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)
classifiers = [DecisionTreeClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
RandomForestClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state)]
self.clf = SimpleClassifierAggregator(classifiers, method='average')
self.clf.fit(self.X_train, self.y_train)
示例7
def setUp(self):
self.roc_floor = 0.9
self.accuracy_floor = 0.9
random_state = 42
X, y = load_breast_cancer(return_X_y=True)
self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)
clf_weights = np.array([0.1, 0.4, 0.1, 0.2, 0.2])
classifiers = [DecisionTreeClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
RandomForestClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state)]
self.clf = SimpleClassifierAggregator(classifiers, method='average',
weights=clf_weights)
self.clf.fit(self.X_train, self.y_train)
示例8
def setUp(self):
self.roc_floor = 0.9
self.accuracy_floor = 0.9
random_state = 42
X, y = load_breast_cancer(return_X_y=True)
self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)
classifiers = [DecisionTreeClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
RandomForestClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state)]
self.clf = SimpleClassifierAggregator(classifiers,
method='maximization')
self.clf.fit(self.X_train, self.y_train)
示例9
def setUp(self):
self.roc_floor = 0.9
self.accuracy_floor = 0.9
random_state = 42
X, y = load_breast_cancer(return_X_y=True)
self.X_train, self.X_test, self.y_train, self.y_test = \
train_test_split(X, y, test_size=0.4, random_state=random_state)
classifiers = [DecisionTreeClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
RandomForestClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state)]
self.clf = SimpleClassifierAggregator(classifiers,
method='median')
self.clf.fit(self.X_train, self.y_train)
示例10
def main():
dataset = datasets.load_breast_cancer()
features = dataset.data
labels = dataset.target
num_features = features.shape[1]
features = StandardScaler().fit_transform(features)
train_features, test_features, train_labels, test_labels = train_test_split(
features, labels, test_size=0.3, stratify=labels
)
model = NearestNeighbor(train_features, train_labels, num_features)
model.predict(test_features, test_labels, result_path="./results/nearest_neighbor/")
示例11
def test_fit_2(self):
"""Tests GridSearchCV fit() with different data."""
x_np, y_np = datasets.load_breast_cancer(return_X_y=True)
x = ds.array(x_np, block_size=(100, 10))
x = StandardScaler().fit_transform(x)
y = ds.array(y_np.reshape(-1, 1), block_size=(100, 1))
parameters = {'c': [0.1], 'gamma': [0.1]}
csvm = CascadeSVM()
searcher = GridSearchCV(csvm, parameters, cv=5)
searcher.fit(x, y)
self.assertTrue(hasattr(searcher, 'best_estimator_'))
self.assertTrue(hasattr(searcher, 'best_score_'))
self.assertTrue(hasattr(searcher, 'best_params_'))
self.assertTrue(hasattr(searcher, 'best_index_'))
self.assertTrue(hasattr(searcher, 'scorer_'))
self.assertEqual(searcher.n_splits_, 5)
示例12
def test_save_load_classifier(self):
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
k = 4
classifier_before = pyfms.Classifier(X.shape[1], k=k)
classifier_before.fit(X_train, y_train, nb_epoch=1000)
weights_before = classifier_before.get_weights()
accuracy_before = accuracy_score(y_test, classifier_before.predict(X_test))
classifier_file = os.path.join(self.workspace, 'classifier.fm')
classifier_before.save_weights(classifier_file)
classifier_after = pyfms.Classifier(X.shape[1])
classifier_after.load_weights(classifier_file)
weights_after = classifier_after.get_weights()
accuracy_after = accuracy_score(y_test, classifier_after.predict(X_test))
for wb, wa in zip(weights_before, weights_after):
np.testing.assert_array_equal(wb, wa)
self.assertEqual(accuracy_before, accuracy_after)
示例13
def test_select_fdr_int(self):
model = SelectFdr()
X, y = load_breast_cancer(return_X_y=True)
model.fit(X, y)
model_onnx = convert_sklearn(
model, "select fdr",
[("input", Int64TensorType([None, X.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnSelectFdr",
allow_failure="StrictVersion(onnx.__version__)"
" < StrictVersion('1.2') or "
"StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.2.1')",
)
示例14
def test_select_fwe_int(self):
model = SelectFwe()
X, y = load_breast_cancer(return_X_y=True)
model.fit(X, y)
model_onnx = convert_sklearn(
model, "select fwe",
[("input", Int64TensorType([None, X.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnSelectFwe",
allow_failure="StrictVersion(onnx.__version__)"
" < StrictVersion('1.2') or "
"StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.2.1')",
)
示例15
def test_select_fdr_float(self):
model = SelectFdr()
X, y = load_breast_cancer(return_X_y=True)
model.fit(X, y)
model_onnx = convert_sklearn(
model, "select fdr",
[("input", FloatTensorType([None, X.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.float32),
model,
model_onnx,
basename="SklearnSelectFdr",
allow_failure="StrictVersion(onnx.__version__)"
" < StrictVersion('1.2') or "
"StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.2.1')",
)
示例16
def test_select_fwe_float(self):
model = SelectFwe()
X, y = load_breast_cancer(return_X_y=True)
model.fit(X, y)
model_onnx = convert_sklearn(
model, "select fwe",
[("input", FloatTensorType([None, X.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.float32),
model,
model_onnx,
basename="SklearnSelectFwe",
allow_failure="StrictVersion(onnx.__version__)"
" < StrictVersion('1.2') or "
"StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.2.1')",
)
示例17
def test_not_labels():
data = load_breast_cancer()
X = data.data
y = data.target
# convert class values to [0,2]
# y = y * 2
# Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42)
# sklearn
clf_sklearn = linear_model.LogisticRegression()
clf_sklearn.fit(X_train, y_train)
y_pred_sklearn = clf_sklearn.predict(X_test)
# h2o
clf_h2o = h2o4gpu.LogisticRegression()
clf_h2o.fit(X_train, y_train)
y_pred_h2o = clf_h2o.predict(X_test)
assert np.allclose(accuracy_score(y_test, y_pred_sklearn), accuracy_score(y_test, y_pred_h2o.squeeze()))
示例18
def load_dataset(encode_labels, rng):
# Generate a classification dataset
data = load_breast_cancer()
X = data.data
y = data.target
if encode_labels is not None:
y = np.take(encode_labels, y)
# split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=rng)
# Scale the variables to have 0 mean and unit variance
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)
# Split the data into training and DSEL for DS techniques
X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train,
test_size=0.5,
random_state=rng)
# Considering a pool composed of 10 base classifiers
# Calibrating Perceptrons to estimate probabilities
return X_dsel, X_test, X_train, y_dsel, y_test, y_train
示例19
def test_meta_no_pool_of_classifiers(knn_methods):
rng = np.random.RandomState(123456)
data = load_breast_cancer()
X = data.data
y = data.target
# split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=rng)
# Scale the variables to have 0 mean and unit variance
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)
meta_des = METADES(knn_classifier=knn_methods, random_state=rng,
DSEL_perc=0.5)
meta_des.fit(X_train, y_train)
assert np.isclose(meta_des.score(X_test, y_test), 0.9095744680851063)
示例20
def get_sample_dataset(dataset_properties):
"""Returns sample dataset
Args:
dataset_properties (dict): Dictionary corresponding to the properties of the dataset
used to verify the estimator and metric generators.
Returns:
X (array-like): Features array
y (array-like): Labels array
splits (iterator): This is an iterator that returns train test splits for
cross-validation purposes on ``X`` and ``y``.
"""
kwargs = dataset_properties.copy()
data_type = kwargs.pop('type')
if data_type == 'multiclass':
try:
X, y = datasets.make_classification(random_state=8, **kwargs)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
except Exception as e:
raise exceptions.UserError(repr(e))
elif data_type == 'iris':
X, y = datasets.load_iris(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'mnist':
X, y = datasets.load_digits(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'breast_cancer':
X, y = datasets.load_breast_cancer(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'boston':
X, y = datasets.load_boston(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
elif data_type == 'diabetes':
X, y = datasets.load_diabetes(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
else:
raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
return X, y, splits
示例21
def setUp(self):
data = load_breast_cancer()
Xtr, Xte = train_test_split(data.data, shuffle=True, test_size=.2)
self.Xtr = torch.tensor(Xtr)
self.Xte = torch.tensor(Xte)
self.Str = ['aaba', 'bac', 'abac', 'waibba', 'aaiicaaac']
self.Ste = ['aaa','bac','bababbwa']
示例22
def setUp(self):
data = load_breast_cancer()
self.Xtr, self.Xte, self.Ytr, self.Yte = train_test_split(data.data, data.target, shuffle=True, train_size=50)
self.Xtr = preprocessing.normalization(self.Xtr)
self.Xte = preprocessing.normalization(self.Xte)
self.KLtr = [pairwise_mk.homogeneous_polynomial_kernel(self.Xtr, degree=d) for d in range(1,6)]
self.KLte = [pairwise_mk.homogeneous_polynomial_kernel(self.Xte, self.Xtr, degree=d) for d in range(1,6)]
self.KLtr_g = HPK_generator(self.Xtr, degrees=range(1,6))
self.KLte_g = HPK_generator(self.Xte, self.Xtr, degrees=range(1,6))
示例23
def setUp(self):
data = load_breast_cancer()
self.Xtr, self.Xte, self.Ytr, self.Yte = train_test_split(data.data, data.target, shuffle=True, train_size=50)
示例24
def main(export_dir):
## load dataset
x, y = load_dataset(return_X_y=True)
## train xgb
xgbc = xgb.XGBClassifier(n_estimators=100, max_depth=7)
xgbc.fit(x, y)
# transpile model
os.mkdir(os.path.join(export_dir, "xgb"))
transpiler = Transpiler(xgbc)
transpiler.transpile(package_name="xgb", method_name="predict", export_method=True)
transpiler.write(os.path.join(export_dir, "xgb"))
print("xgb done.")
## train rfc
rfc = RFC(n_estimators=100, max_depth=7)
rfc.fit(x, y)
# transpile model
os.mkdir(os.path.join(export_dir, "rfc"))
transpiler = Transpiler(rfc)
transpiler.transpile(package_name="rfc", method_name="predict", export_method=True)
transpiler.write(os.path.join(export_dir, "rfc"))
print("rfc done.")
示例25
def fbt_vs_fb_cancer(iterations=30, treeNum=1, treeDepth=4, numThresh=9, filename=None):
bc = load_breast_cancer()
X = pd.DataFrame(bc.data, columns=bc.feature_names)
y = bc.target
X.drop(columns=get_corr_columns(X), inplace=True)
return fbt_vs_fb(X, y, iterations=iterations, treeNum=treeNum, treeDepth=treeDepth, numThresh=numThresh,
filename=filename)
示例26
def setUpClass(cls):
cls.bc = load_breast_cancer()
示例27
def setUpClass(cls):
cls.bc = load_breast_cancer()
示例28
def run():
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=0.2
)
clf = get_model() # Parameters are injected automatically.
clf.fit(X_train, y_train)
return clf.score(X_test, y_test)
示例29
def run(_config):
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=0.2
)
clf = get_model(
_config["C"], _config["gamma"], _config["kernel"]
) # Parameters are passed explicitly.
clf.fit(X_train, y_train)
return clf.score(X_test, y_test)
示例30
def load_breast_cancer_df(include_tgt=True, tgt_name="target", shuffle=False):
"""Loads the breast cancer dataset into a dataframe with the
target set as the "target" feature or whatever name
is specified in ``tgt_name``.
Parameters
----------
include_tgt : bool, optional (default=True)
Whether to include the target
tgt_name : str, optional (default="target")
The name of the target feature
shuffle : bool, optional (default=False)
Whether to shuffle the rows
Returns
-------
X : pd.DataFrame, shape=(n_samples, n_features)
The loaded dataset
"""
bc = load_breast_cancer()
X = pd.DataFrame.from_records(data=bc.data, columns=bc.feature_names)
if include_tgt:
X[tgt_name] = bc.target
return X if not shuffle else shuffle_dataframe(X)