Python源码示例:sklearn.datasets.fetch_openml()
示例1
def main():
from sklearn import preprocessing
from sklearn.datasets import fetch_openml as fetch_mldata
from sklearn.model_selection import train_test_split
db_name = 'diabetes'
data_set = fetch_mldata(db_name)
data_set.data = preprocessing.normalize(data_set.data)
tmp = data_set.target
tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp]
data_set.target = tmpL
X_train, X_test, y_train, y_test = train_test_split(
data_set.data, data_set.target, test_size=0.4)
mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train)
elm = ELM(200).fit(X_train, y_train)
print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test))
print("ELM Accuracy %0.3f " % elm.score(X_test, y_test))
示例2
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
def _mock_urlopen_raise(request):
raise ValueError('This mechanism intends to test correct cache'
'handling. As such, urlopen should never be '
'accessed. URL: %s' % request.get_full_url())
data_id = 2
cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
_monkey_patch_webbased_functions(
monkeypatch, data_id, gzip_response)
X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
data_home=cache_directory,
return_X_y=True)
monkeypatch.setattr(sklearn.datasets.openml, 'urlopen',
_mock_urlopen_raise)
X_cached, y_cached = fetch_openml(data_id=data_id, cache=True,
data_home=cache_directory,
return_X_y=True)
np.testing.assert_array_equal(X_fetched, X_cached)
np.testing.assert_array_equal(y_fetched, y_cached)
示例3
def test_warn_ignore_attribute(monkeypatch, gzip_response):
data_id = 40966
expected_row_id_msg = "target_column={} has flag is_row_identifier."
expected_ignore_msg = "target_column={} has flag is_ignore."
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
# single column test
assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
fetch_openml, data_id=data_id,
target_column='MouseID',
cache=False)
assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
fetch_openml, data_id=data_id,
target_column='Genotype',
cache=False)
# multi column test
assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
fetch_openml, data_id=data_id,
target_column=['MouseID', 'class'],
cache=False)
assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
fetch_openml, data_id=data_id,
target_column=['Genotype', 'class'],
cache=False)
示例4
def load_mauna_loa_atmospheric_co2():
ml_data = fetch_openml(data_id=41187)
months = []
ppmv_sums = []
counts = []
y = ml_data.data[:, 0]
m = ml_data.data[:, 1]
month_float = y + (m - 1) / 12
ppmvs = ml_data.target
for month, ppmv in zip(month_float, ppmvs):
if not months or month != months[-1]:
months.append(month)
ppmv_sums.append(ppmv)
counts.append(1)
else:
# aggregate monthly sum to produce average
ppmv_sums[-1] += ppmv
counts[-1] += 1
months = np.asarray(months).reshape(-1, 1)
avg_ppmvs = np.asarray(ppmv_sums) / counts
return months, avg_ppmvs
示例5
def load_mauna_loa_atmospheric_co2():
ml_data = fetch_openml(data_id=41187)
months = []
ppmv_sums = []
counts = []
y = ml_data.data[:, 0]
m = ml_data.data[:, 1]
month_float = y + (m - 1) / 12
ppmvs = ml_data.target
for month, ppmv in zip(month_float, ppmvs):
if not months or month != months[-1]:
months.append(month)
ppmv_sums.append(ppmv)
counts.append(1)
else:
# aggregate monthly sum to produce average
ppmv_sums[-1] += ppmv
counts[-1] += 1
months = np.asarray(months).reshape(-1, 1)
avg_ppmvs = np.asarray(ppmv_sums) / counts
return months, avg_ppmvs
示例6
def load_mauna_loa_atmospheric_co2():
ml_data = fetch_openml(data_id=41187)
months = []
ppmv_sums = []
counts = []
y = ml_data.data[:, 0]
m = ml_data.data[:, 1]
month_float = y + (m - 1) / 12
ppmvs = ml_data.target
for month, ppmv in zip(month_float, ppmvs):
if not months or month != months[-1]:
months.append(month)
ppmv_sums.append(ppmv)
counts.append(1)
else:
# aggregate monthly sum to produce average
ppmv_sums[-1] += ppmv
counts[-1] += 1
months = np.asarray(months).reshape(-1, 1)
avg_ppmvs = np.asarray(ppmv_sums) / counts
return months, avg_ppmvs
示例7
def main():
from sklearn import preprocessing
from sklearn.datasets import fetch_openml as fetch_mldata
from sklearn.model_selection import ShuffleSplit, KFold, cross_val_score
db_name = 'australian'
hid_nums = [100, 200, 300]
data_set = fetch_mldata(db_name)
data_set.data = preprocessing.normalize(data_set.data)
data_set.target = [1 if i == 1 else -1
for i in data_set.target.astype(int)]
for hid_num in hid_nums:
print(hid_num, end=' ')
e = ELM(hid_num)
ave = 0
for i in range(10):
cv = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(
e, data_set.data, data_set.target,
cv=cv, scoring='accuracy', n_jobs=-1)
ave += scores.mean()
ave /= 10
print("Accuracy: %0.3f " % (ave))
示例8
def main():
from sklearn import preprocessing
from sklearn.datasets import fetch_openml as fetch_mldata
from sklearn.model_selection import cross_val_score
db_name = 'iris'
hid_num = 1000
data_set = fetch_mldata(db_name, version=1)
data_set.data = preprocessing.scale(data_set.data)
data_set.target = preprocessing.LabelEncoder().fit_transform(data_set.target)
print(db_name)
print('ECOBELM', hid_num)
e = ECOBELM(hid_num, c=2**5)
ave = 0
for i in range(10):
scores = cross_val_score(
e, data_set.data, data_set.target, cv=5, scoring='accuracy')
ave += scores.mean()
ave /= 10
print("Accuracy: %0.2f " % (ave))
print('ELM', hid_num)
e = ELM(hid_num)
ave = 0
for i in range(10):
scores = cross_val_score(
e, data_set.data, data_set.target, cv=5, scoring='accuracy')
ave += scores.mean()
ave /= 10
print("Accuracy: %0.2f " % (ave))
示例9
def getdataset(datasetname, onehot_encode_strings=True):
# load
dataset = fetch_openml(datasetname)
# get X and y
X = dshape(dataset.data)
try:
target = dshape(dataset.target)
except:
print("WARNING: No target found. Taking last column of data matrix as target")
target = X[:, -1]
X = X[:, :-1]
if (
len(target.shape) > 1 and target.shape[1] > X.shape[1]
): # some mldata sets are mixed up...
X = target
target = dshape(dataset.data)
if len(X.shape) == 1 or X.shape[1] <= 1:
for k in dataset.keys():
if k != "data" and k != "target" and len(dataset[k]) == X.shape[1]:
X = np.hstack((X, dshape(dataset[k])))
# one-hot for categorical values
if onehot_encode_strings:
cat_ft = [
i
for i in range(X.shape[1])
if "str" in str(type(unpack(X[0, i])))
or "unicode" in str(type(unpack(X[0, i])))
]
if len(cat_ft):
for i in cat_ft:
X[:, i] = tonumeric(X[:, i])
X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
# if sparse, make dense
try:
X = X.toarray()
except:
pass
# convert y to monotonically increasing ints
y = tonumeric(target).astype(int)
return np.nan_to_num(X.astype(float)), y
示例10
def fetch_employee_salaries():
"""fetches the employee_salaries dataset
The employee_salaries dataset contains information about annual salaries
(year 2016) for more than 9,000 employees of the Montgomery County
(Maryland, US).
Returns
-------
dict
a dictionary containing:
- a short description of the dataset (under the ``DESCR``
key)
- the tabular data (under the ``data`` key)
- the target (under the ``target`` key)
References
----------
https://catalog.data.gov/dataset/employee-salaries-2016
"""
data = fetch_openml(data_id=42125, as_frame=True)
data.data['Current Annual Salary'] = data['target']
return data
# link dead.
# return fetch_dataset(EMPLOYEE_SALARIES_CONFIG, show_progress=False)
示例11
def _test_features_list(data_id):
# XXX Test is intended to verify/ensure correct decoding behavior
# Not usable with sparse data or datasets that have columns marked as
# {row_identifier, ignore}
def decode_column(data_bunch, col_idx):
col_name = data_bunch.feature_names[col_idx]
if col_name in data_bunch.categories:
# XXX: This would be faster with np.take, although it does not
# handle missing values fast (also not with mode='wrap')
cat = data_bunch.categories[col_name]
result = [None if is_scalar_nan(idx) else cat[int(idx)]
for idx in data_bunch.data[:, col_idx]]
return np.array(result, dtype='O')
else:
# non-nominal attribute
return data_bunch.data[:, col_idx]
data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)
# also obtain decoded arff
data_description = _get_data_description_by_id(data_id, None)
sparse = data_description['format'].lower() == 'sparse_arff'
if sparse is True:
raise ValueError('This test is not intended for sparse data, to keep '
'code relatively simple')
data_arff = _download_data_arff(data_description['file_id'],
sparse, None, False)
data_downloaded = np.array(list(data_arff['data']), dtype='O')
for i in range(len(data_bunch.feature_names)):
# XXX: Test per column, as this makes it easier to avoid problems with
# missing values
np.testing.assert_array_equal(data_downloaded[:, i],
decode_column(data_bunch, i))
示例12
def test_fetch_openml_notarget(monkeypatch, gzip_response):
data_id = 61
target_column = None
expected_observations = 150
expected_features = 5
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
data = fetch_openml(data_id=data_id, target_column=target_column,
cache=False)
assert data.data.shape == (expected_observations, expected_features)
assert data.target is None
示例13
def test_fetch_openml_inactive(monkeypatch, gzip_response):
# fetch inactive dataset by id
data_id = 40675
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
glas2 = assert_warns_message(
UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
data_id=data_id, cache=False)
# fetch inactive dataset by name and version
assert glas2.data.shape == (163, 9)
glas2_by_version = assert_warns_message(
UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
data_id=None, name="glass2", version=1, cache=False)
assert int(glas2_by_version.details['id']) == data_id
示例14
def test_fetch_nonexiting(monkeypatch, gzip_response):
# there is no active version of glass2
data_id = 40675
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
# Note that we only want to search by name (not data id)
assert_raise_message(ValueError, "No active dataset glass2 found",
fetch_openml, name='glass2', cache=False)
示例15
def test_string_attribute(monkeypatch, gzip_response):
data_id = 40945
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
# single column test
assert_raise_message(ValueError,
'STRING attributes are not yet supported',
fetch_openml, data_id=data_id, cache=False)
示例16
def test_dataset_with_openml_error(monkeypatch, gzip_response):
data_id = 1
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
assert_warns_message(
UserWarning,
"OpenML registered a problem with the dataset. It might be unusable. "
"Error:",
fetch_openml, data_id=data_id, cache=False
)
示例17
def test_dataset_with_openml_warning(monkeypatch, gzip_response):
data_id = 3
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
assert_warns_message(
UserWarning,
"OpenML raised a warning on the dataset. It might be unusable. "
"Warning:",
fetch_openml, data_id=data_id, cache=False
)
示例18
def test_illegal_column(monkeypatch, gzip_response):
data_id = 61
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
assert_raise_message(KeyError, "Could not find target_column=",
fetch_openml, data_id=data_id,
target_column='undefined', cache=False)
assert_raise_message(KeyError, "Could not find target_column=",
fetch_openml, data_id=data_id,
target_column=['undefined', 'class'],
cache=False)
示例19
def test_fetch_openml_raises_illegal_argument():
assert_raise_message(ValueError, "Dataset data_id=",
fetch_openml, data_id=-1, name="name")
assert_raise_message(ValueError, "Dataset data_id=",
fetch_openml, data_id=-1, name=None,
version="version")
assert_raise_message(ValueError, "Dataset data_id=",
fetch_openml, data_id=-1, name="name",
version="version")
assert_raise_message(ValueError, "Neither name nor data_id are provided. "
"Please provide name or data_id.", fetch_openml)
示例20
def __init__(self, k):
self.mnist = datasets.fetch_openml('mnist_784', data_home='mnist_dataset/')
self.data, self.target = self.mnist.data, self.mnist.target
# Make an array of indices the size of MNIST to use for making the data sets.
# This array is in random order, so we can use it to scramble up the MNIST data
self.indx = np.random.choice(len(self.target), 70000, replace=False)
# Initialising the classifier
self.classifier = KNeighborsClassifier(n_neighbors=k)
# method for building the datasets to test with
示例21
def get_data(num_samples):
mnist = fetch_openml('mnist_784')
torch.manual_seed(0)
X = mnist.data.astype('float32').reshape(-1, 1, 28, 28)
y = mnist.target.astype('int64')
X, y = shuffle(X, y)
X, y = X[:num_samples], y[:num_samples]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
X_train /= 255
X_test /= 255
return X_train, X_test, y_train, y_test
示例22
def get_mnist():
x, y = fetch_openml('mnist_784', data_home='~', version=1, return_X_y=True)
y = y.astype(np.int)
return x, y
示例23
def __call__(self):
dataset = fetch_openml(self.name)
return dataset.data, dataset.target
示例24
def objective(trial):
fmnist = fetch_openml(name="Fashion-MNIST", version=1)
classes = list(set(fmnist.target))
# For demonstrational purpose, only use a subset of the dataset.
n_samples = 4000
data = fmnist.data[:n_samples]
target = fmnist.target[:n_samples]
x_train, x_valid, y_train, y_valid = train_test_split(data, target)
clf = MLPClassifier(
hidden_layer_sizes=tuple(
[trial.suggest_int("n_units_l{}".format(i), 32, 64) for i in range(3)]
),
learning_rate_init=trial.suggest_loguniform("lr_init", 1e-5, 1e-1),
)
for step in range(100):
clf.partial_fit(x_train, y_train, classes=classes)
value = clf.score(x_valid, y_valid)
# Report intermediate objective value.
trial.report(value, step)
# Handle pruning based on the intermediate value.
if trial.should_prune():
raise optuna.TrialPruned()
return value
示例25
def load_data():
X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
X /= 255.
X -= X.mean(axis=0)
target_scaler = OneHotEncoder(sparse=False, categories='auto')
y = target_scaler.fit_transform(y.reshape(-1, 1))
return model_selection.train_test_split(
X.astype(np.float32),
y.astype(np.float32),
test_size=(1 / 7.))
示例26
def load_data():
X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
X = (X / 255.).astype(np.float32)
np.random.shuffle(X)
x_train, x_test = X[:60000], X[60000:]
return x_train, x_test
示例27
def load_data():
X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
X = (X / 255.).astype(np.float32)
np.random.shuffle(X)
x_train_2d, x_test_2d = X[:60000], X[60000:]
x_train_4d = x_train_2d.reshape((60000, 28, 28, 1))
x_test_4d = x_test_2d.reshape((10000, 28, 28, 1))
return x_train_4d, x_test_4d
示例28
def load_data():
X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
X = X / 255.
X -= X.mean(axis=0)
x_train, x_test = model_selection.train_test_split(
X.astype(np.float32),
test_size=(1 / 7.)
)
return x_train, x_test
示例29
def load_data():
X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
X = X.reshape(-1, 28, 28, 1)
X /= 255.
target_scaler = OneHotEncoder(sparse=False, categories='auto')
y = target_scaler.fit_transform(y.reshape(-1, 1))
return train_test_split(
X.astype(np.float32),
y.astype(np.float32),
test_size=(1 / 7.)
)