Python源码示例:sklearn.datasets.fetch_mldata()
示例1
def load_data(dtype=np.float32, order='F'):
"""Load the data, then cache and memmap the train/test split"""
######################################################################
# Load dataset
safe_print("Loading dataset...")
data = fetch_mldata('MNIST original')
X = check_array(data['data'], dtype=dtype, order=order)
y = data["target"]
# Normalize features
X = X / 255
# Create train-test split (as [Joachims, 2006])
safe_print("Creating train-test split...")
n_train = 60000
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]
return X_train, X_test, y_train, y_test
示例2
def test_download(tmpdata):
"""Test that fetch_mldata is able to download and cache a data set."""
_urlopen_ref = datasets.mldata.urlopen
datasets.mldata.urlopen = mock_mldata_urlopen({
'mock': {
'label': sp.ones((150,)),
'data': sp.ones((150, 4)),
},
})
try:
mock = assert_warns(DeprecationWarning, fetch_mldata,
'mock', data_home=tmpdata)
for n in ["COL_NAMES", "DESCR", "target", "data"]:
assert_in(n, mock)
assert_equal(mock.target.shape, (150,))
assert_equal(mock.data.shape, (150, 4))
assert_raises(datasets.mldata.HTTPError,
assert_warns, DeprecationWarning,
fetch_mldata, 'not_existing_name')
finally:
datasets.mldata.urlopen = _urlopen_ref
示例3
def test_fetch_one_column(tmpdata):
_urlopen_ref = datasets.mldata.urlopen
try:
dataname = 'onecol'
# create fake data set in cache
x = sp.arange(6).reshape(2, 3)
datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})
dset = fetch_mldata(dataname, data_home=tmpdata)
for n in ["COL_NAMES", "DESCR", "data"]:
assert_in(n, dset)
assert_not_in("target", dset)
assert_equal(dset.data.shape, (2, 3))
assert_array_equal(dset.data, x)
# transposing the data array
dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdata)
assert_equal(dset.data.shape, (3, 2))
finally:
datasets.mldata.urlopen = _urlopen_ref
示例4
def mnist(missingness="mcar", thr=0.2):
""" Loads corrupted MNIST
Parameters
----------
missingness: ('mcar', 'mar', 'mnar')
Type of missigness you want in your dataset
th: float between [0,1]
Percentage of missing data in generated data
Returns
-------
numpy.ndarray
"""
from sklearn.datasets import fetch_mldata
dataset = fetch_mldata('MNIST original')
corruptor = Corruptor(dataset.data, thr=thr)
data = getattr(corruptor, missingness)()
return {"X": data, "Y": dataset.target}
示例5
def train(self, n_epochs, batch_size=128, save_interval=50):
mnist = fetch_mldata('MNIST original')
X = mnist.data
y = mnist.target
# Rescale [-1, 1]
X = (X.astype(np.float32) - 127.5) / 127.5
for epoch in range(n_epochs):
# Select a random half batch of images
idx = np.random.randint(0, X.shape[0], batch_size)
imgs = X[idx]
# Train the Autoencoder
loss, _ = self.autoencoder.train_on_batch(imgs, imgs)
# Display the progress
print ("%d [D loss: %f]" % (epoch, loss))
# If at save interval => save generated image samples
if epoch % save_interval == 0:
self.save_imgs(epoch, X)
示例6
def test_download():
"""Test that fetch_mldata is able to download and cache a data set."""
_urlopen_ref = datasets.mldata.urlopen
datasets.mldata.urlopen = mock_mldata_urlopen({
'mock': {
'label': sp.ones((150,)),
'data': sp.ones((150, 4)),
},
})
try:
mock = fetch_mldata('mock', data_home=tmpdir)
for n in ["COL_NAMES", "DESCR", "target", "data"]:
assert_in(n, mock)
assert_equal(mock.target.shape, (150,))
assert_equal(mock.data.shape, (150, 4))
assert_raises(datasets.mldata.HTTPError,
fetch_mldata, 'not_existing_name')
finally:
datasets.mldata.urlopen = _urlopen_ref
示例7
def test_fetch_one_column():
_urlopen_ref = datasets.mldata.urlopen
try:
dataname = 'onecol'
# create fake data set in cache
x = sp.arange(6).reshape(2, 3)
datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})
dset = fetch_mldata(dataname, data_home=tmpdir)
for n in ["COL_NAMES", "DESCR", "data"]:
assert_in(n, dset)
assert_not_in("target", dset)
assert_equal(dset.data.shape, (2, 3))
assert_array_equal(dset.data, x)
# transposing the data array
dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
assert_equal(dset.data.shape, (3, 2))
finally:
datasets.mldata.urlopen = _urlopen_ref
示例8
def get_mnist():
""" Gets MNIST dataset """
np.random.seed(1234) # set seed for deterministic ordering
data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
data_path = os.path.join(data_path, '../../data')
mnist = fetch_mldata('MNIST original', data_home=data_path)
p = np.random.permutation(mnist.data.shape[0])
X = mnist.data[p].astype(np.float32)*0.02
Y = mnist.target[p]
return X, Y
示例9
def get_mnist():
np.random.seed(1234) # set seed for deterministic ordering
data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
data_path = os.path.join(data_path, '../../data')
mnist = fetch_mldata('MNIST original', data_home=data_path)
p = np.random.permutation(mnist.data.shape[0])
X = mnist.data[p].astype(np.float32)*0.02
Y = mnist.target[p]
return X, Y
示例10
def MNIST_dataload():
from sklearn.datasets import fetch_mldata
import numpy as np
mnist = fetch_mldata('MNIST original')
Data = mnist.data
label = mnist.target
return Data,label
示例11
def get_mnist():
np.random.seed(1234) # set seed for deterministic ordering
data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
data_path = os.path.join(data_path, '../../data')
mnist = fetch_mldata('MNIST original', data_home=data_path)
p = np.random.permutation(mnist.data.shape[0])
X = mnist.data[p].astype(np.float32)*0.02
Y = mnist.target[p]
return X, Y
示例12
def load_data_target(name):
"""
Loads data and target given the name of the dataset.
"""
if name == "Boston":
data = load_boston()
elif name == "Housing":
data = fetch_california_housing()
dataset_size = 1000 # this is necessary so that SVR does not slow down too much
data["data"] = data["data"][:dataset_size]
data["target"] =data["target"][:dataset_size]
elif name == "digits":
data = load_digits()
elif name == "Climate Model Crashes":
try:
data = fetch_mldata("climate-model-simulation-crashes")
except HTTPError as e:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat"
data = urlopen(url).read().split('\n')[1:]
data = [[float(v) for v in d.split()] for d in data]
samples = np.array(data)
data = dict()
data["data"] = samples[:, :-1]
data["target"] = np.array(samples[:, -1], dtype=np.int)
else:
raise ValueError("dataset not supported.")
return data["data"], data["target"]
示例13
def training_data():
"""Get the `MNIST original` training data."""
_np.random.seed(1)
permutation = _np.random.permutation(range(60000))
mnist = _fetch_mldata('MNIST original',
data_home=_os.path.join(_DATA_FOLDER,
'MNIST_original'))
return (mnist.data[:60000, :][permutation, :].reshape((60000, 1, 28, 28)).astype('float32'),
mnist.target[:60000][permutation].reshape((60000, 1)).astype('float32'))
示例14
def test_data():
"""Get the `MNIST original` test data."""
mnist = _fetch_mldata('MNIST original',
data_home=_os.path.join(_DATA_FOLDER,
'MNIST_original'))
return (mnist.data[60000:, :].reshape((10000, 1, 28, 28)).astype('float32'),
mnist.target[60000:].reshape((10000, 1)).astype('float32'))
示例15
def main():
from sklearn.datasets import load_digits, fetch_mldata
SMALL_MNIST = False
if SMALL_MNIST:
mnist_digits = load_digits()
n_input = np.prod(mnist_digits.images.shape[1:])
n_images = len(mnist_digits.images) # 1797
data_images = mnist_digits.images.reshape(n_images, -1) / 16. # -> 1797 x 64
data_targets = mnist_digits.target
# im_size_x, im_size_y = 8, 8
else:
mnist_digits = fetch_mldata('MNIST original')
n_input = np.prod(mnist_digits.data.shape[1:])
data_images = mnist_digits.data / 255. # -> 70000 x 284
data_targets = mnist_digits.target
# im_size_x, im_size_y = 28, 28
n_hidden, n_output = 5, 10
nn = NeuralNetworkClassifier(n_input, n_hidden, n_output)
weight_shapes = nn.get_weights_shapes()
weights = []
for weight_shape in weight_shapes:
weights.append(np.random.randn(*weight_shape))
nn.set_weights(*weights)
score = nn.score(data_images, data_targets)
print("Score is: ", score)
示例16
def __init__(self, traj, parameters):
super().__init__(traj)
if parameters.use_small_mnist:
# 8 x 8 images
mnist_digits = load_digits()
n_input = np.prod(mnist_digits.images.shape[1:])
n_images = len(mnist_digits.images) # 1797
data_images = mnist_digits.images.reshape(n_images, -1) / 16. # -> 1797 x 64
data_targets = mnist_digits.target
else:
# 28 x 28 images
mnist_digits = fetch_mldata('MNIST original')
n_input = np.prod(mnist_digits.data.shape[1:])
data_images = mnist_digits.data / 255. # -> 70000 x 284
n_images = len(data_images)
data_targets = mnist_digits.target
self.n_images = n_images
self.data_images, self.data_targets = data_images, data_targets
seed = parameters.seed
n_hidden = parameters.n_hidden
seed = np.uint32(seed)
self.random_state = np.random.RandomState(seed=seed)
n_output = 10 # This is always true for mnist
self.nn = NeuralNetworkClassifier(n_input, n_hidden, n_output)
self.random_state = np.random.RandomState(seed=seed)
# create_individual can be called because __init__ is complete except for traj initializtion
indiv_dict = self.create_individual()
for key, val in indiv_dict.items():
traj.individual.f_add_parameter(key, val)
traj.individual.f_add_parameter('seed', seed)
示例17
def get_mnist():
""" Gets MNIST dataset """
np.random.seed(1234) # set seed for deterministic ordering
data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
data_path = os.path.join(data_path, '../../data')
mnist = fetch_mldata('MNIST original', data_home=data_path)
p = np.random.permutation(mnist.data.shape[0])
X = mnist.data[p].astype(np.float32)*0.02
Y = mnist.target[p]
return X, Y
示例18
def get_mnist():
np.random.seed(1234) # set seed for deterministic ordering
data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
data_path = os.path.join(data_path, '../../data')
mnist = fetch_mldata('MNIST original', data_home=data_path)
p = np.random.permutation(mnist.data.shape[0])
X = mnist.data[p].astype(np.float32)*0.02
Y = mnist.target[p]
return X, Y
示例19
def load_mnist(params):
mnist = fetch_mldata('MNIST original')
mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=params.random_seed)
mnist_X = mnist_X / 255.0
print("MNIST data prepared")
mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64')
def flatten_img(images):
'''
images: shape => (n, rows, columns)
output: shape => (n, rows*columns)
'''
n_rows = images.shape[1]
n_columns = images.shape[2]
for num in range(n_rows):
if num % 2 != 0:
images[:, num, :] = images[:, num, :][:, ::-1]
output = images.reshape(-1, n_rows*n_columns)
return output
time_steps = 28*28
if params.dataset.startswith("mnist.permute"):
print "permuate MNIST"
mnist_X = mnist_X.reshape((-1, time_steps))
perm = np.random.permutation(time_steps)
for i in xrange(len(mnist_X)):
mnist_X[i] = mnist_X[i][perm]
if len(params.dataset) > len("mnist.permute."):
time_steps = int(params.dataset[len("mnist.permute."):])
else:
if len(params.dataset) > len("mnist."): # mnist.xx
time_steps = int(params.dataset[len("mnist."):])
print "time_steps = ", time_steps
mnist_X = mnist_X.reshape((-1, time_steps, 28*28/time_steps))
#mnist_X = flatten_img(mnist_X) # X.shape => (n_samples, seq_len)
print "mnist_X.shape = ", mnist_X.shape
#mnist_X = mnist_X[:, :, np.newaxis] # X.shape => (n_samples, seq_len, n_features)
mnist_y_one_hot = np.zeros((mnist_y.shape[0], 10))
for i in xrange(len(mnist_y)):
mnist_y_one_hot[i][mnist_y[i]] = 1
print "mnist_y.shape = ", mnist_y_one_hot.shape
# split to training and testing set
train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y_one_hot,
test_size=0.2,
random_state=params.random_seed)
# need to set parameters according to dataset
params.time_steps = train_X.shape[1]
params.input_size = train_X.shape[2]
params.output_size = 10
params.regression_flag = False
return train_X, test_X, train_y, test_y
# synthetic sine curves
示例20
def get_mldata(dataset):
# Use scikit to grab datasets and save them save_dir.
save_dir = FLAGS.save_dir
filename = os.path.join(save_dir, dataset[1]+'.pkl')
if not gfile.Exists(save_dir):
gfile.MkDir(save_dir)
if not gfile.Exists(filename):
if dataset[0][-3:] == 'csv':
data = get_csv_data(dataset[0])
elif dataset[0] == 'breast_cancer':
data = load_breast_cancer()
elif dataset[0] == 'iris':
data = load_iris()
elif dataset[0] == 'newsgroup':
# Removing header information to make sure that no newsgroup identifying
# information is included in data
data = fetch_20newsgroups_vectorized(subset='all', remove=('headers'))
tfidf = TfidfTransformer(norm='l2')
X = tfidf.fit_transform(data.data)
data.data = X
elif dataset[0] == 'rcv1':
sklearn.datasets.rcv1.URL = (
'http://www.ai.mit.edu/projects/jmlr/papers/'
'volume5/lewis04a/a13-vector-files/lyrl2004_vectors')
sklearn.datasets.rcv1.URL_topics = (
'http://www.ai.mit.edu/projects/jmlr/papers/'
'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz')
data = sklearn.datasets.fetch_rcv1(
data_home='/tmp')
elif dataset[0] == 'wikipedia_attack':
data = get_wikipedia_talk_data()
elif dataset[0] == 'cifar10':
data = get_cifar10()
elif 'keras' in dataset[0]:
data = get_keras_data(dataset[0])
else:
try:
data = fetch_mldata(dataset[0])
except:
raise Exception('ERROR: failed to fetch data from mldata.org')
X = data.data
y = data.target
if X.shape[0] != y.shape[0]:
X = np.transpose(X)
assert X.shape[0] == y.shape[0]
data = {'data': X, 'target': y}
pickle.dump(data, gfile.GFile(filename, 'w'))
示例21
def main():
mnist = fetch_mldata('MNIST original')
X = mnist.data / 255.0
y = mnist.target
# Select the samples of the digit 2
X = X[y == 2]
# Limit dataset to 500 samples
idx = np.random.choice(range(X.shape[0]), size=500, replace=False)
X = X[idx]
rbm = RBM(n_hidden=50, n_iterations=200, batch_size=25, learning_rate=0.001)
rbm.fit(X)
# Training error plot
training, = plt.plot(range(len(rbm.training_errors)), rbm.training_errors, label="Training Error")
plt.legend(handles=[training])
plt.title("Error Plot")
plt.ylabel('Error')
plt.xlabel('Iterations')
plt.show()
# Get the images that were reconstructed during training
gen_imgs = rbm.training_reconstructions
# Plot the reconstructed images during the first iteration
fig, axs = plt.subplots(5, 5)
plt.suptitle("Restricted Boltzmann Machine - First Iteration")
cnt = 0
for i in range(5):
for j in range(5):
axs[i,j].imshow(gen_imgs[0][cnt].reshape((28, 28)), cmap='gray')
axs[i,j].axis('off')
cnt += 1
fig.savefig("rbm_first.png")
plt.close()
# Plot the images during the last iteration
fig, axs = plt.subplots(5, 5)
plt.suptitle("Restricted Boltzmann Machine - Last Iteration")
cnt = 0
for i in range(5):
for j in range(5):
axs[i,j].imshow(gen_imgs[-1][cnt].reshape((28, 28)), cmap='gray')
axs[i,j].axis('off')
cnt += 1
fig.savefig("rbm_last.png")
plt.close()