Python源码示例:sklearn.datasets.load_svmlight_file()
示例1
def read_year_prediction_data(fileName):
feature_dim = 90
print("Reading data from disk...")
train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32)
train_features = train_features.todense()
# normalize the data: subtract means and divide by standard deviations
label_mean = train_labels.mean()
label_std = np.sqrt(np.square(train_labels - label_mean).mean())
feature_means = train_features.mean(axis=0)
feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))
train_features = (train_features - feature_means) / feature_stds
train_labels = (train_labels - label_mean) / label_std
return feature_dim, train_features, train_labels
示例2
def load_data(path, dense=False):
"""Load data from a CSV, LibSVM or HDF5 file based on the file extension.
Args:
path (str): A path to the CSV, LibSVM or HDF5 format file.
dense (boolean): An optional variable indicating if the return matrix
should be dense. By default, it is false.
Returns:
Data matrix X and target vector y
"""
catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5}
ext = os.path.splitext(path)[1]
func = catalog[ext]
X, y = func(path)
if dense and sparse.issparse(X):
X = X.todense()
return X, y
示例3
def test_dump(self):
tmpfile = "tmp_dump.txt"
try:
# loads from file
Xs, y = load_svmlight_file(datafile)
# dumps to file
dump_svmlight_file(Xs, y, tmpfile, zero_based=False)
# loads them as CSR MATRIX
X2, y2 = sk_load_svmlight_file(tmpfile)
X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
X2.toarray(out=X3)
# check assertions
assert_array_almost_equal(Xs, X3)
assert_array_almost_equal(y, y2)
finally:
if os.path.exists(tmpfile):
os.remove(tmpfile)
示例4
def test_dump_qid(self):
tmpfile = "/tmp/tmp_dump.txt"
try:
# loads from file
Xs, y, q = load_svmlight_file(qid_datafile, query_id=True)
# dumps to file
dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False)
# loads them as CSR MATRIX with scikit-learn
X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True)
X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
X2.toarray(out=X3)
# check assertions
assert_array_almost_equal(Xs, X3)
assert_array_almost_equal(y, y2)
assert_array_equal(q, q2)
finally:
if os.path.exists(tmpfile):
os.remove(tmpfile)
示例5
def test():
url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
urllib.request.urlretrieve(url_zip_train, filename='train.bz2')
f_svm = open('train.svm', 'wt')
with bz2.open('train.bz2', 'rb') as f_zip:
data = f_zip.read()
f_svm.write(data.decode('utf-8'))
f_svm.close()
X, y = load_svmlight_file('train.svm')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
# pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
pipeline.fit(X_train, y_train)
print("Pipeline Score: ", pipeline.score(X_train, y_train))
示例6
def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True):
X, y = sk_dt.load_svmlight_file(folder + "/real-sim")
y = np.array([int(yy) for yy in y])
if one_hot:
y = to_one_hot_enc(y)
res = [Dataset(data=X, target=y)]
if partitions_proportions:
res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions)
res = Datasets.from_list(res)
if as_tensor: [dat.convert_to_tensor() for dat in res]
return res
# noinspection PyPep8Naming
示例7
def _read_svmlight(lines, out_blocks, col_size, n_features, store_sparse):
from tempfile import SpooledTemporaryFile
from sklearn.datasets import load_svmlight_file
# Creating a tmp file to use load_svmlight_file method should be more
# efficient than parsing the lines manually
tmp_file = SpooledTemporaryFile(mode="wb+", max_size=2e8)
tmp_file.writelines(lines)
tmp_file.seek(0)
x, y = load_svmlight_file(tmp_file, n_features)
if not store_sparse:
x = x.toarray()
# tried also converting to csc/ndarray first for faster splitting but it's
# not worth. Position 0 contains the X
for i in range(ceil(n_features / col_size)):
out_blocks[0][i] = x[:, i * col_size:(i + 1) * col_size]
# Position 1 contains the y block
out_blocks[1][0] = y.reshape(-1, 1)
示例8
def test_load_svmlight_file(self):
""" Tests loading a LibSVM file """
file_ = "tests/files/libsvm/1"
x_np, y_np = load_svmlight_file(file_, n_features=780)
# Load SVM and store in sparse
x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
store_sparse=True)
self.assertTrue(_equal_arrays(x.collect(), x_np))
self.assertTrue(_equal_arrays(y.collect(), y_np))
# Load SVM and store in dense
x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
store_sparse=False)
self.assertTrue(_equal_arrays(x.collect(), x_np.toarray()))
self.assertTrue(_equal_arrays(y.collect(), y_np))
示例9
def retrieve_dataset(dataset, **kwargs):
# if data not extracted, download zip and extract
outdirname = "datasets.1.17.2019"
if not os.path.exists(outdirname):
try:
from urllib import urlretrieve
except ImportError:
from urllib.request import urlretrieve
import zipfile
zipfilename = outdirname + ".zip"
urlretrieve(
"https://publictestdatasets.blob.core.windows.net/data/" + zipfilename,
zipfilename,
)
with zipfile.ZipFile(zipfilename, "r") as unzip:
unzip.extractall(".")
extension = os.path.splitext(dataset)[1]
filepath = os.path.join(outdirname, dataset)
if extension == ".npz":
# sparse format file
import scipy.sparse as sparse
return sparse.load_npz(filepath)
elif extension == ".svmlight":
from sklearn import datasets
return datasets.load_svmlight_file(filepath)
elif extension == ".json":
import json
with open(filepath, encoding="utf-8") as f:
dataset = json.load(f)
return dataset
elif extension == ".csv":
import pandas as pd
return pd.read_csv(filepath, **kwargs)
else:
raise Exception("Unrecognized file extension: " + extension)
示例10
def train(self, depgraphs, modelfile):
"""
:param depgraphs : list of DependencyGraph as the training data
:type depgraphs : DependencyGraph
:param modelfile : file name to save the trained model
:type modelfile : str
"""
try:
input_file = tempfile.NamedTemporaryFile(
prefix='transition_parse.train',
dir=tempfile.gettempdir(),
delete=False)
if self._algorithm == self.ARC_STANDARD:
self._create_training_examples_arc_std(depgraphs, input_file)
else:
self._create_training_examples_arc_eager(depgraphs, input_file)
input_file.close()
# Using the temporary file to train the libsvm classifier
x_train, y_train = load_svmlight_file(input_file.name)
# The parameter is set according to the paper:
# Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
# Todo : because of probability = True => very slow due to
# cross-validation. Need to improve the speed here
model = svm.SVC(
kernel='poly',
degree=2,
coef0=0,
gamma=0.2,
C=0.5,
verbose=True,
probability=True)
model.fit(x_train, y_train)
# Save the model to file name (as pickle)
pickle.dump(model, open(modelfile, 'wb'))
finally:
remove(input_file.name)
示例11
def import_libsvm_sparse(filename):
"""Imports dataset file in libsvm sparse format"""
from sklearn.datasets import load_svmlight_file
X, y = load_svmlight_file(filename)
return Dataset(X.toarray(), y)
示例12
def setUp(self):
dataset_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'datasets/yeast_train.svm')
X, y = load_svmlight_file(dataset_filepath, multilabel=True)
self.X = X.todense().tolist()
self.y = MultiLabelBinarizer().fit_transform(y).tolist()
self.quota = 10
示例13
def read_data(filename, header=True, dtype='float32', zero_based=True):
"""Read data in sparse format
Arguments
---------
filename: str
output file name
header: bool, default=True
If header is present or not
dtype: str, default='float32'
data type of values
zero_based: boolean, default=True
zwero based indices?
Returns
--------
features: csr_matrix
features matrix
labels: csr_matix
labels matrix
num_samples: int
#instances
num_feat: int
#features
num_labels: int
#labels
"""
with open(filename, 'rb') as f:
_l_shape = None
if header:
line = f.readline().decode('utf-8').rstrip("\n")
line = line.split(" ")
num_samples, num_feat, num_labels = int(
line[0]), int(line[1]), int(line[2])
_l_shape = (num_samples, num_labels)
else:
num_samples, num_feat, num_labels = None, None, None
features, labels = load_svmlight_file(f, multilabel=True)
labels = ll_to_sparse(
labels, dtype=dtype, zero_based=zero_based, shape=_l_shape)
return features, labels, num_samples, num_feat, num_labels
示例14
def load_svmlight_file(self):
"""
Use sklearn.datasets.load_svmlight_file to load data.svmlight.
"""
file_name = os.path.join(self.data_dir, "data.svmlight")
datasets.load_svmlight_file(file_name)
示例15
def get_year_prediction_data(dirname=None):
feature_dim = 90
if dirname is None:
dirname = os.path.join(os.path.dirname(__file__), 'data')
filename = 'YearPredictionMSD'
download_filename = os.path.join(dirname, "%s.bz2" % filename)
extracted_filename = os.path.join(dirname, filename)
if not os.path.isfile(download_filename):
print("Downloading data...")
mx.test_utils.download('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/%s.bz2' % filename, dirname=dirname)
if not os.path.isfile(extracted_filename):
print("Extracting data...")
with bz2.BZ2File(download_filename) as fr, open(extracted_filename,"wb") as fw:
shutil.copyfileobj(fr,fw)
print("Reading data from disk...")
train_features, train_labels = load_svmlight_file(extracted_filename, n_features=feature_dim, dtype=np.float32)
train_features = train_features.todense()
# normalize the data: subtract means and divide by standard deviations
label_mean = train_labels.mean()
label_std = np.sqrt(np.square(train_labels - label_mean).mean())
feature_means = train_features.mean(axis=0)
feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))
train_features = (train_features - feature_means) / feature_stds
train_labels = (train_labels - label_mean) / label_std
return feature_dim, train_features, train_labels
示例16
def load_dataset(dataset_path, data_home=None, n_features=None):
"""Load dataset from given path
Parameters
----------
dataset_path : `str`
Dataset relative path
data_home : `str`, optional, default=None
Specify a download and cache folder for the datasets. If None
and not configured with TICK_DATASETS environement variable
all tick datasets are stored in '~/tick_datasets' subfolders.
n_features : `int`, optional, default=None
The number of features to use. If None, it will be inferred. This
argument is useful to load several files that are subsets of a
bigger sliced dataset: each subset might not have examples of
every feature, hence the inferred shape might vary from one
slice to another.
Returns
-------
output : `np.ndarray` or `dict` or `tuple`
Dataset. Its format will depend on queried dataset.
"""
data_home = get_data_home(data_home)
cache_path = os.path.join(data_home, dataset_path)
if cache_path.endswith(".npz"):
dataset = np.load(cache_path, allow_pickle=True)
# If we have only one numpy array we return it directly otherwise
# we return the row dictionary
if len(dataset) == 1:
key_0 = list(dataset.keys())[0]
dataset = dataset[key_0]
else:
dataset = dataset.items()
else:
dataset = load_svmlight_file(cache_path, n_features=n_features)
return dataset
示例17
def load_url_dataset_day(cache_path, days):
"""Loads url dataset from a tar file
Parameters
----------
cache_path : `str`
Path to the tar file
days : `list` or `range`
Days to be loaded
Returns
-------
X : `np.ndarray`
A sparse matrix containing the features
y : `np.ndarray`
An array containing the labels
"""
tar_file = tarfile.open(cache_path, "r:gz")
X, y = None, None
for day in days:
data_filename = 'url_svmlight/Day{}.svm'.format(day)
with tar_file.extractfile(data_filename) as data_file:
X_day, y_day = load_svmlight_file(data_file,
n_features=_N_FEATURES)
if X is None:
X, y = X_day, y_day
else:
X = scipy.sparse.vstack((X, X_day))
y = np.hstack((y, y_day))
return X, y
示例18
def load_file(self, file_path):
data = load_svmlight_file(file_path)
return data[0], data[1]
示例19
def run_test(self, pipeline, name, path):
print("download " + name)
update_name = self.download(name, path)
X, y = load_svmlight_file(update_name)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)
pipeline.fit(X_train, y_train)
print("[Benchmark "+ name + " Score]: ", pipeline.score(X_test, y_test))
示例20
def __init__(self, file_path):
self.file_path = file_path
X, Y = load_svmlight_file(file_path) # X is a sparse matrix
# L = [X[i].nonzero()[0].shape[0] for i in range(X.shape[0])]
X = X.todense().astype(np.float32)
Y = np.array((Y + 1) / 2, dtype=int)
self.X = torch.from_numpy(X)
self.Y = torch.from_numpy(Y)
示例21
def __init__(self, file_path, domain=0):
self.file_path = file_path
X, Y = load_svmlight_file(file_path) # Y is synthetic label, not used
X = X.todense().astype(np.float32)
self.X = torch.from_numpy(X)
self.Y = torch.LongTensor([domain] * self.X.shape[0])
示例22
def get_data(file_input, separator='\t'):
if 'libsvm' not in file_input:
file_input = other2libsvm(file_input, separator)
data = datasets.load_svmlight_file(file_input)
return data[0], data[1]
示例23
def homepage_xgb_model(model_path, training_set='True'):
training_set = './data/%s_features.svm.txt'%(training_set)
model = xgb.XGBClassifier( learning_rate =0.1,
n_estimators=200,
max_depth=5,
min_child_weight=1,
gamma= 0.3,
subsample= 0.7,
colsample_bytree=0.7,
objective= 'binary:logistic',
scale_pos_weight=1)
X, y = load_svmlight_file(training_set)
model.fit(X,y)
pickle.dump(model, open(model_path, 'wb'))
return model
示例24
def retrieve_dataset(dataset, **kwargs):
# if data not extracted, download zip and extract
outdirname = 'datasets.12.18.2019'
if not os.path.exists(outdirname):
try:
from urllib import urlretrieve
except ImportError:
from urllib.request import urlretrieve
import zipfile
zipfilename = outdirname + '.zip'
urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename)
with zipfile.ZipFile(zipfilename, 'r') as unzip:
unzip.extractall('.')
extension = os.path.splitext(dataset)[1]
filepath = os.path.join(outdirname, dataset)
if extension == '.npz':
# sparse format file
from scipy.sparse import load_npz
return load_npz(filepath)
elif extension == '.svmlight':
from sklearn import datasets
return datasets.load_svmlight_file(filepath)
elif extension == '.json':
import json
with open(filepath, encoding='utf-8') as f:
dataset = json.load(f)
return dataset
elif extension == '.csv':
import pandas as pd
return pd.read_csv(filepath, **kwargs)
else:
raise Exception('Unrecognized file extension: ' + extension)
示例25
def train(self, depgraphs, modelfile, verbose=True):
"""
:param depgraphs : list of DependencyGraph as the training data
:type depgraphs : DependencyGraph
:param modelfile : file name to save the trained model
:type modelfile : str
"""
try:
input_file = tempfile.NamedTemporaryFile(
prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False
)
if self._algorithm == self.ARC_STANDARD:
self._create_training_examples_arc_std(depgraphs, input_file)
else:
self._create_training_examples_arc_eager(depgraphs, input_file)
input_file.close()
# Using the temporary file to train the libsvm classifier
x_train, y_train = load_svmlight_file(input_file.name)
# The parameter is set according to the paper:
# Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
# Todo : because of probability = True => very slow due to
# cross-validation. Need to improve the speed here
model = svm.SVC(
kernel='poly',
degree=2,
coef0=0,
gamma=0.2,
C=0.5,
verbose=verbose,
probability=True,
)
model.fit(x_train, y_train)
# Save the model to file name (as pickle)
pickle.dump(model, open(modelfile, 'wb'))
finally:
remove(input_file.name)
示例26
def from_svm_file(cls, svm_file_path, transform=None):
"""
Instantiate a LibSVMDataset from a LibSVM file path.
:param svm_file_path: LibSVM file path
:param transform: a callable defining an optional transformation called on the dataset
:return: LibSVMDataset instantiated from a given file and with an optional transformation defined
"""
x, y, query_ids = load_svmlight_file(svm_file_path, query_id=True)
logger.info("loaded dataset from {} and got x shape {}, y shape {} and query_ids shape {}".format(
svm_file_path, x.shape, y.shape, query_ids.shape))
return cls(x, y, query_ids, transform)
示例27
def get_X_y(dataset, compressed_path, multilabel, replace=False):
"""Load a LIBSVM dataset as sparse X and observation y/Y.
If X and y already exists as npz and npy, they are not redownloaded unless
replace=True."""
ext = '.npz' if multilabel else '.npy'
y_path = pjoin(CELER_PATH, "%s_target%s" % (NAMES[dataset], ext))
X_path = pjoin(CELER_PATH, "%s_data.npz" % NAMES[dataset])
if replace or not os.path.isfile(y_path) or not os.path.isfile(X_path):
tmp_path = pjoin(CELER_PATH, "%s" % NAMES[dataset])
decompressor = BZ2Decompressor()
print("Decompressing...")
with open(tmp_path, "wb") as f, open(compressed_path, "rb") as g:
for data in iter(lambda: g.read(100 * 1024), b''):
f.write(decompressor.decompress(data))
n_features_total = N_FEATURES[dataset]
print("Loading svmlight file...")
with open(tmp_path, 'rb') as f:
X, y = load_svmlight_file(
f, n_features_total, multilabel=multilabel)
os.remove(tmp_path)
X = sparse.csc_matrix(X)
X.sort_indices()
sparse.save_npz(X_path, X)
if multilabel:
indices = np.array([lab for labels in y for lab in labels])
indptr = np.cumsum([0] + [len(labels) for labels in y])
data = np.ones_like(indices)
Y = sparse.csr_matrix((data, indices, indptr))
sparse.save_npz(y_path, Y)
return X, Y
else:
np.save(y_path, y)
else:
X = sparse.load_npz(X_path)
y = np.load(y_path)
return X, y
示例28
def main():
"""
Example of how to use
"""
# data load
#fname = "/home/kzk/datasets/uci_csv/iris.csv"
fname = "/home/kzk/datasets/uci_csv/glass.csv"
#fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv"
#fname = "/home/kzk/datasets/uci_csv/car.csv"
#fname = "/home/kzk/datasets/uci_csv/credit.csv"
#fname = "/home/kzk/datasets/uci_csv/usps.csv"
#fname = "/home/kzk/datasets/uci_csv/liver.csv"
#fname = "/home/kzk/datasets/uci_csv/haberman.csv"
#fname = "/home/kzk/datasets/uci_csv/pima.csv"
#fname = "/home/kzk/datasets/uci_csv/parkinsons.csv"
#fname = "/home/kzk/datasets/uci_csv/ionosphere.csv"
#fname = "/home/kzk/datasets/uci_csv/isolet.csv"
#fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv"
#fname = "/home/kzk/datasets/uci_csv/mammographic.csv"
#fname = "/home/kzk/datasets/uci_csv/yeast.csv"
fname = "/home/k_yoshiyama/datasets/news20/news20.dat"
print "dataset is", fname
#data = np.loadtxt(fname, delimiter=" ")
#X = data[:, 1:]
#y = data[:, 0]
(X, y) = load_svmlight_file(fname)
n_samples = X.shape[0]
y_pred = np.ndarray(n_samples)
#X = X.toarray()
n_samples = X.shape[0]
y_pred = np.ndarray(n_samples)
# learn
model = MSCWIIDiag(C=1, eta=0.9, epochs=1)
model.learn(X, y)
# predict
st = time.time()
for i in xrange(0, n_samples):
if i % 1000 == 0:
print "#samples = %d" % i
pass
sample = X[i, :]
y_pred[i] = model.predict(sample)
et = time.time()
print "prediction time: %f[s]" % (et - st)
print "prediction time/sample: %f[s]" % ((et - st) / n_samples)
# show result
cm = confusion_matrix(y, y_pred)
#print cm
print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm))
示例29
def main():
"""
Example of how to use
"""
# data load
#fname = "/home/kzk/datasets/uci_csv/iris.csv"
fname = "/home/kzk/datasets/uci_csv/glass.csv"
#fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv"
#fname = "/home/kzk/datasets/uci_csv/car.csv"
#fname = "/home/kzk/datasets/uci_csv/credit.csv"
#fname = "/home/kzk/datasets/uci_csv/usps.csv"
#fname = "/home/kzk/datasets/uci_csv/liver.csv"
#fname = "/home/kzk/datasets/uci_csv/haberman.csv"
#fname = "/home/kzk/datasets/uci_csv/pima.csv"
#fname = "/home/kzk/datasets/uci_csv/parkinsons.csv"
#fname = "/home/kzk/datasets/uci_csv/ionosphere.csv"
#fname = "/home/kzk/datasets/uci_csv/isolet.csv"
#fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv"
#fname = "/home/kzk/datasets/uci_csv/mammographic.csv"
#fname = "/home/kzk/datasets/uci_csv/yeast.csv"
#fname = "/home/kzk/datasets/news20/news20.dat"
fname = "/home/k_yoshiyama/datasets/news20/news20.dat"
print "dataset is", fname
#data = np.loadtxt(fname, delimiter=" ")
#X = data[:, 1:]
#y = data[:, 0]
(X, y) = load_svmlight_file(fname)
n_samples = X.shape[0]
y_pred = np.ndarray(n_samples)
#X = X.toarray()
# learn
model = MCWVarDiag(eta=0.9, epochs=1)
model.learn(X, y)
# predict
st = time.time()
for i in xrange(0, n_samples):
if i % 1000 == 0:
print "#samples = %d" % i
pass
sample = X[i, :]
y_pred[i] = model.predict(sample)
et = time.time()
print "prediction time: %f[s]" % (et - st)
print "prediction time/sample: %f[s]" % ((et - st) / n_samples)
# show result
cm = confusion_matrix(y, y_pred)
#print cm
print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm))
示例30
def main():
"""
Example of how to use
"""
# data load
#fname = "/home/kzk/datasets/uci_csv/iris.csv"
fname = "/home/kzk/datasets/uci_csv/glass.csv"
#fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv"
#fname = "/home/kzk/datasets/uci_csv/car.csv"
#fname = "/home/kzk/datasets/uci_csv/credit.csv"
#fname = "/home/kzk/datasets/uci_csv/usps.csv"
#fname = "/home/kzk/datasets/uci_csv/liver.csv"
#fname = "/home/kzk/datasets/uci_csv/haberman.csv"
#fname = "/home/kzk/datasets/uci_csv/pima.csv"
#fname = "/home/kzk/datasets/uci_csv/parkinsons.csv"
#fname = "/home/kzk/datasets/uci_csv/ionosphere.csv"
#fname = "/home/kzk/datasets/uci_csv/isolet.csv"
#fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv"
#fname = "/home/kzk/datasets/uci_csv/mammographic.csv"
#fname = "/home/kzk/datasets/uci_csv/yeast.csv"
fname = "/home/k_yoshiyama/datasets/news20/news20.dat"
print "dataset is", fname
#data = np.loadtxt(fname, delimiter=" ")
#X = data[:, 1:]
#y = data[:, 0]
(X, y) = load_svmlight_file(fname)
n_samples = X.shape[0]
y_pred = np.ndarray(n_samples)
#X = X.toarray()
n_samples = X.shape[0]
y_pred = np.ndarray(n_samples)
# learn
model = MSCWIDiag(C=1, eta=0.9, epochs=1)
model.learn(X, y)
# predict
st = time.time()
for i in xrange(0, n_samples):
if i % 1000 == 0:
print "#samples = %d" % i
pass
sample = X[i, :]
y_pred[i] = model.predict(sample)
et = time.time()
print "prediction time: %f[s]" % (et - st)
print "prediction time/sample: %f[s]" % ((et - st) / n_samples)
# show result
cm = confusion_matrix(y, y_pred)
#print cm
print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm))