Python源码示例:sklearn.datasets.fetch_20newsgroups()
示例1
def __init__(self,
cache: bool = False,
transform: Dict[str, Union[Field, Dict]] = None) -> None:
"""Initialize the NewsGroupDataset builtin."""
try:
from sklearn.datasets import fetch_20newsgroups
except ImportError:
raise ImportError("Install sklearn to use the NewsGroupDataset")
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')
train = [(' '.join(d.split()), str(t)) for d, t in zip(train['data'], train['target'])]
test = [(' '.join(d.split()), str(t)) for d, t in zip(test['data'], test['target'])]
named_cols = ['text', 'label']
super().__init__(
train=train,
val=None,
test=test,
cache=cache,
named_columns=named_cols,
transform=transform
)
示例2
def test_validate_sklearn_sgd_with_text_cv(self):
categories = ['alt.atheism','talk.religion.misc']
data = fetch_20newsgroups(subset='train', categories=categories)
X = data.data[:4]
Y = data.target[:4]
features = ['input']
target = 'output'
model = SGDClassifier(loss="log")
file_name = model.__class__.__name__ + '_CountVec_.pmml'
pipeline = Pipeline([
('vect', CountVectorizer()),
('clf', model)
])
pipeline.fit(X, Y)
skl_to_pmml(pipeline, features , target, file_name)
self.assertEqual(self.schema.is_valid(file_name), True)
示例3
def load(self):
categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
newsgroups_train = fetch_20newsgroups(
subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(
subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.20)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)
x1 = vectors
y1 = newsgroups_train.target
x2 = vectors_test
y2 = newsgroups_test.target
x = np.array(np.r_[x1.todense(), x2.todense()])
y = np.r_[y1, y2]
return x, y
示例4
def _te_ss_t_build(self):
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
corpus = CorpusFromScikit(
X=X_counts,
y=newsgroups_train.target,
feature_vocabulary=count_vectorizer.vocabulary_,
category_names=newsgroups_train.target_names,
raw_texts=newsgroups_train.data
).build()
self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
self.assertEqual(corpus
.get_term_freq_df()
.assign(score=corpus.get_scaled_f_scores('alt.atheism'))
.sort_values(by='score', ascending=False).index.tolist()[:5],
['atheism', 'atheists', 'islam', 'atheist', 'belief'])
self.assertGreater(len(corpus.get_texts()[0]), 5)
示例5
def test_MinHashEncoder(n_sample=70, minmax_hash=False):
X_txt = fetch_20newsgroups(subset='train')['data']
X = X_txt[:n_sample]
for minmax_hash in [True, False]:
for hashing in ['fast', 'murmur']:
if minmax_hash and hashing == 'murmur':
pass # not implemented
# Test output shape
encoder = MinHashEncoder(n_components=50, hashing=hashing)
encoder.fit(X)
y = encoder.transform(X)
assert y.shape == (n_sample, 50), str(y.shape)
assert len(set(y[0])) == 50
# Test same seed return the same output
encoder = MinHashEncoder(50, hashing=hashing)
encoder.fit(X)
y2 = encoder.transform(X)
np.testing.assert_array_equal(y, y2)
# Test min property
if not minmax_hash:
X_substring = [x[:x.find(' ')] for x in X]
encoder = MinHashEncoder(50, hashing=hashing)
encoder.fit(X_substring)
y_substring = encoder.transform(X_substring)
np.testing.assert_array_less(y - y_substring, 0.0001)
示例6
def test_validate_sklearn_sgd_with_text(self):
categories = ['alt.atheism','talk.religion.misc']
data = fetch_20newsgroups(subset='train', categories=categories)
X = data.data[:4]
Y = data.target[:4]
features = ['input']
target = 'output'
model = SGDClassifier(loss="log")
file_name = model.__class__.__name__ + '_TfIdfVec_.pmml'
pipeline = Pipeline([
('vect', TfidfVectorizer()),
('clf', model)
])
pipeline.fit(X, Y)
skl_to_pmml(pipeline, features , target, file_name)
self.assertEqual(self.schema.is_valid(file_name), True)
示例7
def ng(partitions=['train', 'test']):
'''loads 20 NewsGroups topic classification dataset
Args:
partitions: component(s) of data to load; can be a string (for one partition) or list of strings
Returns:
((list of documents, list of labels) for each partition)
'''
if type(partitions) == str:
data = fetch_20newsgroups(subset=partitions)
return data['data'], list(data['target'])
output = []
for partition in partitions:
data = fetch_20newsgroups(subset=partition)
output.append((data['data'], list(data['target'])))
return output
示例8
def create_binary_newsgroups_data():
categories = ["alt.atheism", "soc.religion.christian"]
newsgroups_train = fetch_20newsgroups(subset="train", categories=categories)
newsgroups_test = fetch_20newsgroups(subset="test", categories=categories)
class_names = ["atheism", "christian"]
return newsgroups_train, newsgroups_test, class_names
示例9
def fetch_data(path):
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.graphics', 'rec.sport.baseball', 'talk.politics.guns']
dataset = fetch_20newsgroups(path, categories=categories)
return dataset
示例10
def test_fast_hash():
from sklearn import datasets
data = datasets.fetch_20newsgroups()
a = data.data[0]
min_hash = ngram_min_hash(a, seed=0)
min_hash2 = ngram_min_hash(a, seed=0)
assert min_hash == min_hash2
list_min_hash = [ngram_min_hash(a, seed=seed) for seed in range(50)]
assert len(set(list_min_hash)) > 45, 'Too many hash collisions'
min_hash4 = ngram_min_hash(a, seed=0, return_minmax=True)
assert len(min_hash4) == 2
示例11
def test_20news():
try:
data = datasets.fetch_20newsgroups(
subset='all', download_if_missing=False, shuffle=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")
# Extract a reduced dataset
data2cats = datasets.fetch_20newsgroups(
subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
# Check that the ordering of the target_names is the same
# as the ordering in the full dataset
assert_equal(data2cats.target_names,
data.target_names[-2:])
# Assert that we have only 0 and 1 as labels
assert_equal(np.unique(data2cats.target).tolist(), [0, 1])
# Check that the number of filenames is consistent with data/target
assert_equal(len(data2cats.filenames), len(data2cats.target))
assert_equal(len(data2cats.filenames), len(data2cats.data))
# Check that the first entry of the reduced dataset corresponds to
# the first entry of the corresponding category in the full dataset
entry1 = data2cats.data[0]
category = data2cats.target_names[data2cats.target[0]]
label = data.target_names.index(category)
entry2 = data.data[np.where(data.target == label)[0][0]]
assert_equal(entry1, entry2)
示例12
def test_20news_length_consistency():
"""Checks the length consistencies within the bunch
This is a non-regression test for a bug present in 0.16.1.
"""
try:
data = datasets.fetch_20newsgroups(
subset='all', download_if_missing=False, shuffle=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")
# Extract the full dataset
data = datasets.fetch_20newsgroups(subset='all')
assert_equal(len(data['data']), len(data.data))
assert_equal(len(data['target']), len(data.target))
assert_equal(len(data['filenames']), len(data.filenames))
示例13
def test_20news_vectorized():
try:
datasets.fetch_20newsgroups(subset='all',
download_if_missing=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")
# test subset = train
bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
assert sp.isspmatrix_csr(bunch.data)
assert_equal(bunch.data.shape, (11314, 130107))
assert_equal(bunch.target.shape[0], 11314)
assert_equal(bunch.data.dtype, np.float64)
# test subset = test
bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
assert sp.isspmatrix_csr(bunch.data)
assert_equal(bunch.data.shape, (7532, 130107))
assert_equal(bunch.target.shape[0], 7532)
assert_equal(bunch.data.dtype, np.float64)
# test return_X_y option
fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
check_return_X_y(bunch, fetch_func)
# test subset = all
bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
assert sp.isspmatrix_csr(bunch.data)
assert_equal(bunch.data.shape, (11314 + 7532, 130107))
assert_equal(bunch.target.shape[0], 11314 + 7532)
assert_equal(bunch.data.dtype, np.float64)
示例14
def setUp(self):
"""Carga de los datos de prueba (20 Newsgroups corpus)."""
newsdata = fetch_20newsgroups(data_home="./data/")
self.ids = [str(i) for i in range(len(newsdata.target))]
self.texts = newsdata.data
self.labels = [newsdata.target_names[idx] for idx in newsdata.target]
self.tc = TextClassifier(self.texts, self.ids)
示例15
def load_newsgroups():
"""20 News Groups Dataset.
The data of this dataset is a 1d numpy array vector containing the texts
from 11314 newsgroups posts, and the target is a 1d numpy integer array
containing the label of one of the 20 topics that they are about.
"""
dataset = datasets.fetch_20newsgroups()
return Dataset(load_newsgroups.__doc__, np.array(dataset.data), dataset.target,
accuracy_score, stratify=True)
示例16
def download_articles(name, categories, subset):
data = []
print("Downloading articles")
newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())
for i in range(len(newsgroups_data['data'])):
line = newsgroups_data['data'][i]
data.append({'text': line, 'group': newsgroups_data['target_names'][newsgroups_data['target'][i]]})
print(len(data))
raw_data_dir = os.path.join('data', '20ng', name)
print("Saving to", raw_data_dir)
fh.makedirs(raw_data_dir)
fh.write_jsonlist(data, os.path.join(raw_data_dir, subset + '.jsonlist'))
示例17
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
"""
Retrieve data from 20 newsgroups
:param subset: train, test or all
:param categories: List of newsgroup name
:param shuffle: shuffle the list or not
:param random_state: seed integer to shuffle the dataset
:return: data and labels of the newsgroup
"""
datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
return datasets
示例18
def __init__(self, cfg=None):
super().__init__()
self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
shuffle=cfg['shuffle'], random_state=cfg['random_state'])
示例19
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
"""
Retrieve data from 20 newsgroups
:param subset: train, test or all
:param categories: List of newsgroup name
:param shuffle: shuffle the list or not
:param random_state: seed integer to shuffle the dataset
:return: data and labels of the newsgroup
"""
datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
return datasets
示例20
def load_newsgroup_data(V, cats, sort_data=True):
from sklearn.datasets import fetch_20newsgroups
print("Downloading newsgroups data...")
print('cats = %s' % cats)
newsgroups = fetch_20newsgroups(
subset="train", categories=cats, remove=('headers', 'footers', 'quotes'))
return get_sparse_repr(newsgroups.data, V, sort_data)
示例21
def download_articles(name, categories, subset):
data = {}
print("Downloading articles")
newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())
for i in range(len(newsgroups_data['data'])):
line = newsgroups_data['data'][i]
data[str(len(data))] = {'text': line, 'label': newsgroups_data['target_names'][newsgroups_data['target'][i]]}
print(len(data))
raw_data_dir = os.path.join('..', 'data', '20ng', name)
print("Saving to", raw_data_dir)
fh.makedirs(raw_data_dir)
fh.write_to_json(data, os.path.join(raw_data_dir, subset + '.json'))
示例22
def test_20_newsgroups():
data = fetch_20newsgroups()
X, y = data.data, data.target
r = dask_ml.model_selection.train_test_split(X, y)
X_train, X_test, y_train, y_test = r
for X in [X_train, X_test]:
assert isinstance(X, list)
assert isinstance(X[0], str)
for y in [y_train, y_test]:
assert isinstance(y, np.ndarray)
assert y.dtype == int
示例23
def _get_train_test_dataset(cats_to_fetch, limit=100):
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), shuffle=True,
categories=cats_to_fetch)
X = newsgroups.data[:limit]
y = newsgroups.target[:limit]
X = [six.text_type(x) for x in X] # Ensure all strings to unicode for python 2.7 compatibility
# Category 0 comp-graphic, 1 rec.sport baseball. We can threat it as a binary class.
cats = [{"comp.graphics": not bool(el), "rec.sport.baseball": bool(el)} for el in y]
split = int(len(X) * 0.8)
return X[:split], cats[:split], X[split:], cats[split:]
示例24
def load_newsgroups():
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True)
x_sparse = vectorizer.fit_transform(newsgroups.data)
x = np.asarray(x_sparse.todense())
y = newsgroups.target
print('News group data shape ', x.shape)
print("News group number of clusters: ", np.unique(y).size)
return x, y
示例25
def get_data():
data = fetch_20newsgroups(subset='all',
shuffle=True,
remove=('headers', 'footers', 'quotes'))
return data
示例26
def test_tfidf_20newsgroups(self):
data = fetch_20newsgroups()
X, y = np.array(data.data)[:100], np.array(data.target)[:100]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=42)
model = TfidfVectorizer().fit(X_train)
onnx_model = convert_sklearn(
model, 'cv', [('input', StringTensorType(X_test.shape))])
dump_data_and_model(
X_test, model, onnx_model,
basename="SklearnTfidfVectorizer20newsgroups",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.4.0')")
示例27
def test_tfidf_20newsgroups_nolowercase(self):
data = fetch_20newsgroups()
X, y = np.array(data.data)[:100], np.array(data.target)[:100]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=42)
model = TfidfVectorizer(lowercase=False).fit(X_train)
onnx_model = convert_sklearn(
model, 'cv', [('input', StringTensorType(X_test.shape))])
dump_data_and_model(
X_test, model, onnx_model,
basename="SklearnTfidfVectorizer20newsgroupsNOLower",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.4.0')")
示例28
def test_model_tfidf_transform_bug(self):
categories = [
"alt.atheism",
"soc.religion.christian",
"comp.graphics",
"sci.med",
]
twenty_train = fetch_20newsgroups(subset="train",
categories=categories,
shuffle=True,
random_state=0)
text_clf = Pipeline([("vect", CountVectorizer()),
("tfidf", TfidfTransformer())])
twenty_train.data[0] = "bruît " + twenty_train.data[0]
text_clf.fit(twenty_train.data, twenty_train.target)
model_onnx = convert_sklearn(
text_clf,
name="DocClassifierCV-Tfidf",
initial_types=[("input", StringTensorType([5]))],
)
dump_data_and_model(
twenty_train.data[5:10],
text_clf,
model_onnx,
basename="SklearnPipelineTfidfTransformer",
# Operator mul is not implemented in onnxruntime
allow_failure="StrictVersion(onnx.__version__)"
" <= StrictVersion('1.5')",
)
示例29
def test_pipeline_tfidf(self):
categories = ["alt.atheism", "talk.religion.misc"]
train = fetch_20newsgroups(random_state=1,
subset="test",
categories=categories)
train_data = SubjectBodyExtractor().fit_transform(train.data)
tfi = TfidfVectorizer(min_df=30)
tdata = train_data[:300, :1]
tfi.fit(tdata.ravel())
extra = {
TfidfVectorizer: {
"separators": [
" ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
]
}
}
model_onnx = convert_sklearn(
tfi,
"tfidf",
initial_types=[("input", StringTensorType([1]))],
options=extra,
)
dump_data_and_model(
tdata[:5],
tfi,
model_onnx,
basename="SklearnDocumentationTfIdf-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.4.0')",
)
示例30
def load_20newsgroups(validation_ratio, normalization):
"""Load text network (20 news group)
Arguments:
validation_ratio (float): Ratio of validation split
normalization (str): Variant of normalization method to use.
Returns:
adj (chainer.utils.sparse.CooMatrix): (Node, Node) shape
normalized adjency matrix.
labels (np.ndarray): (Node, ) shape labels array
idx_train (np.ndarray): Indices of the train
idx_val (np.ndarray): Indices of val array
idx_test (np.ndarray): Indices of test array
"""
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')
adj = create_text_adjacency_matrix(
[tokenize(t) for t in (train['data'] + test['data'])])
if normalization == 'gcn':
adj = normalize(adj)
else:
adj = normalize_pygcn(adj)
n_train = int(len(train['data']) * (1.0 - validation_ratio))
n_all = len(train['data']) + len(test['data'])
idx_train = np.array(list(range(n_train)), np.int32)
idx_val = np.array(list(range(n_train, len(train['data']))), np.int32)
idx_test = np.array(list(range(len(train['data']), n_all)), np.int32)
labels = np.concatenate(
(train['target'], test['target'], np.full([adj.shape[0] - n_all], -1)))
labels = labels.astype(np.int32)
adj = to_chainer_sparse_variable(adj)
return adj, labels, idx_train, idx_val, idx_test