Python源码示例:sklearn.datasets.fetch_20newsgroups()

示例1
def __init__(self,
                 cache: bool = False,
                 transform: Dict[str, Union[Field, Dict]] = None) -> None:
        """Initialize the NewsGroupDataset builtin."""
        try:
            from sklearn.datasets import fetch_20newsgroups
        except ImportError:
            raise ImportError("Install sklearn to use the NewsGroupDataset")

        train = fetch_20newsgroups(subset='train')
        test = fetch_20newsgroups(subset='test')

        train = [(' '.join(d.split()), str(t)) for d, t in zip(train['data'], train['target'])]
        test = [(' '.join(d.split()), str(t)) for d, t in zip(test['data'], test['target'])]

        named_cols = ['text', 'label']
        super().__init__(
            train=train,
            val=None,
            test=test,
            cache=cache,
            named_columns=named_cols,
            transform=transform
        ) 
示例2
def test_validate_sklearn_sgd_with_text_cv(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_CountVec_.pmml'
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True) 
示例3
def load(self):
        categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
        newsgroups_train = fetch_20newsgroups(
            subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
        newsgroups_test = fetch_20newsgroups(
            subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)
        vectorizer = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.20)
        vectors = vectorizer.fit_transform(newsgroups_train.data)
        vectors_test = vectorizer.transform(newsgroups_test.data)
        x1 = vectors
        y1 = newsgroups_train.target
        x2 = vectors_test
        y2 = newsgroups_test.target
        x = np.array(np.r_[x1.todense(), x2.todense()])
        y = np.r_[y1, y2]
        return x, y 
示例4
def _te_ss_t_build(self):
		from sklearn.datasets import fetch_20newsgroups
		from sklearn.feature_extraction.text import CountVectorizer

		newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
		corpus = CorpusFromScikit(
			X=X_counts,
			y=newsgroups_train.target,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=newsgroups_train.target_names,
			raw_texts=newsgroups_train.data
		).build()
		self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
		self.assertEqual(corpus
		                 .get_term_freq_df()
		                 .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
		self.assertGreater(len(corpus.get_texts()[0]), 5) 
示例5
def test_MinHashEncoder(n_sample=70, minmax_hash=False):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_sample]

    for minmax_hash in [True, False]:
        for hashing in ['fast', 'murmur']:

            if minmax_hash and hashing == 'murmur':
                pass # not implemented

            # Test output shape
            encoder = MinHashEncoder(n_components=50, hashing=hashing)
            encoder.fit(X)
            y = encoder.transform(X)
            assert y.shape == (n_sample, 50), str(y.shape)
            assert len(set(y[0])) == 50

            # Test same seed return the same output
            encoder = MinHashEncoder(50, hashing=hashing)
            encoder.fit(X)
            y2 = encoder.transform(X)
            np.testing.assert_array_equal(y, y2)

            # Test min property
            if not minmax_hash:
                X_substring = [x[:x.find(' ')] for x in X]
                encoder = MinHashEncoder(50, hashing=hashing)
                encoder.fit(X_substring)
                y_substring = encoder.transform(X_substring)
                np.testing.assert_array_less(y - y_substring, 0.0001) 
示例6
def test_validate_sklearn_sgd_with_text(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_TfIdfVec_.pmml'
        pipeline = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True) 
示例7
def ng(partitions=['train', 'test']):
  '''loads 20 NewsGroups topic classification dataset
  Args:
    partitions: component(s) of data to load; can be a string (for one partition) or list of strings
  Returns:
    ((list of documents, list of labels) for each partition)
  '''

  if type(partitions) == str:
    data = fetch_20newsgroups(subset=partitions)
    return data['data'], list(data['target'])
  output = []
  for partition in partitions:
    data = fetch_20newsgroups(subset=partition)
    output.append((data['data'], list(data['target'])))
  return output 
示例8
def create_binary_newsgroups_data():
    categories = ["alt.atheism", "soc.religion.christian"]
    newsgroups_train = fetch_20newsgroups(subset="train", categories=categories)
    newsgroups_test = fetch_20newsgroups(subset="test", categories=categories)
    class_names = ["atheism", "christian"]
    return newsgroups_train, newsgroups_test, class_names 
示例9
def fetch_data(path):
    from sklearn.datasets import fetch_20newsgroups
    categories = ['comp.graphics', 'rec.sport.baseball', 'talk.politics.guns']
    dataset = fetch_20newsgroups(path, categories=categories)
    return dataset 
示例10
def test_fast_hash():

    from sklearn import datasets
    data = datasets.fetch_20newsgroups()
    a = data.data[0]

    min_hash = ngram_min_hash(a, seed=0)
    min_hash2 = ngram_min_hash(a, seed=0)
    assert min_hash == min_hash2

    list_min_hash = [ngram_min_hash(a, seed=seed) for seed in range(50)]
    assert len(set(list_min_hash)) > 45, 'Too many hash collisions'

    min_hash4 = ngram_min_hash(a, seed=0, return_minmax=True)
    assert len(min_hash4) == 2 
示例11
def test_20news():
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # Extract a reduced dataset
    data2cats = datasets.fetch_20newsgroups(
        subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
    # Check that the ordering of the target_names is the same
    # as the ordering in the full dataset
    assert_equal(data2cats.target_names,
                 data.target_names[-2:])
    # Assert that we have only 0 and 1 as labels
    assert_equal(np.unique(data2cats.target).tolist(), [0, 1])

    # Check that the number of filenames is consistent with data/target
    assert_equal(len(data2cats.filenames), len(data2cats.target))
    assert_equal(len(data2cats.filenames), len(data2cats.data))

    # Check that the first entry of the reduced dataset corresponds to
    # the first entry of the corresponding category in the full dataset
    entry1 = data2cats.data[0]
    category = data2cats.target_names[data2cats.target[0]]
    label = data.target_names.index(category)
    entry2 = data.data[np.where(data.target == label)[0][0]]
    assert_equal(entry1, entry2) 
示例12
def test_20news_length_consistency():
    """Checks the length consistencies within the bunch

    This is a non-regression test for a bug present in 0.16.1.
    """
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")
    # Extract the full dataset
    data = datasets.fetch_20newsgroups(subset='all')
    assert_equal(len(data['data']), len(data.data))
    assert_equal(len(data['target']), len(data.target))
    assert_equal(len(data['filenames']), len(data.filenames)) 
示例13
def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert sp.isspmatrix_csr(bunch.data)
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert sp.isspmatrix_csr(bunch.data)
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test return_X_y option
    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
    check_return_X_y(bunch, fetch_func)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert sp.isspmatrix_csr(bunch.data)
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64) 
示例14
def setUp(self):
        """Carga de los datos de prueba (20 Newsgroups corpus)."""
        newsdata = fetch_20newsgroups(data_home="./data/")
        self.ids = [str(i) for i in range(len(newsdata.target))]
        self.texts = newsdata.data
        self.labels = [newsdata.target_names[idx] for idx in newsdata.target]
        self.tc = TextClassifier(self.texts, self.ids) 
示例15
def load_newsgroups():
    """20 News Groups Dataset.

    The data of this dataset is a 1d numpy array vector containing the texts
    from 11314 newsgroups posts, and the target is a 1d numpy integer array
    containing the label of one of the 20 topics that they are about.
    """
    dataset = datasets.fetch_20newsgroups()
    return Dataset(load_newsgroups.__doc__, np.array(dataset.data), dataset.target,
                   accuracy_score, stratify=True) 
示例16
def download_articles(name, categories, subset):

    data = []
    print("Downloading articles")
    newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())

    for i in range(len(newsgroups_data['data'])):
        line = newsgroups_data['data'][i]
        data.append({'text': line, 'group': newsgroups_data['target_names'][newsgroups_data['target'][i]]})

    print(len(data))
    raw_data_dir = os.path.join('data', '20ng', name)
    print("Saving to", raw_data_dir)
    fh.makedirs(raw_data_dir)
    fh.write_jsonlist(data, os.path.join(raw_data_dir, subset + '.jsonlist')) 
示例17
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets 
示例18
def __init__(self, cfg=None):
        super().__init__()
        self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
                                              shuffle=cfg['shuffle'], random_state=cfg['random_state']) 
示例19
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets 
示例20
def load_newsgroup_data(V, cats, sort_data=True):
    from sklearn.datasets import fetch_20newsgroups
    print("Downloading newsgroups data...")
    print('cats = %s' % cats)
    newsgroups = fetch_20newsgroups(
        subset="train", categories=cats, remove=('headers', 'footers', 'quotes'))
    return get_sparse_repr(newsgroups.data, V, sort_data) 
示例21
def download_articles(name, categories, subset):

    data = {}
    print("Downloading articles")
    newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())

    for i in range(len(newsgroups_data['data'])):
        line = newsgroups_data['data'][i]
        data[str(len(data))] = {'text': line, 'label': newsgroups_data['target_names'][newsgroups_data['target'][i]]}

    print(len(data))
    raw_data_dir = os.path.join('..', 'data', '20ng', name)
    print("Saving to", raw_data_dir)
    fh.makedirs(raw_data_dir)
    fh.write_to_json(data, os.path.join(raw_data_dir, subset + '.json')) 
示例22
def test_20_newsgroups():
    data = fetch_20newsgroups()
    X, y = data.data, data.target
    r = dask_ml.model_selection.train_test_split(X, y)
    X_train, X_test, y_train, y_test = r
    for X in [X_train, X_test]:
        assert isinstance(X, list)
        assert isinstance(X[0], str)
    for y in [y_train, y_test]:
        assert isinstance(y, np.ndarray)
        assert y.dtype == int 
示例23
def _get_train_test_dataset(cats_to_fetch, limit=100):
    newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), shuffle=True,
                                    categories=cats_to_fetch)
    X = newsgroups.data[:limit]
    y = newsgroups.target[:limit]

    X = [six.text_type(x) for x in X]  # Ensure all strings to unicode for python 2.7 compatibility

    # Category 0 comp-graphic, 1 rec.sport baseball. We can threat it as a binary class.
    cats = [{"comp.graphics": not bool(el), "rec.sport.baseball": bool(el)} for el in y]

    split = int(len(X) * 0.8)
    return X[:split], cats[:split], X[split:], cats[split:] 
示例24
def load_newsgroups():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.datasets import fetch_20newsgroups
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

    vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True)
    x_sparse = vectorizer.fit_transform(newsgroups.data)
    x = np.asarray(x_sparse.todense())
    y = newsgroups.target
    print('News group data shape ', x.shape)
    print("News group number of clusters: ", np.unique(y).size)
    return x, y 
示例25
def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              remove=('headers', 'footers', 'quotes'))
    return data 
示例26
def test_tfidf_20newsgroups(self):
        data = fetch_20newsgroups()
        X, y = np.array(data.data)[:100], np.array(data.target)[:100]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.5, random_state=42)

        model = TfidfVectorizer().fit(X_train)
        onnx_model = convert_sklearn(
            model, 'cv', [('input', StringTensorType(X_test.shape))])
        dump_data_and_model(
            X_test, model, onnx_model,
            basename="SklearnTfidfVectorizer20newsgroups",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.4.0')") 
示例27
def test_tfidf_20newsgroups_nolowercase(self):
        data = fetch_20newsgroups()
        X, y = np.array(data.data)[:100], np.array(data.target)[:100]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.5, random_state=42)

        model = TfidfVectorizer(lowercase=False).fit(X_train)
        onnx_model = convert_sklearn(
            model, 'cv', [('input', StringTensorType(X_test.shape))])
        dump_data_and_model(
            X_test, model, onnx_model,
            basename="SklearnTfidfVectorizer20newsgroupsNOLower",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.4.0')") 
示例28
def test_model_tfidf_transform_bug(self):
        categories = [
            "alt.atheism",
            "soc.religion.christian",
            "comp.graphics",
            "sci.med",
        ]
        twenty_train = fetch_20newsgroups(subset="train",
                                          categories=categories,
                                          shuffle=True,
                                          random_state=0)
        text_clf = Pipeline([("vect", CountVectorizer()),
                             ("tfidf", TfidfTransformer())])
        twenty_train.data[0] = "bruît " + twenty_train.data[0]
        text_clf.fit(twenty_train.data, twenty_train.target)
        model_onnx = convert_sklearn(
            text_clf,
            name="DocClassifierCV-Tfidf",
            initial_types=[("input", StringTensorType([5]))],
        )
        dump_data_and_model(
            twenty_train.data[5:10],
            text_clf,
            model_onnx,
            basename="SklearnPipelineTfidfTransformer",
            # Operator mul is not implemented in onnxruntime
            allow_failure="StrictVersion(onnx.__version__)"
                          " <= StrictVersion('1.5')",
        ) 
示例29
def test_pipeline_tfidf(self):
        categories = ["alt.atheism", "talk.religion.misc"]
        train = fetch_20newsgroups(random_state=1,
                                   subset="test",
                                   categories=categories)
        train_data = SubjectBodyExtractor().fit_transform(train.data)
        tfi = TfidfVectorizer(min_df=30)
        tdata = train_data[:300, :1]
        tfi.fit(tdata.ravel())
        extra = {
            TfidfVectorizer: {
                "separators": [
                    " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
                ]
            }
        }
        model_onnx = convert_sklearn(
            tfi,
            "tfidf",
            initial_types=[("input", StringTensorType([1]))],
            options=extra,
        )
        dump_data_and_model(
            tdata[:5],
            tfi,
            model_onnx,
            basename="SklearnDocumentationTfIdf-OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.4.0')",
        ) 
示例30
def load_20newsgroups(validation_ratio, normalization):
    """Load text network (20 news group)

    Arguments:
        validation_ratio (float): Ratio of validation split
        normalization (str): Variant of normalization method to use.

    Returns:
        adj (chainer.utils.sparse.CooMatrix): (Node, Node) shape
            normalized adjency matrix.
        labels (np.ndarray): (Node, ) shape labels array
        idx_train (np.ndarray): Indices of the train
        idx_val (np.ndarray): Indices of val array
        idx_test (np.ndarray): Indices of test array
    """
    train = fetch_20newsgroups(subset='train')
    test = fetch_20newsgroups(subset='test')
    adj = create_text_adjacency_matrix(
        [tokenize(t) for t in (train['data'] + test['data'])])
    if normalization == 'gcn':
        adj = normalize(adj)
    else:
        adj = normalize_pygcn(adj)
    n_train = int(len(train['data']) * (1.0 - validation_ratio))
    n_all = len(train['data']) + len(test['data'])
    idx_train = np.array(list(range(n_train)), np.int32)
    idx_val = np.array(list(range(n_train, len(train['data']))), np.int32)
    idx_test = np.array(list(range(len(train['data']), n_all)), np.int32)

    labels = np.concatenate(
        (train['target'], test['target'], np.full([adj.shape[0] - n_all], -1)))
    labels = labels.astype(np.int32)
    adj = to_chainer_sparse_variable(adj)

    return adj, labels, idx_train, idx_val, idx_test