DOC + TST vocabulary arg in CountVect docstring

larsmans · larsmans · commit f4882b5c3816 · 2012-08-01T15:09:35.000+02:00
Somewhere during the last refactoring, the documentation for the argument
went missing.
Also, check for Mapping subclass instead of hasattr "get" and test with a
few different types.
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -21,9 +21,12 @@
 from numpy.testing import assert_array_equal
 from numpy.testing import assert_raises
 
+from collections import defaultdict, Mapping
+from functools import partial
 import pickle
 from StringIO import StringIO
 
+
 JUNK_FOOD_DOCS = (
     "the pizza pizza beer copyright",
     "the pizza burger beer copyright",
@@ -189,20 +192,20 @@ def test_char_ngram_analyzer():
 
 
 def test_countvectorizer_custom_vocabulary():
-    what_we_like = ["pizza", "beer"]
-    vect = CountVectorizer(vocabulary=what_we_like)
-    vect.fit(JUNK_FOOD_DOCS)
-    assert_equal(set(vect.vocabulary_), set(what_we_like))
-    X = vect.transform(JUNK_FOOD_DOCS)
-    assert_equal(X.shape[1], len(what_we_like))
-
-    # try again with a dict vocabulary
     vocab = {"pizza": 0, "beer": 1}
-    vect = CountVectorizer(vocabulary=vocab)
-    vect.fit(JUNK_FOOD_DOCS)
-    assert_equal(vect.vocabulary_, vocab)
-    X = vect.transform(JUNK_FOOD_DOCS)
-    assert_equal(X.shape[1], len(what_we_like))
+    terms = set(vocab.keys())
+
+    # Try a few of the supported types.
+    for typ in [dict, list, iter, partial(defaultdict, int)]:
+        v = typ(vocab)
+        vect = CountVectorizer(vocabulary=v)
+        vect.fit(JUNK_FOOD_DOCS)
+        if isinstance(v, Mapping):
+            assert_equal(vect.vocabulary_, vocab)
+        else:
+            assert_equal(set(vect.vocabulary_), terms)
+        X = vect.transform(JUNK_FOOD_DOCS)
+        assert_equal(X.shape[1], len(terms))
 
 
 def test_countvectorizer_custom_vocabulary_pipeline():
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -10,9 +10,10 @@
 build feature vectors from text documents.
 """
 
+from collections import Mapping
+from operator import itemgetter
 import re
 import unicodedata
-from operator import itemgetter
 import warnings
 
 import numpy as np
@@ -167,6 +168,11 @@ class CountVectorizer(BaseEstimator):
 
         This parameter is ignored if vocabulary is not None.
 
+    vocabulary: Mapping or iterable, optional
+        Either a Mapping (e.g., a dict) where keys are terms and values are
+        indices in the feature matrix, or an iterable over terms. If not
+        given, a vocabulary is determined from the input documents.
+
     binary: boolean, False by default.
         If True, all non zero counts are set to 1. This is useful for discrete
         probabilistic models that model binary events rather than integer
@@ -201,7 +207,7 @@ def __init__(self, input='content', charset='utf-8',
         self.max_features = max_features
         if vocabulary is not None:
             self.fixed_vocabulary = True
-            if not hasattr(vocabulary, 'get'):
+            if not isinstance(vocabulary, Mapping):
                 vocabulary = dict((t, i) for i, t in enumerate(vocabulary))
             self.vocabulary_ = vocabulary
         else: