FIX set vectorizer vocabulary outside of init

vene · vene · commit 143e5ffc37e1 · 2014-08-13T10:54:44.000+02:00
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -18,6 +18,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.svm import LinearSVC
 
+from sklearn.base import clone
 
 import numpy as np
 from nose import SkipTest
@@ -283,7 +284,8 @@ def test_countvectorizer_stop_words():
 
 def test_countvectorizer_empty_vocabulary():
     try:
-        CountVectorizer(vocabulary=[])
+        vect = CountVectorizer(vocabulary=[])
+        vect.fit(["foo"])
         assert False, "we shouldn't get here"
     except ValueError as e:
         assert_in("empty vocabulary", str(e).lower())
@@ -440,10 +442,10 @@ def test_vectorizer():
     # (equivalent to term count vectorizer + tfidf transformer)
     train_data = iter(ALL_FOOD_DOCS[:-1])
     tv = TfidfVectorizer(norm='l1')
-    assert_false(tv.fixed_vocabulary)
 
     tv.max_df = v1.max_df
     tfidf2 = tv.fit_transform(train_data).toarray()
+    assert_false(tv.fixed_vocabulary)
     assert_array_almost_equal(tfidf, tfidf2)
 
     # test the direct tfidf vectorizer with new data
@@ -777,7 +779,6 @@ def test_vectorizer_pipeline_cross_validation():
     # label junk food as -1, the others as +1
     target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)
 
-
     pipeline = Pipeline([('vect', TfidfVectorizer()),
                          ('svc', LinearSVC())])
 
@@ -824,7 +825,6 @@ def test_tfidf_vectorizer_with_fixed_vocabulary():
     # non regression smoke test for inheritance issues
     vocabulary = ['pizza', 'celeri']
     vect = TfidfVectorizer(vocabulary=vocabulary)
-    assert_true(vect.fixed_vocabulary)
     X_1 = vect.fit_transform(ALL_FOOD_DOCS)
     X_2 = vect.transform(ALL_FOOD_DOCS)
     assert_array_almost_equal(X_1.toarray(), X_2.toarray())
@@ -870,7 +870,8 @@ def test_pickling_transformer():
 
 def test_non_unique_vocab():
     vocab = ['a', 'b', 'c', 'a', 'a']
-    assert_raises(ValueError, CountVectorizer, vocabulary=vocab)
+    vect = CountVectorizer(vocabulary=vocab)
+    assert_raises(ValueError, vect.fit, [])
 
 
 def test_hashingvectorizer_nan_in_docs():
@@ -901,3 +902,11 @@ def test_tfidfvectorizer_export_idf():
     vect = TfidfVectorizer(use_idf=True)
     vect.fit(JUNK_FOOD_DOCS)
     assert_array_almost_equal(vect.idf_, vect._tfidf.idf_)
+
+
+def test_vectorizer_vocab_clone():
+    vect_vocab = TfidfVectorizer(vocabulary=["the"])
+    vect_vocab_clone = clone(vect_vocab)
+    vect_vocab.fit(ALL_FOOD_DOCS)
+    vect_vocab_clone.fit(ALL_FOOD_DOCS)
+    assert_equal(vect_vocab_clone.vocabulary_, vect_vocab.vocabulary_)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -236,6 +236,32 @@ def build_analyzer(self):
             raise ValueError('%s is not a valid tokenization scheme/analyzer' %
                              self.analyzer)
 
+    def _check_vocabulary(self):
+        vocabulary = self.vocabulary
+        if vocabulary is not None:
+            if not isinstance(vocabulary, Mapping):
+                vocab = {}
+                for i, t in enumerate(vocabulary):
+                    if vocab.setdefault(t, i) != i:
+                        msg = "Duplicate term in vocabulary: %r" % t
+                        raise ValueError(msg)
+                vocabulary = vocab
+            else:
+                indices = set(six.itervalues(vocabulary))
+                if len(indices) != len(vocabulary):
+                    raise ValueError("Vocabulary contains repeated indices.")
+                for i in xrange(len(vocabulary)):
+                    if i not in indices:
+                        msg = ("Vocabulary of size %d doesn't contain index "
+                               "%d." % (len(vocabulary), i))
+                        raise ValueError(msg)
+            if not vocabulary:
+                raise ValueError("empty vocabulary passed to fit")
+            self.fixed_vocabulary = True
+            self.vocabulary_ = dict(vocabulary)
+        else:
+            self.fixed_vocabulary = False
+
 
 class HashingVectorizer(BaseEstimator, VectorizerMixin):
     """Convert a collection of text documents to a matrix of token occurrences
@@ -616,29 +642,7 @@ def __init__(self, input='content', encoding='utf-8',
                     "max_features=%r, neither a positive integer nor None"
                     % max_features)
         self.ngram_range = ngram_range
-        if vocabulary is not None:
-            if not isinstance(vocabulary, Mapping):
-                vocab = {}
-                for i, t in enumerate(vocabulary):
-                    if vocab.setdefault(t, i) != i:
-                        msg = "Duplicate term in vocabulary: %r" % t
-                        raise ValueError(msg)
-                vocabulary = vocab
-            else:
-                indices = set(six.itervalues(vocabulary))
-                if len(indices) != len(vocabulary):
-                    raise ValueError("Vocabulary contains repeated indices.")
-                for i in xrange(len(vocabulary)):
-                    if i not in indices:
-                        msg = ("Vocabulary of size %d doesn't contain index "
-                               "%d." % (len(vocabulary), i))
-                        raise ValueError(msg)
-            if not vocabulary:
-                raise ValueError("empty vocabulary passed to fit")
-            self.fixed_vocabulary = True
-            self.vocabulary_ = dict(vocabulary)
-        else:
-            self.fixed_vocabulary = False
+        self.vocabulary = vocabulary
         self.binary = binary
         self.dtype = dtype
 
@@ -773,6 +777,7 @@ def fit_transform(self, raw_documents, y=None):
         # We intentionally don't call the transform method to make
         # fit_transform overridable without unwanted side effects in
         # TfidfVectorizer.
+        self._check_vocabulary()
         max_df = self.max_df
         min_df = self.min_df
         max_features = self.max_features
@@ -820,6 +825,9 @@ def transform(self, raw_documents):
         X : sparse matrix, [n_samples, n_features]
             Document-term matrix.
         """
+        if not hasattr(self, 'vocabulary_'):
+            self._check_vocabulary()
+
         if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
             raise ValueError("Vocabulary wasn't fitted or is empty!")