Merge pull request scikit-learn#5038 from jnothman/textdoc

amueller · amueller · commit f8985f2c9241 · 2015-07-28T15:21:32.000-05:00
DOC tweaks for feature_extraction.text
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -365,6 +365,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
     tokenizer : callable or None (default)
         Override the string tokenization step while preserving the
         preprocessing and n-grams generation steps.
+        Only applies if ``analyzer == 'word'``.
 
     ngram_range : tuple (min_n, max_n), default=(1, 1)
         The lower and upper boundary of the range of n-values for different
@@ -376,13 +377,14 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
 
         If a list, that list is assumed to contain stop words, all of which
         will be removed from the resulting tokens.
+        Only applies if ``analyzer == 'word'``.
 
     lowercase : boolean, default=True
         Convert all characters to lowercase before tokenizing.
 
     token_pattern : string
         Regular expression denoting what constitutes a "token", only used
-        if `analyzer == 'word'`. The default regexp selects tokens of 2
+        if ``analyzer == 'word'``. The default regexp selects tokens of 2
         or more alphanumeric characters (punctuation is completely ignored
         and always treated as a token separator).
 
@@ -544,6 +546,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
 
         If a callable is passed it is used to extract the sequence of features
         out of the raw, unprocessed input.
+        Only applies if ``analyzer == 'word'``.
 
     preprocessor : callable or None (default)
         Override the preprocessing (string transformation) stage while
@@ -552,6 +555,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
     tokenizer : callable or None (default)
         Override the string tokenization step while preserving the
         preprocessing and n-grams generation steps.
+        Only applies if ``analyzer == 'word'``.
 
     ngram_range : tuple (min_n, max_n)
         The lower and upper boundary of the range of n-values for different
@@ -563,6 +567,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
 
         If a list, that list is assumed to contain stop words, all of which
         will be removed from the resulting tokens.
+        Only applies if ``analyzer == 'word'``.
 
         If None, no stop words will be used. max_df can be set to a value
         in the range [0.7, 1.0) to automatically detect and filter stop
@@ -573,7 +578,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
 
     token_pattern : string
         Regular expression denoting what constitutes a "token", only used
-        if `tokenize == 'word'`. The default regexp select tokens of 2
+        if ``analyzer == 'word'``. The default regexp select tokens of 2
         or more alphanumeric characters (punctuation is completely ignored
         and always treated as a token separator).
 
@@ -1090,6 +1095,7 @@ class TfidfVectorizer(CountVectorizer):
     tokenizer : callable or None (default)
         Override the string tokenization step while preserving the
         preprocessing and n-grams generation steps.
+        Only applies if ``analyzer == 'word'``.
 
     ngram_range : tuple (min_n, max_n)
         The lower and upper boundary of the range of n-values for different
@@ -1103,6 +1109,7 @@ class TfidfVectorizer(CountVectorizer):
 
         If a list, that list is assumed to contain stop words, all of which
         will be removed from the resulting tokens.
+        Only applies if ``analyzer == 'word'``.
 
         If None, no stop words will be used. max_df can be set to a value
         in the range [0.7, 1.0) to automatically detect and filter stop
@@ -1113,21 +1120,22 @@ class TfidfVectorizer(CountVectorizer):
 
     token_pattern : string
         Regular expression denoting what constitutes a "token", only used
-        if `analyzer == 'word'`. The default regexp selects tokens of 2
+        if ``analyzer == 'word'``. The default regexp selects tokens of 2
         or more alphanumeric characters (punctuation is completely ignored
         and always treated as a token separator).
 
     max_df : float in range [0.0, 1.0] or int, default=1.0
-        When building the vocabulary ignore terms that have a document frequency
-        strictly higher than the given threshold (corpus specific stop words).
+        When building the vocabulary ignore terms that have a document
+        frequency strictly higher than the given threshold (corpus-specific
+        stop words).
         If float, the parameter represents a proportion of documents, integer
         absolute counts.
         This parameter is ignored if vocabulary is not None.
 
     min_df : float in range [0.0, 1.0] or int, default=1
-        When building the vocabulary ignore terms that have a document frequency
-        strictly lower than the given threshold.
-        This value is also called cut-off in the literature.
+        When building the vocabulary ignore terms that have a document
+        frequency strictly lower than the given threshold. This value is also
+        called cut-off in the literature.
         If float, the parameter represents a proportion of documents, integer
         absolute counts.
         This parameter is ignored if vocabulary is not None.