@@ -365,6 +365,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
365365 tokenizer : callable or None (default)
366366 Override the string tokenization step while preserving the
367367 preprocessing and n-grams generation steps.
368+ Only applies if ``analyzer == 'word'``.
368369
369370 ngram_range : tuple (min_n, max_n), default=(1, 1)
370371 The lower and upper boundary of the range of n-values for different
@@ -376,13 +377,14 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
376377
377378 If a list, that list is assumed to contain stop words, all of which
378379 will be removed from the resulting tokens.
380+ Only applies if ``analyzer == 'word'``.
379381
380382 lowercase : boolean, default=True
381383 Convert all characters to lowercase before tokenizing.
382384
383385 token_pattern : string
384386 Regular expression denoting what constitutes a "token", only used
385- if `analyzer == 'word'`. The default regexp selects tokens of 2
387+ if `` analyzer == 'word'` `. The default regexp selects tokens of 2
386388 or more alphanumeric characters (punctuation is completely ignored
387389 and always treated as a token separator).
388390
@@ -544,6 +546,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
544546
545547 If a callable is passed it is used to extract the sequence of features
546548 out of the raw, unprocessed input.
549+ Only applies if ``analyzer == 'word'``.
547550
548551 preprocessor : callable or None (default)
549552 Override the preprocessing (string transformation) stage while
@@ -552,6 +555,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
552555 tokenizer : callable or None (default)
553556 Override the string tokenization step while preserving the
554557 preprocessing and n-grams generation steps.
558+ Only applies if ``analyzer == 'word'``.
555559
556560 ngram_range : tuple (min_n, max_n)
557561 The lower and upper boundary of the range of n-values for different
@@ -563,6 +567,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
563567
564568 If a list, that list is assumed to contain stop words, all of which
565569 will be removed from the resulting tokens.
570+ Only applies if ``analyzer == 'word'``.
566571
567572 If None, no stop words will be used. max_df can be set to a value
568573 in the range [0.7, 1.0) to automatically detect and filter stop
@@ -573,7 +578,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
573578
574579 token_pattern : string
575580 Regular expression denoting what constitutes a "token", only used
576- if `tokenize == 'word'`. The default regexp select tokens of 2
581+ if ``analyzer == 'word'` `. The default regexp select tokens of 2
577582 or more alphanumeric characters (punctuation is completely ignored
578583 and always treated as a token separator).
579584
@@ -1090,6 +1095,7 @@ class TfidfVectorizer(CountVectorizer):
10901095 tokenizer : callable or None (default)
10911096 Override the string tokenization step while preserving the
10921097 preprocessing and n-grams generation steps.
1098+ Only applies if ``analyzer == 'word'``.
10931099
10941100 ngram_range : tuple (min_n, max_n)
10951101 The lower and upper boundary of the range of n-values for different
@@ -1103,6 +1109,7 @@ class TfidfVectorizer(CountVectorizer):
11031109
11041110 If a list, that list is assumed to contain stop words, all of which
11051111 will be removed from the resulting tokens.
1112+ Only applies if ``analyzer == 'word'``.
11061113
11071114 If None, no stop words will be used. max_df can be set to a value
11081115 in the range [0.7, 1.0) to automatically detect and filter stop
@@ -1113,21 +1120,22 @@ class TfidfVectorizer(CountVectorizer):
11131120
11141121 token_pattern : string
11151122 Regular expression denoting what constitutes a "token", only used
1116- if `analyzer == 'word'`. The default regexp selects tokens of 2
1123+ if `` analyzer == 'word'` `. The default regexp selects tokens of 2
11171124 or more alphanumeric characters (punctuation is completely ignored
11181125 and always treated as a token separator).
11191126
11201127 max_df : float in range [0.0, 1.0] or int, default=1.0
1121- When building the vocabulary ignore terms that have a document frequency
1122- strictly higher than the given threshold (corpus specific stop words).
1128+ When building the vocabulary ignore terms that have a document
1129+ frequency strictly higher than the given threshold (corpus-specific
1130+ stop words).
11231131 If float, the parameter represents a proportion of documents, integer
11241132 absolute counts.
11251133 This parameter is ignored if vocabulary is not None.
11261134
11271135 min_df : float in range [0.0, 1.0] or int, default=1
1128- When building the vocabulary ignore terms that have a document frequency
1129- strictly lower than the given threshold.
1130- This value is also called cut-off in the literature.
1136+ When building the vocabulary ignore terms that have a document
1137+ frequency strictly lower than the given threshold. This value is also
1138+ called cut-off in the literature.
11311139 If float, the parameter represents a proportion of documents, integer
11321140 absolute counts.
11331141 This parameter is ignored if vocabulary is not None.
0 commit comments