|
31 | 31 | from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES |
32 | 32 | from ..utils import _IS_32BIT |
33 | 33 | from ..utils.fixes import _astype_copy_false |
| 34 | +from ..exceptions import ChangedBehaviorWarning |
34 | 35 |
|
35 | 36 |
|
36 | 37 | __all__ = ['HashingVectorizer', |
@@ -304,10 +305,34 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): |
304 | 305 | self._stop_words_id = id(self.stop_words) |
305 | 306 | return 'error' |
306 | 307 |
|
| 308 | + def _validate_custom_analyzer(self): |
| 309 | + # This is to check if the given custom analyzer expects file or a |
| 310 | + # filename instead of data. |
| 311 | + # Behavior changed in v0.21, function could be removed in v0.23 |
| 312 | + import tempfile |
| 313 | + with tempfile.NamedTemporaryFile() as f: |
| 314 | + fname = f.name |
| 315 | + # now we're sure fname doesn't exist |
| 316 | + |
| 317 | + msg = ("Since v0.21, vectorizers pass the data to the custom analyzer " |
| 318 | + "and not the file names or the file objects. This warning " |
| 319 | + "will be removed in v0.23.") |
| 320 | + try: |
| 321 | + self.analyzer(fname) |
| 322 | + except FileNotFoundError: |
| 323 | + warnings.warn(msg, ChangedBehaviorWarning) |
| 324 | + except AttributeError as e: |
| 325 | + if str(e) == "'str' object has no attribute 'read'": |
| 326 | + warnings.warn(msg, ChangedBehaviorWarning) |
| 327 | + except Exception: |
| 328 | + pass |
| 329 | + |
307 | 330 | def build_analyzer(self): |
308 | 331 | """Return a callable that handles preprocessing and tokenization""" |
309 | 332 | if callable(self.analyzer): |
310 | | - return self.analyzer |
| 333 | + if self.input in ['file', 'filename']: |
| 334 | + self._validate_custom_analyzer() |
| 335 | + return lambda doc: self.analyzer(self.decode(doc)) |
311 | 336 |
|
312 | 337 | preprocess = self.build_preprocessor() |
313 | 338 |
|
@@ -490,6 +515,11 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin): |
490 | 515 | If a callable is passed it is used to extract the sequence of features |
491 | 516 | out of the raw, unprocessed input. |
492 | 517 |
|
| 518 | + .. versionchanged:: 0.21 |
| 519 | + Since v0.21, if ``input`` is ``filename`` or ``file``, the data is |
| 520 | + first read from the file and then passed to the given callable |
| 521 | + analyzer. |
| 522 | +
|
493 | 523 | n_features : integer, default=(2 ** 20) |
494 | 524 | The number of features (columns) in the output matrices. Small numbers |
495 | 525 | of features are likely to cause hash collisions, but large numbers |
@@ -745,6 +775,11 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): |
745 | 775 | If a callable is passed it is used to extract the sequence of features |
746 | 776 | out of the raw, unprocessed input. |
747 | 777 |
|
| 778 | + .. versionchanged:: 0.21 |
| 779 | + Since v0.21, if ``input`` is ``filename`` or ``file``, the data is |
| 780 | + first read from the file and then passed to the given callable |
| 781 | + analyzer. |
| 782 | +
|
748 | 783 | max_df : float in range [0.0, 1.0] or int, default=1.0 |
749 | 784 | When building the vocabulary ignore terms that have a document |
750 | 785 | frequency strictly higher than the given threshold (corpus-specific |
@@ -1369,6 +1404,11 @@ class TfidfVectorizer(CountVectorizer): |
1369 | 1404 | If a callable is passed it is used to extract the sequence of features |
1370 | 1405 | out of the raw, unprocessed input. |
1371 | 1406 |
|
| 1407 | + .. versionchanged:: 0.21 |
| 1408 | + Since v0.21, if ``input`` is ``filename`` or ``file``, the data is |
| 1409 | + first read from the file and then passed to the given callable |
| 1410 | + analyzer. |
| 1411 | +
|
1372 | 1412 | stop_words : string {'english'}, list, or None (default=None) |
1373 | 1413 | If a string, it is passed to _check_stop_list and the appropriate stop |
1374 | 1414 | list is returned. 'english' is currently the only supported string |
|
0 commit comments