Skip to content

Commit 250e509

Browse files
committed
ENH O(1) stop-word lookup when list provided
The docstring says stop_words can be a list, but it should be accessed as a set.
1 parent e5934cf commit 250e509

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

sklearn/feature_extraction/tests/test_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def test_countvectorizer_stop_words():
280280
assert_raises(ValueError, cv.get_stop_words)
281281
stoplist = ['some', 'other', 'words']
282282
cv.set_params(stop_words=stoplist)
283-
assert_equal(cv.get_stop_words(), stoplist)
283+
assert_equal(cv.get_stop_words(), set(stoplist))
284284

285285

286286
def test_countvectorizer_empty_vocabulary():

sklearn/feature_extraction/text.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,10 @@ def _check_stop_list(stop):
8888
return ENGLISH_STOP_WORDS
8989
elif isinstance(stop, six.string_types):
9090
raise ValueError("not a built-in stop list: %s" % stop)
91+
elif stop is None:
92+
return None
9193
else: # assume it's a collection
92-
return stop
94+
return frozenset(stop)
9395

9496

9597
class VectorizerMixin(object):

0 commit comments

Comments
 (0)