Skip to content

Commit 058265a

Browse files
committed
Allow individual records to be skipped while indexing
Previously there was no easy way to skip specific objects other than filtering the queryset. This change allows a prepare method to raise `SkipDocument` after calling methods or making other checks which cannot easily be expressed as database filters. Thanks to Felipe Prenholato (@chronossc) for the patch Closes django-haystack#380 Closes django-haystack#1191
1 parent 3fe4e96 commit 058265a

File tree

8 files changed

+116
-29
lines changed

8 files changed

+116
-29
lines changed

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,4 @@ Thanks to
9999
* Andrei Fokau (andreif) for adding support for ``SQ`` in ``SearchQuerySet.narrow()``
100100
* Phill Tornroth (phill-tornroth) for several patches improving UnifiedIndex and ElasticSearch support
101101
* Philippe Luickx (philippeluickx) for documenting how to provide backend-specific facet options
102+
* Felipe Prenholato (@chronossc) for a patch making it easy to exclude documents from indexing using custom logic

haystack/backends/elasticsearch_backend.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import haystack
1515
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, log_query
1616
from haystack.constants import DEFAULT_OPERATOR, DJANGO_CT, DJANGO_ID, ID
17-
from haystack.exceptions import MissingDependency, MoreLikeThisError
17+
from haystack.exceptions import MissingDependency, MoreLikeThisError, SkipDocument
1818
from haystack.inputs import Clean, Exact, PythonData, Raw
1919
from haystack.models import SearchResult
2020
from haystack.utils import log as logging
@@ -172,6 +172,8 @@ def update(self, index, iterable, commit=True):
172172
final_data['_id'] = final_data[ID]
173173

174174
prepped_docs.append(final_data)
175+
except SkipDocument:
176+
self.log.debug(u"Indexing for object `%s` skipped", obj)
175177
except elasticsearch.TransportError as e:
176178
if not self.silently_fail:
177179
raise

haystack/backends/solr_backend.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
1313
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
14-
from haystack.exceptions import MissingDependency, MoreLikeThisError
14+
from haystack.exceptions import MissingDependency, MoreLikeThisError, SkipDocument
1515
from haystack.inputs import Clean, Exact, PythonData, Raw
1616
from haystack.models import SearchResult
1717
from haystack.utils import log as logging
@@ -54,6 +54,8 @@ def update(self, index, iterable, commit=True):
5454
for obj in iterable:
5555
try:
5656
docs.append(index.full_prepare(obj))
57+
except SkipDocument:
58+
self.log.debug(u"Indexing for object `%s` skipped", obj)
5759
except UnicodeDecodeError:
5860
if not self.silently_fail:
5961
raise

haystack/backends/whoosh_backend.py

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
1818
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
19-
from haystack.exceptions import MissingDependency, SearchBackendError
19+
from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument
2020
from haystack.inputs import Clean, Exact, PythonData, Raw
2121
from haystack.models import SearchResult
2222
from haystack.utils import log as logging
@@ -192,32 +192,35 @@ def update(self, index, iterable, commit=True):
192192
writer = AsyncWriter(self.index)
193193

194194
for obj in iterable:
195-
doc = index.full_prepare(obj)
196-
197-
# Really make sure it's unicode, because Whoosh won't have it any
198-
# other way.
199-
for key in doc:
200-
doc[key] = self._from_python(doc[key])
201-
202-
# Document boosts aren't supported in Whoosh 2.5.0+.
203-
if 'boost' in doc:
204-
del doc['boost']
205-
206195
try:
207-
writer.update_document(**doc)
208-
except Exception as e:
209-
if not self.silently_fail:
210-
raise
211-
212-
# We'll log the object identifier but won't include the actual object
213-
# to avoid the possibility of that generating encoding errors while
214-
# processing the log message:
215-
self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
216-
"data": {
217-
"index": index,
218-
"object": get_identifier(obj)
219-
}
220-
})
196+
doc = index.full_prepare(obj)
197+
except SkipDocument:
198+
self.log.debug(u"Indexing for object `%s` skipped", obj)
199+
else:
200+
# Really make sure it's unicode, because Whoosh won't have it any
201+
# other way.
202+
for key in doc:
203+
doc[key] = self._from_python(doc[key])
204+
205+
# Document boosts aren't supported in Whoosh 2.5.0+.
206+
if 'boost' in doc:
207+
del doc['boost']
208+
209+
try:
210+
writer.update_document(**doc)
211+
except Exception as e:
212+
if not self.silently_fail:
213+
raise
214+
215+
# We'll log the object identifier but won't include the actual object
216+
# to avoid the possibility of that generating encoding errors while
217+
# processing the log message:
218+
self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
219+
"data": {
220+
"index": index,
221+
"object": get_identifier(obj)
222+
}
223+
})
221224

222225
if len(iterable) > 0:
223226
# For now, commit no matter what, as we run into locking issues otherwise.

haystack/exceptions.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,47 @@ class HaystackError(Exception):
77
"""A generic exception for all others to extend."""
88
pass
99

10+
1011
class SearchBackendError(HaystackError):
1112
"""Raised when a backend can not be found."""
1213
pass
1314

15+
1416
class SearchFieldError(HaystackError):
1517
"""Raised when a field encounters an error."""
1618
pass
1719

20+
1821
class MissingDependency(HaystackError):
1922
"""Raised when a library a backend depends on can not be found."""
2023
pass
2124

25+
2226
class NotHandled(HaystackError):
2327
"""Raised when a model is not handled by the router setup."""
2428
pass
2529

30+
2631
class MoreLikeThisError(HaystackError):
2732
"""Raised when a model instance has not been provided for More Like This."""
2833
pass
2934

35+
3036
class FacetingError(HaystackError):
3137
"""Raised when incorrect arguments have been provided for faceting."""
3238
pass
3339

40+
3441
class SpatialError(HaystackError):
3542
"""Raised when incorrect arguments have been provided for spatial."""
3643
pass
3744

45+
3846
class StatsError(HaystackError):
3947
"Raised when incorrect arguments have been provided for stats"
4048
pass
49+
50+
51+
class SkipDocument(HaystackError):
52+
"""Raised when a document should be skipped while updating"""
53+
pass

test_haystack/elasticsearch_tests/test_elasticsearch_backend.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from django.utils import unittest
1414

1515
from haystack import connections, indexes, reset_search_queries
16+
from haystack.exceptions import SkipDocument
1617
from haystack.inputs import AutoQuery
1718
from haystack.models import SearchResult
1819
from haystack.query import RelatedSearchQuerySet, SearchQuerySet, SQ
@@ -57,6 +58,14 @@ def get_model(self):
5758
return MockModel
5859

5960

61+
class ElasticsearchMockSearchIndexWithSkipDocument(ElasticsearchMockSearchIndex):
62+
63+
def prepare_text(self, obj):
64+
if obj.author == 'daniel3':
65+
raise SkipDocument
66+
return u"Indexed!\n%s" % obj.id
67+
68+
6069
class ElasticsearchMockSpellingIndex(indexes.SearchIndex, indexes.Indexable):
6170
text = indexes.CharField(document=True)
6271
name = indexes.CharField(model_attr='author', faceted=True)
@@ -207,6 +216,7 @@ def get_model(self):
207216

208217

209218
class TestSettings(TestCase):
219+
210220
def test_kwargs_are_passed_on(self):
211221
from haystack.backends.elasticsearch_backend import ElasticsearchSearchBackend
212222
backend = ElasticsearchSearchBackend('alias', **{
@@ -230,6 +240,7 @@ def setUp(self):
230240
self.old_ui = connections['elasticsearch'].get_unified_index()
231241
self.ui = UnifiedIndex()
232242
self.smmi = ElasticsearchMockSearchIndex()
243+
self.smmidni = ElasticsearchMockSearchIndexWithSkipDocument()
233244
self.smtmmi = ElasticsearchMaintainTypeMockSearchIndex()
234245
self.ui.build(indexes=[self.smmi])
235246
connections['elasticsearch']._index = self.ui
@@ -335,6 +346,18 @@ def test_update(self):
335346
}
336347
])
337348

349+
def test_update_with_SkipDocument_raised(self):
350+
self.sb.update(self.smmidni, self.sample_objs)
351+
352+
# Check what Elasticsearch thinks is there.
353+
res = self.raw_search('*:*')['hits']
354+
self.assertEqual(res['total'], 2)
355+
self.assertListEqual(
356+
sorted([x['_source']['id'] for x in res['hits']]),
357+
['core.mockmodel.1', 'core.mockmodel.2']
358+
)
359+
360+
338361
def test_remove(self):
339362
self.sb.update(self.smmi, self.sample_objs)
340363
self.assertEqual(self.raw_search('*:*')['hits']['total'], 3)

test_haystack/solr_tests/test_solr_backend.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from mock import patch
1515

1616
from haystack import connections, indexes, reset_search_queries
17+
from haystack.exceptions import SkipDocument
1718
from haystack.inputs import AltParser, AutoQuery, Raw
1819
from haystack.models import SearchResult
1920
from haystack.query import RelatedSearchQuerySet, SearchQuerySet, SQ
@@ -50,6 +51,14 @@ def get_model(self):
5051
return MockModel
5152

5253

54+
class SolrMockSearchIndexWithSkipDocument(SolrMockSearchIndex):
55+
56+
def prepare_text(self, obj):
57+
if obj.author == 'daniel3':
58+
raise SkipDocument
59+
return u"Indexed!\n%s" % obj.id
60+
61+
5362
class SolrMockOverriddenFieldNameSearchIndex(indexes.SearchIndex, indexes.Indexable):
5463
text = indexes.CharField(document=True, use_template=True)
5564
name = indexes.CharField(model_attr='author', faceted=True, index_fieldname='name_s')
@@ -200,6 +209,7 @@ def setUp(self):
200209
self.old_ui = connections['solr'].get_unified_index()
201210
self.ui = UnifiedIndex()
202211
self.smmi = SolrMockSearchIndex()
212+
self.smmidni = SolrMockSearchIndexWithSkipDocument()
203213
self.smtmmi = SolrMaintainTypeMockSearchIndex()
204214
self.smofnmi = SolrMockOverriddenFieldNameSearchIndex()
205215
self.ui.build(indexes=[self.smmi])
@@ -285,6 +295,18 @@ def test_update(self):
285295
}
286296
])
287297

298+
def test_update_with_SkipDocument_raised(self):
299+
self.sb.update(self.smmidni, self.sample_objs)
300+
301+
res = self.raw_solr.search('*:*')
302+
303+
# Check what Solr thinks is there.
304+
self.assertEqual(res.hits, 2)
305+
self.assertListEqual(
306+
sorted([x['id'] for x in res.docs]),
307+
['core.mockmodel.1', 'core.mockmodel.2']
308+
)
309+
288310
def test_remove(self):
289311
self.sb.update(self.smmi, self.sample_objs)
290312
self.assertEqual(self.raw_solr.search('*:*').hits, 3)

test_haystack/whoosh_tests/test_whoosh_backend.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from whoosh.qparser import QueryParser
1616

1717
from haystack import connections, indexes, reset_search_queries
18-
from haystack.exceptions import SearchBackendError
18+
from haystack.exceptions import SkipDocument, SearchBackendError
1919
from haystack.inputs import AutoQuery
2020
from haystack.models import SearchResult
2121
from haystack.query import SearchQuerySet, SQ
@@ -35,6 +35,14 @@ def get_model(self):
3535
return MockModel
3636

3737

38+
class WhooshMockSearchIndexWithSkipDocument(WhooshMockSearchIndex):
39+
40+
def prepare_text(self, obj):
41+
if obj.author == 'daniel3':
42+
raise SkipDocument
43+
return obj.author
44+
45+
3846
class WhooshAnotherMockSearchIndex(indexes.SearchIndex, indexes.Indexable):
3947
text = indexes.CharField(document=True)
4048
name = indexes.CharField(model_attr='author')
@@ -115,6 +123,7 @@ def setUp(self):
115123
self.old_ui = connections['whoosh'].get_unified_index()
116124
self.ui = UnifiedIndex()
117125
self.wmmi = WhooshMockSearchIndex()
126+
self.wmmidni = WhooshMockSearchIndexWithSkipDocument()
118127
self.wmtmmi = WhooshMaintainTypeMockSearchIndex()
119128
self.ui.build(indexes=[self.wmmi])
120129
self.sb = connections['whoosh'].get_backend()
@@ -172,6 +181,18 @@ def test_update(self):
172181
self.assertEqual(len(self.whoosh_search(u'*')), 23)
173182
self.assertEqual([doc.fields()['id'] for doc in self.whoosh_search(u'*')], [u'core.mockmodel.%s' % i for i in range(1, 24)])
174183

184+
def test_update_with_SkipDocument_raised(self):
185+
self.sb.update(self.wmmidni, self.sample_objs)
186+
187+
# Check what Whoosh thinks is there.
188+
res = self.whoosh_search(u'*')
189+
self.assertEqual(len(res), 14)
190+
ids = [1, 2, 5, 6, 7, 8, 9, 11, 12, 14, 15, 18, 20, 21]
191+
self.assertListEqual(
192+
[doc.fields()['id'] for doc in res],
193+
[u'core.mockmodel.%s' % i for i in ids]
194+
)
195+
175196
def test_remove(self):
176197
self.sb.update(self.wmmi, self.sample_objs)
177198
self.assertEqual(self.sb.index.doc_count(), 23)

0 commit comments

Comments
 (0)