Skip to content

Commit a79cdbe

Browse files
committed
Added the ability to remove objects from the index that are no longer in the database to the update_index management command.
1 parent 3994bfa commit a79cdbe

File tree

2 files changed

+55
-4
lines changed

2 files changed

+55
-4
lines changed

docs/management_commands.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ arguments::
4141
Number of items to index at once. Default is 1000.
4242
``--site``:
4343
The site object to use when reindexing (like `search_sites.mysite`).
44+
``--remove``:
45+
Remove objects from the index that are no longer present in the
46+
database.
4447
``--verbosity``:
4548
If provided, dumps out more information about what's being done.
4649
@@ -76,6 +79,9 @@ of the arguments of the following arguments::
7679
``--noinput``:
7780
If provided, the interactive prompts are skipped and the index is
7881
uncerimoniously wiped out.
82+
``--remove``:
83+
Remove objects from the index that are no longer present in the
84+
database.
7985
``--verbosity``:
8086
If provided, dumps out more information about what's being done.
8187

haystack/management/commands/update_index.py

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,15 @@
44
from django.core.management.base import AppCommand, CommandError
55
from django.db import reset_queries
66
from django.utils.encoding import smart_str
7+
from haystack.query import SearchQuerySet
78
try:
89
from django.utils import importlib
910
except ImportError:
1011
from haystack.utils import importlib
12+
try:
13+
set
14+
except NameError:
15+
from sets import Set as set
1116

1217

1318
DEFAULT_BATCH_SIZE = getattr(settings, 'HAYSTACK_BATCH_SIZE', 1000)
@@ -28,6 +33,9 @@ class Command(AppCommand):
2833
make_option('-s', '--site', action='store', dest='site',
2934
type='string', help='The site object to use when reindexing (like `search_sites.mysite`).'
3035
),
36+
make_option('-r', '--remove', action='store_true', dest='remove',
37+
default=False, help='Remove objects from the index that are no longer present in the database.'
38+
),
3139
)
3240
option_list = AppCommand.option_list + base_options
3341

@@ -51,6 +59,7 @@ def handle(self, *apps, **options):
5159
self.batchsize = options.get('batchsize', DEFAULT_BATCH_SIZE)
5260
self.age = options.get('age', DEFAULT_AGE)
5361
self.site = options.get('site')
62+
self.remove = options.get('remove', False)
5463

5564
if not apps:
5665
from django.db.models import get_app
@@ -111,16 +120,52 @@ def handle_app(self, app, **options):
111120
if self.verbosity >= 1:
112121
print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural))
113122

123+
pks_seen = set()
124+
114125
for start in range(0, total, self.batchsize):
115126
end = min(start + self.batchsize, total)
116127

117-
if self.verbosity >= 2:
118-
print " indexing %s - %d of %d." % (start+1, end, total)
119-
120128
# Get a clone of the QuerySet so that the cache doesn't bloat up
121129
# in memory. Useful when reindexing large amounts of data.
122130
small_cache_qs = qs.all()
123-
index.backend.update(index, small_cache_qs[start:end])
131+
current_qs = small_cache_qs[start:end]
132+
133+
for obj in current_qs:
134+
pks_seen.add(smart_str(obj.pk))
135+
136+
if self.verbosity >= 2:
137+
print " indexing %s - %d of %d." % (start+1, end, total)
138+
139+
index.backend.update(index, current_qs)
124140

125141
# Clear out the DB connections queries because it bloats up RAM.
126142
reset_queries()
143+
144+
if self.remove:
145+
if self.age or total <= 0:
146+
# They're using a reduced set, which may not incorporate
147+
# all pks. Rebuild the list with everything.
148+
pks_seen = set()
149+
qs = index.get_queryset().values_list('pk', flat=True)
150+
total = qs.count()
151+
152+
for pk in qs:
153+
pks_seen.add(smart_str(pk))
154+
155+
for start in range(0, total, self.batchsize):
156+
upper_bound = start + self.batchsize
157+
158+
# Fetch a list of results.
159+
# Can't do pk range, because id's are strings (thanks comments
160+
# & UUIDs!).
161+
stuff_in_the_index = SearchQuerySet().models(model)[start:upper_bound]
162+
163+
# Iterate over those results.
164+
for result in stuff_in_the_index:
165+
# Be careful not to hit the DB.
166+
if not smart_str(result.pk) in pks_seen:
167+
# The id is NOT in the small_cache_qs, issue a delete.
168+
if self.verbosity >= 2:
169+
print " removing %s." % result.pk
170+
171+
index.backend.remove(".".join([result.app_label, result.model_name, result.pk]))

0 commit comments

Comments
 (0)