Skip to content

Commit 534a7b7

Browse files
committed
implement new iter_by_chunks() in items
1 parent 92ff096 commit 534a7b7

File tree

2 files changed

+44
-0
lines changed

2 files changed

+44
-0
lines changed

scrapinghub/client/items.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,30 @@ def _modify_iter_params(self, params):
5959
if offset:
6060
params['start'] = '{}/{}'.format(self.key, offset)
6161
return params
62+
63+
def iter_by_chunks(self, chunksize=10000, *args, **kwargs):
64+
"""An alternative for reading and processing items by returning a
65+
generator of item chunks.
66+
67+
This is a convenient method for cases when processing a large amount of
68+
items from a job isn't ideal in one go due to the large memory needed.
69+
Instead, this allows you to process it chunk by chunk.
70+
71+
You can improve I/O overheads by increasing the chunk value but that
72+
would also increase the memory consumption.
73+
74+
:return: an iterator over a list of elements.
75+
:rtype: :class:`collections.Iterable`
76+
"""
77+
78+
processed = 0
79+
while True:
80+
next_key = self.key + '/' + str(processed)
81+
items = [
82+
item for item in self.iter(
83+
count=chunksize, start=next_key, *args, **kwargs)
84+
]
85+
yield items
86+
processed += len(items)
87+
if len(items) < chunksize:
88+
break

tests/client/test_items.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,20 @@ def test_items_list(spider, json_and_msgpack):
3636
assert o[0] == {'id': 0, 'data': 'data0'}
3737
assert o[1] == {'id': 1, 'data': 'data1'}
3838
assert o[2] == {'id': 2, 'data': 'data2'}
39+
40+
41+
def test_items_iter_by_chunks(spider, json_and_msgpack):
42+
job = spider.jobs.run(meta={'state': 'running'})
43+
_add_test_items(job)
44+
45+
o = job.items.iter_by_chunks(2)
46+
assert next(o) == [
47+
{'id': 0, 'data': 'data0'},
48+
{'id': 1, 'data': 'data1'},
49+
]
50+
assert next(o) == [
51+
{'id': 2, 'data': 'data2'},
52+
]
53+
next(o)
54+
with pytest.raises(StopIteration):
55+
next(o)

0 commit comments

Comments
 (0)