Skip to content

Commit 0246578

Browse files
author
jennifersmith
committed
Implements json streaming for array output
This is to address issue wireservice#355 and to allow json to be 'streamed' - that is, each object in the array be written out separated by a newline, rather than output as one single array. Allowing this means we can convert large csv streams to json without waiting until the end to see the output. Many json-processing tools can deal with streaming json (for instance jq).
1 parent 30db4b4 commit 0246578

File tree

2 files changed

+40
-17
lines changed

2 files changed

+40
-17
lines changed

csvkit/utilities/csvjson.py

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,27 @@ def add_arguments(self):
3131
self.argparser.add_argument('--crs', dest='crs', type=str, default=None,
3232
help='A coordinate reference system string to be included with GeoJSON output. Only valid if --lat and --lon are also specified.')
3333

34+
self.argparser.add_argument('--stream', dest='streamOutput', action='store_true',
35+
help='Output JSON as a stream of newline-separated objects, rather than an as an array.')
36+
3437
def main(self):
38+
if six.PY2:
39+
stream = codecs.getwriter('utf-8')(self.output_file)
40+
else:
41+
stream = self.output_file
42+
43+
json_kwargs = {
44+
'ensure_ascii': False,
45+
'indent': self.args.indent,
46+
}
47+
48+
if six.PY2:
49+
json_kwargs['encoding'] = 'utf-8'
50+
51+
def dump_json (data,newline=False):
52+
json.dump(data, stream, **json_kwargs)
53+
if newline: stream.write("\n")
54+
3555
"""
3656
Convert CSV to JSON.
3757
"""
@@ -47,11 +67,6 @@ def main(self):
4767
rows = CSVKitReader(self.input_file, **self.reader_kwargs)
4868
column_names = next(rows)
4969

50-
if six.PY2:
51-
stream = codecs.getwriter('utf-8')(self.output_file)
52-
else:
53-
stream = self.output_file
54-
5570
# GeoJSON
5671
if self.args.lat and self.args.lon:
5772
features = []
@@ -129,6 +144,7 @@ def main(self):
129144
'name': self.args.crs
130145
})
131146
])
147+
dump_json(output)
132148
# Keyed JSON
133149
elif self.args.key:
134150
output = OrderedDict()
@@ -145,10 +161,10 @@ def main(self):
145161
raise NonUniqueKeyColumnException('Value %s is not unique in the key column.' % six.text_type(k))
146162

147163
output[k] = data
164+
dump_json(output)
148165
# Boring JSON
149166
else:
150167
output = []
151-
152168
for row in rows:
153169
data = OrderedDict()
154170

@@ -157,18 +173,13 @@ def main(self):
157173
data[column] = row[i]
158174
except IndexError:
159175
data[column] = None
176+
if(self.args.streamOutput):
177+
dump_json(data,newline=True)
178+
else:
179+
output.append(data)
180+
if not self.args.streamOutput:
181+
dump_json(output)
160182

161-
output.append(data)
162-
163-
kwargs = {
164-
'ensure_ascii': False,
165-
'indent': self.args.indent,
166-
}
167-
168-
if six.PY2:
169-
kwargs['encoding'] = 'utf-8'
170-
171-
json.dump(output, stream, **kwargs)
172183

173184

174185
def launch_new_instance():

tests/test_utilities/test_csvjson.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,15 @@ def test_geojson_with_crs(self):
123123
self.assertEqual(crs['type'], 'name')
124124
self.assertEqual(crs['properties']['name'], 'EPSG:4269')
125125

126+
def test_json_streaming(self):
127+
args = ['--stream', 'examples/dummy3.csv']
128+
output_file = six.StringIO()
129+
130+
utility = CSVJSON(args, output_file)
131+
utility.main()
132+
133+
result = map(json.loads, output_file.getvalue().splitlines())
134+
self.assertEqual(len(result), 2)
135+
self.assertDictEqual(result[0], {"a": "1", "c": "3", "b": "2"})
136+
self.assertDictEqual(result[1], {"a": "1", "c": "5", "b": "4"})
137+

0 commit comments

Comments
 (0)