Skip to content

Commit fa6bade

Browse files
committed
Refactor ndjson into an in2csv format. wireservice#329.
1 parent aceb1e7 commit fa6bade

File tree

9 files changed

+87
-30
lines changed

9 files changed

+87
-30
lines changed

CHANGELOG

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
1.0.0
22
-----
33

4+
* Add Antonio Lima to AUTHORS.
5+
* Add support for ndjson. (#329)
46
* Add missing docs for csvcut -C. (#227)
57
* Reorganize docs so TOC works better. (#339)
68
* Render docs locally with RTD theme.

csvkit/convert/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
from csvkit.convert.fixed import fixed2csv
77
from csvkit.convert.geojs import geojson2csv
88
from csvkit.convert.js import json2csv
9+
from csvkit.convert.ndjs import ndjson2csv
910
from csvkit.convert.xls import xls2csv
1011
from csvkit.convert.xlsx import xlsx2csv
1112

12-
SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json', 'geojson']
13+
SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json', 'geojson', 'ndjson']
1314

1415
# DBF is supported for Python 2 only
1516
if six.PY2:
@@ -38,6 +39,8 @@ def convert(f, format, schema=None, key=None, **kwargs):
3839
return xlsx2csv(f, **kwargs)
3940
elif format == 'json':
4041
return json2csv(f, key, **kwargs)
42+
elif format == 'ndjson':
43+
return ndjson2csv(f, **kwargs)
4144
elif format == 'geojson':
4245
return geojson2csv(f, **kwargs)
4346
elif format == 'csv':

csvkit/convert/js.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,19 +37,9 @@ def json2csv(f, key=None, **kwargs):
3737
"""
3838
Convert a JSON document into CSV format.
3939
40-
Supports both JSON and "Newline-delimited JSON".
41-
4240
The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
4341
"""
44-
first_line = f.readline()
45-
46-
# Test for newline delimited JSON
47-
try:
48-
first_row = json.loads(first_line, object_pairs_hook=OrderedDict)
49-
js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f))
50-
except ValueError:
51-
document = first_line + f.read()
52-
js = json.loads(document, object_pairs_hook=OrderedDict)
42+
js = json.load(f, object_pairs_hook=OrderedDict)
5343

5444
if isinstance(js, dict):
5545
if not key:

csvkit/convert/ndjs.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/env python
2+
3+
try:
4+
from collections import OrderedDict
5+
import json
6+
except ImportError:
7+
from ordereddict import OrderedDict
8+
import simplejson as json
9+
10+
import itertools
11+
import six
12+
13+
from csvkit import CSVKitWriter
14+
15+
def parse_object(obj, path=''):
16+
"""
17+
Recursively parse JSON objects and a dictionary of paths/keys and values.
18+
19+
Inspired by JSONPipe (https://github.com/dvxhouse/jsonpipe).
20+
"""
21+
if isinstance(obj, dict):
22+
iterator = obj.items()
23+
elif isinstance(obj, (list, tuple)):
24+
iterator = enumerate(obj)
25+
else:
26+
return { path.strip('/'): obj }
27+
28+
d = {}
29+
30+
for key, value in iterator:
31+
key = six.text_type(key)
32+
d.update(parse_object(value, path + key + '/'))
33+
34+
return d
35+
36+
def ndjson2csv(f, key=None, **kwargs):
37+
"""
38+
Convert a JSON document into CSV format.
39+
40+
Supports both JSON and "Newline-delimited JSON".
41+
42+
The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
43+
"""
44+
first_line = f.readline()
45+
46+
first_row = json.loads(first_line, object_pairs_hook=OrderedDict)
47+
js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f))
48+
49+
fields = []
50+
flat = []
51+
52+
for obj in js:
53+
flat.append(parse_object(obj))
54+
55+
for key in obj.keys():
56+
if key not in fields:
57+
fields.append(key)
58+
59+
o = six.StringIO()
60+
writer = CSVKitWriter(o)
61+
62+
writer.writerow(fields)
63+
64+
for i in flat:
65+
row = []
66+
67+
for field in fields:
68+
row.append(i.get(field, None))
69+
70+
writer.writerow(row)
71+
72+
output = o.getvalue()
73+
o.close()
74+
75+
return output
76+

docs/scripts/in2csv.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ The header line is required though the columns may be in any order::
3131
-f FORMAT, --format FORMAT
3232
The format of the input file. If not specified will be
3333
inferred from the file type. Supported formats: csv,
34-
dbf, fixed, geojson, json, xls, xlsx.
34+
dbf, fixed, geojson, json, ndjson, xls, xlsx.
3535
-s SCHEMA, --schema SCHEMA
3636
Specifies a CSV-formatted schema file for converting
3737
fixed-width files. See documentation for details.
@@ -47,7 +47,7 @@ See also: :doc:`../common_arguments`.
4747

4848
.. note::
4949

50-
The "json" format supports both standard JSON as well as "newline delimited JSON", such as is output by the many streaming APIs.
50+
The "ndjson" format refers to "newline delimited JSON", such as is output by the many streaming APIs.
5151

5252
.. note::
5353

examples/testjson_multiline_doc_converted.csv

Whitespace-only changes.

examples/testjson_multiline_document.json

Lines changed: 0 additions & 2 deletions
This file was deleted.

examples/testjson_multiline_document_converted.csv

Lines changed: 0 additions & 3 deletions
This file was deleted.

tests/test_utilities/test_in2csv.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ def test_convert_json(self):
6161
target_output = open('examples/testjson_converted.csv', 'r').read()
6262
self.assertEqual(output_file.getvalue(), target_output)
6363

64-
def test_convert_json_multiline(self):
65-
args = ['examples/testjson_multiline.json']
64+
def test_convert_ndjson(self):
65+
args = ['examples/testjson_multiline.json', '-f', 'ndjson']
6666
output_file = six.StringIO()
6767

6868
utility = In2CSV(args, output_file)
@@ -71,12 +71,3 @@ def test_convert_json_multiline(self):
7171
target_output = open('examples/testjson_multiline_converted.csv', 'r').read()
7272
self.assertEqual(output_file.getvalue(), target_output)
7373

74-
def test_convert_json_multiline_document(self):
75-
args = ['examples/testjson_multiline_document.json']
76-
output_file = six.StringIO()
77-
78-
utility = In2CSV(args, output_file)
79-
utility.main()
80-
81-
target_output = open('examples/testjson_multiline_document_converted.csv', 'r').read()
82-
self.assertEqual(output_file.getvalue(), target_output)

0 commit comments

Comments
 (0)