Refactor ndjson into an in2csv format. wireservice#329.

onyxfish · onyxfish · commit fa6bade025cf · 2014-11-27T09:20:05.000-07:00
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,8 @@
 1.0.0
 -----
 
+* Add Antonio Lima to AUTHORS.
+* Add support for ndjson. (#329)
 * Add missing docs for csvcut -C. (#227)
 * Reorganize docs so TOC works better. (#339)
 * Render docs locally with RTD theme.
diff --git a/csvkit/convert/__init__.py b/csvkit/convert/__init__.py
@@ -6,10 +6,11 @@
 from csvkit.convert.fixed import fixed2csv
 from csvkit.convert.geojs import geojson2csv
 from csvkit.convert.js import json2csv
+from csvkit.convert.ndjs import ndjson2csv
 from csvkit.convert.xls import xls2csv
 from csvkit.convert.xlsx import xlsx2csv
 
-SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json', 'geojson']
+SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json', 'geojson', 'ndjson']
 
 # DBF is supported for Python 2 only
 if six.PY2:
@@ -38,6 +39,8 @@ def convert(f, format, schema=None, key=None, **kwargs):
         return xlsx2csv(f, **kwargs)
     elif format == 'json':
         return json2csv(f, key, **kwargs)
+    elif format == 'ndjson':
+        return ndjson2csv(f, **kwargs)
     elif format == 'geojson':
         return geojson2csv(f, **kwargs)
     elif format == 'csv':
diff --git a/csvkit/convert/js.py b/csvkit/convert/js.py
@@ -37,19 +37,9 @@ def json2csv(f, key=None, **kwargs):
     """
     Convert a JSON document into CSV format.
 
-    Supports both JSON and "Newline-delimited JSON".
-
     The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
     """
-    first_line = f.readline()
-
-    # Test for newline delimited JSON
-    try:
-        first_row = json.loads(first_line, object_pairs_hook=OrderedDict)
-        js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f))
-    except ValueError:
-        document = first_line + f.read()
-        js = json.loads(document, object_pairs_hook=OrderedDict)
+    js = json.load(f, object_pairs_hook=OrderedDict)
 
     if isinstance(js, dict):
         if not key:
diff --git a/csvkit/convert/ndjs.py b/csvkit/convert/ndjs.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+try:
+    from collections import OrderedDict
+    import json
+except ImportError:
+    from ordereddict import OrderedDict
+    import simplejson as json
+
+import itertools
+import six
+
+from csvkit import CSVKitWriter
+
+def parse_object(obj, path=''):
+    """
+    Recursively parse JSON objects and a dictionary of paths/keys and values.
+
+    Inspired by JSONPipe (https://github.com/dvxhouse/jsonpipe).
+    """
+    if isinstance(obj, dict):
+        iterator = obj.items()
+    elif isinstance(obj, (list, tuple)):
+        iterator = enumerate(obj)
+    else:
+        return { path.strip('/'): obj }
+
+    d = {}
+
+    for key, value in iterator:
+        key = six.text_type(key)
+        d.update(parse_object(value, path + key + '/'))
+
+    return d
+
+def ndjson2csv(f, key=None, **kwargs):
+    """
+    Convert a JSON document into CSV format.
+
+    Supports both JSON and "Newline-delimited JSON".
+
+    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
+    """
+    first_line = f.readline()
+
+    first_row = json.loads(first_line, object_pairs_hook=OrderedDict)
+    js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f))
+
+    fields = []
+    flat = []
+
+    for obj in js:
+        flat.append(parse_object(obj)) 
+
+        for key in obj.keys():
+            if key not in fields:
+                fields.append(key)
+
+    o = six.StringIO()
+    writer = CSVKitWriter(o)
+
+    writer.writerow(fields)
+
+    for i in flat:
+        row = []
+
+        for field in fields:
+            row.append(i.get(field, None))
+
+        writer.writerow(row)
+
+    output = o.getvalue()
+    o.close()
+
+    return output
+
diff --git a/docs/scripts/in2csv.rst b/docs/scripts/in2csv.rst
@@ -31,7 +31,7 @@ The header line is required though the columns may be in any order::
       -f FORMAT, --format FORMAT
                             The format of the input file. If not specified will be
                             inferred from the file type. Supported formats: csv,
-                            dbf, fixed, geojson, json, xls, xlsx.
+                            dbf, fixed, geojson, json, ndjson, xls, xlsx.
       -s SCHEMA, --schema SCHEMA
                             Specifies a CSV-formatted schema file for converting
                             fixed-width files. See documentation for details.
@@ -47,7 +47,7 @@ See also: :doc:`../common_arguments`.
 
 .. note::
 
-    The "json" format supports both standard JSON as well as "newline delimited JSON", such as is output by the many streaming APIs.
+    The "ndjson" format refers to "newline delimited JSON", such as is output by the many streaming APIs.
 
 .. note::
 
diff --git a/examples/testjson_multiline_doc_converted.csv b/examples/testjson_multiline_doc_converted.csv
diff --git a/examples/testjson_multiline_document.json b/examples/testjson_multiline_document.json
diff --git a/examples/testjson_multiline_document_converted.csv b/examples/testjson_multiline_document_converted.csv
diff --git a/tests/test_utilities/test_in2csv.py b/tests/test_utilities/test_in2csv.py
@@ -61,8 +61,8 @@ def test_convert_json(self):
         target_output = open('examples/testjson_converted.csv', 'r').read()
         self.assertEqual(output_file.getvalue(), target_output)
 
-    def test_convert_json_multiline(self):
-        args = ['examples/testjson_multiline.json']
+    def test_convert_ndjson(self):
+        args = ['examples/testjson_multiline.json', '-f', 'ndjson']
         output_file = six.StringIO()
         
         utility = In2CSV(args, output_file)
@@ -71,12 +71,3 @@ def test_convert_json_multiline(self):
         target_output = open('examples/testjson_multiline_converted.csv', 'r').read()
         self.assertEqual(output_file.getvalue(), target_output)
 
-    def test_convert_json_multiline_document(self):
-        args = ['examples/testjson_multiline_document.json']
-        output_file = six.StringIO()
-        
-        utility = In2CSV(args, output_file)
-        utility.main()
-        
-        target_output = open('examples/testjson_multiline_document_converted.csv', 'r').read()
-        self.assertEqual(output_file.getvalue(), target_output)