Merge pull request wireservice#783 from wireservice/writesheets

James McKinney · web-flow · commit 66e52a2e5eec · 2017-02-28T23:27:57.000-05:00
in2csv adds a --write-sheets option to write the named Excel sheets to file
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -16,6 +16,7 @@ Improvements:
 * :doc:`/scripts/csvsql` accepts a file name for the :code:`--query` option.
 * :doc:`/scripts/csvstat` adds a :code:`--freq-count` option to set the maximum number of frequent values to display.
 * :doc:`/scripts/in2csv` adds a :code:`--names` flag to print Excel sheet names.
+* :doc:`/scripts/in2csv` adds a :code:`--write-sheets` option to write the named Excel sheets to files.
 * :doc:`/scripts/sql2csv` adds an :code:`--encoding` option to specify the encoding of the input query file.
 
 Fixes:
diff --git a/csvkit/cli.py b/csvkit/cli.py
@@ -5,8 +5,8 @@
 import codecs
 import gzip
 import itertools
-import os.path
 import sys
+from os.path import splitext
 
 import agate
 import six
@@ -213,7 +213,7 @@ def _open_input_file(self, path):
             else:
                 f = sys.stdin
         else:
-            (_, extension) = os.path.splitext(path)
+            extension = splitext(path)[1]
 
             if extension == '.gz':
                 f = LazyFile(gzip.open, path, mode, **kwargs)
diff --git a/csvkit/utilities/csvsql.py b/csvkit/utilities/csvsql.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 
-import os
-from pkg_resources import iter_entry_points
+import os.path
 import sys
+from pkg_resources import iter_entry_points
 
 import agate
 import agatesql  # noqa
@@ -109,7 +109,7 @@ def _failsafe_main(self):
                     table_name = "stdin"
                 else:
                     # Use filename as table name
-                    table_name = os.path.splitext(os.path.split(f.name)[1])[0]
+                    table_name = os.path.splitext(os.path.basename(f.name))[0]
 
             table = None
 
diff --git a/csvkit/utilities/csvstack.py b/csvkit/utilities/csvstack.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-import os
+import os.path
 
 import agate
 
@@ -31,7 +31,7 @@ def main(self):
             self.argparser.error('You must specify at least one file to stack.')
 
         if self.args.group_by_filenames:
-            groups = [os.path.split(f.name)[1] for f in self.input_files]
+            groups = [os.path.basename(f.name) for f in self.input_files]
         elif self.args.groups:
             groups = self.args.groups.split(',')
 
diff --git a/csvkit/utilities/in2csv.py b/csvkit/utilities/in2csv.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import sys
+from os.path import splitext
 
 import agate
 import agatedbf  # noqa
@@ -42,11 +43,28 @@ def option_parser(bytestring):
                                     help='Display sheet names from the input Excel file.')
         self.argparser.add_argument('--sheet', dest='sheet', type=option_parser,
                                     help='The name of the Excel sheet to operate on.')
+        self.argparser.add_argument('--write-sheets', dest='write_sheets', type=option_parser,
+                                    help='The names of the Excel sheets to write to files, or "-" to write all sheets.')
         self.argparser.add_argument('-y', '--snifflimit', dest='sniff_limit', type=int,
                                     help='Limit CSV dialect sniffing to the specified number of bytes. Specify "0" to disable sniffing entirely.')
         self.argparser.add_argument('-I', '--no-inference', dest='no_inference', action='store_true',
                                     help='Disable type inference (and --locale, --date-format, --datetime-format) when parsing CSV input.')
 
+    def open_excel_input_file(self, path):
+        if not path or path == '-':
+            if six.PY2:
+                return six.BytesIO(sys.stdin.read())
+            else:
+                return six.BytesIO(sys.stdin.buffer.read())
+        else:
+            return open(path, 'rb')
+
+    def sheet_names(self, filetype):
+        if filetype == 'xls':
+            return xlrd.open_workbook(file_contents=self.input_file.read()).sheet_names()
+        elif filetype == 'xlsx':
+            return openpyxl.load_workbook(self.input_file, read_only=True, data_only=True).sheetnames
+
     def main(self):
         path = self.args.input_path
 
@@ -71,25 +89,15 @@ def main(self):
 
         # Set the input file.
         if filetype in ('xls', 'xlsx'):
-            if not path or path == '-':
-                if six.PY2:
-                    self.input_file = six.BytesIO(sys.stdin.read())
-                else:
-                    self.input_file = six.BytesIO(sys.stdin.buffer.read())
-            else:
-                self.input_file = open(path, 'rb')
+            self.input_file = self.open_excel_input_file(path)
         else:
             self.input_file = self._open_input_file(path)
 
         if self.args.names_only:
-            sheet_names = None
-            if filetype == 'xls':
-                sheet_names = xlrd.open_workbook(file_contents=self.input_file.read()).sheet_names()
-            elif filetype == 'xlsx':
-                sheet_names = openpyxl.load_workbook(self.input_file, read_only=True, data_only=True).sheetnames
-            if sheet_names:
-                for name in sheet_names:
-                    self.output_file.write('%s\n' % name)
+            sheets = self.sheet_names(filetype)
+            if sheets:
+                for sheet in sheets:
+                    self.output_file.write('%s\n' % sheet)
             else:
                 self.argparser.error('You cannot use the -n or --names options with non-Excel files.')
             self.input_file.close()
@@ -103,9 +111,6 @@ def main(self):
         elif filetype == 'fixed':
             raise ValueError('schema must not be null when format is "fixed"')
 
-        if self.args.sheet:
-            kwargs['sheet'] = self.args.sheet
-
         if filetype == 'csv':
             kwargs.update(self.reader_kwargs)
             kwargs['sniff_limit'] = self.args.sniff_limit
@@ -133,15 +138,36 @@ def main(self):
             elif filetype == 'ndjson':
                 table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs)
             elif filetype == 'xls':
-                table = agate.Table.from_xls(self.input_file, **kwargs)
+                table = agate.Table.from_xls(self.input_file, sheet=self.args.sheet, **kwargs)
             elif filetype == 'xlsx':
-                table = agate.Table.from_xlsx(self.input_file, **kwargs)
+                table = agate.Table.from_xlsx(self.input_file, sheet=self.args.sheet, **kwargs)
             elif filetype == 'dbf':
                 if not hasattr(self.input_file, 'name'):
                     raise ValueError('DBF files can not be converted from stdin. You must pass a filename.')
                 table = agate.Table.from_dbf(self.input_file.name, **kwargs)
             table.to_csv(self.output_file)
 
+        if self.args.write_sheets:
+            # Close and re-open the file, as the file object has been mutated or closed.
+            self.input_file.close()
+
+            self.input_file = self.open_excel_input_file(path)
+
+            if self.args.write_sheets == '-':
+                sheets = self.sheet_names(filetype)
+            else:
+                sheets = [int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',')]
+
+            if filetype == 'xls':
+                tables = agate.Table.from_xls(self.input_file, sheet=sheets, **kwargs)
+            elif filetype == 'xlsx':
+                tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs)
+
+            base = splitext(self.input_file.name)[0]
+            for i, table in enumerate(tables.values()):
+                with open('%s_%d.csv' % (base, i), 'w') as f:
+                    table.to_csv(f)
+
         self.input_file.close()
 
         if self.args.schema:
diff --git a/examples/testxls_unicode_converted.csv b/examples/testxls_unicode_converted.csv
@@ -0,0 +1,2 @@
+a,b,c
+1.0,2.0,3.0
diff --git a/examples/testxlsx_noinference_converted.csv b/examples/testxlsx_noinference_converted.csv
@@ -0,0 +1,6 @@
+text,date,integer,boolean,float,datetime,empty_column,h
+Chicago Reader,1971-01-01,40,True,1,1971-01-01 04:14:00,,
+Chicago Sun-Times,1948-01-01,63,True,1.27,1948-01-01 14:57:13,,Extra data beyond headers will be trimmed
+Chicago Tribune,1920-01-01,164,False,41800000.01,1920-01-01,,
+This row has blanks,,,,,,,
+Unicode! Σ,,,,,,,
diff --git a/examples/testxlsx_unicode_converted.csv b/examples/testxlsx_unicode_converted.csv
@@ -0,0 +1,2 @@
+a,b,c
+1,2,3
diff --git a/setup.py b/setup.py
@@ -4,10 +4,10 @@
 from setuptools import setup
 
 install_requires = [
-    'agate>=1.5.5',
-    'agate-excel>=0.2.0',
+    'agate>=1.6.0',
+    'agate-excel>=0.2.1',
     'agate-dbf>=0.2.0',
-    'agate-sql>=0.5.0',
+    'agate-sql>=0.5.1',
     'six>=1.6.1'
 ]
 
diff --git a/tests/test_utilities/test_csvsql.py b/tests/test_utilities/test_csvsql.py
@@ -134,15 +134,9 @@ def test_query(self):
         with stdin_as_string(input_file):
             sql = self.get_output(['--query', 'SELECT m.usda_id, avg(i.sepal_length) AS mean_sepal_length FROM iris AS i JOIN irismeta AS m ON (i.species = m.species) GROUP BY m.species', 'examples/iris.csv', 'examples/irismeta.csv'])
 
-            if six.PY2:
-                self.assertTrue('usda_id,mean_sepal_length' in sql)
-                self.assertTrue('IRSE,5.006' in sql)
-                self.assertTrue('IRVE2,5.936' in sql)
-                self.assertTrue('IRVI,6.588' in sql)
-            else:
-                self.assertTrue('usda_id,mean_sepal_length' in sql)
-                self.assertTrue('IRSE,5.005' in sql)
-                self.assertTrue('IRVE2,5.936' in sql)
-                self.assertTrue('IRVI,6.587' in sql)
+            self.assertTrue('usda_id,mean_sepal_length' in sql)
+            self.assertTrue('IRSE,5.00' in sql)
+            self.assertTrue('IRVE2,5.936' in sql)
+            self.assertTrue('IRVI,6.58' in sql)
 
         input_file.close()
diff --git a/tests/test_utilities/test_in2csv.py b/tests/test_utilities/test_in2csv.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import os
 import sys
 
 import six
@@ -170,3 +171,35 @@ def test_names_xlsx(self):
 
         self.assertEqual(next(output), 'not this one\n')
         self.assertEqual(next(output), 'data\n')
+
+    def test_convert_xls_with_write_sheets(self):
+        try:
+            self.assertConverted('xls', 'examples/sheets.xls', 'examples/testxls_converted.csv', ['--sheet', 'data', '--write-sheets', "ʤ,1"])
+            with open('examples/sheets_0.csv', 'r') as f:
+                with open('examples/testxls_unicode_converted.csv', 'r') as g:
+                    self.assertEqual(f.read(), g.read())
+            with open('examples/sheets_1.csv', 'r') as f:
+                with open('examples/testxls_converted.csv', 'r') as g:
+                    self.assertEqual(f.read(), g.read())
+            self.assertFalse(os.path.exists('examples/sheets_2.csv'))
+        finally:
+            for suffix in (0, 1):
+                path = 'examples/sheets_%d.csv' % suffix
+                if os.path.exists(path):
+                    os.remove(path)
+
+    def test_convert_xlsx_with_write_sheets(self):
+        try:
+            self.assertConverted('xlsx', 'examples/sheets.xlsx', 'examples/testxlsx_noinference_converted.csv', ['--no-inference', '--sheet', 'data', '--write-sheets', "ʤ,1"])
+            with open('examples/sheets_0.csv', 'r') as f:
+                with open('examples/testxlsx_unicode_converted.csv', 'r') as g:
+                    self.assertEqual(f.read(), g.read())
+            with open('examples/sheets_1.csv', 'r') as f:
+                with open('examples/testxlsx_noinference_converted.csv', 'r') as g:
+                    self.assertEqual(f.read(), g.read())
+            self.assertFalse(os.path.exists('examples/sheets_2.csv'))
+        finally:
+            for suffix in (0, 1):
+                path = 'examples/sheets_%d.csv' % suffix
+                if os.path.exists(path):
+                    os.remove(path)