Skip to content

Commit 66e52a2

Browse files
author
James McKinney
authored
Merge pull request wireservice#783 from wireservice/writesheets
in2csv adds a --write-sheets option to write the named Excel sheets to file
2 parents cb1b95e + 7bf2fde commit 66e52a2

File tree

11 files changed

+104
-40
lines changed

11 files changed

+104
-40
lines changed

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Improvements:
1616
* :doc:`/scripts/csvsql` accepts a file name for the :code:`--query` option.
1717
* :doc:`/scripts/csvstat` adds a :code:`--freq-count` option to set the maximum number of frequent values to display.
1818
* :doc:`/scripts/in2csv` adds a :code:`--names` flag to print Excel sheet names.
19+
* :doc:`/scripts/in2csv` adds a :code:`--write-sheets` option to write the named Excel sheets to files.
1920
* :doc:`/scripts/sql2csv` adds an :code:`--encoding` option to specify the encoding of the input query file.
2021

2122
Fixes:

csvkit/cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
import codecs
66
import gzip
77
import itertools
8-
import os.path
98
import sys
9+
from os.path import splitext
1010

1111
import agate
1212
import six
@@ -213,7 +213,7 @@ def _open_input_file(self, path):
213213
else:
214214
f = sys.stdin
215215
else:
216-
(_, extension) = os.path.splitext(path)
216+
extension = splitext(path)[1]
217217

218218
if extension == '.gz':
219219
f = LazyFile(gzip.open, path, mode, **kwargs)

csvkit/utilities/csvsql.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#!/usr/bin/env python
22

3-
import os
4-
from pkg_resources import iter_entry_points
3+
import os.path
54
import sys
5+
from pkg_resources import iter_entry_points
66

77
import agate
88
import agatesql # noqa
@@ -109,7 +109,7 @@ def _failsafe_main(self):
109109
table_name = "stdin"
110110
else:
111111
# Use filename as table name
112-
table_name = os.path.splitext(os.path.split(f.name)[1])[0]
112+
table_name = os.path.splitext(os.path.basename(f.name))[0]
113113

114114
table = None
115115

csvkit/utilities/csvstack.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python
22

3-
import os
3+
import os.path
44

55
import agate
66

@@ -31,7 +31,7 @@ def main(self):
3131
self.argparser.error('You must specify at least one file to stack.')
3232

3333
if self.args.group_by_filenames:
34-
groups = [os.path.split(f.name)[1] for f in self.input_files]
34+
groups = [os.path.basename(f.name) for f in self.input_files]
3535
elif self.args.groups:
3636
groups = self.args.groups.split(',')
3737

csvkit/utilities/in2csv.py

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22

33
import sys
4+
from os.path import splitext
45

56
import agate
67
import agatedbf # noqa
@@ -42,11 +43,28 @@ def option_parser(bytestring):
4243
help='Display sheet names from the input Excel file.')
4344
self.argparser.add_argument('--sheet', dest='sheet', type=option_parser,
4445
help='The name of the Excel sheet to operate on.')
46+
self.argparser.add_argument('--write-sheets', dest='write_sheets', type=option_parser,
47+
help='The names of the Excel sheets to write to files, or "-" to write all sheets.')
4548
self.argparser.add_argument('-y', '--snifflimit', dest='sniff_limit', type=int,
4649
help='Limit CSV dialect sniffing to the specified number of bytes. Specify "0" to disable sniffing entirely.')
4750
self.argparser.add_argument('-I', '--no-inference', dest='no_inference', action='store_true',
4851
help='Disable type inference (and --locale, --date-format, --datetime-format) when parsing CSV input.')
4952

53+
def open_excel_input_file(self, path):
54+
if not path or path == '-':
55+
if six.PY2:
56+
return six.BytesIO(sys.stdin.read())
57+
else:
58+
return six.BytesIO(sys.stdin.buffer.read())
59+
else:
60+
return open(path, 'rb')
61+
62+
def sheet_names(self, filetype):
63+
if filetype == 'xls':
64+
return xlrd.open_workbook(file_contents=self.input_file.read()).sheet_names()
65+
elif filetype == 'xlsx':
66+
return openpyxl.load_workbook(self.input_file, read_only=True, data_only=True).sheetnames
67+
5068
def main(self):
5169
path = self.args.input_path
5270

@@ -71,25 +89,15 @@ def main(self):
7189

7290
# Set the input file.
7391
if filetype in ('xls', 'xlsx'):
74-
if not path or path == '-':
75-
if six.PY2:
76-
self.input_file = six.BytesIO(sys.stdin.read())
77-
else:
78-
self.input_file = six.BytesIO(sys.stdin.buffer.read())
79-
else:
80-
self.input_file = open(path, 'rb')
92+
self.input_file = self.open_excel_input_file(path)
8193
else:
8294
self.input_file = self._open_input_file(path)
8395

8496
if self.args.names_only:
85-
sheet_names = None
86-
if filetype == 'xls':
87-
sheet_names = xlrd.open_workbook(file_contents=self.input_file.read()).sheet_names()
88-
elif filetype == 'xlsx':
89-
sheet_names = openpyxl.load_workbook(self.input_file, read_only=True, data_only=True).sheetnames
90-
if sheet_names:
91-
for name in sheet_names:
92-
self.output_file.write('%s\n' % name)
97+
sheets = self.sheet_names(filetype)
98+
if sheets:
99+
for sheet in sheets:
100+
self.output_file.write('%s\n' % sheet)
93101
else:
94102
self.argparser.error('You cannot use the -n or --names options with non-Excel files.')
95103
self.input_file.close()
@@ -103,9 +111,6 @@ def main(self):
103111
elif filetype == 'fixed':
104112
raise ValueError('schema must not be null when format is "fixed"')
105113

106-
if self.args.sheet:
107-
kwargs['sheet'] = self.args.sheet
108-
109114
if filetype == 'csv':
110115
kwargs.update(self.reader_kwargs)
111116
kwargs['sniff_limit'] = self.args.sniff_limit
@@ -133,15 +138,36 @@ def main(self):
133138
elif filetype == 'ndjson':
134139
table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs)
135140
elif filetype == 'xls':
136-
table = agate.Table.from_xls(self.input_file, **kwargs)
141+
table = agate.Table.from_xls(self.input_file, sheet=self.args.sheet, **kwargs)
137142
elif filetype == 'xlsx':
138-
table = agate.Table.from_xlsx(self.input_file, **kwargs)
143+
table = agate.Table.from_xlsx(self.input_file, sheet=self.args.sheet, **kwargs)
139144
elif filetype == 'dbf':
140145
if not hasattr(self.input_file, 'name'):
141146
raise ValueError('DBF files can not be converted from stdin. You must pass a filename.')
142147
table = agate.Table.from_dbf(self.input_file.name, **kwargs)
143148
table.to_csv(self.output_file)
144149

150+
if self.args.write_sheets:
151+
# Close and re-open the file, as the file object has been mutated or closed.
152+
self.input_file.close()
153+
154+
self.input_file = self.open_excel_input_file(path)
155+
156+
if self.args.write_sheets == '-':
157+
sheets = self.sheet_names(filetype)
158+
else:
159+
sheets = [int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',')]
160+
161+
if filetype == 'xls':
162+
tables = agate.Table.from_xls(self.input_file, sheet=sheets, **kwargs)
163+
elif filetype == 'xlsx':
164+
tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs)
165+
166+
base = splitext(self.input_file.name)[0]
167+
for i, table in enumerate(tables.values()):
168+
with open('%s_%d.csv' % (base, i), 'w') as f:
169+
table.to_csv(f)
170+
145171
self.input_file.close()
146172

147173
if self.args.schema:
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
a,b,c
2+
1.0,2.0,3.0
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
text,date,integer,boolean,float,datetime,empty_column,h
2+
Chicago Reader,1971-01-01,40,True,1,1971-01-01 04:14:00,,
3+
Chicago Sun-Times,1948-01-01,63,True,1.27,1948-01-01 14:57:13,,Extra data beyond headers will be trimmed
4+
Chicago Tribune,1920-01-01,164,False,41800000.01,1920-01-01,,
5+
This row has blanks,,,,,,,
6+
Unicode! Σ,,,,,,,
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
a,b,c
2+
1,2,3

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
from setuptools import setup
55

66
install_requires = [
7-
'agate>=1.5.5',
8-
'agate-excel>=0.2.0',
7+
'agate>=1.6.0',
8+
'agate-excel>=0.2.1',
99
'agate-dbf>=0.2.0',
10-
'agate-sql>=0.5.0',
10+
'agate-sql>=0.5.1',
1111
'six>=1.6.1'
1212
]
1313

tests/test_utilities/test_csvsql.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -134,15 +134,9 @@ def test_query(self):
134134
with stdin_as_string(input_file):
135135
sql = self.get_output(['--query', 'SELECT m.usda_id, avg(i.sepal_length) AS mean_sepal_length FROM iris AS i JOIN irismeta AS m ON (i.species = m.species) GROUP BY m.species', 'examples/iris.csv', 'examples/irismeta.csv'])
136136

137-
if six.PY2:
138-
self.assertTrue('usda_id,mean_sepal_length' in sql)
139-
self.assertTrue('IRSE,5.006' in sql)
140-
self.assertTrue('IRVE2,5.936' in sql)
141-
self.assertTrue('IRVI,6.588' in sql)
142-
else:
143-
self.assertTrue('usda_id,mean_sepal_length' in sql)
144-
self.assertTrue('IRSE,5.005' in sql)
145-
self.assertTrue('IRVE2,5.936' in sql)
146-
self.assertTrue('IRVI,6.587' in sql)
137+
self.assertTrue('usda_id,mean_sepal_length' in sql)
138+
self.assertTrue('IRSE,5.00' in sql)
139+
self.assertTrue('IRVE2,5.936' in sql)
140+
self.assertTrue('IRVI,6.58' in sql)
147141

148142
input_file.close()

0 commit comments

Comments
 (0)