Skip to content

Cache input data read during init_separators to avoid seek/rewind. #45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 49 additions & 58 deletions lib/csv.rb
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,7 @@ def initialize(data, col_sep: ",", row_sep: :auto, quote_char: '"', field_size_l

# create the IO object we will read from
@io = data.is_a?(String) ? StringIO.new(data) : data
@prefix_io = nil # cache for input data possibly read by init_separators
@encoding = determine_encoding(encoding, internal_encoding)
#
# prepare for building safe regular expressions in the target encoding,
Expand Down Expand Up @@ -1204,8 +1205,14 @@ def shift

loop do
# add another read to the line
unless parse = @io.gets(@row_sep)
return nil
if @prefix_io
parse = @prefix_io.gets(@row_sep)
if @prefix_io.eof?
parse << (@io.gets(@row_sep) || "") unless parse.end_with?(@row_sep)
@prefix_io = nil # avoid having to test @prefix_io.eof? in main code path
end
else
return nil unless parse = @io.gets(@row_sep)
end

if in_extended_col
Expand Down Expand Up @@ -1315,7 +1322,7 @@ def shift

if in_extended_col
# if we're at eof?(), a quoted field wasn't closed...
if @io.eof?
if @io.eof? and !@prefix_io
raise MalformedCSVError.new("Unclosed quoted field",
lineno + 1)
elsif @field_size_limit and csv.last.size >= @field_size_limit
Expand Down Expand Up @@ -1437,68 +1444,52 @@ def init_separators(col_sep, row_sep, quote_char, force_quotes)
# (not fully encoding safe)
#
if @row_sep == :auto
if [ARGF, STDIN, STDOUT, STDERR].include?(@io) or
(defined?(Zlib) and @io.class == Zlib::GzipWriter)
@row_sep = $INPUT_RECORD_SEPARATOR
else
begin
saved_prefix = [] # sample chunks to be reprocessed later
begin
while @row_sep == :auto && @io.respond_to?(:gets)
#
# remember where we were (pos() will raise an exception if @io is pipe
# or not opened for reading)
# if we run out of data, it's probably a single line
# (ensure will set default value)
#
saved_pos = @io.pos
while @row_sep == :auto
#
# if we run out of data, it's probably a single line
# (ensure will set default value)
#
break unless sample = @io.gets(nil, 1024)

cr = encode_str("\r")
lf = encode_str("\n")
# extend sample if we're unsure of the line ending
if sample.end_with?(cr)
sample << (@io.gets(nil, 1) || "")
end
break unless sample = @io.gets(nil, 1024)

# try to find a standard separator
sample.each_char.each_cons(2) do |char, next_char|
case char
when cr
if next_char == lf
@row_sep = encode_str("\r\n")
else
@row_sep = cr
end
break
when lf
@row_sep = lf
break
end
end
cr = encode_str("\r")
lf = encode_str("\n")
# extend sample if we're unsure of the line ending
if sample.end_with?(cr)
sample << (@io.gets(nil, 1) || "")
end

# tricky seek() clone to work around GzipReader's lack of seek()
@io.rewind
# reset back to the remembered position
while saved_pos > 1024 # avoid loading a lot of data into memory
@io.read(1024)
saved_pos -= 1024
saved_prefix << sample

# try to find a standard separator
sample.each_char.each_cons(2) do |char, next_char|
case char
when cr
if next_char == lf
@row_sep = encode_str("\r\n")
else
@row_sep = cr
end
break
when lf
@row_sep = lf
break
end
end
@io.read(saved_pos) if saved_pos.nonzero?
rescue IOError # not opened for reading
# do nothing: ensure will set default
rescue NoMethodError # Zlib::GzipWriter doesn't have some IO methods
# do nothing: ensure will set default
rescue SystemCallError # pipe
# do nothing: ensure will set default
ensure
#
# set default if we failed to detect
# (stream not opened for reading, a pipe, or a single line of data)
#
@row_sep = $INPUT_RECORD_SEPARATOR if @row_sep == :auto
end
rescue IOError
# do nothing: ensure will set default
ensure
#
# set default if we failed to detect
# (stream not opened for reading or a single line of data)
#
@row_sep = $INPUT_RECORD_SEPARATOR if @row_sep == :auto

# save sampled input for later parsing (but only if there is some!)
saved_prefix = saved_prefix.join('')
@prefix_io = StringIO.new(saved_prefix) unless saved_prefix.empty?
end
end
@row_sep = @row_sep.to_s.encode(@encoding)
Expand Down
39 changes: 39 additions & 0 deletions test/csv/test_features.rb
Original file line number Diff line number Diff line change
Expand Up @@ -375,4 +375,43 @@ def test_string_works_like_a_regexp
def test_table_nil_equality
assert_nothing_raised(NoMethodError) { CSV.parse("test", headers: true) == nil }
end

# non-seekable input stream for testing https://github.com/ruby/csv/issues/44
class DummyIO
extend Forwardable
def_delegators :@io, :gets, :read, :pos # no seek or rewind!
def initialize(data)
@io = StringIO.new(data)
end
end

def test_line_separator_autodetection_for_non_seekable_input
# simple input with LF line breaks
c = CSV.new(DummyIO.new("one,two,three\nfoo,bar,baz\n"))
assert_equal [["one", "two", "three"], ["foo", "bar", "baz"]], c.each.to_a

# simple input with CR line breaks
c = CSV.new(DummyIO.new("one,two,three\rfoo,bar,baz\r"))
assert_equal [["one", "two", "three"], ["foo", "bar", "baz"]], c.each.to_a

# simple input with CRLF line breaks
c = CSV.new(DummyIO.new("one,two,three\r\nfoo,bar,baz\r\n"))
assert_equal [["one", "two", "three"], ["foo", "bar", "baz"]], c.each.to_a

# input with lines longer than 1024 bytes
table = (1..10).map { |row| (1..200).map { |col| "row#{row}col#{col}" }.to_a }.to_a
input = table.map { |line| line.join(",") }.join("\n")
c = CSV.new(DummyIO.new(input))
assert_equal table, c.each.to_a

# same with CRLF line breaks
input = table.map { |line| line.join(",") }.join("\r\n")
c = CSV.new(DummyIO.new(input))
assert_equal table, c.each.to_a

# input with lots of CRs (to make sure no bytes are lost due to look-ahead)
c = CSV.new(DummyIO.new("foo\r" + "\r" * 9999 + "bar\r"))
assert_equal [["foo"]] + [[]] * 9999 + [["bar"]], c.each.to_a
end

end