Skip to content

Commit f3ac86f

Browse files
committed
Add support for detecting children moving to different parents
1 parent aa55533 commit f3ac86f

File tree

3 files changed

+140
-42
lines changed

3 files changed

+140
-42
lines changed

lib/csv-diff/algorithm.rb

Lines changed: 99 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,55 @@ class CSVDiff
33
# Implements the CSV diff algorithm.
44
module Algorithm
55

6+
# Holds the details of a single difference
7+
class Diff
8+
9+
attr_accessor :diff_type
10+
attr_reader :fields
11+
attr_reader :row
12+
attr_reader :sibling_position
13+
14+
def initialize(diff_type, fields, row_idx, pos_idx)
15+
@diff_type = diff_type
16+
@fields = fields
17+
@row = row_idx + 1
18+
self.sibling_position = pos_idx
19+
end
20+
21+
22+
def sibling_position=(pos_idx)
23+
if pos_idx.is_a?(Array)
24+
pos_idx.compact!
25+
if pos_idx.first != pos_idx.last
26+
@sibling_position = pos_idx.map{ |pos| pos + 1 }
27+
else
28+
@sibling_position = pos_idx.first + 1
29+
end
30+
else
31+
@sibling_position = pos_idx + 1
32+
end
33+
end
34+
35+
36+
# For backwards compatibility and access to fields with differences
37+
def [](key)
38+
case key
39+
when String
40+
@fields[key]
41+
when :action
42+
a = diff_type.to_s
43+
a[0] = a[0].upcase
44+
a
45+
when :row
46+
@row
47+
when :sibling_position
48+
@sibling_position
49+
end
50+
end
51+
52+
end
53+
54+
655
# Diffs two CSVSource structures.
756
#
857
# @param left [CSVSource] A CSVSource object containing the contents of
@@ -33,74 +82,83 @@ def diff_sources(left, right, key_fields, diff_fields, options = {})
3382
right_index = right.index
3483
right_values = right.lines
3584
right_keys = right_values.keys
36-
parent_fields = left.parent_fields.length
85+
parent_field_count = left.parent_fields.length
3786

3887
include_adds = !options[:ignore_adds]
3988
include_moves = !options[:ignore_moves]
4089
include_updates = !options[:ignore_updates]
4190
include_deletes = !options[:ignore_deletes]
4291

43-
diffs = Hash.new{ |h, k| h[k] = {} }
92+
diffs = {}
93+
potential_moves = Hash.new{ |h, k| h[k] = [] }
4494

4595
# First identify deletions
4696
if include_deletes
4797
(left_keys - right_keys).each do |key|
4898
# Delete
4999
key_vals = key.split('~', -1)
50-
parent = key_vals[0...parent_fields].join('~')
100+
parent = key_vals[0...parent_field_count].join('~')
101+
child = key_vals[parent_field_count..-1].join('~')
51102
left_parent = left_index[parent]
52103
left_value = left_values[key]
53-
left_idx = left_parent.index(key)
54-
next unless left_idx
55-
id = {}
56-
id[:row] = left_keys.index(key) + 1
57-
id[:sibling_position] = left_idx + 1
58-
key_fields.each do |field_name|
59-
id[field_name] = left_value[field_name]
60-
end
61-
diffs[key].merge!(id.merge(left_values[key].merge(:action => 'Delete')))
104+
row_idx = left_keys.index(key)
105+
sib_idx = left_parent.index(key)
106+
raise "Can't locate key #{key} in parent #{parent}" unless sib_idx
107+
diffs[key] = d = Diff.new(:delete, left_value, row_idx, sib_idx)
108+
potential_moves[child] << key
62109
#puts "Delete: #{key}"
63110
end
64111
end
65112

66113
# Now identify adds/updates
67114
right_keys.each_with_index do |key, right_row_id|
68115
key_vals = key.split('~', -1)
69-
parent = key_vals[0...parent_fields].join('~')
116+
parent = key_vals[0...parent_field_count].join('~')
70117
left_parent = left_index[parent]
71118
right_parent = right_index[parent]
72119
left_value = left_values[key]
73120
right_value = right_values[key]
74121
left_idx = left_parent && left_parent.index(key)
75122
right_idx = right_parent && right_parent.index(key)
76123

77-
id = {}
78-
id[:row] = right_row_id + 1
79-
id[:sibling_position] = right_idx + 1
80-
key_fields.each do |field_name|
81-
id[field_name] = right_value[field_name]
82-
end
83124
if left_idx && right_idx
125+
if include_updates && (changes = diff_row(left_value, right_value, diff_fields, case_sensitive))
126+
id = id_fields(key_fields, right_value)
127+
diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
128+
#puts "Change: #{key}"
129+
end
84130
if include_moves
85131
left_common = left_parent & right_parent
86132
right_common = right_parent & left_parent
87133
left_pos = left_common.index(key)
88134
right_pos = right_common.index(key)
89135
if left_pos != right_pos
90136
# Move
91-
diffs[key].merge!(id.merge!(:action => 'Move',
92-
:sibling_position => [left_idx + 1, right_idx + 1]))
137+
if d = diffs[key]
138+
d.sibling_position = [left_idx, right_idx]
139+
else
140+
id = id_fields(key_fields, right_value)
141+
diffs[key] = Diff.new(:move, id, right_row_id, [left_idx, right_idx])
142+
end
93143
#puts "Move #{left_idx} -> #{right_idx}: #{key}"
94144
end
95145
end
96-
if include_updates && (changes = diff_row(left_value, right_value, diff_fields, case_sensitive))
97-
diffs[key].merge!(id.merge(changes.merge(:action => 'Update')))
98-
#puts "Change: #{key}"
99-
end
100-
elsif include_adds && right_idx
146+
elsif right_idx
101147
# Add
102-
diffs[key].merge!(id.merge(right_values[key].merge(:action => 'Add')))
103-
#puts "Add: #{key}"
148+
child = key_vals[parent_field_count..-1].join('~')
149+
if potential_moves.has_key?(child) && old_key = potential_moves[child].pop
150+
diffs.delete(old_key)
151+
if include_updates
152+
left_value = left_values[old_key]
153+
id = id_fields(right.child_fields, right_value)
154+
changes = diff_row(left_value, right_value, left.parent_fields + diff_fields, case_sensitive)
155+
diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
156+
#puts "Update Parent: #{key}"
157+
end
158+
elsif include_adds
159+
diffs[key] = Diff.new(:add, right_value, right_row_id, right_idx)
160+
#puts "Add: #{key}"
161+
end
104162
end
105163
end
106164

@@ -137,6 +195,19 @@ def diff_row(left_row, right_row, fields, case_sensitive)
137195
diffs if diffs.size > 0
138196
end
139197

198+
199+
private
200+
201+
202+
# Return a hash containing just the key field values
203+
def id_fields(key_fields, fields)
204+
id = {}
205+
key_fields.each do |field_name|
206+
id[field_name] = fields[field_name]
207+
end
208+
id
209+
end
210+
140211
end
141212

142213
end

lib/csv-diff/csv_source.rb

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,29 @@ class CSVSource
66

77
# @return [String] the path to the source file
88
attr_accessor :path
9+
910
# @return [Array<String>] The names of the fields in the source file
1011
attr_reader :field_names
1112
# @return [Array<String>] The names of the field(s) that uniquely
1213
# identify each row.
1314
attr_reader :key_fields
14-
# @return [Array<Fixnum>] The indexes of the key fields in the source
15-
# file.
16-
attr_reader :key_field_indexes
1715
# @return [Array<String>] The names of the field(s) that identify a
1816
# common parent of child records.
1917
attr_reader :parent_fields
2018
# @return [Array<String>] The names of the field(s) that distinguish a
2119
# child of a parent record.
2220
attr_reader :child_fields
21+
22+
# @return [Array<Fixnum>] The indexes of the key fields in the source
23+
# file.
24+
attr_reader :key_field_indexes
25+
# @return [Array<Fixnum>] The indexes of the parent fields in the source
26+
# file.
27+
attr_reader :parent_field_indexes
28+
# @return [Array<Fixnum>] The indexes of the child fields in the source
29+
# file.
30+
attr_reader :child_field_indexes
31+
2332
# @return [Boolean] True if the source has been indexed with case-
2433
# sensitive keys, or false if it has been indexed using upper-case key
2534
# values.
@@ -117,7 +126,9 @@ def [](key)
117126
def index_source(lines, options)
118127
@lines = {}
119128
@index = Hash.new{ |h, k| h[k] = [] }
120-
@key_field_indexes = find_field_indexes(@key_fields, @field_names) if @field_names
129+
if @field_names
130+
index_fields
131+
end
121132
@case_sensitive = options.fetch(:case_sensitive, true)
122133
@trim_whitespace = options.fetch(:trim_whitespace, false)
123134
line_num = 0
@@ -126,8 +137,7 @@ def index_source(lines, options)
126137
next if line_num == 1 && @field_names && options[:ignore_header]
127138
unless @field_names
128139
@field_names = row
129-
@key_field_indexes = find_field_indexes(@key_fields, @field_names)
130-
@key_fields = @key_field_indexes.map{ |i| @field_names[i] }
140+
index_fields
131141
next
132142
end
133143
field_vals = row
@@ -150,6 +160,16 @@ def index_source(lines, options)
150160
end
151161

152162

163+
def index_fields
164+
@key_field_indexes = find_field_indexes(@key_fields, @field_names)
165+
@parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
166+
@child_field_indexes = find_field_indexes(@child_fields, @field_names)
167+
@key_fields = @key_field_indexes.map{ |i| @field_names[i] }
168+
@parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
169+
@child_fields = @child_field_indexes.map{ |i| @field_names[i] }
170+
end
171+
172+
153173
# Converts an array of field names to an array of indexes of the fields
154174
# matching those names.
155175
def find_field_indexes(key_fields, field_names)

test/test_diff.rb

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,30 +9,37 @@ class TestDiff < Test::Unit::TestCase
99
['A', 'A1', 'Account1'],
1010
['A', 'A2', 'Account 2'],
1111
['A', 'A3', 'Account 3'],
12-
['A', 'A4', 'Account 4']
12+
['A', 'A4', 'Account 4'],
13+
['A', 'A6', 'Account 6']
1314
]
1415

1516
DATA2 = [
1617
['Parent', 'Child', 'Description'],
1718
['A', 'A1', 'Account1'],
1819
['A', 'A2', 'Account2'],
1920
['A', 'a3', 'ACCOUNT 3'],
20-
['A', 'A5', 'Account 5']
21+
['A', 'A5', 'Account 5'],
22+
['B', 'A6', 'Account 6'],
23+
['C', 'A6', 'Account 6c']
2124
]
2225

2326
def test_array_diff
24-
diff = CSVDiff.new(DATA1, DATA2, key_fields: [1, 0])
25-
assert_equal(1, diff.adds.size)
27+
diff = CSVDiff.new(DATA1, DATA2, key_fields: [0, 1])
28+
#assert_equal(['Parent'], diff.left.parent_fields)
29+
#assert_equal(['Parent'], diff.right.parent_fields)
30+
#assert_equal(['Child'], diff.left.child_fields)
31+
#assert_equal(['Child'], diff.right.child_fields)
32+
assert_equal(2, diff.adds.size)
2633
assert_equal(1, diff.deletes.size)
27-
assert_equal(2, diff.updates.size)
34+
assert_equal(3, diff.updates.size)
2835
end
2936

3037

3138
def test_case_insensitive_diff
32-
diff = CSVDiff.new(DATA1, DATA2, key_fields: [1, 0], case_sensitive: false)
33-
assert_equal(1, diff.adds.size)
39+
diff = CSVDiff.new(DATA1, DATA2, key_fields: [0, 1], case_sensitive: false)
40+
assert_equal(2, diff.adds.size)
3441
assert_equal(1, diff.deletes.size)
35-
assert_equal(1, diff.updates.size)
42+
assert_equal(2, diff.updates.size)
3643
end
3744

3845

0 commit comments

Comments
 (0)