|
| 1 | +# This library performs diffs of flat file content that contains structured data |
| 2 | +# in fields, with rows provided in a parent-child format. |
| 3 | +# |
| 4 | +# Parent-child data does not lend itself well to standard text diffs, as small |
| 5 | +# changes in the organisation of the tree at an upper level (e.g. re-ordering of |
| 6 | +# two ancestor nodes) can lead to big movements in the position of descendant |
| 7 | +# records - particularly when the parent-child data is generated by a hierarchy |
| 8 | +# traversal. |
| 9 | +# |
| 10 | +# Additionally, simple line-based diffs can identify that a line has changed, |
| 11 | +# but not which field(s) in the line have changed. |
| 12 | +# |
| 13 | +# Data may be supplied in the form of CSV files, or as an array of arrays. The |
| 14 | +# diff process process provides a fine level of control over what to diff, and |
| 15 | +# can optionally ignore certain types of changes (e.g. changes in order). |
| 16 | +class CSVDiff |
| 17 | + |
| 18 | + # @return [CSVSource] CSVSource object containing details of the left/from |
| 19 | + # input. |
| 20 | + attr_reader :left |
| 21 | + alias_method :from, :left |
| 22 | + # @return [CSVSource] CSVSource object containing details of the right/to |
| 23 | + # input. |
| 24 | + attr_reader :right |
| 25 | + alias_method :to, :right |
| 26 | + # @return [Array<Hash>] An array of differences |
| 27 | + attr_reader :diffs |
| 28 | + |
| 29 | + |
| 30 | + # Generates a diff between two hierarchical tree structures, provided |
| 31 | + # as +left+ and +right+, each of which consists of an array of lines in CSV |
| 32 | + # format. |
| 33 | + # An array of field indexes can also be specified as +key_fields+; |
| 34 | + # a minimum of one field index must be specified; the last index is the |
| 35 | + # child id, and the remaining fields (if any) are the parent field(s) that |
| 36 | + # uniquely qualify the child instance. |
| 37 | + # |
| 38 | + # @param left [Array<Array<String>>] An Array of lines, each of which is in |
| 39 | + # turn an Array containing fields. |
| 40 | + # @param right [Array<Array<String>>] An Array of lines, each of which is in |
| 41 | + # turn an Array containing fields. |
| 42 | + # @param options [Hash] A hash containing options. |
| 43 | + # @option options [Array<String>] :field_names An Array of field names for |
| 44 | + # each field in +left+ and +right+. If not provided, the first row is |
| 45 | + # assumed to contain field names. |
| 46 | + # @option options [Boolean] :ignore_header If true, the first line of each |
| 47 | + # file is ignored. This option can only be true if :field_names is |
| 48 | + # specified. |
| 49 | + # @options options [Array] :ignore_fields The names of any fields to be |
| 50 | + # ignored when performing the diff. |
| 51 | + def initialize(left, right, options = {}) |
| 52 | + @left = CSVSource.new(left, options) |
| 53 | + raise "No field names found in left (from) source" unless @left.field_names && @left.field_names.size > 0 |
| 54 | + @right = CSVSource.new(right, options) |
| 55 | + raise "No field names found in right (to) source" unless @right.field_names && @right.field_names.size > 0 |
| 56 | + @warnings = [] |
| 57 | + @diff_fields = get_diff_fields(@left.field_names, @right.field_names, options.fetch(:ignore_fields, [])) |
| 58 | + @key_fields = @left.key_fields.map{ |kf| @diff_fields[kf] } |
| 59 | + diff(options) |
| 60 | + end |
| 61 | + |
| 62 | + |
| 63 | + # Performs a diff with the specified +options+. |
| 64 | + def diff(options = {}) |
| 65 | + @diffs = diff_sources(@left, @right, @key_fields, @diff_fields, options) |
| 66 | + end |
| 67 | + |
| 68 | + |
| 69 | + # Returns a summary of the number of adds, deletes, moves, and updates. |
| 70 | + def summary |
| 71 | + summ = Hash.new{ |h, k| h[k] = 0 } |
| 72 | + @diffs.each{ |k, v| summ[v[:action]] += 1 } |
| 73 | + summ |
| 74 | + end |
| 75 | + |
| 76 | + |
| 77 | + [:adds, :deletes, :updates, :moves].each do |mthd| |
| 78 | + define_method mthd do |
| 79 | + action = mthd.to_s.chomp('s') |
| 80 | + @diffs.select{ |k, v| v[:action].downcase == action } |
| 81 | + end |
| 82 | + end |
| 83 | + |
| 84 | + |
| 85 | + # @return [Array<String>] an array of warning messages generated during the |
| 86 | + # diff process. |
| 87 | + def warnings |
| 88 | + @left.warnings + @right.warnings + @warnings |
| 89 | + end |
| 90 | + |
| 91 | + |
| 92 | + private |
| 93 | + |
| 94 | + |
| 95 | + # Given two sets of field names, determines the common set of fields present |
| 96 | + # in both, on which members can be diffed. |
| 97 | + def get_diff_fields(left_fields, right_fields, ignore_fields) |
| 98 | + diff_fields = [] |
| 99 | + right_fields.each do |fld| |
| 100 | + if left_fields.include?(fld) |
| 101 | + diff_fields << fld unless ignore_fields.include?(fld) |
| 102 | + else |
| 103 | + @warnings << "Field '#{fld}' is missing from the left (from) file, and won't be diffed" |
| 104 | + end |
| 105 | + end |
| 106 | + diff_fields |
| 107 | + end |
| 108 | + |
| 109 | + |
| 110 | + include Algorithm |
| 111 | + |
| 112 | +end |
0 commit comments