Skip to content

Fix diff patch parsing #412

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 19, 2016
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Make diff patch parsing more reliable
The a_path and b_path cannot reliably be read from the first diff line
as it's ambiguous.  From the git-diff manpage:

  > The a/ and b/ filenames are the same unless rename/copy is involved.
  > Especially, **even for a creation or a deletion**, /dev/null is not
  > used in place of the a/ or b/ filenames.

This patch changes the a_path and b_path detection to read it from the
more reliable locations further down the diff headers.  Two use cases
are fixed by this:

  - As the man page snippet above states, for new/deleted files the a
    or b path will now be properly None.
  - File names with spaces in it are now properly parsed.

Working on this patch, I realized the --- and +++ lines really belong to
the diff header, not the diff contents.  This means that when parsing
the patch format, the --- and +++ will now be swallowed, and not end up
anymore as part of the diff contents.  The diff contents now always
start with an @@ line.

This may be a breaking change for some users that rely on this
behaviour.  However, those users could now access that information more
reliably via the normal Diff properties a_path and b_path now.
  • Loading branch information
nvie committed Apr 19, 2016
commit e77128e5344ce7d84302facc08d17c3151037ec3
32 changes: 22 additions & 10 deletions git/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,16 +198,18 @@ class Diff(object):
# precompiled regex
re_header = re.compile(r"""
^diff[ ]--git
[ ](?:a/)?(?P<a_path>.+?)[ ](?:b/)?(?P<b_path>.+?)\n
(?:^similarity[ ]index[ ](?P<similarity_index>\d+)%\n
^rename[ ]from[ ](?P<rename_from>\S+)\n
^rename[ ]to[ ](?P<rename_to>\S+)(?:\n|$))?
[ ](?:a/)?(?P<a_path_fallback>.+?)[ ](?:b/)?(?P<b_path_fallback>.+?)\n
(?:^similarity[ ]index[ ]\d+%\n
^rename[ ]from[ ](?P<rename_from>.*)\n
^rename[ ]to[ ](?P<rename_to>.*)(?:\n|$))?
(?:^old[ ]mode[ ](?P<old_mode>\d+)\n
^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
(?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
(?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
(?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+)
\.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
(?:^---[ ](?:a/)?(?P<a_path>.*)(?:\n|$))?
(?:^\+\+\+[ ](?:b/)?(?P<b_path>.*)(?:\n|$))?
""".encode('ascii'), re.VERBOSE | re.MULTILINE)
# can be used for comparisons
NULL_HEX_SHA = "0" * 40
Expand All @@ -231,15 +233,14 @@ def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode,
if self.b_mode:
self.b_mode = mode_str_to_int(self.b_mode)

if a_blob_id is None:
if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA:
self.a_blob = None
else:
assert self.a_mode is not None
self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=a_path)
if b_blob_id is None:

if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA:
self.b_blob = None
else:
assert self.b_mode is not None
self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=b_path)

self.new_file = new_file
Expand Down Expand Up @@ -329,11 +330,22 @@ def _index_from_patch_format(cls, repo, stream):
index = DiffIndex()
previous_header = None
for header in cls.re_header.finditer(text):
a_path, b_path, similarity_index, rename_from, rename_to, \
a_path_fallback, b_path_fallback, \
rename_from, rename_to, \
old_mode, new_mode, new_file_mode, deleted_file_mode, \
a_blob_id, b_blob_id, b_mode = header.groups()
a_blob_id, b_blob_id, b_mode, \
a_path, b_path = header.groups()
new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode)

a_path = a_path or rename_from or a_path_fallback
b_path = b_path or rename_to or b_path_fallback

if a_path == b'/dev/null':
a_path = None

if b_path == b'/dev/null':
b_path = None

# Our only means to find the actual text is to see what has not been matched by our regex,
# and then retro-actively assin it to our index
if previous_header is not None:
Expand Down
7 changes: 7 additions & 0 deletions git/test/fixtures/diff_file_with_spaces
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
diff --git a/file with spaces b/file with spaces
new file mode 100644
index 0000000000000000000000000000000000000000..75c620d7b0d3b0100415421a97f553c979d75174
--- /dev/null
+++ b/file with spaces
@@ -0,0 +1 @@
+ohai
2 changes: 0 additions & 2 deletions git/test/fixtures/diff_initial
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,7 @@
+=======
+CHANGES
Expand Down
11 changes: 9 additions & 2 deletions git/test/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_list_from_string_new_mode(self):
self._assert_diff_format(diffs)

assert_equal(1, len(diffs))
assert_equal(10, len(diffs[0].diff.splitlines()))
assert_equal(8, len(diffs[0].diff.splitlines()))

def test_diff_with_rename(self):
output = StringProcessAdapter(fixture('diff_rename'))
Expand Down Expand Up @@ -140,7 +140,8 @@ def test_diff_initial_commit(self):

# ...and with creating a patch
diff_index = initial_commit.diff(NULL_TREE, create_patch=True)
assert diff_index[0].b_path == 'CHANGES'
assert diff_index[0].a_path is None, repr(diff_index[0].a_path)
assert diff_index[0].b_path == 'CHANGES', repr(diff_index[0].b_path)
assert diff_index[0].new_file
assert diff_index[0].diff == fixture('diff_initial')

Expand All @@ -156,6 +157,12 @@ def test_diff_patch_format(self):
Diff._index_from_patch_format(self.rorepo, diff_proc.stdout)
# END for each fixture

def test_diff_with_spaces(self):
data = StringProcessAdapter(fixture('diff_file_with_spaces'))
diff_index = Diff._index_from_patch_format(self.rorepo, data.stdout)
assert diff_index[0].a_path is None, repr(diff_index[0].a_path)
assert diff_index[0].b_path == u'file with spaces', repr(diff_index[0].b_path)

def test_diff_interface(self):
# test a few variations of the main diff routine
assertion_map = dict()
Expand Down