-
Notifications
You must be signed in to change notification settings - Fork 1k
WIP: HashFromNode and VerifyDepTree #959
Changes from 1 commit
84b3567
ae2c30e
1a76ff6
66fe4eb
5c97180
546d65f
f1d5d85
b6d16e1
9a7209d
581e51a
a298630
f0b532d
911fda4
ecbbf03
0f7b419
443fcd3
8575040
541b3f8
0a01713
8a0c365
cba6ebe
59a1046
c5be3d8
87959d0
29b1709
17e7c9d
0df696c
f02d479
e9ff6b2
fbe5d64
48cd664
845c95e
9a883e8
a252c36
83ad7c6
a9bb8cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
* Only works with directories; no longer with files. * Takes a single argument, that of the directory to hash. * Hashes existance and type of all file system nodes in the directory that it finds. * When hashing file system node names, it no longer includes the prefix matching the directory name parameter.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ package pkgtree | |
import ( | ||
"bytes" | ||
"crypto/sha256" | ||
"encoding/binary" | ||
"fmt" | ||
"hash" | ||
"io" | ||
|
@@ -26,12 +27,9 @@ const ( | |
skipSpecialNodes = os.ModeDevice | os.ModeNamedPipe | os.ModeSocket | os.ModeCharDevice | ||
) | ||
|
||
// DigestFromPathname returns a deterministic hash of the specified file system | ||
// node, performing a breadth-first traversal of directories. While the | ||
// specified prefix is joined with the pathname to walk the file system, the | ||
// prefix string is eliminated from the pathname of the nodes encounted when | ||
// hashing the pathnames, so that the resultant hash is agnostic to the absolute | ||
// root directory path of the nodes being checked. | ||
// DigestFromDirectory returns a hash of the specified directory contents, which | ||
// will match the hash computed for any directory on any supported Go platform | ||
// whose contents exactly match the specified directory. | ||
// | ||
// This function ignores any file system node named `vendor`, `.bzr`, `.git`, | ||
// `.hg`, and `.svn`, as these are typically used as Version Control System | ||
|
@@ -41,60 +39,105 @@ const ( | |
// hash includes the pathname to every discovered file system node, whether it | ||
// is an empty directory, a non-empty directory, empty file, non-empty file, or | ||
// symbolic link. If a symbolic link, the referent name is included. If a | ||
// non-empty file, the file's contents are incuded. If a non-empty directory, | ||
// non-empty file, the file's contents are included. If a non-empty directory, | ||
// the contents of the directory are included. | ||
// | ||
// While filepath.Walk could have been used, that standard library function | ||
// skips symbolic links, and for now, we want the hash to include the symbolic | ||
// link referents. | ||
func DigestFromPathname(prefix, pathname string) ([]byte, error) { | ||
func DigestFromDirectory(dirname string) ([]byte, error) { | ||
dirname = filepath.Clean(dirname) | ||
|
||
// Ensure parameter is a directory | ||
fi, err := os.Stat(dirname) | ||
if err != nil { | ||
return nil, errors.Wrap(err, "cannot Stat") | ||
} | ||
if !fi.IsDir() { | ||
return nil, errors.Errorf("cannot verify non directory: %q", dirname) | ||
} | ||
|
||
// Create a single hash instance for the entire operation, rather than a new | ||
// hash for each node we encounter. | ||
h := sha256.New() | ||
|
||
// Initialize a work queue with the os-agnostic cleaned up pathname. Note | ||
// that we use `filepath.Clean` rather than `filepath.Abs`, because the hash | ||
// has pathnames which are relative to prefix, and there is no reason to | ||
// convert to absolute pathname for every invocation of this function. | ||
prefix = filepath.Clean(prefix) + pathSeparator | ||
prefixLength := len(prefix) // store length to trim off pathnames later | ||
pathnameQueue := []string{filepath.Join(prefix, pathname)} | ||
// Initialize a work queue with the empty string, which signifies the | ||
// starting directory itself. | ||
queue := []string{""} | ||
|
||
var written int64 | ||
var relative string // pathname relative to dirname | ||
var pathname string // formed by combining dirname with relative for each node | ||
var bytesWritten int64 | ||
modeBytes := make([]byte, 4) // scratch place to store encoded os.FileMode (uint32) | ||
|
||
// As we enumerate over the queue and encounter a directory, its children | ||
// will be added to the work queue. | ||
for len(pathnameQueue) > 0 { | ||
for len(queue) > 0 { | ||
// Unshift a pathname from the queue (breadth-first traversal of | ||
// hierarchy) | ||
pathname, pathnameQueue = pathnameQueue[0], pathnameQueue[1:] | ||
relative, queue = queue[0], queue[1:] | ||
pathname = filepath.Join(dirname, relative) | ||
|
||
fi, err := os.Lstat(pathname) | ||
fi, err = os.Lstat(pathname) | ||
if err != nil { | ||
return nil, errors.Wrap(err, "cannot Lstat") | ||
} | ||
mode := fi.Mode() | ||
|
||
// Skip file system nodes we are not concerned with | ||
if mode&skipSpecialNodes != 0 { | ||
continue | ||
// We could make our own enum-like data type for encoding the file type, | ||
// but Go's runtime already gives us architecture independent file | ||
// modes, as discussed in `os/types.go`: | ||
// | ||
// Go's runtime FileMode type has same definition on all systems, so | ||
// that information about files can be moved from one system to | ||
// another portably. | ||
var mt os.FileMode | ||
|
||
// We only care about the bits that identify the type of a file system | ||
// node, and can ignore append, exclusive, temporary, setuid, setgid, | ||
// permission bits, and sticky bits, which are coincident to bits which | ||
// declare type of the file system node. | ||
modeType := fi.Mode() & os.ModeType | ||
var shouldSkip bool // skip some types of file system nodes | ||
|
||
switch { | ||
case modeType&os.ModeDir > 0: | ||
mt = os.ModeDir | ||
case modeType&os.ModeSymlink > 0: | ||
mt = os.ModeSymlink | ||
case modeType&os.ModeNamedPipe > 0: | ||
mt = os.ModeNamedPipe | ||
shouldSkip = true | ||
case modeType&os.ModeSocket > 0: | ||
mt = os.ModeSocket | ||
shouldSkip = true | ||
case modeType&os.ModeDevice > 0: | ||
mt = os.ModeDevice | ||
shouldSkip = true | ||
} | ||
|
||
// Write the prefix-stripped pathname to hash because the hash is as | ||
// much a function of the relative names of the files and directories as | ||
// it is their contents. Added benefit is that even empty directories | ||
// and symbolic links will affect final hash value. Use | ||
// `filepath.ToSlash` to ensure relative pathname is os-agnostic. | ||
writeBytesWithNull(h, []byte(filepath.ToSlash(pathname[prefixLength:]))) | ||
// Write the relative pathname to hash because the hash is a function of | ||
// the node names, node types, and node contents. Added benefit is that | ||
// empty directories, named pipes, sockets, devices, and symbolic links | ||
// will affect final hash value. Use `filepath.ToSlash` to ensure | ||
// relative pathname is os-agnostic. | ||
writeBytesWithNull(h, []byte(filepath.ToSlash(relative))) | ||
|
||
binary.LittleEndian.PutUint32(modeBytes, uint32(mt)) // encode the type of mode | ||
writeBytesWithNull(h, modeBytes) // and write to hash | ||
|
||
if shouldSkip { | ||
// There is nothing more to do for some of the node types. | ||
continue | ||
} | ||
|
||
if mode&os.ModeSymlink != 0 { | ||
referent, err := os.Readlink(pathname) | ||
if mt == os.ModeSymlink { // okay to check for equivalence because we set to this value | ||
relative, err = os.Readlink(pathname) // read the symlink referent | ||
if err != nil { | ||
return nil, errors.Wrap(err, "cannot Readlink") | ||
} | ||
// Write the os-agnostic referent to the hash and proceed to the | ||
// next pathname in the queue. | ||
writeBytesWithNull(h, []byte(filepath.ToSlash(referent))) | ||
writeBytesWithNull(h, []byte(filepath.ToSlash(relative))) // and write it to hash | ||
continue | ||
} | ||
|
||
|
@@ -105,28 +148,28 @@ func DigestFromPathname(prefix, pathname string) ([]byte, error) { | |
return nil, errors.Wrap(err, "cannot Open") | ||
} | ||
|
||
if fi.IsDir() { | ||
if mt == os.ModeDir { | ||
childrenNames, err := sortedListOfDirectoryChildrenFromFileHandle(fh) | ||
if err != nil { | ||
_ = fh.Close() // already have an error reading directory; ignore Close result. | ||
_ = fh.Close() // ignore close error because we already have an error reading directory | ||
return nil, errors.Wrap(err, "cannot get list of directory children") | ||
} | ||
for _, childName := range childrenNames { | ||
switch childName { | ||
case ".", "..", "vendor", ".bzr", ".git", ".hg", ".svn": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. kinda more of a "note to self," but in the analogous spot in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree with this. If There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i'm fine with omitting it, though i will nitpick to note that the reason you cite here isn't a factor: if something committed within a if somebody manually hacked something in the |
||
// skip | ||
default: | ||
pathnameQueue = append(pathnameQueue, filepath.Join(pathname, childName)) | ||
queue = append(queue, filepath.Join(relative, childName)) | ||
} | ||
} | ||
} else { | ||
written, err = io.Copy(h, newLineEndingReader(fh)) // fast copy of file contents to hash | ||
err = errors.Wrap(err, "cannot Copy") // errors.Wrap only wraps non-nil, so elide guard condition | ||
writeBytesWithNull(h, []byte(strconv.FormatInt(written, 10))) // format file size as base 10 integer | ||
bytesWritten, err = io.Copy(h, newLineEndingReader(fh)) // fast copy of file contents to hash | ||
err = errors.Wrap(err, "cannot Copy") // errors.Wrap only wraps non-nil, so skip extra check | ||
writeBytesWithNull(h, []byte(strconv.FormatInt(bytesWritten, 10))) // format file size as base 10 integer | ||
} | ||
|
||
// Close the file handle to the open directory without masking possible | ||
// previous error value. | ||
// Close the file handle to the open directory or file without masking | ||
// possible previous error value. | ||
if er := fh.Close(); err == nil { | ||
err = errors.Wrap(er, "cannot Close") | ||
} | ||
|
@@ -395,7 +438,7 @@ func VerifyDepTree(vendorRoot string, wantSums map[string][]byte) (map[string]Ve | |
if expectedSum, ok := wantSums[projectNormalized]; ok { | ||
ls := EmptyDigestInLock | ||
if len(expectedSum) > 0 { | ||
projectSum, err := DigestFromPathname(vendorRoot, projectNormalized) | ||
projectSum, err := DigestFromDirectory(filepath.Join(vendorRoot, projectNormalized)) | ||
if err != nil { | ||
return nil, errors.Wrap(err, "cannot compute dependency hash") | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,46 +78,25 @@ func getTestdataVerifyRoot(t *testing.T) string { | |
return filepath.Join(filepath.Dir(cwd), "testdata_digest") | ||
} | ||
|
||
func TestDigestFromPathnameWithFile(t *testing.T) { | ||
relativePathname := "github.com/alice/match/match.go" | ||
want := []byte{0x43, 0xe6, 0x3, 0x53, 0xda, 0x4, 0x18, 0xf6, 0xbd, 0x98, 0xf4, 0x6c, 0x6d, 0xb8, 0xc1, 0x8d, 0xa2, 0x78, 0x16, 0x45, 0xf7, 0xca, 0xc, 0xec, 0xcf, 0x2e, 0xa1, 0x64, 0x55, 0x69, 0xbf, 0x8f} | ||
|
||
// NOTE: Create the hash using both an absolute and a relative pathname to | ||
// ensure hash ignores prefix. | ||
|
||
t.Run("AbsolutePrefix", func(t *testing.T) { | ||
prefix := getTestdataVerifyRoot(t) | ||
got, err := DigestFromPathname(prefix, relativePathname) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
if !bytes.Equal(got, want) { | ||
t.Errorf("\n(GOT):\n\t%#v\n(WNT):\n\t%#v", got, want) | ||
} | ||
}) | ||
|
||
t.Run("RelativePrefix", func(t *testing.T) { | ||
prefix := "../testdata_digest" | ||
got, err := DigestFromPathname(prefix, relativePathname) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
if !bytes.Equal(got, want) { | ||
t.Errorf("\n(GOT):\n\t%#v\n(WNT):\n\t%#v", got, want) | ||
} | ||
}) | ||
func TestDigestFromDirectoryBailsUnlessDirectory(t *testing.T) { | ||
prefix := getTestdataVerifyRoot(t) | ||
relativePathname := "launchpad.net/match" | ||
_, err := DigestFromDirectory(filepath.Join(prefix, relativePathname)) | ||
if got, want := err, error(nil); got != want { | ||
t.Errorf("\n(GOT): %v; (WNT): %v", got, want) | ||
} | ||
} | ||
|
||
func TestDigestFromPathnameWithDirectory(t *testing.T) { | ||
func TestDigestFromDirectory(t *testing.T) { | ||
relativePathname := "launchpad.net/match" | ||
want := []byte{0x74, 0xe, 0x17, 0x87, 0xd4, 0x8e, 0x56, 0x2e, 0x7e, 0x32, 0x4e, 0x80, 0x3a, 0x5f, 0x3a, 0x10, 0x33, 0x43, 0x2c, 0x24, 0x8e, 0xf7, 0x1a, 0x37, 0x5e, 0x76, 0xf4, 0x6, 0x2b, 0xf3, 0xfd, 0x91} | ||
want := []byte{0x7e, 0x10, 0x6, 0x2f, 0x8, 0x3, 0x3c, 0x76, 0xae, 0xbc, 0xa4, 0xc9, 0xec, 0x73, 0x67, 0x15, 0x70, 0x2b, 0x0, 0x89, 0x27, 0xbb, 0x61, 0x9d, 0xc7, 0xc3, 0x39, 0x46, 0x3, 0x91, 0xb7, 0x3b} | ||
|
||
// NOTE: Create the hash using both an absolute and a relative pathname to | ||
// ensure hash ignores prefix. | ||
|
||
t.Run("AbsolutePrefix", func(t *testing.T) { | ||
prefix := getTestdataVerifyRoot(t) | ||
got, err := DigestFromPathname(prefix, relativePathname) | ||
got, err := DigestFromDirectory(filepath.Join(prefix, relativePathname)) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
|
@@ -128,7 +107,7 @@ func TestDigestFromPathnameWithDirectory(t *testing.T) { | |
|
||
t.Run("RelativePrefix", func(t *testing.T) { | ||
prefix := "../testdata_digest" | ||
got, err := DigestFromPathname(prefix, relativePathname) | ||
got, err := DigestFromDirectory(filepath.Join(prefix, relativePathname)) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
|
@@ -142,13 +121,13 @@ func TestVerifyDepTree(t *testing.T) { | |
vendorRoot := getTestdataVerifyRoot(t) | ||
|
||
wantSums := map[string][]byte{ | ||
"github.com/alice/match": []byte{0x87, 0x4, 0x53, 0x5f, 0xb8, 0xca, 0xe9, 0x52, 0x40, 0x41, 0xcd, 0x93, 0x2e, 0x42, 0xfb, 0x14, 0x77, 0x57, 0x9c, 0x3, 0x6b, 0xe1, 0x15, 0xe7, 0xfa, 0xc1, 0xf0, 0x98, 0x4b, 0x61, 0x9f, 0x48}, | ||
"github.com/alice/match": []byte{0x7e, 0x10, 0x6, 0x2f, 0x8, 0x3, 0x3c, 0x76, 0xae, 0xbc, 0xa4, 0xc9, 0xec, 0x73, 0x67, 0x15, 0x70, 0x2b, 0x0, 0x89, 0x27, 0xbb, 0x61, 0x9d, 0xc7, 0xc3, 0x39, 0x46, 0x3, 0x91, 0xb7, 0x3b}, | ||
"github.com/alice/mismatch": []byte("some non-matching digest"), | ||
"github.com/bob/emptyDigest": nil, // empty hash result | ||
"github.com/bob/match": []byte{0x6a, 0x11, 0xf9, 0x46, 0xcc, 0xf3, 0x44, 0xdb, 0x2c, 0x6b, 0xcc, 0xb4, 0x0, 0x71, 0xe5, 0xc4, 0xee, 0x9a, 0x26, 0x71, 0x9d, 0xab, 0xe2, 0x40, 0xb7, 0xbf, 0x2a, 0xd9, 0x4, 0xcf, 0xc9, 0x46}, | ||
"github.com/bob/match": []byte{0x7e, 0x10, 0x6, 0x2f, 0x8, 0x3, 0x3c, 0x76, 0xae, 0xbc, 0xa4, 0xc9, 0xec, 0x73, 0x67, 0x15, 0x70, 0x2b, 0x0, 0x89, 0x27, 0xbb, 0x61, 0x9d, 0xc7, 0xc3, 0x39, 0x46, 0x3, 0x91, 0xb7, 0x3b}, | ||
"github.com/charlie/notInTree": nil, // not in tree result ought to superseede empty digest result | ||
// matching result at seldomly found directory level | ||
"launchpad.net/match": []byte{0x74, 0xe, 0x17, 0x87, 0xd4, 0x8e, 0x56, 0x2e, 0x7e, 0x32, 0x4e, 0x80, 0x3a, 0x5f, 0x3a, 0x10, 0x33, 0x43, 0x2c, 0x24, 0x8e, 0xf7, 0x1a, 0x37, 0x5e, 0x76, 0xf4, 0x6, 0x2b, 0xf3, 0xfd, 0x91}, | ||
"launchpad.net/match": []byte{0x7e, 0x10, 0x6, 0x2f, 0x8, 0x3, 0x3c, 0x76, 0xae, 0xbc, 0xa4, 0xc9, 0xec, 0x73, 0x67, 0x15, 0x70, 0x2b, 0x0, 0x89, 0x27, 0xbb, 0x61, 0x9d, 0xc7, 0xc3, 0x39, 0x46, 0x3, 0x91, 0xb7, 0x3b}, | ||
} | ||
|
||
status, err := VerifyDepTree(vendorRoot, wantSums) | ||
|
@@ -160,7 +139,7 @@ func TestVerifyDepTree(t *testing.T) { | |
// digest keys. | ||
if false { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i assume this is here for debugging purposes? why not put it down below and have the conditional be on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great idea! Updated. |
||
for k, want := range wantSums { | ||
got, err := DigestFromPathname(vendorRoot, k) | ||
got, err := DigestFromDirectory(filepath.Join(vendorRoot, k)) | ||
if err != nil { | ||
t.Error(err) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i meant to mention this earlier, but i thought i'd just point you at #588 - we do want to get away from
filepath.Walk()
inListPackages()
as well, and a) maybe there's some useful things for you to look at in there and b) maybe we can eventually think about abstracting out this walker implementation a bit so that we can reuse it both here and inListPackages()
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Which walker implementation, the one created for this issue? If so, it sounds good. I'm happy to pull this out.
Please let me know whether you can foresee any constraints that you would like to ensure we code for, in order to support other requirements, other than the obvious ones that #121 needs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, I created a
DirWalk
function which has the exact same semantics asfilepath.Walk
, plugged it into a newDigestFromDirectoryDirWalk
function, and ran benchmarks between the origin and the new. The benchmarks are not promising, and I am not completely certain why the performance has degraded so much.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The above benchmark was performed on one of my systems that has ~1.3 GiB total of file data.
I cannot imagine the performance degradation is due to the additional overhead of calling the anonymous function, which happens to close over a few variables, four of them to be exact.
Because
DirWalk
is really an excellent drop in replacement forfilepath.Walk
for when you need to visit every file system node, it makes it very attractive to keepDirWalk
around for other portions of this project which may need it.Just for performance comparison sake, I also benchmarked
DigestFromDirectoryDirWalk
againstDigestFromDirectoryWalk
, which invokesfilepath.Walk
. (I knowfilepath.Walk
does not provide the operational functionality we need for this application, however I wanted to determine whether I was doing something completely wrong. Interestingly,DigestFromDirectoryDirWalk
andDigestFromDirectoryWalk
performed nearly identically. So perhaps the extra overhead of making the anonymous function call is causing the 1.8x slow down for my particular data set.On the below graph, the chunk in the middle is using
DirWalk
. The bizarre stepped chunk on the right is usingfilepath.Walk
, and the very short chunk on the left is using the original function with the embedded directory walk.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unless you do extra work (stuff them all in a struct, for example), each variable shared with the closure is allocated separately. That is so that its lifetime is independent of the others, which might be important in some cases. So each closed variable is a malloc, a gc, and a pointer.
If you want tighter code you can try putting your 4 variables in a struct and closing over the struct. That will cost one malloc and gc for all four. And you can try copying the vars into locals if they are accessed in a loop, since otherwise they must be treated like globals and cannot be kept in registers across any function calls.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at the original function with the embedded directory walk logic, the bulk of time is spent calling
io.Copy
. There is no observed time spent inlineEndingReader
that is not accounted for bybytes.Index
oros.(*File).Read
. So the original function seems mostly spent waiting on I/O.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After some more flamegraph splunking, it became apparent that more wallclock time is spent during reading when using DirWalk as compared to using embedded. However, both functions ought to be visiting the same files. Sure enough, a test case showed they both return different digests, so I'm looking into whether they are checking the same file list or not.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure enough, that was the correct problem. The newer method using
DirWalk
neglected to compare thefilepath.Basename(...)
to the list of VCS directories and vendor directory, and theDirWalk
method was evaluating a lot more data than the original.The corrected version of
DigestFromDirname
usingDirWalk
runs on par with the original version, both in performance and benchmem statistics.The additional levels in the middle flamegraph compared to the left hand flame graph are due to a higher callstack when the newer method is invoking the anonymous function passed to
DirWalk
.