Skip to content

Implementation based on the Dirhash Standard #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 46 commits into from
Apr 20, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
96300a6
relatively complete draft of dirhash standard
andhus Feb 20, 2019
32f217d
add DIRSUM, change property separator to null char, general additions
andhus Feb 21, 2019
9a50a92
separate content -> data, dirhash for clarity, formalize cyclic links…
andhus Feb 22, 2019
871e18b
complete and separate DIRHASH STANDARD from README
andhus Feb 23, 2019
e26203c
move cyclic links examples to appendix
andhus Feb 23, 2019
9abb336
minor fixes
andhus Feb 23, 2019
77c0035
Use double null char as dir entry separator, fix comments by Jim-Holm…
andhus Apr 11, 2020
6b36806
add general traverse logic
andhus Feb 25, 2019
0da6b06
temp, complete general traverse logic and tentative dirhash use
andhus Mar 1, 2019
32b41d2
add recursion filters, compat module, get_included_paths and tests fo…
andhus Mar 2, 2019
4b5efb2
add include empty option, first tests for traverse
andhus Mar 3, 2019
eea58a2
complete test coverage for traverse
andhus Mar 6, 2019
445951b
add allow_cyclic_links option, fix check for empty issue, more tests
andhus Mar 9, 2019
0568d14
major refactor wip: new standard and based on general traverse logic
andhus Mar 9, 2019
dd88f77
fix empty root dir handling
andhus Mar 11, 2019
cb1079c
update dirhash tests according new API and protocol
andhus Mar 11, 2019
30d0c2d
update cli according new API, NOTE some todo:s left
andhus Mar 11, 2019
c967490
use scantree external package, remove old TODOs re ignorefile
andhus Mar 30, 2019
3491869
losen requirements for multiproc speed-up test, due to travis overhead
andhus Mar 30, 2019
03a4170
rename filter_options -> filtering and protocol_options -> protocol
andhus Apr 10, 2020
56d9120
complete test coverage + some clean up
andhus Apr 10, 2020
7fbd165
update arg in benchmark/run.py
andhus Apr 10, 2020
1968d6a
change entry descripto separator to repeated null char, avoid collisi…
andhus Apr 10, 2020
39ea304
change order of funcs
andhus Apr 12, 2020
b787d84
fix typo in DIRHASH_STANDARD
andhus Apr 12, 2020
49a157d
rename match_patterns -> match
andhus Apr 12, 2020
abcc8b0
update docs of dirhash
andhus Apr 12, 2020
4999e5b
update docs, rename get_included_path -> included_paths, add __all__
andhus Apr 13, 2020
4bc3c71
replace on_cyclic_link -> allow_cyclic_links
andhus Apr 13, 2020
febbedb
update docs
andhus Apr 14, 2020
6104c5a
update CLI
andhus Apr 18, 2020
5e22d5b
update README
andhus Apr 18, 2020
bd76084
fix bug in default arg
andhus Apr 18, 2020
45eaa38
rename dirsum properties
andhus Apr 18, 2020
3d23002
update reference according separation of standard and python implemen…
andhus Apr 18, 2020
a4c267e
update repo ref and bump version in in setup.py
andhus Apr 18, 2020
a1ba7c1
draft of changelog and update of benchmark/run.py
andhus Apr 19, 2020
39c4785
fixes in changelog
andhus Apr 19, 2020
9806bfb
flatten args to dirhash, add dirhash_impl for passing filter_ and pro…
andhus Apr 19, 2020
53ed900
update docs according flat args
andhus Apr 20, 2020
1b500c1
update README
andhus Apr 20, 2020
429f0b9
minor cleanup
andhus Apr 20, 2020
e15a37a
add results from new benchmark runs
andhus Apr 20, 2020
2d821b3
add back removed tests
andhus Apr 20, 2020
1f2643b
update changelog
andhus Apr 20, 2020
aa4cd7f
rename Filter arg match -> match_patterns to reflect Dirhash Standard
andhus Apr 20, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
temp, complete general traverse logic and tentative dirhash use
  • Loading branch information
andhus committed Apr 12, 2020
commit 0da6b06753138a4e186fdf18311d50b375d452ea
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
author_email="[email protected]",
license='MIT',
install_requires=[
'attrs>=18.0.0',
'pathspec>=0.5.9',
'scandir>=1.9.0;python_version<"3.5"'
],
Expand Down
280 changes: 207 additions & 73 deletions src/dirhash/traverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,170 @@

# Use the built-in version of scandir/walk if possible (python > 3.5),
# otherwise use the scandir module version
from functools import partial
from multiprocessing.pool import Pool

from dirhash import _get_hasher_factory, _get_filehash

try:
from os import scandir
from posix import DirEntry
except ImportError: # pragma: no cover
from scandir import scandir, DirEntry

import attr


def identity(x):
return x


def mpmap(func, iterable, jobs=1):
if jobs == 1:
return [func(element) for element in iterable]

pool = Pool(jobs)
try:
results = pool.map(func, iterable)
finally:
pool.close()

return results


def dirhash(
directory,
algorithm,
chunksize=2**20,
jobs=1
):
hasher_factory = _get_hasher_factory(algorithm)
realpaths = set()

def extract_realpaths(path):
realpaths.add(path.real)
return path

root_node = traverse(directory, file_apply=extract_realpaths)
realpaths = list(realpaths)

# hash files in parallel
file_hashes = mpmap(
partial(_get_filehash, hasher_factory=hasher_factory, chunk_size=chunksize),
realpaths,
jobs=jobs
)
# prepare the cache with precomputed file hashes
realpath_to_hash = dict(zip(realpaths, file_hashes))

def fetch_filehash(path):
return path, realpath_to_hash[path.real]

def get_dirhash(dir_node):
descriptors = []
for path, filehash in dir_node.files:
descriptors.append('_'.join([path.name, filehash]))
for path, sub_dirhash in dir_node.directories:
descriptors.append('_'.join([path.name, sub_dirhash]))
descriptor = '\n'.join(sorted(descriptors) + [''])
dirhash_ = hasher_factory(descriptor.encode('utf-8')).hexdigest()

return dir_node.path, dirhash_

_, root_dirhash = root_node.apply(
dir_apply=get_dirhash,
file_apply=fetch_filehash
)
return root_dirhash


class DirhashProtocol(object):

def __init__(self, entry_properties=('data', 'name')):
self.entry_properties = entry_properties

def get_descriptor(self, entry_descriptors):
return '\n'.join(sorted(entry_descriptors) + [''])

def get_entry_descriptor(self, entry_properties):
entry_strings = ['{}:{}'.format(k, v) for k, v in entry_properties]
return '\000'.join(sorted(entry_strings))

def get_entry_properties(self, path, entry_hash):
pass # TODO


def traverse(
directory,
recursion_filter=identity,
file_apply=identity,
dir_apply=identity,
follow_symlinks=True
follow_symlinks=True,
cache_file_apply=False,
jobs=1
):
if jobs is None or jobs > 1:
return _traverse_multiprocess(**vars())

path = RecursionPath.from_root(directory)
dir_node = _traverse_recursive(

if cache_file_apply:
file_apply = _cached_by_realpath(file_apply)

root_dir_node = _traverse_recursive(
path=path,
filter_=recursion_filter,
file_apply=file_apply,
dir_apply=dir_apply,
follow_symlinks=follow_symlinks,
parents={path.real: path},
)
result = dir_apply(dir_node)
result = dir_apply(root_dir_node)

return result


def _traverse_multiprocess(**kwargs):
file_apply = kwargs.pop('file_apply')
dir_apply = kwargs.pop('dir_apply')
jobs = kwargs.pop('jobs')

# assume not cache by real
file_paths = []

def extract_paths(path):
# hacky way to support pickling
# (__set/getstate__ does not work for slotted attrs classes)
path._dir_entry = DirEntryReplacement.from_dir_entry(path._dir_entry)
result_idx = len(file_paths)
file_paths.append(path)
return result_idx

root_dir_node = traverse(file_apply=extract_paths, dir_apply=identity, **kwargs)

pool = Pool(jobs)
try:
file_results = pool.map(file_apply, file_paths)
finally:
pool.close()

def fetch_result(result_idx):
return file_results[result_idx]

return root_dir_node.apply(dir_apply=dir_apply, file_apply=fetch_result)


def _cached_by_realpath(file_apply):
cache = {}

def file_apply_cached(path):
if path.real not in cache:
cache[path.real] = file_apply(path)
return cache[path]

return file_apply_cached


def _traverse_recursive(
path,
filter_,
Expand Down Expand Up @@ -65,28 +198,32 @@ def _traverse_recursive(
if subpath.is_dir():
dirs.append(dir_apply(_traverse_recursive(subpath, **fwd_kwargs)))
if subpath.is_file():
files.append(file_apply(FileNode(subpath)))
files.append(file_apply(subpath))

if follow_symlinks:
del parents[path.real]

return DirNode(path=path, directories=dirs, files=files)


class RecursionPath(
namedtuple(
'RecursionPath',
['root', 'relative', 'real', 'dir_entry']
)
):
"""Track the recursion path."""
@attr.s(slots=True)
class RecursionPath(object):
root = attr.ib()
relative = attr.ib()
real = attr.ib()
_dir_entry = attr.ib()
"""Track the recursion path.

So why not use pathlib.Path:
- keep track of real path but only do fs check on follow link
- use scandir/DirEntry's caching of e.g. is_dir/is_file for speedup.
"""
@classmethod
def from_root(cls, directory):
if isinstance(directory, DirEntry):
dir_entry = directory
else:
dir_entry = DirEntryReplacement(directory)
dir_entry = DirEntryReplacement.from_path(directory)
return cls(
root=dir_entry.path,
relative='',
Expand All @@ -95,53 +232,72 @@ def from_root(cls, directory):
)

def scandir(self):
return (self.join(dir_entry) for dir_entry in scandir(self.real))
return (self._join(dir_entry) for dir_entry in scandir(self.real))

def join(self, dir_entry):
def _join(self, dir_entry):
relative = os.path.join(self.relative, dir_entry.name)
real = os.path.join(self.real, dir_entry.name)
if dir_entry.is_symlink():
real = os.path.realpath(real)

return self._replace(
relative=relative,
real=real,
dir_entry=dir_entry
)
return attr.evolve(self, relative=relative, real=real, dir_entry=dir_entry)

@property
def name(self):
return self._dir_entry.name

def is_dir(self, follow_symlinks=True):
return self.dir_entry.is_dir(follow_symlinks=follow_symlinks)
return self._dir_entry.is_dir(follow_symlinks=follow_symlinks)

def is_file(self, follow_symlinks=True):
return self.dir_entry.is_file(follow_symlinks=follow_symlinks)
return self._dir_entry.is_file(follow_symlinks=follow_symlinks)

def is_symlink(self):
return self.dir_entry.is_symlink()
return self._dir_entry.is_symlink()

def stat(self, follow_symlinks=True):
return self.dir_entry.stat(follow_symlinks=follow_symlinks)
return self._dir_entry.stat(follow_symlinks=follow_symlinks)

def inode(self):
return self.dir_entry.inode()


return self._dir_entry.inode()

# TODO bellow has no effect when pickling?
# def __getstate__(self):
# return (
# self.root,
# self.relative,
# self.real,
# DirEntryReplacement.from_dir_entry(self._dir_entry)
# )
#
# def __setstate__(self, state):
# self.root, self.relative, self.real, self._dir_entry = state


@attr.s(slots=True)
class DirEntryReplacement(object):
path = attr.ib()
name = attr.ib()
_is_dir = attr.ib(default=None)
_is_file = attr.ib(default=None)
_is_symlink = attr.ib(default=None)
_stat_sym = attr.ib(default=None)
_stat_nosym = attr.ib(default=None)

def __init__(self, path):
@classmethod
def from_path(cls, path):
if not os.path.exists(path):
raise ValueError('{} does not exist'.format(path))
self.path = path
basename = os.path.basename(path)
if basename in ['', '.', '..']:
self.name = os.path.basename(os.path.realpath(path))
name = os.path.basename(os.path.realpath(path))
else:
self.name = basename
self._is_dir = None
self._is_file = None
self._is_symlink = None
self._stat_sym = None
self._stat_nosym = None
self._inode = None
name = basename
return cls(path, name)

@classmethod
def from_dir_entry(cls, dir_entry):
return cls(dir_entry.path, dir_entry.name)

def is_dir(self, follow_symlinks=True):
if self._is_dir is None:
Expand Down Expand Up @@ -187,35 +343,17 @@ def __call__(self, paths):
raise NotImplementedError()


class _DirEntryInterface(object):

# alt:
# def __getattr__(self, item):
# if hasattr(self.path.dir_entry, item):
# return getattr(self.path.dir_entry, item)
# else:
# raise AttributeError('')

def is_dir(self, follow_symlinks=True):
return self.path.dir_entry.is_dir(follow_symlinks=follow_symlinks)

def is_file(self, follow_symlinks=True):
return self.path.dir_entry.is_file(follow_symlinks=follow_symlinks)

def is_symlink(self):
return self.path.dir_entry.is_symlink()

def stat(self, follow_symlinks=True):
return self.path.dir_entry.stat(follow_symlinks=follow_symlinks)
class MatchPatterns(RecursionFilter):

def inode(self):
return self.path.dir_entry.inode()
def __init__(self, match_patterns):
pass


class DirNode(
namedtuple('DirNode', ['path', 'directories', 'files']),
_DirEntryInterface
):
@attr.s(slots=True, frozen=True)
class DirNode(object):
path = attr.ib(validator=attr.validators.instance_of(RecursionPath))
directories = attr.ib(converter=tuple)
files = attr.ib(converter=tuple)

@property
def empty(self):
Expand All @@ -230,16 +368,12 @@ def apply(self, dir_apply, file_apply):
return dir_apply(dir_node)


class FileNode(namedtuple('FileNode', ['path']), _DirEntryInterface):
pass

@attr.s(slots=True, frozen=True)
class LinkedDir(object):
path = attr.ib(validator=attr.validators.instance_of(RecursionPath))

class LinkedDir(namedtuple('LinkedDir', ['path']), _DirEntryInterface):
pass


class CyclicLinkedDir(
namedtuple('CyclicLinkedDir', ['path', 'target_path']),
_DirEntryInterface
):
pass
@attr.s(slots=True, frozen=True)
class CyclicLinkedDir(object):
path = attr.ib(validator=attr.validators.instance_of(RecursionPath))
target_path = attr.ib(validator=attr.validators.instance_of(RecursionPath))