Skip to content

Tracking cache requests #1566

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
adding some testing and fixing out of range reqs
  • Loading branch information
betolink committed Feb 18, 2024
commit 1a04591585e9f632672e7db7c3caa488e0d018d2
17 changes: 14 additions & 3 deletions fsspec/caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __repr__(self) -> str:
cache type : {self.__class__.__name__}
block size : {self.blocksize}
block count : {self.nblocks}
cache size : {self.size}
file size : {self.size}
cache hits : {self.hit_count}
cache misses: {self.miss_count}
total requested bytes: {self.total_requested_bytes}
Expand Down Expand Up @@ -233,12 +233,20 @@ class FirstChunkCache(BaseCache):
name = "first"

def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
if blocksize > size:
# this will buffer the whole thing
blocksize = size
super().__init__(blocksize, fetcher, size)
self.cache: bytes | None = None

def _fetch(self, start: int | None, end: int | None) -> bytes:
start = start or 0
end = end or self.size
if start > self.size:
logger.debug("FirstChunkCache: requested start > file size")
return b""

end = min(end, self.size)

if start < self.blocksize:
if self.cache is None:
self.miss_count += 1
Expand All @@ -248,12 +256,15 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
self.cache = data[: self.blocksize]
return data[start:]
self.cache = self.fetcher(0, self.blocksize)
self.total_requested_bytes += self.blocksize
part = self.cache[start:end]
if end > self.blocksize:
self.total_requested_bytes += end - self.blocksize
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment: isn't the number of bytes returned by fetcher the more important number? If we request 100bytes, we might only get 10 back; probably the calling code will request more. OTOH, the various implementations of the request code might keep making new requests until enough bytes arrive.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above, perhaps we are missing another variable to keep track of the total_bytes_returned?

part += self.fetcher(self.blocksize, end)
self.hit_count += 1
return part
else:
self.miss_count += 1
self.total_requested_bytes += end - start
return self.fetcher(start, end)

Expand Down Expand Up @@ -370,8 +381,8 @@ def _read_cache(
start_pos = start % self.blocksize
end_pos = end % self.blocksize

self.hit_count += 1
if start_block_number == end_block_number:
self.hit_count += 1
block: bytes = self._fetch_block_cached(start_block_number)
return block[start_pos:end_pos]

Expand Down
52 changes: 44 additions & 8 deletions fsspec/tests/test_caches.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,39 @@ def test_block_cache_lru():
cache._fetch(0, 2)
assert cache.cache_info().misses == 1
assert cache.cache_info().currsize == 1
assert cache.total_requested_bytes == 4
assert cache.size == 52

# hit
cache._fetch(0, 2)
assert cache.cache_info().misses == 1
assert cache.cache_info().currsize == 1
assert cache.total_requested_bytes == 4

# hit
cache._fetch(0, 2)
assert cache.cache_info().misses == 1
assert cache.cache_info().currsize == 1
# this works as a counter since all the reads are from the cache
assert cache.hit_count == 3
assert cache.miss_count == 1
# so far only 4 bytes have been read using range requests
assert cache.total_requested_bytes == 4

# miss
cache._fetch(4, 6)
assert cache.cache_info().misses == 2
assert cache.cache_info().currsize == 2
assert cache.total_requested_bytes == 8

# miss & evict
cache._fetch(12, 13)
assert cache.cache_info().misses == 3
assert cache.cache_info().currsize == 2
assert cache.hit_count == 5
assert cache.miss_count == 3
assert cache.total_requested_bytes == 12



def _fetcher(start, end):
Expand Down Expand Up @@ -73,14 +91,32 @@ def test_cache_pickleable(Cache_imp):


def test_first_cache():
c = FirstChunkCache(5, letters_fetcher, 52)
assert c.cache is None
assert c._fetch(12, 15) == letters_fetcher(12, 15)
assert c.cache is None
assert c._fetch(3, 10) == letters_fetcher(3, 10)
assert c.cache == letters_fetcher(0, 5)
c.fetcher = None
assert c._fetch(1, 4) == letters_fetcher(1, 4)
cache = FirstChunkCache(5, letters_fetcher, len(string.ascii_letters))
assert cache.cache is None
assert cache._fetch(12, 15) == letters_fetcher(12, 15)
assert cache.miss_count == 1
assert cache.hit_count == 0
assert cache.cache is None
assert cache.total_requested_bytes == 3

# because we overlap with the cache range, it will be cached
assert cache._fetch(3, 10) == letters_fetcher(3, 10)
assert cache.miss_count == 2
assert cache.hit_count == 0
assert cache.total_requested_bytes == 13

# partial hit again
assert cache._fetch(3, 10) == letters_fetcher(3, 10)
assert cache.miss_count == 2
assert cache.hit_count == 1
# we have the first 5 bytes cached
assert cache.total_requested_bytes == 18

assert cache.cache == letters_fetcher(0, 5)
assert cache._fetch(0, 4) == letters_fetcher(0, 4)
assert cache.hit_count == 2
assert cache.miss_count == 2
assert cache.total_requested_bytes == 18


@pytest.mark.parametrize(
Expand Down