Skip to content

Commit a23810f

Browse files
authored
partially rollback batch remote checks (#10796)
1 parent 0adf667 commit a23810f

File tree

2 files changed

+43
-28
lines changed

2 files changed

+43
-28
lines changed

dvc/repo/data.py

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,6 @@ def _get_entries_not_in_remote(
323323
filter_keys: Optional[Iterable["DataIndexKey"]] = None,
324324
granular: bool = False,
325325
remote_refresh: bool = False,
326-
batch_size: Optional[int] = None,
327326
) -> list[str]:
328327
"""Get entries that are not in remote storage."""
329328
from dvc.repo.worktree import worktree_view
@@ -337,10 +336,6 @@ def _get_entries_not_in_remote(
337336

338337
missing_entries = []
339338

340-
to_check: dict[FileSystem, dict[str, list[DataIndexEntry]]] = defaultdict(
341-
lambda: defaultdict(list)
342-
)
343-
344339
storage_map = view.storage_map
345340
with TqdmCallback(size=0, desc="Checking remote", unit="entry") as cb:
346341
for key, entry in view.iteritems(shallow=not granular):
@@ -358,28 +353,12 @@ def _get_entries_not_in_remote(
358353
continue
359354

360355
k = (*key, "") if entry.meta and entry.meta.isdir else key
361-
if remote_refresh:
362-
# on remote_refresh, collect all entries to check
363-
# then check them in batches below
364-
try:
365-
remote_fs, remote_path = storage_map.get_remote(entry)
366-
to_check[remote_fs][remote_path].append(entry)
367-
cb.size += 1
368-
cb.relative_update(0) # try to update the progress bar
369-
except StorageKeyError:
370-
pass
371-
else:
372-
try:
373-
if not storage_map.remote_exists(entry, refresh=remote_refresh):
374-
missing_entries.append(os.path.sep.join(k))
375-
cb.relative_update() # no need to update the size
376-
except StorageKeyError:
377-
pass
378-
missing_entries.extend(
379-
_get_missing_paths(
380-
to_check, batch_size=batch_size, callback=StorageCallback(cb)
381-
)
382-
)
356+
try:
357+
if not storage_map.remote_exists(entry, refresh=remote_refresh):
358+
missing_entries.append(os.path.sep.join(k))
359+
cb.relative_update() # no need to update the size
360+
except StorageKeyError:
361+
pass
383362
return missing_entries
384363

385364

@@ -428,7 +407,6 @@ def status(
428407
filter_keys=filter_keys,
429408
granular=granular,
430409
remote_refresh=remote_refresh,
431-
batch_size=batch_size,
432410
)
433411

434412
try:

tests/func/test_data_status.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,3 +881,40 @@ def test_filter_targets_not_in_cache(
881881
assert dvc.data_status(targets, granular=True, not_in_remote=not_in_remote) == d | {
882882
key: granular
883883
}
884+
885+
886+
def test_compat_legacy_new_cache_types(M, tmp_dir, dvc, scm):
887+
tmp_dir.gen({"foo": "foo", "bar": "bar"})
888+
(tmp_dir / "foo.dvc").dump(
889+
{
890+
"outs": [
891+
{"path": "foo", "md5": "acbd18db4cc2f85cedef654fccc4a4d8", "size": 3},
892+
]
893+
}
894+
)
895+
dvc.add(tmp_dir / "bar", no_commit=True)
896+
897+
assert dvc.data_status() == {
898+
**EMPTY_STATUS,
899+
"not_in_cache": M.unordered("foo", "bar"),
900+
"committed": {"added": M.unordered("foo", "bar")},
901+
"git": M.dict(),
902+
}
903+
904+
dvc.commit("foo")
905+
906+
assert dvc.data_status() == {
907+
**EMPTY_STATUS,
908+
"not_in_cache": ["bar"],
909+
"committed": {"added": M.unordered("foo", "bar")},
910+
"git": M.dict(),
911+
}
912+
913+
dvc.commit("bar")
914+
915+
assert dvc.data_status() == {
916+
**EMPTY_STATUS,
917+
"not_in_cache": [],
918+
"committed": {"added": M.unordered("foo", "bar")},
919+
"git": M.dict(),
920+
}

0 commit comments

Comments
 (0)