diff --git a/dandi/cli/cmd_download.py b/dandi/cli/cmd_download.py index 5f67d6b89..a35daa1e7 100644 --- a/dandi/cli/cmd_download.py +++ b/dandi/cli/cmd_download.py @@ -118,6 +118,17 @@ @click.option( "--sync", is_flag=True, help="Delete local assets that do not exist on the server" ) +@click.option( + "--zarr", + "zarr_filters", + multiple=True, + metavar="FILTER", + help=( + "Filter entries within Zarr assets. Format: TYPE:PATTERN where TYPE " + "is 'glob', 'path', or 'regex'. Predefined: 'metadata'. " + "Can be specified multiple times (OR logic)." + ), +) @instance_option( default=None, help=( @@ -151,6 +162,7 @@ def download( format: DownloadFormat, download_types: set[str], sync: bool, + zarr_filters: tuple[str, ...], dandi_instance: str, path_type: PathType, preserve_tree: bool, @@ -191,6 +203,7 @@ def download( get_assets="assets" in download_types or preserve_tree, preserve_tree=preserve_tree, sync=sync, + zarr_filters=zarr_filters, path_type=path_type, # develop_debug=develop_debug ) diff --git a/dandi/cli/cmd_ls.py b/dandi/cli/cmd_ls.py index 9ac9ed42f..1969236ad 100644 --- a/dandi/cli/cmd_ls.py +++ b/dandi/cli/cmd_ls.py @@ -8,7 +8,13 @@ from .base import devel_option, lgr, map_to_click_exceptions from .formatter import JSONFormatter, JSONLinesFormatter, PYOUTFormatter, YAMLFormatter from ..consts import ZARR_EXTENSIONS, metadata_all_fields -from ..dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url +from ..dandiapi import BaseRemoteZarrAsset +from ..dandiarchive import ( + AssetZarrEntryURL, + DandisetURL, + _dandi_url_parser, + parse_dandi_url, +) from ..dandiset import Dandiset from ..misctypes import Digest from ..support.pyout import PYOUT_SHORT_NAMES, PYOUT_SHORT_NAMES_rev @@ -143,6 +149,17 @@ def assets_gen(): if metadata in ("all", "assets"): rec["metadata"] = a.get_raw_metadata() yield rec + # If URL points into a zarr, also list entries + if isinstance(parsed_url, AssetZarrEntryURL) and isinstance( + a, BaseRemoteZarrAsset + ): + for entry in a.iterfiles( + prefix=parsed_url.zarr_subpath + ): + yield { + "path": f"{a.path}/{entry}", + "size": entry.size, + } else: # For now we support only individual files yield path diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 1f057dc36..4cd34b9be 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -9,7 +9,7 @@ instance_option, map_to_click_exceptions, ) -from ..upload import UploadExisting, UploadValidation +from ..upload import UploadExisting, UploadValidation, ZarrMode @click.command() @@ -48,6 +48,16 @@ default="require", show_default=True, ) +@click.option( + "--zarr-mode", + type=click.Choice(list(ZarrMode)), + default="full", + help=( + "Zarr sync mode: 'full' (default) syncs completely; " + "'patch' uploads/updates without deleting remote files." + ), + show_default=True, +) @click.argument("paths", nargs=-1) # , type=click.Path(exists=True, dir_okay=False)) # & # Development options: Set DANDI_DEVEL for them to become available @@ -75,6 +85,7 @@ def upload( dandi_instance: str, existing: UploadExisting, validation: UploadValidation, + zarr_mode: ZarrMode, # Development options should come as kwargs allow_any_path: bool = False, upload_dandiset_metadata: bool = False, @@ -115,4 +126,5 @@ def upload( jobs=jobs, jobs_per_file=jobs_per_file, sync=sync, + zarr_mode=zarr_mode, ) diff --git a/dandi/cli/tests/test_download.py b/dandi/cli/tests/test_download.py index 4f10227c5..39919a354 100644 --- a/dandi/cli/tests/test_download.py +++ b/dandi/cli/tests/test_download.py @@ -25,6 +25,7 @@ def test_download_defaults(mocker): get_assets=True, preserve_tree=False, sync=False, + zarr_filters=(), path_type=PathType.EXACT, ) @@ -44,6 +45,7 @@ def test_download_all_types(mocker): get_assets=True, preserve_tree=False, sync=False, + zarr_filters=(), path_type=PathType.EXACT, ) @@ -63,6 +65,7 @@ def test_download_metadata_only(mocker): get_assets=False, preserve_tree=False, sync=False, + zarr_filters=(), path_type=PathType.EXACT, ) @@ -82,6 +85,7 @@ def test_download_assets_only(mocker): get_assets=True, preserve_tree=False, sync=False, + zarr_filters=(), path_type=PathType.EXACT, ) @@ -116,6 +120,7 @@ def test_download_gui_instance_in_dandiset(mocker): get_assets=True, preserve_tree=False, sync=False, + zarr_filters=(), path_type=PathType.EXACT, ) @@ -142,6 +147,7 @@ def test_download_api_instance_in_dandiset(mocker): get_assets=True, preserve_tree=False, sync=False, + zarr_filters=(), path_type=PathType.EXACT, ) @@ -168,6 +174,7 @@ def test_download_url_instance_match(mocker): get_assets=True, preserve_tree=False, sync=False, + zarr_filters=(), path_type=PathType.EXACT, ) diff --git a/dandi/dandiarchive.py b/dandi/dandiarchive.py index 2a9472463..581fe8bf5 100644 --- a/dandi/dandiarchive.py +++ b/dandi/dandiarchive.py @@ -46,6 +46,7 @@ PUBLISHED_VERSION_REGEX, RETRY_STATUSES, VERSION_REGEX, + ZARR_EXTENSIONS, DandiInstance, known_instances, ) @@ -445,6 +446,72 @@ def get_assets( ) +def split_zarr_location(location: str) -> tuple[str, str] | None: + """Split a location into ``(asset_path, zarr_subpath)`` if it crosses a zarr boundary. + + Scans path components for segments ending with a Zarr extension + (``.zarr``, ``.ngff``). If found **and** there are remaining components + after the boundary, returns the split; otherwise returns ``None``. + + Parameters + ---------- + location : str + A POSIX-style path, e.g. ``"sub-1/file.ome.zarr/0/0/0"``. + + Returns + ------- + tuple[str, str] | None + ``(asset_path, zarr_subpath)`` when a zarr boundary with a subpath + is detected, otherwise ``None``. + + Examples + -------- + >>> split_zarr_location("sub-1/file.ome.zarr/0/0/0") + ('sub-1/file.ome.zarr', '0/0/0') + >>> split_zarr_location("sub-1/file.ome.zarr") # no subpath + >>> split_zarr_location("sub-1/file.nwb") # no zarr extension + """ + parts = [p for p in location.split("/") if p] + for i, part in enumerate(parts): + if any(part.endswith(ext) for ext in ZARR_EXTENSIONS): + asset_path = "/".join(parts[: i + 1]) + zarr_subpath = "/".join(parts[i + 1 :]) + return (asset_path, zarr_subpath) if zarr_subpath else None + return None + + +@dataclass +class AssetZarrEntryURL(SingleAssetURL): + """Parsed from a URL that points into entries within a Zarr asset. + + For example, ``dandi://dandi/000108/sub-1/file.ome.zarr/0/0/0`` would + produce ``asset_path="sub-1/file.ome.zarr"`` and ``zarr_subpath="0/0/0"``. + """ + + asset_path: str # e.g., "sub-1/file.ome.zarr" + zarr_subpath: str # e.g., "0/0/0" + + def get_assets( + self, client: DandiAPIClient, order: str | None = None, strict: bool = False + ) -> Iterator[BaseRemoteAsset]: + """Yield the zarr asset whose path equals ``asset_path``. + + If the asset does not exist, a `NotFoundError` is raised when + ``strict`` is true; otherwise nothing is yielded. + """ + try: + dandiset = self.get_dandiset(client, lazy=not strict) + assert dandiset is not None + dandiset.version_id # Force version evaluation + except NotFoundError: + if strict: + raise + else: + return + with _maybe_strict(strict): + yield dandiset.get_asset_by_path(self.asset_path) + + @dataclass class AssetFolderURL(MultiAssetURL): """ @@ -845,12 +912,24 @@ def parse( path=location, ) else: - parsed_url = AssetItemURL( - instance=instance, - dandiset_id=dandiset_id, - version_id=version_id, - path=location, - ) + # Check if location crosses a zarr boundary + zarr_split = split_zarr_location(location) + if zarr_split is not None: + asset_path, zarr_subpath = zarr_split + parsed_url = AssetZarrEntryURL( + instance=instance, + dandiset_id=dandiset_id, + version_id=version_id, + asset_path=asset_path, + zarr_subpath=zarr_subpath, + ) + else: + parsed_url = AssetItemURL( + instance=instance, + dandiset_id=dandiset_id, + version_id=version_id, + path=location, + ) elif asset_id: if dandiset_id is None: parsed_url = BaseAssetIDURL(instance=instance, asset_id=asset_id) diff --git a/dandi/download.py b/dandi/download.py index f1131f446..d242c8791 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -42,6 +42,7 @@ from .dandiapi import AssetType, BaseRemoteZarrAsset, RemoteDandiset from .dandiarchive import ( AssetItemURL, + AssetZarrEntryURL, DandisetURL, ParsedDandiURL, SingleAssetURL, @@ -108,6 +109,7 @@ def download( get_assets: bool = True, preserve_tree: bool = False, sync: bool = False, + zarr_filters: Sequence[str] = (), path_type: PathType = PathType.EXACT, ) -> None: # TODO: unduplicate with upload. For now stole from that one @@ -122,6 +124,26 @@ def download( parsed_urls = [parse_dandi_url(u, glob=path_type is PathType.GLOB) for u in urls] + # Parse zarr entry filters + from .zarr_filter import ZarrFilter, make_zarr_entry_filter, parse_zarr_filter + + zarr_entry_filter: Callable[[str], bool] | None = None + all_zf: list[ZarrFilter] = [] + if zarr_filters: + for spec in zarr_filters: + all_zf.extend(parse_zarr_filter(spec)) + + # Merge URL-derived path filters from AssetZarrEntryURL + for purl in parsed_urls: + if isinstance(purl, AssetZarrEntryURL): + all_zf.append(ZarrFilter("path", purl.zarr_subpath)) + + if all_zf: + zarr_entry_filter = make_zarr_entry_filter(all_zf) + + if sync and zarr_entry_filter is not None: + raise ValueError("--sync and --zarr cannot be used together") + # dandi.cli.formatters are used in cmd_ls to provide switchable pyout_style = pyouts.get_style(hide_if_missing=False) @@ -158,6 +180,7 @@ def download( preserve_tree=preserve_tree, jobs_per_zarr=jobs_per_zarr, on_error="yield" if format is DownloadFormat.PYOUT else "raise", + zarr_entry_filter=zarr_entry_filter, **kw, ) for purl in parsed_urls @@ -244,6 +267,7 @@ class Downloader: preserve_tree: bool jobs_per_zarr: int | None on_error: Literal["raise", "yield"] + zarr_entry_filter: Callable[[str], bool] | None = None #: which will be set .gen to assets. Purpose is to make it possible to get #: summary statistics while already downloading. TODO: reimplement #: properly! @@ -368,6 +392,7 @@ def download_generator(self) -> Iterator[dict]: existing=self.existing, jobs=self.jobs_per_zarr, lock=lock, + zarr_entry_filter=self.zarr_entry_filter, ) def _progress_filter(gen): @@ -990,14 +1015,15 @@ def _download_zarr( existing: DownloadExisting, lock: Lock, jobs: int | None = None, + zarr_entry_filter: Callable[[str], bool] | None = None, ) -> Iterator[dict]: # Avoid heavy import by importing within function: from .support.digests import get_zarr_checksum # we will collect them all while starting the download # with the first page of entries received from the server. - entries = [] - digests = {} + entries: list = [] + digests: dict[str, str] = {} pc = ProgressCombiner(zarr_size=asset.size) def digest_callback(path: str, algoname: str, d: str) -> None: @@ -1006,21 +1032,24 @@ def digest_callback(path: str, algoname: str, d: str) -> None: def downloads_gen(): for entry in asset.iterfiles(): + entry_path = str(entry) + if zarr_entry_filter is not None and not zarr_entry_filter(entry_path): + continue entries.append(entry) etag = entry.digest assert etag.algorithm is DigestType.md5 yield pairing( - str(entry), + entry_path, _download_file( entry.get_download_file_iter(), - download_path / str(entry), + download_path / entry_path, toplevel_path=toplevel_path, size=entry.size, mtime=entry.modified, existing=existing, digests={"md5": etag.value}, lock=lock, - digest_callback=partial(digest_callback, str(entry)), + digest_callback=partial(digest_callback, entry_path), ), ) pc.file_qty = len(entries) @@ -1040,55 +1069,68 @@ def downloads_gen(): if final_out is not None: break else: + if zarr_entry_filter is not None: + # Filter matched no entries; still report completion + yield {"status": "done"} return - yield {"status": "deleting extra files"} - remote_paths = set(map(str, entries)) - zarr_basepath = Path(download_path) - dirs = deque([zarr_basepath]) - empty_dirs: deque[Path] = deque() - while dirs: - d = dirs.popleft() - is_empty = True - for p in list(d.iterdir()): - if exclude_from_zarr(p): - is_empty = False - elif ( - p.is_file() - and p.relative_to(zarr_basepath).as_posix() not in remote_paths - ): - try: - p.unlink() - except OSError: + if zarr_entry_filter is not None: + # Partial download: skip deleting extra local files and skip + # whole-zarr checksum verification (individual file checksums + # are still verified during download). + lgr.debug( + "%s: Partial download — skipping extra-file deletion and" + " zarr checksum verification", + download_path, + ) + else: + yield {"status": "deleting extra files"} + remote_paths = set(map(str, entries)) + zarr_basepath = Path(download_path) + dirs = deque([zarr_basepath]) + empty_dirs: deque[Path] = deque() + while dirs: + d = dirs.popleft() + is_empty = True + for p in list(d.iterdir()): + if exclude_from_zarr(p): is_empty = False - elif p.is_dir(): - dirs.append(p) - is_empty = False + elif ( + p.is_file() + and p.relative_to(zarr_basepath).as_posix() not in remote_paths + ): + try: + p.unlink() + except OSError: + is_empty = False + elif p.is_dir(): + dirs.append(p) + is_empty = False + else: + is_empty = False + if is_empty and d != zarr_basepath: + empty_dirs.append(d) + while empty_dirs: + d = empty_dirs.popleft() + d.rmdir() + if d.parent != zarr_basepath and not any(d.parent.iterdir()): + empty_dirs.append(d.parent) + + if "skipped" not in final_out["message"]: + zarr_checksum = asset.get_digest().value + local_checksum = get_zarr_checksum(zarr_basepath, known=digests) + if zarr_checksum != local_checksum: + msg = f"Zarr checksum: downloaded {local_checksum} != {zarr_checksum}" + yield {"checksum": "differs", "status": "error", "message": msg} + lgr.debug("%s is different: %s.", zarr_basepath, msg) + return else: - is_empty = False - if is_empty and d != zarr_basepath: - empty_dirs.append(d) - while empty_dirs: - d = empty_dirs.popleft() - d.rmdir() - if d.parent != zarr_basepath and not any(d.parent.iterdir()): - empty_dirs.append(d.parent) - - if "skipped" not in final_out["message"]: - zarr_checksum = asset.get_digest().value - local_checksum = get_zarr_checksum(zarr_basepath, known=digests) - if zarr_checksum != local_checksum: - msg = f"Zarr checksum: downloaded {local_checksum} != {zarr_checksum}" - yield {"checksum": "differs", "status": "error", "message": msg} - lgr.debug("%s is different: %s.", zarr_basepath, msg) - return - else: - yield {"checksum": "ok"} - lgr.debug( - "Verified that %s has correct Zarr checksum %s", - zarr_basepath, - zarr_checksum, - ) + yield {"checksum": "ok"} + lgr.debug( + "Verified that %s has correct Zarr checksum %s", + zarr_basepath, + zarr_checksum, + ) yield {"status": "done"} diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index efb1efd2d..23a1b8948 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -10,10 +10,13 @@ import json import os import os.path -import urllib.parse from pathlib import Path from time import sleep -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional + +if TYPE_CHECKING: + from ..upload import ZarrMode +import urllib.parse from dandischema.models import BareAsset, DigestType from pydantic import BaseModel, ConfigDict, ValidationError @@ -557,6 +560,7 @@ def iter_upload( metadata: dict[str, Any], jobs: int | None = None, replacing: RemoteAsset | None = None, + zarr_mode: ZarrMode = "full", # type: ignore[assignment] ) -> Iterator[dict]: """ Upload the Zarr directory as an asset with the given metadata to the @@ -835,19 +839,29 @@ def mkzarr() -> str: ) lgr.debug("%s: Completing upload of batch #%d", asset_path, i) lgr.debug("%s: All files uploaded", asset_path) - old_zarr_files = list(old_zarr_entries.values()) - if old_zarr_files: - lgr.debug( - "%s: Deleting %s in remote Zarr not present locally", - asset_path, - pluralize(len(old_zarr_files), "file"), - ) - yield from _rmfiles( - asset=a, - entries=old_zarr_files, - status="deleting extra remote files", - ) - changed = True + if zarr_mode == "full": + old_zarr_files = list(old_zarr_entries.values()) + if old_zarr_files: + lgr.debug( + "%s: Deleting %s in remote Zarr not present locally", + asset_path, + pluralize(len(old_zarr_files), "file"), + ) + yield from _rmfiles( + asset=a, + entries=old_zarr_files, + status="deleting extra remote files", + ) + changed = True + else: + n_remote_only = len(old_zarr_entries) + if n_remote_only: + lgr.info( + "%s: Skipping deletion of %s remote-only" " %s (patch mode)", + asset_path, + n_remote_only, + "file" if n_remote_only == 1 else "files", + ) if changed: lgr.debug( "%s: Waiting for server to calculate Zarr checksum", asset_path @@ -858,20 +872,28 @@ def mkzarr() -> str: sleep(2) r = client.get(f"/zarr/{zarr_id}/") if r["status"] == "Complete": - our_checksum = str(zcc.process()) - server_checksum = r["checksum"] - if our_checksum == server_checksum: - mismatched = False - else: - mismatched = True + if zarr_mode == "patch": lgr.info( - "%s: Asset checksum mismatch (local: %s;" - " server: %s); redoing upload", + "%s: Skipping local checksum verification" + " (patch mode)", asset_path, - our_checksum, - server_checksum, ) - yield {"status": "Checksum mismatch"} + mismatched = False + else: + our_checksum = str(zcc.process()) + server_checksum = r["checksum"] + if our_checksum == server_checksum: + mismatched = False + else: + mismatched = True + lgr.info( + "%s: Asset checksum mismatch (local: %s;" + " server: %s); redoing upload", + asset_path, + our_checksum, + server_checksum, + ) + yield {"status": "Checksum mismatch"} break elif mismatched and not first_run: lgr.error( diff --git a/dandi/tests/test_dandiarchive.py b/dandi/tests/test_dandiarchive.py index e17b3f099..199e88bf7 100644 --- a/dandi/tests/test_dandiarchive.py +++ b/dandi/tests/test_dandiarchive.py @@ -12,12 +12,14 @@ AssetIDURL, AssetItemURL, AssetPathPrefixURL, + AssetZarrEntryURL, BaseAssetIDURL, DandisetURL, ParsedDandiURL, follow_redirect, multiasset_target, parse_dandi_url, + split_zarr_location, ) from dandi.exceptions import FailedToConnectError, NotFoundError, UnknownURLError from dandi.tests.skip import mark @@ -267,6 +269,36 @@ path="path/", ), ), + # Zarr entry URLs — location crosses a zarr boundary + ( + "dandi://dandi/000108/sub-1/file.ome.zarr/0/0/0", + AssetZarrEntryURL( + instance=known_instances["dandi"], + dandiset_id="000108", + version_id=None, + asset_path="sub-1/file.ome.zarr", + zarr_subpath="0/0/0", + ), + ), + ( + "dandi://dandi/000108@draft/sub-1/file.ngff/scale0/0/0", + AssetZarrEntryURL( + instance=known_instances["dandi"], + dandiset_id="000108", + version_id="draft", + asset_path="sub-1/file.ngff", + zarr_subpath="scale0/0/0", + ), + ), + ( # plain .zarr URL without subpath should stay AssetItemURL + "dandi://dandi/000108/sub-1/file.ome.zarr", + AssetItemURL( + instance=known_instances["dandi"], + dandiset_id="000108", + version_id=None, + path="sub-1/file.ome.zarr", + ), + ), ( "https://api.dandiarchive.org/api/dandisets/000003/versions/draft" "/assets/0a748f90-d497-4a9c-822e-9c63811db412/download/", @@ -355,6 +387,28 @@ def test_parse_api_url(url: str, parsed_url: ParsedDandiURL) -> None: assert parse_dandi_url(url) == parsed_url +@pytest.mark.ai_generated +@pytest.mark.parametrize( + "location,expected", + [ + # Crosses zarr boundary + ("sub-1/file.ome.zarr/0/0/0", ("sub-1/file.ome.zarr", "0/0/0")), + ("file.zarr/scale0/data", ("file.zarr", "scale0/data")), + ("sub-1/file.ngff/0/0", ("sub-1/file.ngff", "0/0")), + # No zarr extension + ("sub-1/file.nwb", None), + ("some/path/file.txt", None), + # Zarr without subpath — no split + ("sub-1/file.ome.zarr", None), + ("file.zarr", None), + # Deeply nested subpath + ("a/b.zarr/c/d/e/f", ("a/b.zarr", "c/d/e/f")), + ], +) +def test_split_zarr_location(location: str, expected: tuple[str, str] | None) -> None: + assert split_zarr_location(location) == expected + + @pytest.mark.parametrize( "url,parsed_url", [ diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 5f982d43b..2a95c4432 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -1293,3 +1293,121 @@ def test__check_attempts_and_sleep_retries(status_code: int) -> None: mock_sleep.assert_called_once() # and we do not sleep really assert not mock_sleep.call_args.args[0] + + +# ---------- Partial Zarr download tests ---------- + + +@pytest.mark.ai_generated +def test_download_zarr_with_glob_filter( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + """Download only .z* metadata files from a zarr asset.""" + download( + zarr_dandiset.dandiset.version_api_url, + tmp_path, + zarr_filters=("glob:**/.z*",), + ) + zarr_dir = tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" + assert zarr_dir.exists() + # All downloaded files should be metadata (start with .z) + all_files = list_paths(zarr_dir) + assert len(all_files) > 0 + for f in all_files: + assert f.name.startswith(".z"), f"Non-metadata file downloaded: {f}" + + +@pytest.mark.ai_generated +def test_download_zarr_metadata_alias( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + """Test the 'metadata' alias for --zarr.""" + download( + zarr_dandiset.dandiset.version_api_url, + tmp_path, + zarr_filters=("metadata",), + ) + zarr_dir = tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" + assert zarr_dir.exists() + all_files = list_paths(zarr_dir) + assert len(all_files) > 0 + for f in all_files: + assert f.name.startswith(".z") or f.name in ( + "zarr.json", + ".zmetadata", + ), f"Non-metadata file downloaded: {f}" + + +@pytest.mark.ai_generated +def test_download_zarr_filter_no_delete( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + """Verify local files outside filter scope are NOT deleted during partial download.""" + dd = tmp_path / zarr_dandiset.dandiset_id + dd.mkdir() + zarr_dir = dd / "sample.zarr" + zarr_dir.mkdir() + # Create a pre-existing "extra" file + extra = zarr_dir / "extra_local_file.txt" + extra.write_text("should not be deleted") + + download( + zarr_dandiset.dandiset.version_api_url, + tmp_path, + zarr_filters=("glob:**/.z*",), + existing=DownloadExisting.OVERWRITE, + ) + # The extra file should still exist (filter mode skips deletion) + assert extra.exists(), "Local file outside filter scope was deleted" + + +@pytest.mark.ai_generated +def test_download_zarr_with_path_filter( + tmp_path: Path, new_dandiset: SampleDandiset +) -> None: + """Download only a specific subtree using path: filter.""" + zf = new_dandiset.dspath / "sample.zarr" + zf.mkdir() + (zf / "a").mkdir() + (zf / "a" / "data.bin").write_text("data-a") + (zf / "b").mkdir() + (zf / "b" / "data.bin").write_text("data-b") + new_dandiset.upload(validation="skip") + + download( + new_dandiset.dandiset.version_api_url, + tmp_path, + zarr_filters=("path:a",), + ) + zarr_dir = tmp_path / new_dandiset.dandiset_id / "sample.zarr" + assert (zarr_dir / "a" / "data.bin").exists() + assert not (zarr_dir / "b").exists(), "Files outside path filter were downloaded" + + +@pytest.mark.ai_generated +def test_download_zarr_filter_nonexistent( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + """A filter matching nothing should download nothing, without error.""" + download( + zarr_dandiset.dandiset.version_api_url, + tmp_path, + zarr_filters=("path:nonexistent/path",), + ) + zarr_dir = tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" + if zarr_dir.exists(): + # Should have no actual data files + data_files = [f for f in list_paths(zarr_dir) if not f.name.startswith(".")] + assert len(data_files) == 0 + + +@pytest.mark.ai_generated +def test_download_zarr_sync_conflict() -> None: + """--sync and --zarr cannot be used together.""" + with pytest.raises(ValueError, match="--sync and --zarr cannot be used together"): + download( + "https://dandiarchive.org/dandiset/000027", + "/tmp/unused", + sync=True, + zarr_filters=("metadata",), + ) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 602e75ab9..c91c376c4 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -711,3 +711,83 @@ def test_upload_rejects_dandidownload_nwb_file(new_dandiset: SampleDandiset) -> match=f"contains {DOWNLOAD_SUFFIX} path which indicates incomplete download", ): new_dandiset.upload(allow_any_path=True) + + +# ---------- Partial Zarr upload (patch mode) tests ---------- + + +@pytest.mark.ai_generated +def test_upload_zarr_patch_mode_no_delete( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + """Upload with patch mode should NOT delete remote-only files.""" + asset = zarr_dandiset.dandiset.get_asset_by_path("sample.zarr") + assert isinstance(asset, RemoteZarrAsset) + + # Delete a local file so it becomes remote-only + local_zarr = zarr_dandiset.dspath / "sample.zarr" + # Find and remove a data file (not .zgroup etc) + data_files = [ + f for f in list_paths(local_zarr) if f.is_file() and not f.name.startswith(".") + ] + assert data_files, "Expected data files in zarr" + removed_file = data_files[0] + removed_relpath = removed_file.relative_to(local_zarr).as_posix() + removed_file.unlink() + + # Upload in patch mode — remote file should be preserved + zarr_dandiset.upload(zarr_mode="patch") + + asset = zarr_dandiset.dandiset.get_asset_by_path("sample.zarr") + assert isinstance(asset, RemoteZarrAsset) + remote_entries_after = set(str(e) for e in asset.iterfiles()) + assert ( + removed_relpath in remote_entries_after + ), "Remote-only file was deleted in patch mode" + + +@pytest.mark.ai_generated +def test_upload_zarr_full_mode_delete( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + """Upload with full mode (default) SHOULD delete remote-only files.""" + asset = zarr_dandiset.dandiset.get_asset_by_path("sample.zarr") + assert isinstance(asset, RemoteZarrAsset) + + # Delete a local file + local_zarr = zarr_dandiset.dspath / "sample.zarr" + data_files = [ + f for f in list_paths(local_zarr) if f.is_file() and not f.name.startswith(".") + ] + assert data_files, "Expected data files in zarr" + removed_file = data_files[0] + removed_relpath = removed_file.relative_to(local_zarr).as_posix() + removed_file.unlink() + + # Upload in full mode — remote file should be deleted + zarr_dandiset.upload(zarr_mode="full") + + asset = zarr_dandiset.dandiset.get_asset_by_path("sample.zarr") + assert isinstance(asset, RemoteZarrAsset) + remote_entries_after = set(str(e) for e in asset.iterfiles()) + assert ( + removed_relpath not in remote_entries_after + ), "Remote-only file was NOT deleted in full mode" + + +@pytest.mark.ai_generated +def test_upload_zarr_patch_mode_updates_changed_file( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + """Patch mode should still upload new/modified files.""" + local_zarr = zarr_dandiset.dspath / "sample.zarr" + # Add a new file + new_file = local_zarr / "new_entry.txt" + new_file.write_text("new content") + + zarr_dandiset.upload(zarr_mode="patch", validation="skip") + + asset = zarr_dandiset.dandiset.get_asset_by_path("sample.zarr") + assert isinstance(asset, RemoteZarrAsset) + remote_entries = set(str(e) for e in asset.iterfiles()) + assert "new_entry.txt" in remote_entries, "New file was not uploaded in patch mode" diff --git a/dandi/tests/test_zarr_filter.py b/dandi/tests/test_zarr_filter.py new file mode 100644 index 000000000..5475bb354 --- /dev/null +++ b/dandi/tests/test_zarr_filter.py @@ -0,0 +1,228 @@ +"""Tests for dandi.zarr_filter module.""" + +from __future__ import annotations + +import pytest + +from dandi.zarr_filter import ( + ZARR_FILTER_ALIASES, + ZarrFilter, + make_zarr_entry_filter, + parse_zarr_filter, +) + +# Sample paths representing entries inside a Zarr asset +SAMPLE_PATHS = [ + ".zgroup", + ".zattrs", + ".zmetadata", + "zarr.json", + "0/.zarray", + "0/.zattrs", + "0/0/0", + "0/0/1", + "0/1/0", + "1/.zarray", + "1/0/0", + "1/0/1", + "scale0/.zarray", + "scale0/0/0/0", + "scale0/0/0/1", +] + + +class TestZarrFilterGlob: + @pytest.mark.ai_generated + @pytest.mark.parametrize( + "pattern,path,expected", + [ + # ** matches across directories + ("**/.z*", ".zgroup", True), + ("**/.z*", ".zattrs", True), + ("**/.z*", "0/.zarray", True), + ("**/.z*", "0/.zattrs", True), + ("**/.z*", "0/0/0", False), + ("**/.z*", "zarr.json", False), + # ** with specific filename + ("**/zarr.json", "zarr.json", True), + ("**/zarr.json", "sub/zarr.json", True), + ("**/zarr.json", ".zgroup", False), + # ** matches .zmetadata + ("**/.zmetadata", ".zmetadata", True), + ("**/.zmetadata", "sub/.zmetadata", True), + ("**/.zmetadata", ".zgroup", False), + # * within a single component + ("0/*", "0/.zarray", True), + ("0/*", "0/.zattrs", True), + ("0/*", "0/0/0", False), # * does not cross / + ("0/*", "1/.zarray", False), + # Exact component match + ("0/0/0", "0/0/0", True), + ("0/0/0", "0/0/1", False), + # Pattern with * in middle + ("scale*/.zarray", "scale0/.zarray", True), + ("scale*/.zarray", "scale1/.zarray", True), + ("scale*/.zarray", "0/.zarray", False), + ], + ) + def test_glob_match(self, pattern: str, path: str, expected: bool) -> None: + f = ZarrFilter("glob", pattern) + assert f.matches(path) is expected + + +class TestZarrFilterPath: + @pytest.mark.ai_generated + @pytest.mark.parametrize( + "pattern,path,expected", + [ + # Exact match + ("0/0/0", "0/0/0", True), + # Prefix match + ("0", "0/.zarray", True), + ("0", "0/0/0", True), + ("0/0", "0/0/0", True), + ("0/0", "0/0/1", True), + # Not a prefix + ("0/0", "0/1/0", False), + ("1", "0/0/0", False), + # Trailing slash in pattern + ("0/", "0/.zarray", True), + ("0/", "0/0/0", True), + # No false prefix match (0 should not match 01) + ("0", "01/data", False), + ], + ) + def test_path_match(self, pattern: str, path: str, expected: bool) -> None: + f = ZarrFilter("path", pattern) + assert f.matches(path) is expected + + +class TestZarrFilterRegex: + @pytest.mark.ai_generated + @pytest.mark.parametrize( + "pattern,path,expected", + [ + (r"\.z(array|group|attrs)$", ".zgroup", True), + (r"\.z(array|group|attrs)$", "0/.zarray", True), + (r"\.z(array|group|attrs)$", "0/0/0", False), + (r"^0/0/", "0/0/0", True), + (r"^0/0/", "0/0/1", True), + (r"^0/0/", "0/1/0", False), + (r"zarr\.json$", "zarr.json", True), + (r"zarr\.json$", "sub/zarr.json", True), + ], + ) + def test_regex_match(self, pattern: str, path: str, expected: bool) -> None: + f = ZarrFilter("regex", pattern) + assert f.matches(path) is expected + + +class TestParseZarrFilter: + @pytest.mark.ai_generated + def test_parse_glob(self) -> None: + filters = parse_zarr_filter("glob:**/.z*") + assert len(filters) == 1 + assert filters[0].filter_type == "glob" + assert filters[0].pattern == "**/.z*" + + @pytest.mark.ai_generated + def test_parse_path(self) -> None: + filters = parse_zarr_filter("path:0/0") + assert len(filters) == 1 + assert filters[0].filter_type == "path" + assert filters[0].pattern == "0/0" + + @pytest.mark.ai_generated + def test_parse_regex(self) -> None: + filters = parse_zarr_filter(r"regex:\.zarray$") + assert len(filters) == 1 + assert filters[0].filter_type == "regex" + assert filters[0].pattern == r"\.zarray$" + + @pytest.mark.ai_generated + def test_parse_alias_metadata(self) -> None: + filters = parse_zarr_filter("metadata") + assert len(filters) == len(ZARR_FILTER_ALIASES["metadata"]) + # Should be separate objects (not the alias list itself) + assert filters is not ZARR_FILTER_ALIASES["metadata"] + for f, expected in zip(filters, ZARR_FILTER_ALIASES["metadata"]): + assert f.filter_type == expected.filter_type + assert f.pattern == expected.pattern + + @pytest.mark.ai_generated + def test_parse_invalid_no_colon(self) -> None: + with pytest.raises(ValueError, match="Invalid zarr filter"): + parse_zarr_filter("notanalias") + + @pytest.mark.ai_generated + def test_parse_invalid_type(self) -> None: + with pytest.raises(ValueError, match="Unknown zarr filter type"): + parse_zarr_filter("badtype:pattern") + + @pytest.mark.ai_generated + def test_parse_pattern_with_colon(self) -> None: + """Pattern containing a colon should keep everything after the first colon.""" + filters = parse_zarr_filter("regex:a:b") + assert filters[0].pattern == "a:b" + + @pytest.mark.ai_generated + def test_parse_invalid_regex(self) -> None: + """Invalid regex patterns should raise ValueError at construction time.""" + with pytest.raises(ValueError, match="Invalid regex pattern"): + parse_zarr_filter("regex:[invalid") + + @pytest.mark.ai_generated + def test_invalid_regex_direct(self) -> None: + """Direct ZarrFilter construction with bad regex should also fail.""" + with pytest.raises(ValueError, match="Invalid regex pattern"): + ZarrFilter("regex", "(unclosed") + + +class TestMakeZarrEntryFilter: + @pytest.mark.ai_generated + def test_or_composition(self) -> None: + """Multiple filters should combine with OR semantics.""" + filters = [ + ZarrFilter("glob", "**/.z*"), + ZarrFilter("path", "0/0"), + ] + predicate = make_zarr_entry_filter(filters) + # Matches glob + assert predicate(".zgroup") is True + assert predicate("1/.zarray") is True + # Matches path prefix + assert predicate("0/0/0") is True + assert predicate("0/0/1") is True + # Matches neither + assert predicate("1/0/0") is False + + @pytest.mark.ai_generated + def test_metadata_alias_matches_all_metadata(self) -> None: + """The 'metadata' alias should match typical Zarr metadata files.""" + filters = parse_zarr_filter("metadata") + predicate = make_zarr_entry_filter(filters) + # Zarr v2 metadata + assert predicate(".zgroup") is True + assert predicate(".zattrs") is True + assert predicate(".zmetadata") is True + assert predicate("0/.zarray") is True + assert predicate("0/.zattrs") is True + # Zarr v3 metadata + assert predicate("zarr.json") is True + assert predicate("sub/zarr.json") is True + # Data chunks should NOT match + assert predicate("0/0/0") is False + assert predicate("0/0/1") is False + + @pytest.mark.ai_generated + def test_single_filter(self) -> None: + predicate = make_zarr_entry_filter([ZarrFilter("path", "scale0")]) + assert predicate("scale0/0/0/0") is True + assert predicate("scale0/.zarray") is True + assert predicate("0/0/0") is False + + @pytest.mark.ai_generated + def test_empty_filter_list(self) -> None: + """Empty filter list should match nothing.""" + predicate = make_zarr_entry_filter([]) + assert predicate("anything") is False diff --git a/dandi/upload.py b/dandi/upload.py index 1694aee23..dfddfb8ee 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -101,6 +101,14 @@ def __str__(self) -> str: return self.value +class ZarrMode(str, Enum): + FULL = "full" + PATCH = "patch" + + def __str__(self) -> str: + return self.value + + def upload( paths: Sequence[str | Path] | None = None, existing: UploadExisting = UploadExisting.REFRESH, @@ -112,6 +120,7 @@ def upload( jobs: int | None = None, jobs_per_file: int | None = None, sync: bool = False, + zarr_mode: ZarrMode = ZarrMode.FULL, ) -> None: if paths: paths = [Path(p).absolute() for p in paths] @@ -394,8 +403,15 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: # yield {"status": "uploading"} validating = False + upload_kwargs: dict = {} + if isinstance(dfile, ZarrAsset): + upload_kwargs["zarr_mode"] = zarr_mode for r in dfile.iter_upload( - remote_dandiset, metadata, jobs=jobs_per_file, replacing=extant + remote_dandiset, + metadata, + jobs=jobs_per_file, + replacing=extant, + **upload_kwargs, ): r.pop("asset", None) # to keep pyout from choking if r["status"] == "uploading": diff --git a/dandi/zarr_filter.py b/dandi/zarr_filter.py new file mode 100644 index 000000000..3aa51cc43 --- /dev/null +++ b/dandi/zarr_filter.py @@ -0,0 +1,167 @@ +""" +Filtering of entries within Zarr assets for partial download. + +Provides filter parsing, matching, and predefined aliases for selecting +subsets of entries within Zarr assets (e.g., only metadata files). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from fnmatch import fnmatchcase +import re +from typing import Callable, Literal + + +@dataclass +class ZarrFilter: + """Filter for selecting entries within a Zarr asset.""" + + filter_type: Literal["glob", "path", "regex"] + pattern: str + _compiled_regex: re.Pattern[str] | None = field( + init=False, default=None, repr=False, compare=False + ) + + def __post_init__(self) -> None: + if self.filter_type == "regex": + try: + self._compiled_regex = re.compile(self.pattern) + except re.error as e: + raise ValueError(f"Invalid regex pattern {self.pattern!r}: {e}") from e + + def matches(self, entry_path: str) -> bool: + """Test if a zarr-internal path matches this filter. + + Parameters + ---------- + entry_path : str + A zarr-internal path like ``"0/0/0/.zarray"`` (no leading slash). + + Returns + ------- + bool + True if the path matches, False otherwise. + """ + if self.filter_type == "glob": + return _glob_match(self.pattern, entry_path) + elif self.filter_type == "path": + # Prefix match: the entry path equals the pattern or is under it + return entry_path == self.pattern or entry_path.startswith( + self.pattern.rstrip("/") + "/" + ) + elif self.filter_type == "regex": + assert self._compiled_regex is not None + return self._compiled_regex.search(entry_path) is not None + else: + raise ValueError(f"Unknown filter type: {self.filter_type!r}") + + +def _glob_match(pattern: str, path: str) -> bool: + """Match a glob pattern against a zarr-internal path. + + Supports ``*`` (within a single component) and ``**`` (across directories), + consistent with ``fnmatchcase`` semantics used elsewhere in the codebase + (``BasePath.match()``, ``RemoteZarrEntry.match()``). + """ + patparts = [p for p in pattern.split("/") if p] + pathparts = [p for p in path.split("/") if p] + # Collapse consecutive ** into a single ** to avoid exponential backtracking + collapsed: list[str] = [] + for p in patparts: + if p == "**" and collapsed and collapsed[-1] == "**": + continue + collapsed.append(p) + return _glob_match_parts(collapsed, pathparts) + + +def _glob_match_parts(patparts: list[str], pathparts: list[str]) -> bool: + """Recursively match pattern parts against path parts.""" + pi = 0 # pattern index + si = 0 # path (string) index + while pi < len(patparts) and si < len(pathparts): + if patparts[pi] == "**": + # ** matches zero or more path components + # Try matching the rest of the pattern against every suffix + for k in range(si, len(pathparts) + 1): + if _glob_match_parts(patparts[pi + 1 :], pathparts[k:]): + return True + return False + elif fnmatchcase(pathparts[si], patparts[pi]): + pi += 1 + si += 1 + else: + return False + # Handle trailing ** which can match zero components + while pi < len(patparts) and patparts[pi] == "**": + pi += 1 + return pi == len(patparts) and si == len(pathparts) + + +ZARR_FILTER_ALIASES: dict[str, list[ZarrFilter]] = { + "metadata": [ + ZarrFilter("glob", "**/.z*"), + ZarrFilter("glob", "**/zarr.json"), + ZarrFilter("glob", "**/.zmetadata"), + ], +} + + +def parse_zarr_filter(spec: str) -> list[ZarrFilter]: + """Parse a ``--zarr`` filter spec like ``'glob:**/.z*'`` or ``'metadata'``. + + Parameters + ---------- + spec : str + Either a predefined alias (e.g., ``"metadata"``) or a + ``TYPE:PATTERN`` string where TYPE is ``glob``, ``path``, or ``regex``. + + Returns + ------- + list[ZarrFilter] + One or more filters (aliases may expand to multiple). + + Raises + ------ + ValueError + If the spec is not a valid alias or ``TYPE:PATTERN`` string, + or if a regex pattern is invalid. + """ + if spec in ZARR_FILTER_ALIASES: + return list(ZARR_FILTER_ALIASES[spec]) + type_, _, pattern = spec.partition(":") + if not pattern: + raise ValueError( + f"Invalid zarr filter: {spec!r}. " + f"Expected TYPE:PATTERN or one of {list(ZARR_FILTER_ALIASES)}" + ) + if type_ not in ("glob", "path", "regex"): + raise ValueError( + f"Unknown zarr filter type: {type_!r}. Expected 'glob', 'path', or 'regex'" + ) + # type_ is validated above; narrow str -> Literal for the dataclass + return [ZarrFilter(type_, pattern)] # type: ignore[arg-type] + + +def make_zarr_entry_filter(filters: list[ZarrFilter]) -> Callable[[str], bool]: + """Return a predicate that tests whether a zarr entry path should be included. + + Multiple filters are combined with OR semantics: an entry is included + if it matches **any** of the provided filters. + + Parameters + ---------- + filters : list[ZarrFilter] + Filters to combine. An empty list produces a predicate that + rejects every path. + + Returns + ------- + Callable[[str], bool] + A predicate ``(entry_path) -> bool``; returns True to include. + """ + + def predicate(entry_path: str) -> bool: + return any(f.matches(entry_path) for f in filters) + + return predicate diff --git a/doc/design/partial-zarr.md b/doc/design/partial-zarr.md new file mode 100644 index 000000000..2d1a9395d --- /dev/null +++ b/doc/design/partial-zarr.md @@ -0,0 +1,484 @@ +# Design: Partial Zarr Download and Upload + +## Status + +Draft + +## Related Issues + +- dandi-cli [#1462](https://github.com/dandi/dandi-cli/issues/1462) -- Support partial zarr upload +- dandi-cli [#1474](https://github.com/dandi/dandi-cli/issues/1474) -- Partial Zarr Directory Updates (DANDI/LINC) +- dandi-cli [#1596](https://github.com/dandi/dandi-cli/pull/1596) -- Draft PR: Support for partial download of zarrs (stale) +- dandi-archive [#1993](https://github.com/dandi/dandi-archive/issues/1993) -- WebDAV access for zarr browsing +- dandi-archive [#931](https://github.com/dandi/dandi-archive/issues/931) -- Re-design zarr checksum (closed/completed) +- dandi-archive [#1892](https://github.com/dandi/dandi-archive/pull/1892) -- Zarr versioning/publishing design via manifests (closed) +- dandi-archive [#2702](https://github.com/dandi/dandi-archive/pull/2702) -- Versioned Zarr design doc (draft) +- zarr_checksum [#50](https://github.com/dandi/zarr_checksum/issues/50) -- Generalize to folder_checksum +- zarr_checksum [#56](https://github.com/dandi/zarr_checksum/issues/56) -- Generalize checksum algorithm + +## Problem + +Zarr assets on DANDI can be very large (multi-GB). Users frequently need to +modify only metadata files (`.zarray`, `.zgroup`, `.zattrs` for Zarr v2; +`zarr.json` for Zarr v3) or work with a specific subdirectory of a Zarr archive +without downloading or uploading the entire thing. + +Currently, `dandi download` of a Zarr asset always fetches every file, and +`dandi upload` of a Zarr always performs a full bidirectional sync (uploading new +or changed files **and deleting** remote files absent locally). This makes even +small metadata edits prohibitively expensive for large Zarrs. + +## Primary Use Case + +From @kabilar (#1462): + +```bash +# 1. Download just the metadata files from a zarr +dandi download --zarr glob:'**/.z*' --zarr glob:'**/zarr.json' \ + dandi://dandi/001289/rawdata/.../PC.ome.zarr + +# 2. Edit .zattrs locally +vim PC.ome.zarr/.zattrs + +# 3. Upload changes without deleting remote data chunks +dandi upload --zarr-mode patch +``` + +--- + +## Part 1: Zarr Entry Filtering (`--zarr`) + +### Design Rationale + +Rather than a narrow `--zarr-metadata` flag, we provide a general-purpose +`--zarr` option that accepts filter expressions for selecting entries within +Zarr assets. This aligns with the existing `--path-type glob` mechanism that +filters *assets* by path, and extends the concept into the Zarr-internal +namespace. + +### Filter Expression Syntax + +The `--zarr` option takes a `TYPE:PATTERN` expression. Multiple `--zarr` +options combine with OR semantics (entry matches if **any** filter matches): + +| Type | Syntax | Description | +|------|--------|-------------| +| `glob:PATTERN` | `--zarr glob:'**/.z*'` | Glob matched against the zarr-internal path using `fnmatch` semantics per path component. `*` matches within a component, `**` matches across directory levels. | +| `path:PREFIX` | `--zarr path:0/1/2` | Exact prefix match -- download entries whose zarr-internal path starts with this prefix (maps to the API `prefix` parameter). | +| `regex:PATTERN` | `--zarr regex:'\.z(array|group|attrs)$'` | Full regex matched against the zarr-internal path. | + +#### Predefined Aliases + +Predefined aliases provide convenient shorthands for common filter sets: + +| Alias | Expands To | +|-------|-----------| +| `metadata` | `glob:**/.z*` + `glob:**/zarr.json` + `glob:**/.zmetadata` | + +So `--zarr metadata` is equivalent to +`--zarr glob:'**/.z*' --zarr glob:'**/zarr.json'`. + +### Why This Approach + +1. **Consistency with existing patterns**: The codebase already uses + `fnmatchcase` for `RemoteZarrEntry.match()` (`dandiapi.py:2098`) and + `BasePath.match()` (`misctypes.py:210`), and the server API supports + `?glob=` for asset-level filtering via `get_assets_by_glob()`. Reusing + glob syntax at the zarr-entry level is natural. + +2. **The server already supports prefix filtering**: `RemoteZarrAsset.iterfiles(prefix=...)` + (`dandiapi.py:1813`) calls `/zarr/{id}/files?prefix=...`. The `path:` type + maps directly to this, minimizing API calls. + +3. **Composability**: Users can combine glob and path filters to express complex + selections. E.g., download all `.zattrs` under a specific resolution level: + `--zarr path:0/1 --zarr glob:'**/.zattrs'` (OR semantics; could later add + AND with `+` or `--zarr-and`). + +4. **Prior art**: `rsync` uses `--include`/`--exclude` with glob patterns; + `rclone` uses `--include`/`--filter` with similar syntax; `git` uses + pathspecs. The `TYPE:PATTERN` syntax is closest to `git`'s + `:(glob)pattern` / `:(literal)pattern` pathspec magic. + +### Where Filters Apply + +- `dandi download --zarr ...`: Filters which entries within Zarr assets to + download. When `--zarr` is specified, only matching entries are downloaded. + Applies to **all** Zarr assets encountered (whether downloading a single zarr + URL or an entire dandiset). + +- `dandi upload` does **not** use `--zarr` filters for selecting what to upload + (the local filesystem determines that). Upload behavior is controlled by + `--zarr-mode` (see Part 2). + +### Implementation + +#### New module: `dandi/zarr_filter.py` + +```python +@dataclass +class ZarrFilter: + """Filter for selecting entries within a Zarr asset.""" + filter_type: Literal["glob", "path", "regex"] + pattern: str + + def matches(self, entry_path: str) -> bool: + """Test if a zarr-internal path matches this filter.""" + ... + +ZARR_FILTER_ALIASES: dict[str, list[ZarrFilter]] = { + "metadata": [ + ZarrFilter("glob", "**/.z*"), + ZarrFilter("glob", "**/zarr.json"), + ZarrFilter("glob", "**/.zmetadata"), + ], +} + +def parse_zarr_filter(spec: str) -> list[ZarrFilter]: + """Parse a --zarr filter spec like 'glob:**/.z*' or 'metadata'.""" + if spec in ZARR_FILTER_ALIASES: + return ZARR_FILTER_ALIASES[spec] + type_, _, pattern = spec.partition(":") + if not pattern: + raise click.BadParameter(f"Invalid zarr filter: {spec!r}") + return [ZarrFilter(type_, pattern)] + +def make_zarr_entry_filter(filters: list[ZarrFilter]) -> Callable[[str], bool]: + """Return a predicate: entry_path -> bool (True = include).""" + def predicate(entry_path: str) -> bool: + return any(f.matches(entry_path) for f in filters) + return predicate +``` + +#### CLI changes: `dandi/cli/cmd_download.py` + +```python +@click.option( + "--zarr", + "zarr_filters", + multiple=True, + metavar="FILTER", + help=( + "Filter entries within Zarr assets. Format: TYPE:PATTERN where TYPE " + "is 'glob', 'path', or 'regex'. Predefined: 'metadata'. " + "Can be specified multiple times (OR logic)." + ), +) +``` + +#### Download pipeline changes: `dandi/download.py` + +`_download_zarr()` (line ~986) gains a `zarr_entry_filter: Callable | None` parameter: + +- When a filter is active: + - For `path:` filters, use `asset.iterfiles(prefix=...)` to reduce API calls + - For other filters, iterate all entries and apply `predicate(str(entry))` + - **Skip** the "deleting extra files" step (lines 1045-1075) -- we don't + want to delete local files outside the filter scope + - **Skip** zarr-level checksum verification (lines 1077-1091) -- partial + downloads won't match the whole-zarr checksum + - **Do** verify individual file checksums (MD5/etag) as they are downloaded -- + this already happens in `_download_file()` + +--- + +## Part 2: URL Parsing -- Zarr-Internal Paths + +### Problem + +A user may specify a URL like: + +``` +dandi://dandi/000108/.../sub-MITU01.ome.zarr/0/0/0/0/0 +``` + +Currently, `parse_dandi_url()` creates an `AssetItemURL` with the full path, +and `get_asset_by_path()` fails because no asset exists at that path -- the +asset path is `sub-MITU01.ome.zarr` and `0/0/0/0/0` is internal to the Zarr. + +### Approach + +Add zarr boundary detection in URL parsing (as discussed in #1462 by @jwodder), +mirroring dandidav's approach: + +1. When `parse_dandi_url()` would create an `AssetItemURL`, scan the `location` + path for segments ending with extensions in `ZARR_EXTENSIONS` (`.zarr`, + `.ngff`). + +2. If found, split into `asset_path` (up to and including the `.zarr`/`.ngff` + segment) and `zarr_subpath` (the remainder). + +3. Create a new `AssetZarrEntryURL` that carries both parts: + +```python +@dataclass +class AssetZarrEntryURL(SingleAssetURL): + """URL pointing to entries within a Zarr asset.""" + asset_path: str # e.g., "sub-1/file.ome.zarr" + zarr_subpath: str # e.g., "0/0/0/0/0" + + def get_assets(self, client, order=None, strict=False): + dandiset = self.get_dandiset(client, lazy=not strict) + assert dandiset is not None + yield dandiset.get_asset_by_path(self.asset_path) + + def get_zarr_filter(self) -> list[ZarrFilter]: + """Convert the subpath into a path: filter for download.""" + return [ZarrFilter("path", self.zarr_subpath)] +``` + +The download pipeline then combines any URL-derived filters with explicit +`--zarr` filters (OR semantics). + +### Validation + +Use `PurePosixPath` to check path components properly (addressing @jwodder's +review on #1596 about rejecting paths like `foo/.zarr/bar`): + +```python +from pathlib import PurePosixPath + +def split_zarr_location(location: str) -> tuple[str, str] | None: + """Split a location into (asset_path, zarr_subpath) if it crosses a zarr boundary.""" + parts = PurePosixPath(location).parts + for i, part in enumerate(parts): + if any(part.endswith(ext) for ext in ZARR_EXTENSIONS): + asset_path = "/".join(parts[: i + 1]) + zarr_subpath = "/".join(parts[i + 1 :]) + return (asset_path, zarr_subpath) if zarr_subpath else None + return None +``` + +--- + +## Part 3: Partial Zarr Upload (`--zarr-mode`) + +### Modes + +| Mode | Behavior | +|------|----------| +| `full` (default) | Current behavior: full bidirectional sync. Upload new/changed local files, **delete** remote files not present locally. | +| `patch` | Upload new/changed local files. **Never delete** remote files absent locally. | + +**Why start with two modes**: The `partial-full` and `partial-lean` modes from +#1462 add complexity that isn't needed for the primary metadata-editing use +case. We can add them later if requested. + +**"patch" vs "dont-delete"**: "dont-delete" is misleading because even in `patch` +mode, if the user re-uploads a subdirectory tree, files within that tree that +were replaced by different structure should be cleaned up. See "Subtree sync" +below. "patch" better conveys "apply these changes on top of what's there." + +### Subtree Sync Semantics + +When uploading in `patch` mode, we still need to handle cases where a file at +path `a/b` locally is now a directory remotely (or vice versa). The existing +conflict-resolution logic in `iter_upload()` (lines 658-688) handles this: +if a local file's parent is a remote file, or a local file's path is a remote +directory, the conflicting remote entries are deleted. This behavior should be +preserved in `patch` mode. + +What `patch` mode changes is **only** the final cleanup step: remote files that +simply don't have a corresponding local file are **not** deleted (lines 838-850 +in `zarr.py`). + +### Checksum Handling in `patch` Mode + +After upload, the server computes a zarr-level checksum over **all** remote +files (via `POST /zarr/{id}/finalize/`). In `patch` mode, the client only has +a subset of files locally, so it cannot independently compute the matching +whole-zarr checksum. + +**Approach**: Skip local checksum comparison in `patch` mode. The server still +finalizes and computes its checksum correctly. Log a message explaining that +whole-zarr checksum verification is skipped due to partial upload mode. + +Individual file checksums (MD5) are still verified during upload -- each file's +`Content-MD5` header is checked by S3 on upload, and the local digest is +compared against the remote entry's etag during the comparison phase. + +### Implementation: `dandi/files/zarr.py` + +`iter_upload()` gains `zarr_mode: str = "full"`: + +```python +def iter_upload(self, dandiset, metadata, jobs=None, replacing=None, + zarr_mode="full"): + ... + # Line ~650: Only collect to_delete in full mode + if zarr_mode == "full": + to_delete: list[RemoteZarrEntry] = [] + else: + to_delete = None # signal: skip deletion + + ... + # Line ~838: Skip deleting extra remote files in patch mode + if zarr_mode == "full": + old_zarr_files = list(old_zarr_entries.values()) + if old_zarr_files: + yield from _rmfiles(...) + + ... + # Line ~861: Skip local checksum comparison in patch mode + if zarr_mode == "patch": + lgr.info("%s: Skipping local checksum verification (patch mode)", asset_path) + mismatched = False + else: + our_checksum = str(zcc.process()) + ... +``` + +### CLI: `dandi/cli/cmd_upload.py` + +```python +@click.option( + "--zarr-mode", + type=click.Choice(["full", "patch"]), + default="full", + help="Zarr sync mode: 'full' (default) syncs completely; " + "'patch' uploads/updates without deleting remote files.", + show_default=True, +) +``` + +--- + +## Part 4: Checksums and Manifests + +### Current State + +The zarr checksum is a **tree hash** (dandi-archive #931, resolved). Each +directory's checksum is computed from its immediate children only (not full +paths), producing a hierarchical Merkle-like structure: + +``` +root_checksum = md5(json({"directories": [...], "files": [...]})) + "-N--S" +``` + +where each child entry is `{"digest": ..., "name": ..., "size": ...}`. + +The `zarr_checksum` library (`~0.4.0`) implements this via `ZarrChecksumTree` +(see `zarr_checksum/tree.py`). The tree processes nodes bottom-up: each +directory's digest is computed from sorted JSON of its immediate children's +`{name, digest, size}` entries. This is repeated upward until the root digest +is produced. + +### Per-Directory Checksums Are NOT Persisted + +Although the algorithm is hierarchical and computes per-directory checksums as +intermediate results, the archive does **not** persist these intermediate +checksums anywhere. Specifically: + +- The archive's `ingest_zarr_archive` Celery task (in + `dandiapi/zarr/tasks/__init__.py`) calls `compute_zarr_checksum()` which + streams all files from S3, computes the tree hash **entirely in memory**, + and stores only the final root digest in the database (`zarr.checksum`, + `zarr.file_count`, `zarr.size`). + +- An earlier design (`zarr-support-3.md`) wrote per-directory `.checksum` + files to S3 under the `zarr-checksums/` prefix (note: hyphen, not + underscore). The server code that wrote these was removed in December 2022 + (dandi-archive commits removing `ZarrChecksumFileUpdater`, PRs #1390, + #1395). However, **legacy `.checksum` files still exist on S3** for + ~3,930 out of 5,431 zarrs (~72%) — those ingested before the removal. + Newer zarrs do not have them. These legacy files are not exposed via any + API endpoint and are effectively orphaned artifacts. + +- There is no API endpoint to retrieve per-directory checksums. + +This means **subtree checksum verification is not possible today** without +recomputing the checksum from individual file ETags (or reading the legacy +`.checksum` files directly from S3 for older zarrs, which is not a viable +general solution). + +### Incorporating Checksums into Manifests + +The versioned Zarr design doc (dandi-archive #2702, #1892) proposes **manifest +files** that track each Zarr snapshot's file inventory (paths, S3 version IDs, +sizes, ETags). These manifests would enable publishing Zarr-bearing dandisets. + +**Recommendation**: Include per-directory checksums in the manifest structure. +Since the algorithm is already hierarchical (each directory checksum depends only +on its immediate children), the manifest can encode the full Merkle tree. This +would enable: + +1. **Partial checksum verification**: After a partial download, verify the + checksum of each downloaded subtree against the manifest's per-directory + checksums, rather than only verifying individual file ETags. + +2. **Efficient change detection**: When uploading in `patch` mode, compare + the local subtree's checksum against the manifest to detect whether changes + occurred, without comparing every file individually. + +3. **Incremental manifest updates**: When a subtree is modified, only the + checksums along the path from that subtree to the root need recomputation. + +For the initial implementation here, we rely on per-file ETags for verification +and skip subtree-level checksums. A follow-up could add subtree verification +if the archive exposes per-directory checksums via API or manifests. + +### What We Verify Now + +| Scenario | File-level (MD5/ETag) | Subtree checksum | Whole-zarr checksum | +|----------|-----------------------|------------------|---------------------| +| Full download | Yes | N/A | Yes | +| Partial download (`--zarr`) | Yes | No (future: via manifest) | No (skipped) | +| Full upload | Yes (via S3 Content-MD5) | N/A | Yes (client vs server) | +| Patch upload | Yes (via S3 Content-MD5) | No (future: via manifest) | No (skipped) | + +--- + +## Part 5: `dandi ls` for Zarr Contents + +See separate implementation task. Summary: allow `dandi ls` to list files +within a Zarr asset when given a Zarr URL, using `asset.iterfiles(prefix=...)`. +Reuses `AssetZarrEntryURL` from Part 2. + +--- + +## Implementation Order + +1. **`dandi/zarr_filter.py`** -- filter parsing and matching (no deps) +2. **URL parsing** -- `AssetZarrEntryURL` and `split_zarr_location()` in + `dandi/dandiarchive.py` +3. **Download pipeline** -- `_download_zarr()` partial support, `Downloader` + plumbing +4. **`--zarr` CLI option** on `dandi download` +5. **`--zarr-mode` CLI option** on `dandi upload` +6. **`iter_upload()` patch mode** in `dandi/files/zarr.py` +7. **Upload plumbing** in `dandi/upload.py` +8. **`dandi ls` zarr contents** (separate PR) +9. Tests throughout + +## Files to Modify + +| File | Changes | +|------|---------| +| `dandi/zarr_filter.py` | **New**: filter parsing, matching, aliases | +| `dandi/consts.py` | `ZARR_METADATA_FILENAMES` constants (used by `metadata` alias) | +| `dandi/dandiarchive.py` | `AssetZarrEntryURL`, `split_zarr_location()`, modify `parse_dandi_url()` | +| `dandi/cli/cmd_download.py` | `--zarr` option | +| `dandi/cli/cmd_upload.py` | `--zarr-mode` option | +| `dandi/download.py` | `_download_zarr()` filtering, `Downloader` plumbing | +| `dandi/upload.py` | Pass `zarr_mode` through | +| `dandi/files/zarr.py` | `iter_upload()` patch mode support | +| `dandi/cli/cmd_ls.py` | Zarr contents listing (separate PR) | + +## Open Questions + +1. **`--zarr` AND vs OR**: Multiple `--zarr` options use OR. Should we support + AND composition (e.g., `--zarr path:0/1 --zarr-and glob:'**/.zattrs'`)? + For now, OR suffices; AND can be added later. + +2. **Server-side glob for zarr entries**: Currently the server only supports + `prefix` filtering on zarr entries (not glob). Glob filtering therefore + happens client-side after fetching entries. For large zarrs with millions of + entries, this could be slow. A future server API enhancement for + entry-level glob would help. For now, `path:` filters should be preferred + for large zarrs to minimize data transfer. + +3. **Interaction with `--sync`**: When `dandi download --zarr ... --sync` is + used, should `--sync` only delete local zarr entries that match the filter + but aren't remote? Or should `--sync` be disallowed with `--zarr`? + Recommendation: disallow `--sync` with `--zarr` initially, as the semantics + are ambiguous.