Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ development command line options.
are `clear` - would clear the cache, `ignore` - would ignore it. Note that for
metadata cache we use only released portion of `dandi.__version__` as a token.
If handling of metadata has changed while developing, set this env var to
`clear` to have cache `clear()`ed before use.
`clear` to have cache `clear()`ed before use. This variable also controls the
API metadata cache used by `DandiAPIClient(cache=True)`.

- `DANDI_INSTANCEHOST` -- defaults to `localhost`. Point to host/IP which hosts
a local instance of dandiarchive.
Expand Down
148 changes: 148 additions & 0 deletions dandi/apicache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Persistent sqlite3-backed cache for DANDI API metadata responses.

The cache stores metadata keyed by ``(api_url, entity_type, entity_id)`` and
validates entries against a *modified* timestamp so that stale data is
automatically discarded without extra API calls.
"""

from __future__ import annotations

import json
import os
from pathlib import Path
import sqlite3

from platformdirs import user_cache_dir

from . import get_logger

lgr = get_logger()

_SCHEMA = """\
CREATE TABLE IF NOT EXISTS metadata_cache (
api_url TEXT NOT NULL,
entity_type TEXT NOT NULL,
entity_id TEXT NOT NULL,
modified TEXT NOT NULL,
metadata TEXT NOT NULL,
PRIMARY KEY (api_url, entity_type, entity_id)
);
"""


class APIMetadataCache:
"""A lightweight, persistent metadata cache backed by sqlite3.

Parameters
----------
db_path : Path or None
Explicit path for the sqlite database. When *None* (the default) the
database is placed under ``platformdirs.user_cache_dir("dandi")``.
"""

def __init__(self, db_path: Path | None = None) -> None:
if db_path is None:
db_path = Path(user_cache_dir("dandi")) / "api_metadata_cache.sqlite"
db_path.parent.mkdir(parents=True, exist_ok=True)
self._db_path = db_path

dandi_cache = os.environ.get("DANDI_CACHE", "").lower()
if dandi_cache == "ignore":
lgr.debug("DANDI_CACHE=ignore: API metadata cache disabled")
self._enabled = False
return

self._enabled = True
self._con = sqlite3.connect(str(db_path), check_same_thread=False)
self._con.execute("PRAGMA journal_mode=WAL;")
self._con.execute(_SCHEMA)
self._con.commit()

if dandi_cache == "clear":
lgr.debug("DANDI_CACHE=clear: clearing API metadata cache")
self.clear()

# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------

def get(
self,
api_url: str,
entity_type: str,
entity_id: str,
modified: str,
) -> dict | None:
"""Return cached metadata if *modified* matches, else ``None``.

Parameters
----------
api_url : str
Base URL of the DANDI API server.
entity_type : str
``"dandiset"`` or ``"asset"``.
entity_id : str
Unique identifier for the entity (asset UUID, or
``"<dandiset_id>/<version_id>"`` for Dandisets).
modified : str
ISO-8601 timestamp of the entity's last modification. A cache
hit is only returned when this value matches the stored entry.

Returns
-------
dict or None
The cached metadata dict, or ``None`` on a cache miss.
"""
if not self._enabled:
return None
row = self._con.execute(
"SELECT metadata FROM metadata_cache "
"WHERE api_url = ? AND entity_type = ? AND entity_id = ? AND modified = ?",
(api_url, entity_type, entity_id, modified),
).fetchone()
if row is not None:
lgr.debug("API cache hit: %s %s %s", entity_type, entity_id, modified)
return json.loads(row[0]) # type: ignore[no-any-return]
return None

def set(
self,
api_url: str,
entity_type: str,
entity_id: str,
modified: str,
metadata: dict,
) -> None:
"""Insert or replace a cache entry.

Parameters
----------
api_url : str
Base URL of the DANDI API server.
entity_type : str
``"dandiset"`` or ``"asset"``.
entity_id : str
Unique identifier for the entity.
modified : str
ISO-8601 timestamp of the entity's last modification.
metadata : dict
The raw metadata dict to cache.
"""
if not self._enabled:
return
self._con.execute(
"INSERT OR REPLACE INTO metadata_cache "
"(api_url, entity_type, entity_id, modified, metadata) "
"VALUES (?, ?, ?, ?, ?)",
(api_url, entity_type, entity_id, modified, json.dumps(metadata)),
)
self._con.commit()
lgr.debug("API cache set: %s %s %s", entity_type, entity_id, modified)

def clear(self) -> None:
"""Delete all cached entries from the database."""
if not self._enabled:
return
self._con.execute("DELETE FROM metadata_cache")
self._con.commit()
lgr.debug("API metadata cache cleared")
67 changes: 56 additions & 11 deletions dandi/dandiapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@
if TYPE_CHECKING:
from typing_extensions import Self

from .apicache import APIMetadataCache


lgr = get_logger()

Expand Down Expand Up @@ -426,6 +428,7 @@ def __init__(
api_url: str | None = None,
token: str | None = None,
dandi_instance: DandiInstance | None = None,
cache: bool = False,
) -> None:
"""
Construct a client instance for the given API URL or DANDI instance
Expand All @@ -439,6 +442,11 @@ def __init__(
``"https://api.sandbox.dandiarchive.org/api"``
:param str token: User API Key. Note that different instance APIs have
different keys.
:param bool cache: When ``True``, API metadata responses are cached
persistently to disk (in an sqlite3 database) and validated against
``modified`` timestamps. Controlled by the :envvar:`DANDI_CACHE`
environment variable (``"ignore"`` disables, ``"clear"`` wipes the
cache on first access).
"""
check_dandi_version()
if api_url is None:
Expand All @@ -458,6 +466,17 @@ def __init__(
self.dandi_instance: DandiInstance = dandi_instance
if token is not None:
self.authenticate(token)
if cache:
from .apicache import APIMetadataCache

self._cache: APIMetadataCache | None = APIMetadataCache()
else:
self._cache = None

@property
def cache(self) -> APIMetadataCache | None:
"""The persistent API metadata cache, or ``None`` if caching is disabled."""
return self._cache

@classmethod
def for_dandi_instance(
Expand Down Expand Up @@ -1152,14 +1171,28 @@ def get_metadata(self) -> models.Dandiset:
def get_raw_metadata(self) -> dict[str, Any]:
"""
Fetch the metadata for this version of the Dandiset as an unprocessed
`dict`
"""
`dict`.

When the client was created with ``cache=True``, results are served
from a persistent on-disk cache whenever the version's ``modified``
timestamp has not changed.
"""
cache = self.client.cache
entity_id = f"{self.identifier}/{self.version_id}"
if cache is not None and self._version is not None:
modified = self._version.modified.isoformat()
cached = cache.get(self.client.api_url, "dandiset", entity_id, modified)
if cached is not None:
return cached
try:
data = self.client.get(self.version_api_path)
assert isinstance(data, dict)
return data
except HTTP404Error:
raise NotFoundError(f"No such asset: {self}")
if cache is not None:
modified = self.version.modified.isoformat()
cache.set(self.client.api_url, "dandiset", entity_id, modified, data)
return data

def set_metadata(self, metadata: models.Dandiset) -> None:
"""
Expand Down Expand Up @@ -1558,16 +1591,28 @@ def get_metadata(self) -> models.Asset:
return models.Asset.model_validate(self.get_raw_metadata())

def get_raw_metadata(self) -> dict[str, Any]:
"""Fetch the metadata for the asset as an unprocessed `dict`"""
"""Fetch the metadata for the asset as an unprocessed `dict`.

When the client was created with ``cache=True``, results are served
from a persistent on-disk cache whenever the asset's ``modified``
timestamp has not changed.
"""
if self._metadata is not None:
return self._metadata
else:
try:
data = self.client.get(self.api_path)
assert isinstance(data, dict)
return data
except HTTP404Error:
raise NotFoundError(f"No such asset: {self}")
cache = self.client.cache
modified = self.modified.isoformat()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

frankly I am not sure yet if we can rely on modified on per asset, see e.g. a random first hit

and we had touched on those prior in

if cache is not None:
cached = cache.get(self.client.api_url, "asset", self.identifier, modified)
if cached is not None:
return cached
try:
data = self.client.get(self.api_path)
assert isinstance(data, dict)
except HTTP404Error:
raise NotFoundError(f"No such asset: {self}")
if cache is not None:
cache.set(self.client.api_url, "asset", self.identifier, modified, data)
return data

def get_raw_digest(self, digest_type: str | models.DigestType | None = None) -> str:
"""
Expand Down
102 changes: 102 additions & 0 deletions dandi/tests/test_apicache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Tests for the persistent API metadata cache."""

from __future__ import annotations

from pathlib import Path

import pytest

from dandi.apicache import APIMetadataCache


@pytest.mark.ai_generated
class TestAPIMetadataCache:
API_URL = "https://api.dandiarchive.org/api"

def test_cache_miss(self, tmp_path: Path) -> None:
cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
result = cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z")
assert result is None

def test_set_then_get(self, tmp_path: Path) -> None:
cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
metadata = {"name": "test-asset", "size": 42}
cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", metadata)
result = cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z")
assert result == metadata

def test_stale_modified_returns_none(self, tmp_path: Path) -> None:
cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
metadata = {"name": "test-asset", "size": 42}
cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", metadata)
# Different modified timestamp -> cache miss
result = cache.get(self.API_URL, "asset", "abc-123", "2024-06-15T12:00:00Z")
assert result is None

def test_update_replaces_entry(self, tmp_path: Path) -> None:
cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", {"v": 1})
cache.set(self.API_URL, "asset", "abc-123", "2024-06-15T12:00:00Z", {"v": 2})
# Old modified no longer matches
assert (
cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") is None
)
# New modified matches
assert cache.get(self.API_URL, "asset", "abc-123", "2024-06-15T12:00:00Z") == {
"v": 2
}

def test_clear(self, tmp_path: Path) -> None:
cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", {"a": 1})
cache.clear()
assert (
cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") is None
)

def test_different_entity_types(self, tmp_path: Path) -> None:
cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
cache.set(
self.API_URL, "asset", "id1", "2024-01-01T00:00:00Z", {"type": "asset"}
)
cache.set(
self.API_URL,
"dandiset",
"id1",
"2024-01-01T00:00:00Z",
{"type": "dandiset"},
)
assert cache.get(self.API_URL, "asset", "id1", "2024-01-01T00:00:00Z") == {
"type": "asset"
}
assert cache.get(self.API_URL, "dandiset", "id1", "2024-01-01T00:00:00Z") == {
"type": "dandiset"
}

def test_dandi_cache_ignore(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
monkeypatch.setenv("DANDI_CACHE", "ignore")
cache = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
cache.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", {"a": 1})
# Even after set, get returns None because cache is disabled
assert (
cache.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") is None
)

def test_dandi_cache_clear(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
# First, populate the cache normally
cache1 = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
cache1.set(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z", {"a": 1})
assert cache1.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") == {
"a": 1
}

# Now open with DANDI_CACHE=clear — old data should be gone
monkeypatch.setenv("DANDI_CACHE", "clear")
cache2 = APIMetadataCache(db_path=tmp_path / "cache.sqlite")
assert (
cache2.get(self.API_URL, "asset", "abc-123", "2024-01-01T00:00:00Z") is None
)
Loading
Loading