From 23ab6095add56479f07d058b714e12a62d90d13e Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 19 Feb 2026 11:42:16 -0700 Subject: [PATCH 01/10] add lazy read capabilities --- README.md | 15 ++ docs/src/dlpack.md | 15 ++ src/ome_arrow/__init__.py | 2 +- src/ome_arrow/core.py | 383 +++++++++++++++++++++++++++++++------- src/ome_arrow/tensor.py | 251 ++++++++++++++++++++++++- tests/test_core.py | 42 +++++ tests/test_tensor.py | 24 ++- 7 files changed, 663 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 1a2a8bf..a9bed3c 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,21 @@ for cap in view3d.iter_tiles_3d(tile_size=(2, 64, 64), mode="numpy"): pass ``` +Lazy scan-style convention (Polars-like): + +```python +from ome_arrow import OMEArrow + +oa = OMEArrow.scan("your_image.ome.parquet") # deferred load +# First: queue lazy spatial/index slicing +lazy_crop = oa.slice_lazy(0, 512, 0, 512).slice_lazy(64, 256, 64, 256) +cropped = lazy_crop.collect() + +# Then: build lazy tensor views from a source scan +lazy_view = oa.tensor_view(t=0, z=slice(0, 4), roi=(0, 0, 512, 512)) +arr = lazy_view.to_numpy() +``` + Advanced options: - `chunk_policy="auto" | "combine" | "keep"` controls ChunkedArray handling. diff --git a/docs/src/dlpack.md b/docs/src/dlpack.md index 01947d7..aaf15b7 100644 --- a/docs/src/dlpack.md +++ b/docs/src/dlpack.md @@ -41,6 +41,21 @@ flat = torch.utils.dlpack.from_dlpack(capsule) tensor = flat.reshape(view.shape) ``` +## Lazy scan-style slicing + +```python +from ome_arrow import OMEArrow + +obj = OMEArrow.scan("example.ome.parquet") +# Prioritize lazy slice planning first. +lazy_crop = obj.slice_lazy(0, 512, 0, 512).slice_lazy(64, 256, 64, 256) +cropped = lazy_crop.collect() + +# Then execute lazy tensor selections. +lazy_view = obj.tensor_view(t=0, z=slice(0, 8), roi=(128, 128, 256, 256)) +arr = lazy_view.to_numpy() +``` + ## JAX ```python diff --git a/src/ome_arrow/__init__.py b/src/ome_arrow/__init__.py index 509201f..a446e30 100644 --- a/src/ome_arrow/__init__.py +++ b/src/ome_arrow/__init__.py @@ -20,7 +20,7 @@ to_ome_arrow, ) from ome_arrow.meta import OME_ARROW_STRUCT, OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION -from ome_arrow.tensor import TensorView +from ome_arrow.tensor import LazyTensorView, TensorView from ome_arrow.utils import describe_ome_arrow, verify_ome_arrow from ome_arrow.view import view_matplotlib, view_pyvista diff --git a/src/ome_arrow/core.py b/src/ome_arrow/core.py index ad975cc..e7f40c1 100644 --- a/src/ome_arrow/core.py +++ b/src/ome_arrow/core.py @@ -5,6 +5,7 @@ from __future__ import annotations import pathlib +from dataclasses import dataclass from typing import ( TYPE_CHECKING, Any, @@ -36,7 +37,7 @@ from_tiff, ) from ome_arrow.meta import OME_ARROW_STRUCT -from ome_arrow.tensor import TensorView +from ome_arrow.tensor import LazyTensorView, TensorView from ome_arrow.transform import slice_ome_arrow from ome_arrow.utils import describe_ome_arrow from ome_arrow.view import view_matplotlib, view_pyvista @@ -46,6 +47,30 @@ import pyvista +@dataclass(frozen=True) +class _LazySourceSpec: + """Deferred source description for lazy OMEArrow loading.""" + + data: str + column_name: str + row_index: int + image_type: str | None + + +@dataclass(frozen=True) +class _LazySliceSpec: + """Deferred spatial/index slice specification.""" + + x_min: int + x_max: int + y_min: int + y_max: int + t_indices: tuple[int, ...] | None + c_indices: tuple[int, ...] | None + z_indices: tuple[int, ...] | None + fill_missing: bool + + class OMEArrow: """ Small convenience toolkit for working with ome-arrow data. @@ -70,6 +95,7 @@ def __init__( column_name: str = "ome_arrow", row_index: int = 0, image_type: str | None = None, + lazy: bool = False, ) -> None: """ Construct an OMEArrow from: @@ -83,11 +109,26 @@ def __init__( - a dict already matching the OME-Arrow schema - a pa.StructScalar already typed to OME_ARROW_STRUCT - optionally override/set image_type metadata on ingest + - optionally defer source-file ingestion with lazy=True """ # set the tcz for viewing self.tcz = tcz + self._data: pa.StructScalar | None = None self._struct_array: pa.StructArray | None = None + self._lazy_source: _LazySourceSpec | None = None + self._lazy_slices: list[_LazySliceSpec] = [] + + if lazy: + if not isinstance(data, str): + raise TypeError("lazy=True currently supports only string file inputs.") + self._lazy_source = _LazySourceSpec( + data=data, + column_name=column_name, + row_index=row_index, + image_type=image_type, + ) + return # --- 1) Stack pattern (Bio-Formats-style) -------------------------------- if isinstance(data, str) and any(c in data for c in "<>*"): @@ -101,69 +142,12 @@ def __init__( # --- 2) String path/URL: OME-Zarr / OME-Parquet / OME-TIFF --------------- elif isinstance(data, str): - s = data.strip() - path = pathlib.Path(s) - - # Zarr detection - if ( - s.lower().endswith(".zarr") - or s.lower().endswith(".ome.zarr") - or ".zarr/" in s.lower() - or (path.exists() and path.is_dir() and path.suffix.lower() == ".zarr") - ): - self.data = from_ome_zarr(s) - if image_type is not None: - self.data = self._wrap_with_image_type(self.data, image_type) - - # OME-Parquet - elif s.lower().endswith((".parquet", ".pq")) or path.suffix.lower() in { - ".parquet", - ".pq", - }: - parquet_result = from_ome_parquet( - s, - column_name=column_name, - row_index=row_index, - return_array=True, - ) - self.data, self._struct_array = parquet_result - if image_type is not None: - self.data = self._wrap_with_image_type(self.data, image_type) - - # Vortex - elif s.lower().endswith(".vortex") or path.suffix.lower() == ".vortex": - vortex_result = from_ome_vortex( - s, - column_name=column_name, - row_index=row_index, - return_array=True, - ) - self.data, self._struct_array = vortex_result - if image_type is not None: - self.data = self._wrap_with_image_type(self.data, image_type) - - # TIFF - elif path.suffix.lower() in {".tif", ".tiff"} or s.lower().endswith( - (".tif", ".tiff") - ): - self.data = from_tiff(s) - if image_type is not None: - self.data = self._wrap_with_image_type(self.data, image_type) - - elif path.exists() and path.is_dir(): - raise ValueError( - f"Directory '{s}' exists but does not look like an OME-Zarr store " - "(expected suffix '.zarr' or '.ome.zarr')." - ) - else: - raise ValueError( - "String input must be one of:\n" - " • Bio-Formats pattern string (contains '<', '>' or '*')\n" - " • OME-Zarr path/URL ending with '.zarr' or '.ome.zarr'\n" - " • OME-Parquet file ending with '.parquet' or '.pq'\n" - " • Vortex file ending with '.vortex'\n" - " • OME-TIFF path/URL ending with '.tif' or '.tiff'" - ) + self.data, self._struct_array = self._load_from_string_source( + data, + column_name=column_name, + row_index=row_index, + image_type=image_type, + ) # --- 3) NumPy ndarray ---------------------------------------------------- elif isinstance(data, np.ndarray): @@ -191,6 +175,178 @@ def __init__( "input data must be str, dict, pa.StructScalar, or numpy.ndarray" ) + @classmethod + def scan( + cls, + data: str, + *, + tcz: Tuple[int, int, int] = (0, 0, 0), + column_name: str = "ome_arrow", + row_index: int = 0, + image_type: str | None = None, + ) -> "OMEArrow": + """Create a lazily-loaded OMEArrow, similar to Polars scan semantics. + + Args: + data: Input source path/URL. + tcz: Default `(t, c, z)` indices used for view helpers. + column_name: OME-Arrow column name for tabular sources. + row_index: Row index for tabular sources. + image_type: Optional image type override. + + Returns: + OMEArrow: Lazily planned OMEArrow instance. + """ + return cls( + data=data, + tcz=tcz, + column_name=column_name, + row_index=row_index, + image_type=image_type, + lazy=True, + ) + + @property + def is_lazy(self) -> bool: + """Return whether this instance still has deferred work.""" + return self._lazy_source is not None or bool(self._lazy_slices) + + @property + def data(self) -> pa.StructScalar: + """Return the materialized OME-Arrow StructScalar. + + Returns: + pa.StructScalar: Materialized OME-Arrow record. + + Raises: + RuntimeError: If the record could not be initialized. + """ + self._ensure_materialized() + if self._data is None: + raise RuntimeError("OMEArrow data is not initialized.") + return self._data + + @data.setter + def data(self, value: pa.StructScalar) -> None: + self._data = value + + def collect(self) -> "OMEArrow": + """Materialize deferred source data and return ``self``. + + Returns: + OMEArrow: The same instance after materialization. + """ + self._ensure_materialized() + return self + + @staticmethod + def _load_from_string_source( + data: str, + *, + column_name: str, + row_index: int, + image_type: str | None, + ) -> tuple[pa.StructScalar, pa.StructArray | None]: + s = data.strip() + path = pathlib.Path(s) + struct_array: pa.StructArray | None = None + + if ( + s.lower().endswith(".zarr") + or s.lower().endswith(".ome.zarr") + or ".zarr/" in s.lower() + or (path.exists() and path.is_dir() and path.suffix.lower() == ".zarr") + ): + scalar = from_ome_zarr(s) + if image_type is not None: + scalar = OMEArrow._wrap_with_image_type(scalar, image_type) + return scalar, None + + if s.lower().endswith((".parquet", ".pq")) or path.suffix.lower() in { + ".parquet", + ".pq", + }: + parquet_result = from_ome_parquet( + s, + column_name=column_name, + row_index=row_index, + return_array=True, + ) + scalar, struct_array = parquet_result + if image_type is not None: + scalar = OMEArrow._wrap_with_image_type(scalar, image_type) + return scalar, struct_array + + if s.lower().endswith(".vortex") or path.suffix.lower() == ".vortex": + vortex_result = from_ome_vortex( + s, + column_name=column_name, + row_index=row_index, + return_array=True, + ) + scalar, struct_array = vortex_result + if image_type is not None: + scalar = OMEArrow._wrap_with_image_type(scalar, image_type) + return scalar, struct_array + + if path.suffix.lower() in {".tif", ".tiff"} or s.lower().endswith( + (".tif", ".tiff") + ): + scalar = from_tiff(s) + if image_type is not None: + scalar = OMEArrow._wrap_with_image_type(scalar, image_type) + return scalar, None + + if path.exists() and path.is_dir(): + raise ValueError( + f"Directory '{s}' exists but does not look like an OME-Zarr store " + "(expected suffix '.zarr' or '.ome.zarr')." + ) + + raise ValueError( + "String input must be one of:\n" + " • Bio-Formats pattern string (contains '<', '>' or '*')\n" + " • OME-Zarr path/URL ending with '.zarr' or '.ome.zarr'\n" + " • OME-Parquet file ending with '.parquet' or '.pq'\n" + " • Vortex file ending with '.vortex'\n" + " • OME-TIFF path/URL ending with '.tif' or '.tiff'" + ) + + def _ensure_materialized(self) -> None: + if self._lazy_source is None: + return + lazy_source = self._lazy_source + scalar, struct_array = self._load_from_string_source( + lazy_source.data, + column_name=lazy_source.column_name, + row_index=lazy_source.row_index, + image_type=lazy_source.image_type, + ) + if self._lazy_slices: + data = scalar + for spec in self._lazy_slices: + data = slice_ome_arrow( + data=data, + x_min=spec.x_min, + x_max=spec.x_max, + y_min=spec.y_min, + y_max=spec.y_max, + t_indices=spec.t_indices, + c_indices=spec.c_indices, + z_indices=spec.z_indices, + fill_missing=spec.fill_missing, + ) + self.data = data + self._struct_array = None + else: + self.data, self._struct_array = scalar, struct_array + self._lazy_source = None + self._lazy_slices = [] + + def _tensor_source(self) -> pa.StructScalar | pa.StructArray: + self._ensure_materialized() + return self._struct_array if self._struct_array is not None else self.data + @staticmethod def _wrap_with_image_type( data: pa.StructScalar, image_type: str @@ -284,6 +440,8 @@ def export( # noqa: PLR0911 ValueError: Unknown 'how' or missing required params. """ + self._ensure_materialized() + # existing modes if how == "numpy": return to_numpy(self.data, dtype=dtype, strict=strict, clamp=clamp) @@ -364,6 +522,7 @@ def info(self) -> Dict[str, Any]: - type: classification string - summary: human-readable text """ + self._ensure_materialized() return describe_ome_arrow(self.data) def view( @@ -455,6 +614,8 @@ def view( >>> plotter = obj.view(how="pyvista", clim=(100, 2000), show_axes=False) """ + self._ensure_materialized() + if how == "matplotlib": return view_matplotlib( self.data, @@ -529,7 +690,7 @@ def tensor_view( dtype: np.dtype | None = None, chunk_policy: Literal["auto", "combine", "keep"] = "auto", channel_policy: Literal["error", "first"] = "error", - ) -> TensorView: + ) -> TensorView | LazyTensorView: """Create a TensorView of the pixel data. Args: @@ -551,7 +712,9 @@ def tensor_view( "first" keeps the first channel. Returns: - TensorView: Tensor view over the selected pixels. + TensorView | LazyTensorView: Tensor view over selected pixels. + In lazy mode, this returns a deferred ``LazyTensorView`` that + materializes on first execution call (for example ``to_numpy()``). Raises: ValueError: If an unsupported scene is requested. @@ -560,6 +723,21 @@ def tensor_view( if scene not in (None, 0): raise ValueError("Only scene=0 is supported for single-image records.") + if self._lazy_source is not None: + return LazyTensorView( + loader=self._tensor_source, + t=t, + z=z, + c=c, + roi=roi, + roi3d=roi3d, + tile=tile, + layout=layout, + dtype=dtype, + chunk_policy=chunk_policy, + channel_policy=channel_policy, + ) + # TensorView uses an internal canonical axis basis (TZCHW) for shape/stride # math, then applies the requested layout permutation for output. return TensorView( @@ -608,6 +786,7 @@ def slice( OMEArrow object New OME-Arrow record with updated sizes and planes. """ + self._ensure_materialized() return OMEArrow( data=slice_ome_arrow( @@ -623,11 +802,83 @@ def slice( ) ) + def slice_lazy( + self, + x_min: int, + x_max: int, + y_min: int, + y_max: int, + t_indices: Optional[Iterable[int]] = None, + c_indices: Optional[Iterable[int]] = None, + z_indices: Optional[Iterable[int]] = None, + fill_missing: bool = True, + ) -> OMEArrow: + """Return a lazily planned slice, collected on first execution. + + For lazy sources created with ``OMEArrow.scan(...)``, this queues a + deferred slice operation and returns a new lazy OMEArrow plan. + For already materialized sources, this falls back to eager ``slice()``. + + Args: + x_min: Inclusive minimum X index for the crop. + x_max: Exclusive maximum X index for the crop. + y_min: Inclusive minimum Y index for the crop. + y_max: Exclusive maximum Y index for the crop. + t_indices: Optional time indices to retain. + c_indices: Optional channel indices to retain. + z_indices: Optional depth indices to retain. + fill_missing: Whether to zero-fill missing `(t, c, z)` planes. + + Returns: + OMEArrow: Lazy plan when source is lazy; eager slice result otherwise. + """ + if self._lazy_source is None: + return self.slice( + x_min=x_min, + x_max=x_max, + y_min=y_min, + y_max=y_max, + t_indices=t_indices, + c_indices=c_indices, + z_indices=z_indices, + fill_missing=fill_missing, + ) + + lazy_source = self._lazy_source + planned = OMEArrow.scan( + lazy_source.data, + tcz=self.tcz, + column_name=lazy_source.column_name, + row_index=lazy_source.row_index, + image_type=lazy_source.image_type, + ) + planned._lazy_slices = [ + *self._lazy_slices, + _LazySliceSpec( + x_min=int(x_min), + x_max=int(x_max), + y_min=int(y_min), + y_max=int(y_max), + t_indices=( + None if t_indices is None else tuple(int(i) for i in t_indices) + ), + c_indices=( + None if c_indices is None else tuple(int(i) for i in c_indices) + ), + z_indices=( + None if z_indices is None else tuple(int(i) for i in z_indices) + ), + fill_missing=bool(fill_missing), + ), + ] + return planned + def _repr_html_(self) -> str: """ Auto-render a plane as inline PNG in Jupyter. """ try: + self._ensure_materialized() view_matplotlib( data=self.data, tcz=self.tcz, diff --git a/src/ome_arrow/tensor.py b/src/ome_arrow/tensor.py index a1d13f4..699d901 100644 --- a/src/ome_arrow/tensor.py +++ b/src/ome_arrow/tensor.py @@ -5,7 +5,7 @@ import random import warnings from dataclasses import dataclass -from typing import Any, Iterator, List, Literal, Sequence +from typing import Any, Callable, Iterator, List, Literal, Sequence import numpy as np import pyarrow as pa @@ -38,6 +38,255 @@ class _Selection: roi: tuple[int, int, int, int] +class LazyTensorView: + """Deferred TensorView plan with Polars-style collect semantics.""" + + def __init__( + self, + *, + loader: Callable[ + [], + dict[str, Any] | pa.StructScalar | pa.StructArray | pa.ChunkedArray, + ], + t: int | slice | Sequence[int] | None = None, + z: int | slice | Sequence[int] | None = None, + c: int | slice | Sequence[int] | None = None, + roi: tuple[int, int, int, int] | None = None, + roi3d: tuple[int, int, int, int, int, int] | None = None, + tile: tuple[int, int] | None = None, + layout: str | None = None, + dtype: np.dtype | None = None, + chunk_policy: Literal["auto", "combine", "keep"] = "auto", + channel_policy: Literal["error", "first"] = "error", + ) -> None: + """Initialize a deferred TensorView plan. + + Args: + loader: Callable that returns concrete OME-Arrow pixel data. + t: Time index selection. + z: Depth index selection. + c: Channel index selection. + roi: Spatial crop as ``(x, y, w, h)``. + roi3d: Spatial + depth crop as ``(x, y, z, w, h, d)``. + tile: Tile index as ``(tile_y, tile_x)``. + layout: Requested output layout (TZCHW letters). + dtype: Output dtype override. + chunk_policy: Chunk handling strategy for ChunkedArray inputs. + channel_policy: Behavior when dropping ``C`` from layout. + """ + self._loader = loader + self._kwargs: dict[str, Any] = { + "t": t, + "z": z, + "c": c, + "roi": roi, + "roi3d": roi3d, + "tile": tile, + "layout": layout, + "dtype": dtype, + "chunk_policy": chunk_policy, + "channel_policy": channel_policy, + } + self._resolved: TensorView | None = None + + def _spawn(self, **updates: Any) -> "LazyTensorView": + kwargs = dict(self._kwargs) + kwargs.update(updates) + return LazyTensorView(loader=self._loader, **kwargs) + + def collect(self) -> "TensorView": + """Materialize this lazy plan into a concrete TensorView.""" + if self._resolved is None: + self._resolved = TensorView(self._loader(), **self._kwargs) + return self._resolved + + def with_layout(self, layout: str) -> "LazyTensorView": + """Return a new lazy view with an updated layout.""" + return self._spawn(layout=layout) + + def select( + self, + *, + t: int | slice | Sequence[int] | None = None, + z: int | slice | Sequence[int] | None = None, + c: int | slice | Sequence[int] | None = None, + roi: tuple[int, int, int, int] | None = None, + roi3d: tuple[int, int, int, int, int, int] | None = None, + tile: tuple[int, int] | None = None, + ) -> "LazyTensorView": + """Return a new lazy plan with updated index/ROI selections.""" + return self._spawn(t=t, z=z, c=c, roi=roi, roi3d=roi3d, tile=tile) + + @property + def dtype(self) -> np.dtype: + """Return the tensor dtype.""" + return self.collect().dtype + + @property + def device(self) -> str: + """Return the tensor storage device.""" + return self.collect().device + + @property + def layout(self) -> str: + """Return the effective tensor layout.""" + return self.collect().layout + + @property + def shape(self) -> tuple[int, ...]: + """Return the tensor shape.""" + return self.collect().shape + + @property + def strides(self) -> tuple[int, ...]: + """Return tensor strides in bytes.""" + return self.collect().strides + + def to_numpy(self, *, contiguous: bool = False) -> np.ndarray: + """Materialize as a NumPy array. + + Args: + contiguous: When True, return a contiguous array copy. + + Returns: + np.ndarray: Materialized array. + """ + return self.collect().to_numpy(contiguous=contiguous) + + def to_dlpack( + self, + *, + device: str = "cpu", + contiguous: bool = True, + mode: str = "arrow", + ) -> Any: + """Export the planned view as a DLPack object. + + Args: + device: Target device (``"cpu"`` or ``"cuda"``). + contiguous: When True, materialize contiguous data when needed. + mode: Export mode (``"arrow"`` or ``"numpy"``). + + Returns: + Any: DLPack-compatible object. + """ + return self.collect().to_dlpack(device=device, contiguous=contiguous, mode=mode) + + def to_torch( + self, + *, + device: str = "cpu", + contiguous: bool = True, + mode: str = "arrow", + ) -> Any: + """Convert the planned view to a torch tensor. + + Args: + device: Target device (``"cpu"`` or ``"cuda"``). + contiguous: When True, materialize contiguous data when needed. + mode: Export mode (``"arrow"`` or ``"numpy"``). + + Returns: + Any: ``torch.Tensor`` when torch is installed. + """ + return self.collect().to_torch(device=device, contiguous=contiguous, mode=mode) + + def to_jax( + self, + *, + device: str = "cpu", + contiguous: bool = True, + mode: str = "arrow", + ) -> Any: + """Convert the planned view to a JAX array. + + Args: + device: Target device (``"cpu"`` or ``"cuda"``). + contiguous: When True, materialize contiguous data when needed. + mode: Export mode (``"arrow"`` or ``"numpy"``). + + Returns: + Any: JAX array when JAX is installed. + """ + return self.collect().to_jax(device=device, contiguous=contiguous, mode=mode) + + def iter_dlpack( + self, + *, + batch_size: int | None = None, + tile_size: tuple[int, int] | None = None, + tiles: tuple[int, int] | None = None, + shuffle: bool = False, + seed: int | None = None, + prefetch: int = 0, + device: str = "cpu", + contiguous: bool = True, + mode: str = "arrow", + ) -> Iterator[Any]: + """Iterate DLPack outputs in batches or 2D tiles. + + Args: + batch_size: Number of time indices per batch. + tile_size: Optional tile size as ``(tile_h, tile_w)``. + tiles: Deprecated alias for ``tile_size``. + shuffle: Whether to shuffle iteration order. + seed: Optional random seed for deterministic shuffling. + prefetch: Placeholder prefetch count. + device: Target device (``"cpu"`` or ``"cuda"``). + contiguous: When True, materialize contiguous data when needed. + mode: Export mode (``"arrow"`` or ``"numpy"``). + + Returns: + Iterator[Any]: Iterator of DLPack-compatible objects. + """ + return self.collect().iter_dlpack( + batch_size=batch_size, + tile_size=tile_size, + tiles=tiles, + shuffle=shuffle, + seed=seed, + prefetch=prefetch, + device=device, + contiguous=contiguous, + mode=mode, + ) + + def iter_tiles_3d( + self, + *, + tile_size: tuple[int, int, int], + shuffle: bool = False, + seed: int | None = None, + prefetch: int = 0, + device: str = "cpu", + contiguous: bool = True, + mode: str = "numpy", + ) -> Iterator[Any]: + """Iterate DLPack outputs in 3D tiles. + + Args: + tile_size: Tile shape as ``(tile_z, tile_h, tile_w)``. + shuffle: Whether to shuffle iteration order. + seed: Optional random seed for deterministic shuffling. + prefetch: Placeholder prefetch count. + device: Target device (``"cpu"`` or ``"cuda"``). + contiguous: When True, materialize contiguous data when needed. + mode: Export mode (currently ``"numpy"`` only). + + Returns: + Iterator[Any]: Iterator of DLPack-compatible objects. + """ + return self.collect().iter_tiles_3d( + tile_size=tile_size, + shuffle=shuffle, + seed=seed, + prefetch=prefetch, + device=device, + contiguous=contiguous, + mode=mode, + ) + + class TensorView: """View OME-Arrow pixel data as a tensor-like object. diff --git a/tests/test_core.py b/tests/test_core.py index 27f17a1..56515c0 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -408,3 +408,45 @@ def test_vortex_custom_column_name(tmp_path: pathlib.Path) -> None: reloaded = OMEArrow(str(out), column_name="custom_ome_arrow") assert reloaded.info() == oa.info() + + +def test_scan_collect_roundtrip() -> None: + """Materialize a lazily scanned parquet source via collect().""" + oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") + assert oa.is_lazy + + with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): + oa.collect() + assert not oa.is_lazy + assert oa.info()["shape"] == (1, 1, 1, 72, 84) + + +def test_slice_lazy_scan_collect() -> None: + """Queue a lazy slice and materialize it via collect().""" + oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") + sliced = oa.slice_lazy(0, 10, 0, 8) + + assert sliced.is_lazy + with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): + sliced.collect() + assert sliced.info()["shape"] == (1, 1, 1, 8, 10) + + +def test_slice_lazy_chain_scan_collect() -> None: + """Allow chaining lazy slices before materialization.""" + oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") + sliced = oa.slice_lazy(0, 20, 0, 20).slice_lazy(5, 15, 2, 12) + + with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): + shape = sliced.collect().info()["shape"] + assert shape == (1, 1, 1, 10, 10) + + +def test_slice_lazy_on_materialized_falls_back_to_eager() -> None: + """Use eager slice behavior when source is already materialized.""" + arr = np.arange(1 * 1 * 1 * 6 * 7, dtype=np.uint16).reshape(1, 1, 1, 6, 7) + oa = OMEArrow(arr) + out = oa.slice_lazy(1, 5, 1, 4) + + assert not out.is_lazy + assert out.info()["shape"] == (1, 1, 1, 3, 4) diff --git a/tests/test_tensor.py b/tests/test_tensor.py index 0fb870c..d787ccc 100644 --- a/tests/test_tensor.py +++ b/tests/test_tensor.py @@ -10,7 +10,7 @@ from ome_arrow import OMEArrow from ome_arrow.export import to_ome_parquet from ome_arrow.meta import OME_ARROW_STRUCT -from ome_arrow.tensor import TensorView +from ome_arrow.tensor import LazyTensorView, TensorView def _from_dlpack_capsule(capsule: object) -> np.ndarray: @@ -87,6 +87,28 @@ def test_tensor_view_chunk_policy_invalid(example_correct_data: dict) -> None: oa.tensor_view(chunk_policy="invalid") +def test_lazy_tensor_view_collects_on_execution() -> None: + """Defer source loading until lazy tensor view execution.""" + oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") + assert oa.is_lazy + + view = oa.tensor_view(t=0, z=0, c=0, layout="HW") + assert isinstance(view, LazyTensorView) + assert oa.is_lazy + + with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): + arr = view.to_numpy(contiguous=True) + assert arr.shape == (72, 84) + assert not oa.is_lazy + + +def test_lazy_reader_requires_string_source() -> None: + """Reject lazy mode for non-file inputs.""" + arr = np.zeros((1, 1, 1, 2, 2), dtype=np.uint16) + with pytest.raises(TypeError, match="lazy=True currently supports only string"): + OMEArrow(arr, lazy=True) + + def test_dlpack_roundtrip_torch(example_correct_data: dict) -> None: """Round-trip DLPack export/import through torch on CPU.""" torch = pytest.importorskip("torch") From e0609b09f018b3ac638d1ce17f253e17d9662767 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 19 Feb 2026 12:25:43 -0700 Subject: [PATCH 02/10] address copilot and coderabbit reviews --- README.md | 5 +++-- docs/src/dlpack.md | 4 ++-- src/ome_arrow/core.py | 30 +++++++++++++++++++++++++++++- src/ome_arrow/tensor.py | 28 +++++++++++++++++++++------- tests/test_core.py | 9 +++++++++ tests/test_tensor.py | 37 +++++++++++++++++++++++++++++++++++++ 6 files changed, 101 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a9bed3c..f7e8f7f 100644 --- a/README.md +++ b/README.md @@ -107,8 +107,9 @@ oa = OMEArrow.scan("your_image.ome.parquet") # deferred load lazy_crop = oa.slice_lazy(0, 512, 0, 512).slice_lazy(64, 256, 64, 256) cropped = lazy_crop.collect() -# Then: build lazy tensor views from a source scan -lazy_view = oa.tensor_view(t=0, z=slice(0, 4), roi=(0, 0, 512, 512)) +# slice_lazy returns a new OMEArrow plan; collect does not mutate `oa`. +# Build tensor_view from the returned sliced object to reuse that plan. +lazy_view = cropped.tensor_view(t=0, z=slice(0, 4), roi=(0, 0, 512, 512)) arr = lazy_view.to_numpy() ``` diff --git a/docs/src/dlpack.md b/docs/src/dlpack.md index aaf15b7..fbb1784 100644 --- a/docs/src/dlpack.md +++ b/docs/src/dlpack.md @@ -51,8 +51,8 @@ obj = OMEArrow.scan("example.ome.parquet") lazy_crop = obj.slice_lazy(0, 512, 0, 512).slice_lazy(64, 256, 64, 256) cropped = lazy_crop.collect() -# Then execute lazy tensor selections. -lazy_view = obj.tensor_view(t=0, z=slice(0, 8), roi=(128, 128, 256, 256)) +# Then execute tensor selections on the sliced result. +lazy_view = cropped.tensor_view(t=0, z=slice(0, 8), roi=(128, 128, 256, 256)) arr = lazy_view.to_numpy() ``` diff --git a/src/ome_arrow/core.py b/src/ome_arrow/core.py index e7f40c1..30eef38 100644 --- a/src/ome_arrow/core.py +++ b/src/ome_arrow/core.py @@ -122,6 +122,12 @@ def __init__( if lazy: if not isinstance(data, str): raise TypeError("lazy=True currently supports only string file inputs.") + if any(c in data for c in "<>*"): + raise TypeError( + "lazy=True does not support Bio-Formats pattern strings. " + "Use OMEArrow(..., lazy=False) for pattern ingestion via " + "from_stack_pattern_path." + ) self._lazy_source = _LazySourceSpec( data=data, column_name=column_name, @@ -316,6 +322,9 @@ def _ensure_materialized(self) -> None: if self._lazy_source is None: return lazy_source = self._lazy_source + # Intentionally do not clear `_lazy_source` / `_lazy_slices` before load. + # If `_load_from_string_source(...)` raises, lazy state is preserved so + # callers can inspect/retry without losing the deferred plan. scalar, struct_array = self._load_from_string_source( lazy_source.data, column_name=lazy_source.column_name, @@ -336,10 +345,16 @@ def _ensure_materialized(self) -> None: z_indices=spec.z_indices, fill_missing=spec.fill_missing, ) + # Applying lazy slices via `slice_ome_arrow` materializes through a + # StructScalar path, so we intentionally drop `_struct_array` here. + # Consequence: Arrow-backed zero-copy tensor paths + # (for example `tensor_view(...).to_dlpack(mode="arrow")`) are not + # available after lazy slicing. self.data = data self._struct_array = None else: self.data, self._struct_array = scalar, struct_array + # Lazy state is cleared only after a successful materialization. self._lazy_source = None self._lazy_slices = [] @@ -816,8 +831,21 @@ def slice_lazy( """Return a lazily planned slice, collected on first execution. For lazy sources created with ``OMEArrow.scan(...)``, this queues a - deferred slice operation and returns a new lazy OMEArrow plan. + deferred slice operation and returns a new lazy OMEArrow plan produced + from ``OMEArrow.scan(...)``. For already materialized sources, this falls back to eager ``slice()``. + This method does not mutate ``self``. + + Notes: + ``slice_lazy`` always returns a new plan object. Internally, the + returned plan gets a fresh ``_lazy_slices`` list + (``[*self._lazy_slices, new_slice]``), so chained plans do not + share mutable slice state with the original ``OMEArrow``. + A common footgun is: + ``oa.slice_lazy(...).collect()`` followed by ``oa.tensor_view(...)``. + Those calls can load/materialize the same source twice because + ``oa`` remains the original plan. For a single-load workflow, keep + working from the value returned by ``slice_lazy`` / ``collect``. Args: x_min: Inclusive minimum X index for the crop. diff --git a/src/ome_arrow/tensor.py b/src/ome_arrow/tensor.py index 699d901..0aaf46c 100644 --- a/src/ome_arrow/tensor.py +++ b/src/ome_arrow/tensor.py @@ -17,6 +17,7 @@ _TZCHW = "TZCHW" _ALLOWED_DIMS = set(_TZCHW) _ALLOWED_MODES = {"arrow", "numpy"} +_UNSET = object() @dataclass(frozen=True) @@ -107,15 +108,28 @@ def with_layout(self, layout: str) -> "LazyTensorView": def select( self, *, - t: int | slice | Sequence[int] | None = None, - z: int | slice | Sequence[int] | None = None, - c: int | slice | Sequence[int] | None = None, - roi: tuple[int, int, int, int] | None = None, - roi3d: tuple[int, int, int, int, int, int] | None = None, - tile: tuple[int, int] | None = None, + t: Any = _UNSET, + z: Any = _UNSET, + c: Any = _UNSET, + roi: Any = _UNSET, + roi3d: Any = _UNSET, + tile: Any = _UNSET, ) -> "LazyTensorView": """Return a new lazy plan with updated index/ROI selections.""" - return self._spawn(t=t, z=z, c=c, roi=roi, roi3d=roi3d, tile=tile) + updates = {} + if t is not _UNSET: + updates["t"] = t + if z is not _UNSET: + updates["z"] = z + if c is not _UNSET: + updates["c"] = c + if roi is not _UNSET: + updates["roi"] = roi + if roi3d is not _UNSET: + updates["roi3d"] = roi3d + if tile is not _UNSET: + updates["tile"] = tile + return self._spawn(**updates) @property def dtype(self) -> np.dtype: diff --git a/tests/test_core.py b/tests/test_core.py index 56515c0..fa0e999 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -450,3 +450,12 @@ def test_slice_lazy_on_materialized_falls_back_to_eager() -> None: assert not out.is_lazy assert out.info()["shape"] == (1, 1, 1, 3, 4) + + +def test_scan_stack_pattern_rejected_in_lazy_mode() -> None: + """Reject Bio-Formats stack pattern strings when lazy scan is requested.""" + pattern = "tests/data/nviz-artificial-4d-dataset/E99_C<111,222>_ZS<000-021>.tif" + with pytest.raises( + TypeError, match="lazy=True does not support Bio-Formats pattern strings" + ): + OMEArrow.scan(pattern) diff --git a/tests/test_tensor.py b/tests/test_tensor.py index d787ccc..ece02fc 100644 --- a/tests/test_tensor.py +++ b/tests/test_tensor.py @@ -109,6 +109,43 @@ def test_lazy_reader_requires_string_source() -> None: OMEArrow(arr, lazy=True) +def test_lazy_tensor_view_select_preserves_existing_dims() -> None: + """Preserve existing lazy selections when select() updates one axis.""" + oa = OMEArrow.scan( + "tests/data/ome-artificial-5d-datasets/multi-channel-time-series.ome.tiff" + ) + assert oa.is_lazy + + view = oa.tensor_view(t=2, c=1) + assert isinstance(view, LazyTensorView) + + view_z = view.select(z=0) + assert isinstance(view_z, LazyTensorView) + assert oa.is_lazy + + arr = view_z.to_numpy(contiguous=True) + assert arr.shape == (1, 167, 439) + assert not oa.is_lazy + + +def test_lazy_tensor_view_with_layout_defers_materialization() -> None: + """Update layout lazily and materialize only on execution.""" + oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") + assert oa.is_lazy + + view = oa.tensor_view(t=0, z=0, c=0) + view_hw = view.with_layout("HW") + + assert isinstance(view_hw, LazyTensorView) + assert view_hw._kwargs["layout"] == "HW" + assert oa.is_lazy + + with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): + arr = view_hw.to_numpy(contiguous=True) + assert arr.shape == (72, 84) + assert not oa.is_lazy + + def test_dlpack_roundtrip_torch(example_correct_data: dict) -> None: """Round-trip DLPack export/import through torch on CPU.""" torch = pytest.importorskip("torch") From d214c7e251ac92b6972a57ff77246f53d067d3af Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 19 Feb 2026 12:29:31 -0700 Subject: [PATCH 03/10] ignore warnings --- tests/test_tensor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_tensor.py b/tests/test_tensor.py index ece02fc..7010da0 100644 --- a/tests/test_tensor.py +++ b/tests/test_tensor.py @@ -109,6 +109,9 @@ def test_lazy_reader_requires_string_source() -> None: OMEArrow(arr, lazy=True) +@pytest.mark.filterwarnings( + "ignore:As of version 0.4.0, the parser argument is ignored.*:DeprecationWarning" +) def test_lazy_tensor_view_select_preserves_existing_dims() -> None: """Preserve existing lazy selections when select() updates one axis.""" oa = OMEArrow.scan( From df6f1b9832ad08141c2e0af37e34f7163e8ea77f Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 20 Feb 2026 08:15:40 -0700 Subject: [PATCH 04/10] address coderabbit review comments --- .pre-commit-config.yaml | 2 +- README.md | 4 ++-- docs/src/dlpack.md | 4 ++-- src/ome_arrow/core.py | 6 +++++- src/ome_arrow/tensor.py | 35 ++++++++++++++++++++++++++++++----- tests/test_tensor.py | 16 +++++++++++----- 6 files changed, 51 insertions(+), 16 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 34f8f7b..5d15765 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,7 @@ repos: - id: yamllint exclude: pre-commit-config.yaml - repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.15.1" + rev: "v0.15.2" hooks: - id: ruff-format - id: ruff-check diff --git a/README.md b/README.md index f7e8f7f..b504f4f 100644 --- a/README.md +++ b/README.md @@ -109,8 +109,8 @@ cropped = lazy_crop.collect() # slice_lazy returns a new OMEArrow plan; collect does not mutate `oa`. # Build tensor_view from the returned sliced object to reuse that plan. -lazy_view = cropped.tensor_view(t=0, z=slice(0, 4), roi=(0, 0, 512, 512)) -arr = lazy_view.to_numpy() +tensor_view_result = cropped.tensor_view(t=0, z=slice(0, 4), roi=(0, 0, 192, 192)) +arr = tensor_view_result.to_numpy() ``` Advanced options: diff --git a/docs/src/dlpack.md b/docs/src/dlpack.md index fbb1784..f5e191e 100644 --- a/docs/src/dlpack.md +++ b/docs/src/dlpack.md @@ -52,8 +52,8 @@ lazy_crop = obj.slice_lazy(0, 512, 0, 512).slice_lazy(64, 256, 64, 256) cropped = lazy_crop.collect() # Then execute tensor selections on the sliced result. -lazy_view = cropped.tensor_view(t=0, z=slice(0, 8), roi=(128, 128, 256, 256)) -arr = lazy_view.to_numpy() +tensor_view = cropped.tensor_view(t=0, z=slice(0, 8), roi=(64, 64, 128, 128)) +arr = tensor_view.to_numpy() ``` ## JAX diff --git a/src/ome_arrow/core.py b/src/ome_arrow/core.py index 30eef38..a9163ea 100644 --- a/src/ome_arrow/core.py +++ b/src/ome_arrow/core.py @@ -360,7 +360,11 @@ def _ensure_materialized(self) -> None: def _tensor_source(self) -> pa.StructScalar | pa.StructArray: self._ensure_materialized() - return self._struct_array if self._struct_array is not None else self.data + if self._struct_array is not None: + return self._struct_array + if self._data is None: + raise RuntimeError("OMEArrow data is not initialized.") + return self._data @staticmethod def _wrap_with_image_type( diff --git a/src/ome_arrow/tensor.py b/src/ome_arrow/tensor.py index 0aaf46c..b8e48f4 100644 --- a/src/ome_arrow/tensor.py +++ b/src/ome_arrow/tensor.py @@ -133,27 +133,52 @@ def select( @property def dtype(self) -> np.dtype: - """Return the tensor dtype.""" + """Return the tensor dtype. + + Note: + Accessing this property calls ``collect()`` and may materialize data + from source files (for example Parquet/TIFF), which can be expensive. + """ return self.collect().dtype @property def device(self) -> str: - """Return the tensor storage device.""" + """Return the tensor storage device. + + Note: + Accessing this property calls ``collect()`` and may materialize data + from source files (for example Parquet/TIFF), which can be expensive. + """ return self.collect().device @property def layout(self) -> str: - """Return the effective tensor layout.""" + """Return the effective tensor layout. + + Note: + Accessing this property calls ``collect()`` and may materialize data + from source files (for example Parquet/TIFF), which can be expensive. + """ return self.collect().layout @property def shape(self) -> tuple[int, ...]: - """Return the tensor shape.""" + """Return the tensor shape. + + Note: + Accessing this property calls ``collect()`` and may materialize data + from source files (for example Parquet/TIFF), which can be expensive. + """ return self.collect().shape @property def strides(self) -> tuple[int, ...]: - """Return tensor strides in bytes.""" + """Return tensor strides in bytes. + + Note: + Accessing this property calls ``collect()`` and may materialize data + from source files (for example Parquet/TIFF), which can be expensive. + """ return self.collect().strides def to_numpy(self, *, contiguous: bool = False) -> np.ndarray: diff --git a/tests/test_tensor.py b/tests/test_tensor.py index 7010da0..bdb8c2f 100644 --- a/tests/test_tensor.py +++ b/tests/test_tensor.py @@ -87,7 +87,9 @@ def test_tensor_view_chunk_policy_invalid(example_correct_data: dict) -> None: oa.tensor_view(chunk_policy="invalid") -def test_lazy_tensor_view_collects_on_execution() -> None: +def test_lazy_tensor_view_collects_on_execution( + recwarn: pytest.WarningsRecorder, +) -> None: """Defer source loading until lazy tensor view execution.""" oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") assert oa.is_lazy @@ -96,10 +98,13 @@ def test_lazy_tensor_view_collects_on_execution() -> None: assert isinstance(view, LazyTensorView) assert oa.is_lazy - with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): - arr = view.to_numpy(contiguous=True) + arr = view.to_numpy(contiguous=True) assert arr.shape == (72, 84) assert not oa.is_lazy + if recwarn: + assert any( + "Requested column 'ome_arrow'" in str(w.message) for w in recwarn.list + ) def test_lazy_reader_requires_string_source() -> None: @@ -140,11 +145,12 @@ def test_lazy_tensor_view_with_layout_defers_materialization() -> None: view_hw = view.with_layout("HW") assert isinstance(view_hw, LazyTensorView) - assert view_hw._kwargs["layout"] == "HW" assert oa.is_lazy with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): - arr = view_hw.to_numpy(contiguous=True) + concrete = view_hw.collect() + assert concrete.layout == "HW" + arr = concrete.to_numpy(contiguous=True) assert arr.shape == (72, 84) assert not oa.is_lazy From 66f090572746f2065603cc9fd857453b60942708 Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 20 Feb 2026 08:25:58 -0700 Subject: [PATCH 05/10] address coderabbit review comment --- src/ome_arrow/tensor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ome_arrow/tensor.py b/src/ome_arrow/tensor.py index b8e48f4..4bb8be6 100644 --- a/src/ome_arrow/tensor.py +++ b/src/ome_arrow/tensor.py @@ -159,6 +159,9 @@ def layout(self) -> str: Accessing this property calls ``collect()`` and may materialize data from source files (for example Parquet/TIFF), which can be expensive. """ + layout = self._kwargs.get("layout") + if layout is not None: + return layout return self.collect().layout @property From 57f2941765b8c0d5d5a5a55c160e237eee1fa7bb Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 20 Feb 2026 08:46:19 -0700 Subject: [PATCH 06/10] address coderabbit comments --- src/ome_arrow/tensor.py | 43 ++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/src/ome_arrow/tensor.py b/src/ome_arrow/tensor.py index 4bb8be6..6da9911 100644 --- a/src/ome_arrow/tensor.py +++ b/src/ome_arrow/tensor.py @@ -3,6 +3,7 @@ from __future__ import annotations import random +import threading import warnings from dataclasses import dataclass from typing import Any, Callable, Iterator, List, Literal, Sequence @@ -17,7 +18,13 @@ _TZCHW = "TZCHW" _ALLOWED_DIMS = set(_TZCHW) _ALLOWED_MODES = {"arrow", "numpy"} -_UNSET = object() + + +class _Unset: + """Typed sentinel for arguments that were not provided.""" + + +_UNSET: _Unset = _Unset() @dataclass(frozen=True) @@ -89,6 +96,7 @@ def __init__( "channel_policy": channel_policy, } self._resolved: TensorView | None = None + self._collect_lock = threading.Lock() def _spawn(self, **updates: Any) -> "LazyTensorView": kwargs = dict(self._kwargs) @@ -97,9 +105,16 @@ def _spawn(self, **updates: Any) -> "LazyTensorView": def collect(self) -> "TensorView": """Materialize this lazy plan into a concrete TensorView.""" - if self._resolved is None: - self._resolved = TensorView(self._loader(), **self._kwargs) - return self._resolved + resolved = self._resolved + if resolved is not None: + return resolved + + with self._collect_lock: + resolved = self._resolved + if resolved is None: + resolved = TensorView(self._loader(), **self._kwargs) + self._resolved = resolved + return resolved def with_layout(self, layout: str) -> "LazyTensorView": """Return a new lazy view with an updated layout.""" @@ -108,12 +123,12 @@ def with_layout(self, layout: str) -> "LazyTensorView": def select( self, *, - t: Any = _UNSET, - z: Any = _UNSET, - c: Any = _UNSET, - roi: Any = _UNSET, - roi3d: Any = _UNSET, - tile: Any = _UNSET, + t: int | slice | Sequence[int] | None | _Unset = _UNSET, + z: int | slice | Sequence[int] | None | _Unset = _UNSET, + c: int | slice | Sequence[int] | None | _Unset = _UNSET, + roi: tuple[int, int, int, int] | None | _Unset = _UNSET, + roi3d: tuple[int, int, int, int, int, int] | None | _Unset = _UNSET, + tile: tuple[int, int] | None | _Unset = _UNSET, ) -> "LazyTensorView": """Return a new lazy plan with updated index/ROI selections.""" updates = {} @@ -146,10 +161,12 @@ def device(self) -> str: """Return the tensor storage device. Note: - Accessing this property calls ``collect()`` and may materialize data - from source files (for example Parquet/TIFF), which can be expensive. + For unresolved lazy plans, this returns ``"cpu"`` without calling + ``collect()``. """ - return self.collect().device + if self._resolved is None: + return "cpu" + return self._resolved.device @property def layout(self) -> str: From b1401be08960e9f900d51f8538d75b14a7fa8478 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 25 Feb 2026 09:11:13 -0700 Subject: [PATCH 07/10] address mikes review comments Co-Authored-By: Mike Lippincott <58147848+MikeLippincott@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- README.md | 6 +- docs/src/dlpack.md | 23 +++---- src/ome_arrow/core.py | 15 ++++- src/ome_arrow/tensor.py | 141 +++++++++++++++++++++++++++++++++++++--- tests/test_core.py | 32 +++++++++ tests/test_tensor.py | 98 +++++++++++++++++++++------- 7 files changed, 269 insertions(+), 48 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5d15765..1b3e714 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: - id: check-yaml - id: detect-private-key - repo: https://github.com/tox-dev/pyproject-fmt - rev: "v2.16.1" + rev: "v2.16.2" hooks: - id: pyproject-fmt - repo: https://github.com/citation-file-format/cffconvert diff --git a/README.md b/README.md index b504f4f..d03c40e 100644 --- a/README.md +++ b/README.md @@ -86,11 +86,11 @@ from ome_arrow import OMEArrow oa = OMEArrow("your_image.ome.parquet") -# Spatial ROI per plane -view = oa.tensor_view(t=0, z=0, roi=(32, 32, 128, 128), layout="CHW") +# Spatial ROI per plane (YX convention) +view = oa.tensor_view(t=0, z=0, roi=(32, 32, 128, 128), layout="CYX") # Convenience 3D ROI (x, y, z, w, h, d) -view3d = oa.tensor_view(roi3d=(32, 32, 2, 128, 128, 4), layout="TZCHW") +view3d = oa.tensor_view(roi3d=(32, 32, 2, 128, 128, 4), layout="TZCYX") # 3D tiled iteration over (z, y, x) for cap in view3d.iter_tiles_3d(tile_size=(2, 64, 64), mode="numpy"): diff --git a/docs/src/dlpack.md b/docs/src/dlpack.md index f5e191e..86f19ed 100644 --- a/docs/src/dlpack.md +++ b/docs/src/dlpack.md @@ -7,23 +7,24 @@ and (optionally) GPU. Key defaults: - OME-Arrow tensor layouts always include channels (`C`) as a tensor axis. -- Default layout is `CHW` when both `T` and `Z` are singleton in the source. -- Otherwise, default layout is `TZCHW` (with singleton `T`/`Z` retained unless you override layout). -- You can override with any valid TZCHW permutation/subset, for example `HWC`, `ZCHW`, or `CHW`. +- Default layout is `CHW` (equivalent to `CYX`) when both `T` and `Z` are singleton in the source. +- Otherwise, default layout is `TZCHW` (equivalent to `TZCYX`, with singleton `T`/`Z` retained unless you override layout). +- You can override with any valid TZCHW/TZCYX permutation/subset, for example `YXC`, `ZCYX`, or `CYX`. Layout nomenclature: - `T`: time index - `Z`: z/depth index - `C`: channel index -- `H`: image height (Y axis) -- `W`: image width (X axis) +- `Y`: image row axis (height) +- `X`: image column axis (width) + (`H/W` aliases are also accepted for compatibility). Practical mapping: -- 2D image content (`YX`) is typically exposed as `CHW`. -- 3D z-stack content (`ZYX`) is typically exposed as `ZCHW` or `TZCHW` (with `T=1`). -- Time-lapse and volumetric content use `TZCHW` by default. +- 2D image content (`YX`) is typically exposed as `CYX`. +- 3D z-stack content (`ZYX`) is typically exposed as `ZCYX` or `TZCYX` (with `T=1`). +- Time-lapse and volumetric content use `TZCYX`/`TZCHW` by default. ## PyTorch @@ -62,7 +63,7 @@ arr = tensor_view.to_numpy() from ome_arrow import OMEArrow obj = OMEArrow("example.ome.parquet") -view = obj.tensor_view(t=0, z=0, c=0, layout="CHW") +view = obj.tensor_view(t=0, z=0, c=0, layout="CYX") import jax.numpy as jnp @@ -83,7 +84,7 @@ view = obj.tensor_view() # Batch over time (T) dimension. for cap in view.iter_dlpack(batch_size=2, shuffle=False, mode="numpy"): batch = np.from_dlpack(cap) - # batch shape: (batch, Z, C, H, W) in TZCHW layout + # batch shape: (batch, Z, C, Y, X) in TZCYX layout ``` ```python @@ -98,7 +99,7 @@ for cap in view.iter_dlpack( tile_size=(256, 256), shuffle=True, seed=123, mode="numpy" ): tile = np.from_dlpack(cap) - # tile shape: (C, H, W) in CHW layout + # tile shape: (C, Y, X) in CYX layout ``` ## Ownership and lifetime diff --git a/src/ome_arrow/core.py b/src/ome_arrow/core.py index a9163ea..207b708 100644 --- a/src/ome_arrow/core.py +++ b/src/ome_arrow/core.py @@ -704,6 +704,8 @@ def tensor_view( c: int | slice | Sequence[int] | None = None, roi: tuple[int, int, int, int] | None = None, roi3d: tuple[int, int, int, int, int, int] | None = None, + roi_nd: tuple[int, ...] | None = None, + roi_type: Literal["2d", "2d_timelapse", "3d", "4d"] | None = None, tile: tuple[int, int] | None = None, layout: str | None = None, dtype: np.dtype | None = None, @@ -721,9 +723,13 @@ def tensor_view( roi3d: Spatial + depth crop (x, y, z, w, h, d) in pixels/planes. This is a convenience alias for ``roi=(x, y, w, h)`` and ``z=slice(z, z + d)``. + roi_nd: General ROI tuple with min/max bounds. + roi_type: ROI interpretation mode for ``roi_nd``. Supported values: + ``"2d"``, ``"2d_timelapse"``, ``"3d"``, and ``"4d"``. tile: Tile index (tile_y, tile_x) based on chunk grid. - layout: Desired layout string using TZCHW letters where - T=time, Z=depth, C=channel, H=height (Y), W=width (X). + layout: Desired layout string using `TZCYX` letters where + T=time, Z=depth, C=channel, Y=row axis, X=column axis. + `TZCHW` aliases are also accepted for compatibility. dtype: Output dtype override. chunk_policy: Handling for ``pyarrow.ChunkedArray`` inputs. channel_policy: Behavior when dropping `C` from layout while @@ -750,6 +756,8 @@ def tensor_view( c=c, roi=roi, roi3d=roi3d, + roi_nd=roi_nd, + roi_type=roi_type, tile=tile, layout=layout, dtype=dtype, @@ -759,6 +767,7 @@ def tensor_view( # TensorView uses an internal canonical axis basis (TZCHW) for shape/stride # math, then applies the requested layout permutation for output. + # Public layout examples prefer TZCYX (Y/X), with H/W accepted as aliases. return TensorView( self._struct_array if self._struct_array is not None else self.data, t=t, @@ -766,6 +775,8 @@ def tensor_view( c=c, roi=roi, roi3d=roi3d, + roi_nd=roi_nd, + roi_type=roi_type, tile=tile, layout=layout, dtype=dtype, diff --git a/src/ome_arrow/tensor.py b/src/ome_arrow/tensor.py index 6da9911..b52f489 100644 --- a/src/ome_arrow/tensor.py +++ b/src/ome_arrow/tensor.py @@ -18,6 +18,8 @@ _TZCHW = "TZCHW" _ALLOWED_DIMS = set(_TZCHW) _ALLOWED_MODES = {"arrow", "numpy"} +_ROI_TYPES = {"2d", "2d_timelapse", "3d", "4d"} +_LAYOUT_ALIASES = str.maketrans({"Y": "H", "X": "W"}) class _Unset: @@ -61,6 +63,8 @@ def __init__( c: int | slice | Sequence[int] | None = None, roi: tuple[int, int, int, int] | None = None, roi3d: tuple[int, int, int, int, int, int] | None = None, + roi_nd: tuple[int, ...] | None = None, + roi_type: Literal["2d", "2d_timelapse", "3d", "4d"] | None = None, tile: tuple[int, int] | None = None, layout: str | None = None, dtype: np.dtype | None = None, @@ -76,8 +80,11 @@ def __init__( c: Channel index selection. roi: Spatial crop as ``(x, y, w, h)``. roi3d: Spatial + depth crop as ``(x, y, z, w, h, d)``. + roi_nd: General ROI tuple with min/max bounds. + roi_type: ROI interpretation mode for ``roi_nd``. tile: Tile index as ``(tile_y, tile_x)``. - layout: Requested output layout (TZCHW letters). + layout: Requested output layout (`TZCYX` preferred, `TZCHW` + aliases accepted). dtype: Output dtype override. chunk_policy: Chunk handling strategy for ChunkedArray inputs. channel_policy: Behavior when dropping ``C`` from layout. @@ -89,6 +96,8 @@ def __init__( "c": c, "roi": roi, "roi3d": roi3d, + "roi_nd": roi_nd, + "roi_type": roi_type, "tile": tile, "layout": layout, "dtype": dtype, @@ -128,6 +137,8 @@ def select( c: int | slice | Sequence[int] | None | _Unset = _UNSET, roi: tuple[int, int, int, int] | None | _Unset = _UNSET, roi3d: tuple[int, int, int, int, int, int] | None | _Unset = _UNSET, + roi_nd: tuple[int, ...] | None | _Unset = _UNSET, + roi_type: Literal["2d", "2d_timelapse", "3d", "4d"] | None | _Unset = _UNSET, tile: tuple[int, int] | None | _Unset = _UNSET, ) -> "LazyTensorView": """Return a new lazy plan with updated index/ROI selections.""" @@ -142,6 +153,10 @@ def select( updates["roi"] = roi if roi3d is not _UNSET: updates["roi3d"] = roi3d + if roi_nd is not _UNSET: + updates["roi_nd"] = roi_nd + if roi_type is not _UNSET: + updates["roi_type"] = roi_type if tile is not _UNSET: updates["tile"] = tile return self._spawn(**updates) @@ -358,9 +373,19 @@ class TensorView: roi3d: Spatial + depth crop (x, y, z, w, h, d). This is a convenience alias for ``roi=(x, y, w, h)`` and ``z=slice(z, z + d)``. + roi_nd: General ROI tuple with min/max bounds, interpreted by + ``roi_type``. + roi_type: ROI interpretation mode for ``roi_nd``. Supported values: + ``"2d"`` = ``(ymin, xmin, ymax, xmax)``; + ``"2d_timelapse"`` = + ``(tmin, ymin, xmin, tmax, ymax, xmax)``; + ``"3d"`` = ``(zmin, ymin, xmin, zmax, ymax, xmax)``; + ``"4d"`` = + ``(tmin, zmin, ymin, xmin, tmax, zmax, ymax, xmax)``. tile: Tile index (tile_y, tile_x) based on chunk grid. - layout: Desired layout string using TZCHW letters where - T=time, Z=depth, C=channel, H=height (Y), W=width (X). + layout: Desired layout string using `TZCYX` letters where + T=time, Z=depth, C=channel, Y=row axis, X=column axis. + `TZCHW` aliases are also accepted for compatibility. dtype: Output dtype override. Defaults to pixels_meta.type when valid. chunk_policy: Handling for ``pyarrow.ChunkedArray`` inputs. "auto" keeps multi-chunk arrays and unwraps single-chunk arrays. @@ -380,6 +405,8 @@ def __init__( c: int | slice | Sequence[int] | None = None, roi: tuple[int, int, int, int] | None = None, roi3d: tuple[int, int, int, int, int, int] | None = None, + roi_nd: tuple[int, ...] | None = None, + roi_type: Literal["2d", "2d_timelapse", "3d", "4d"] | None = None, tile: tuple[int, int] | None = None, layout: str | None = None, dtype: np.dtype | None = None, @@ -397,9 +424,13 @@ def __init__( roi3d: Spatial + depth crop (x, y, z, w, h, d). This is a convenience alias for ``roi=(x, y, w, h)`` and ``z=slice(z, z + d)``. + roi_nd: General ROI tuple with min/max bounds, interpreted by + ``roi_type``. + roi_type: ROI interpretation mode for ``roi_nd``. tile: Tile index (tile_y, tile_x) derived from chunk_grid. - layout: Desired layout string using TZCHW letters where - T=time, Z=depth, C=channel, H=height (Y), W=width (X). + layout: Desired layout string using `TZCYX` letters where + T=time, Z=depth, C=channel, Y=row axis, X=column axis. + `TZCHW` aliases are also accepted for compatibility. dtype: Output dtype override. chunk_policy: Handling for ``pyarrow.ChunkedArray`` inputs. channel_policy: Behavior when dropping `C` from layout while @@ -448,7 +479,14 @@ def __init__( self._dtype = np.dtype(dtype) self._selection = self._normalize_selection( - t=t, z=z, c=c, roi=roi, roi3d=roi3d, tile=tile + t=t, + z=z, + c=c, + roi=roi, + roi3d=roi3d, + roi_nd=roi_nd, + roi_type=roi_type, + tile=tile, ) self._array: np.ndarray | None = None self._array_layout: str | None = None @@ -492,8 +530,9 @@ def with_layout(self, layout: str) -> "TensorView": """Return a new TensorView with a layout override. Args: - layout: Desired layout string using TZCHW letters where - T=time, Z=depth, C=channel, H=height (Y), W=width (X). + layout: Desired layout string using `TZCYX` letters where + T=time, Z=depth, C=channel, Y=row axis, X=column axis. + `TZCHW` aliases are also accepted for compatibility. Returns: TensorView: New view with the requested layout. @@ -910,8 +949,29 @@ def _normalize_selection( c: int | slice | Sequence[int] | None, roi: tuple[int, int, int, int] | None, roi3d: tuple[int, int, int, int, int, int] | None, + roi_nd: tuple[int, ...] | None, + roi_type: Literal["2d", "2d_timelapse", "3d", "4d"] | None, tile: tuple[int, int] | None, ) -> _Selection: + if roi_nd is not None: + if roi is not None or roi3d is not None or tile is not None: + raise ValueError("Provide only one of roi_nd, roi3d, roi, or tile.") + roi, t_from_roi, z_from_roi = self._parse_roi_nd(roi_nd, roi_type) + if t_from_roi is not None: + if t is not None: + raise ValueError( + "Provide either t or roi_nd time bounds, not both." + ) + t = t_from_roi + if z_from_roi is not None: + if z is not None: + raise ValueError( + "Provide either z or roi_nd depth bounds, not both." + ) + z = z_from_roi + elif roi_type is not None: + raise ValueError("roi_type requires roi_nd.") + if roi3d is not None: if roi is not None or tile is not None: raise ValueError("Provide only one of roi3d, roi, or tile.") @@ -951,6 +1011,65 @@ def _normalize_selection( return _Selection(t=t_idx, z=z_idx, c=c_idx, roi=roi) + def _parse_roi_nd( + self, + roi_nd: tuple[int, ...], + roi_type: Literal["2d", "2d_timelapse", "3d", "4d"] | None, + ) -> tuple[tuple[int, int, int, int], slice | None, slice | None]: + vals = tuple(int(v) for v in roi_nd) + if roi_type is None: + if len(vals) == 4: + roi_type = "2d" + elif len(vals) == 8: + roi_type = "4d" + elif len(vals) == 6: + raise ValueError( + "roi_nd with 6 values is ambiguous; provide roi_type " + "('2d_timelapse' or '3d')." + ) + else: + raise ValueError("roi_nd must have length 4, 6, or 8.") + if roi_type not in _ROI_TYPES: + raise ValueError(f"Unsupported roi_type: {roi_type!r}.") + + t_sel: slice | None = None + z_sel: slice | None = None + + if roi_type == "2d": + if len(vals) != 4: + raise ValueError("roi_type='2d' requires 4 values.") + ymin, xmin, ymax, xmax = vals + elif roi_type == "2d_timelapse": + if len(vals) != 6: + raise ValueError("roi_type='2d_timelapse' requires 6 values.") + tmin, ymin, xmin, tmax, ymax, xmax = vals + if tmin < 0 or tmax > self._size_t or tmax <= tmin: + raise ValueError("roi_nd time bounds are out of range.") + t_sel = slice(tmin, tmax) + elif roi_type == "3d": + if len(vals) != 6: + raise ValueError("roi_type='3d' requires 6 values.") + zmin, ymin, xmin, zmax, ymax, xmax = vals + if zmin < 0 or zmax > self._size_z or zmax <= zmin: + raise ValueError("roi_nd depth bounds are out of range.") + z_sel = slice(zmin, zmax) + else: + if len(vals) != 8: + raise ValueError("roi_type='4d' requires 8 values.") + tmin, zmin, ymin, xmin, tmax, zmax, ymax, xmax = vals + if tmin < 0 or tmax > self._size_t or tmax <= tmin: + raise ValueError("roi_nd time bounds are out of range.") + if zmin < 0 or zmax > self._size_z or zmax <= zmin: + raise ValueError("roi_nd depth bounds are out of range.") + t_sel = slice(tmin, tmax) + z_sel = slice(zmin, zmax) + + x, y = int(xmin), int(ymin) + w, h = int(xmax - xmin), int(ymax - ymin) + if w <= 0 or h <= 0: + raise ValueError("roi_nd spatial bounds must satisfy max > min.") + return (x, y, w, h), t_sel, z_sel + def _data_py_dict(self) -> dict[str, Any]: """Return the backing record as a Python dict. @@ -1117,8 +1236,12 @@ def _normalize_layout(layout: str | None) -> str | None: layout = layout.strip().upper() if not layout: raise ValueError("layout must be non-empty") + layout = layout.translate(_LAYOUT_ALIASES) if any(dim not in _ALLOWED_DIMS for dim in layout): - raise ValueError("layout must use only TZCHW letters") + raise ValueError( + "layout must use only TZC + (Y/X or H/W) letters " + "(for example CYX, YXC, TZCYX)." + ) if len(set(layout)) != len(layout): raise ValueError("layout cannot repeat dimensions") return layout diff --git a/tests/test_core.py b/tests/test_core.py index fa0e999..f85f5c0 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -421,6 +421,38 @@ def test_scan_collect_roundtrip() -> None: assert oa.info()["shape"] == (1, 1, 1, 72, 84) +@pytest.mark.parametrize( + ("input_data", "expected_shape"), + [ + ( + "tests/data/ome-artificial-5d-datasets/single-channel.ome.tiff", + (1, 1, 1, 167, 439), + ), # 2D + ( + "tests/data/ome-artificial-5d-datasets/time-series.ome.tif", + (7, 1, 1, 167, 439), + ), # 2D timelapse + ( + "tests/data/ome-artificial-5d-datasets/z-series.ome.tiff", + (1, 1, 5, 167, 439), + ), # 3D + ], +) +@pytest.mark.filterwarnings( + "ignore:As of version 0.4.0, the parser argument is ignored.*:DeprecationWarning" +) +def test_scan_collect_roundtrip_non4d( + input_data: str, expected_shape: tuple[int, int, int, int, int] +) -> None: + """Materialize lazy scans for non-4D sources and preserve shapes.""" + oa = OMEArrow.scan(input_data) + assert oa.is_lazy + + info = oa.collect().info() + assert not oa.is_lazy + assert info["shape"] == expected_shape + + def test_slice_lazy_scan_collect() -> None: """Queue a lazy slice and materialize it via collect().""" oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") diff --git a/tests/test_tensor.py b/tests/test_tensor.py index bdb8c2f..4f7686c 100644 --- a/tests/test_tensor.py +++ b/tests/test_tensor.py @@ -53,15 +53,15 @@ def test_tensor_view_layout_and_values(example_correct_data: dict) -> None: ) np.testing.assert_array_equal(arr, expected) - view_hwc = oa.tensor_view(t=0, z=0, layout="HWC") - arr_hwc = view_hwc.to_numpy(contiguous=False) - assert arr_hwc.shape == (3, 4, 2) - assert arr_hwc[0, 0, 0] == expected[0, 0, 0] - assert arr_hwc[0, 0, 1] == expected[1, 0, 0] - assert not arr_hwc.flags["C_CONTIGUOUS"] + view_yxc = oa.tensor_view(t=0, z=0, layout="YXC") + arr_yxc = view_yxc.to_numpy(contiguous=False) + assert arr_yxc.shape == (3, 4, 2) + assert arr_yxc[0, 0, 0] == expected[0, 0, 0] + assert arr_yxc[0, 0, 1] == expected[1, 0, 0] + assert not arr_yxc.flags["C_CONTIGUOUS"] - arr_hwc_contig = view_hwc.to_numpy(contiguous=True) - assert arr_hwc_contig.flags["C_CONTIGUOUS"] + arr_yxc_contig = view_yxc.to_numpy(contiguous=True) + assert arr_yxc_contig.flags["C_CONTIGUOUS"] def test_tensor_view_chunk_policy_modes(example_correct_data: dict) -> None: @@ -94,7 +94,7 @@ def test_lazy_tensor_view_collects_on_execution( oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") assert oa.is_lazy - view = oa.tensor_view(t=0, z=0, c=0, layout="HW") + view = oa.tensor_view(t=0, z=0, c=0, layout="YX") assert isinstance(view, LazyTensorView) assert oa.is_lazy @@ -142,14 +142,15 @@ def test_lazy_tensor_view_with_layout_defers_materialization() -> None: assert oa.is_lazy view = oa.tensor_view(t=0, z=0, c=0) - view_hw = view.with_layout("HW") + view_yx = view.with_layout("YX") - assert isinstance(view_hw, LazyTensorView) + assert isinstance(view_yx, LazyTensorView) + assert view_yx.layout == "YX" assert oa.is_lazy with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): - concrete = view_hw.collect() - assert concrete.layout == "HW" + concrete = view_yx.collect() + assert concrete.layout in {"YX", "HW"} arr = concrete.to_numpy(contiguous=True) assert arr.shape == (72, 84) assert not oa.is_lazy @@ -180,18 +181,18 @@ def test_dlpack_roundtrip_torch(example_correct_data: dict) -> None: assert tensor.data_ptr() == arr.__array_interface__["data"][0] -def test_tensor_view_layout_hw_with_first_channel_policy( +def test_tensor_view_layout_yx_with_first_channel_policy( example_correct_data: dict, ) -> None: - """Allow HW layout by selecting the first channel when requested.""" + """Allow YX layout by selecting the first channel when requested.""" oa = OMEArrow(example_correct_data) - view_hw = oa.tensor_view(t=0, z=0, layout="HW", channel_policy="first") - arr_hw = view_hw.to_numpy(contiguous=False) + view_yx = oa.tensor_view(t=0, z=0, layout="YX", channel_policy="first") + arr_yx = view_yx.to_numpy(contiguous=False) - expected_chw = oa.tensor_view(t=0, z=0, layout="CHW").to_numpy(contiguous=True) - np.testing.assert_array_equal(arr_hw, expected_chw[0]) - assert arr_hw.shape == (3, 4) + expected_cyx = oa.tensor_view(t=0, z=0, layout="CYX").to_numpy(contiguous=True) + np.testing.assert_array_equal(arr_yx, expected_cyx[0]) + assert arr_yx.shape == (3, 4) def test_tensor_view_roi3d_selects_z_and_roi() -> None: @@ -199,7 +200,7 @@ def test_tensor_view_roi3d_selects_z_and_roi() -> None: arr = np.arange(1 * 1 * 3 * 4 * 5, dtype=np.uint16).reshape(1, 1, 3, 4, 5) oa = OMEArrow(arr) - view = oa.tensor_view(roi3d=(1, 1, 1, 3, 2, 2), layout="TZCHW") + view = oa.tensor_view(roi3d=(1, 1, 1, 3, 2, 2), layout="TZCYX") out = view.to_numpy(contiguous=True) expected = arr[:, :, 1:3, 1:3, 1:4] @@ -217,6 +218,59 @@ def test_tensor_view_roi3d_conflicts_with_z() -> None: oa.tensor_view(z=0, roi3d=(0, 0, 0, 2, 2, 1)) +def test_tensor_view_roi_nd_3d_selects_z_and_roi() -> None: + """Support roi_nd 3D bounds with explicit roi_type.""" + arr = np.arange(1 * 1 * 3 * 4 * 5, dtype=np.uint16).reshape(1, 1, 3, 4, 5) + oa = OMEArrow(arr) + + view = oa.tensor_view(roi_nd=(1, 1, 1, 3, 3, 4), roi_type="3d", layout="TZCYX") + out = view.to_numpy(contiguous=True) + + expected = arr[:, :, 1:3, 1:3, 1:4] + expected = np.transpose(expected, (0, 2, 1, 3, 4)) + np.testing.assert_array_equal(out, expected) + assert out.shape == (1, 2, 1, 2, 3) + + +def test_tensor_view_roi_nd_2d_timelapse_selects_t_and_roi() -> None: + """Support roi_nd timelapse bounds with explicit roi_type.""" + arr = np.arange(3 * 1 * 1 * 4 * 5, dtype=np.uint16).reshape(3, 1, 1, 4, 5) + oa = OMEArrow(arr) + + view = oa.tensor_view( + roi_nd=(1, 1, 1, 3, 3, 4), roi_type="2d_timelapse", layout="TZCYX" + ) + out = view.to_numpy(contiguous=True) + + expected = arr[1:3, :, :, 1:3, 1:4] + expected = np.transpose(expected, (0, 2, 1, 3, 4)) + np.testing.assert_array_equal(out, expected) + assert out.shape == (2, 1, 1, 2, 3) + + +def test_tensor_view_roi_nd_4d_selects_t_z_and_roi() -> None: + """Support roi_nd 4D bounds with implicit roi_type by tuple length.""" + arr = np.arange(3 * 1 * 4 * 5 * 6, dtype=np.uint16).reshape(3, 1, 4, 5, 6) + oa = OMEArrow(arr) + + view = oa.tensor_view(roi_nd=(1, 1, 1, 2, 3, 3, 4, 5), layout="TZCYX") + out = view.to_numpy(contiguous=True) + + expected = arr[1:3, :, 1:3, 1:4, 2:5] + expected = np.transpose(expected, (0, 2, 1, 3, 4)) + np.testing.assert_array_equal(out, expected) + assert out.shape == (2, 2, 1, 3, 3) + + +def test_tensor_view_roi_nd_len6_requires_roi_type() -> None: + """Reject ambiguous roi_nd tuples with 6 values unless roi_type is set.""" + arr = np.arange(1 * 1 * 3 * 4 * 5, dtype=np.uint16).reshape(1, 1, 3, 4, 5) + oa = OMEArrow(arr) + + with pytest.raises(ValueError, match="roi_nd with 6 values is ambiguous"): + oa.tensor_view(roi_nd=(0, 0, 0, 1, 2, 3)) + + def test_dlpack_roundtrip_jax(example_correct_data: dict) -> None: """Round-trip DLPack export/import through JAX on CPU.""" jnp = pytest.importorskip("jax.numpy") @@ -263,7 +317,7 @@ def test_layout_drop_non_singleton_errors() -> None: """Reject layout drops when the omitted axis is non-singleton.""" arr = np.zeros((2, 1, 1, 2, 2), dtype=np.uint16) oa = OMEArrow(arr) - view = oa.tensor_view(layout="CHW") + view = oa.tensor_view(layout="CYX") with pytest.raises(ValueError, match="drops non-singleton"): view.to_numpy() From 1d4703c77ad5554915c2034f2dd81b7afca96642 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 25 Feb 2026 10:40:52 -0700 Subject: [PATCH 08/10] make ome-arrow internals lazy; benchmark --- .github/workflows/run-tests.yml | 84 ++++++++ README.md | 26 +++ benchmarks/benchmark_lazy_tensor.py | 322 ++++++++++++++++++++++++++++ benchmarks/ci-baseline.json | 7 + docs/src/dlpack.md | 38 ++++ src/ome_arrow/core.py | 51 ++++- src/ome_arrow/ingest.py | 112 +++++++++- src/ome_arrow/tensor.py | 106 ++++++++- tests/test_core.py | 14 ++ tests/test_tensor.py | 78 ++++++- 10 files changed, 829 insertions(+), 9 deletions(-) create mode 100644 benchmarks/benchmark_lazy_tensor.py create mode 100644 benchmarks/ci-baseline.json diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 54d6745..5edf855 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -46,3 +46,87 @@ jobs: run: uv sync --frozen --extra viz --extra dlpack - name: Run pytest run: uv run --frozen pytest + + benchmark_canary: + runs-on: ubuntu-24.04 + env: + BENCH_ENFORCE: "0" + steps: + - name: Checkout + uses: actions/checkout@v6 + - name: Python setup + uses: actions/setup-python@v6 + with: + python-version: "3.11" + - name: Install the latest version of uv + uses: astral-sh/setup-uv@v7 + - name: Sync dependencies + run: uv sync --frozen + - name: Run lazy tensor canary benchmark + run: | + extra_args="" + if [ "${BENCH_ENFORCE}" = "1" ]; then + extra_args="--fail-on-regression" + fi + uv run --frozen python benchmarks/benchmark_lazy_tensor.py \ + --repeats 7 \ + --warmup 2 \ + --baseline-json benchmarks/ci-baseline.json \ + --regression-factor 1.5 \ + --absolute-slack-ms 5.0 \ + --json-out benchmark-results.json \ + ${extra_args} + - name: Write benchmark summary + run: | + python - <<'PY' + import json + from pathlib import Path + + payload = json.loads(Path("benchmark-results.json").read_text()) + lines = [ + "## Lazy Tensor Benchmark Canary", + "", + f"- repeats: `{payload['repeats']}`", + f"- warmup: `{payload['warmup']}`", + "", + "| case | median (ms) | min (ms) | max (ms) |", + "|---|---:|---:|---:|", + ] + for r in payload["results"]: + lines.append( + "| " + f"{r['name']} | {r['median_ms']:.2f} " + f"| {r['min_ms']:.2f} | {r['max_ms']:.2f} |" + ) + lines.extend( + [ + "", + "| case | baseline (ms) | threshold (ms) | status |", + "|---|---:|---:|---|", + ] + ) + for c in payload["regression_checks"]: + baseline = ( + "-" + if c["baseline_ms"] is None + else f"{c['baseline_ms']:.2f}" + ) + threshold = ( + "-" + if c["threshold_ms"] is None + else f"{c['threshold_ms']:.2f}" + ) + status = "regressed" if c["regressed"] else "ok" + lines.append( + f"| {c['name']} | {baseline} | " + f"{threshold} | {status} |" + ) + + summary_path = Path(__import__("os").environ["GITHUB_STEP_SUMMARY"]) + summary_path.write_text("\n".join(lines) + "\n") + PY + - name: Upload benchmark artifact + uses: actions/upload-artifact@v4 + with: + name: lazy-tensor-benchmark-${{ github.run_id }} + path: benchmark-results.json diff --git a/README.md b/README.md index d03c40e..ac7e1fc 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,32 @@ Advanced options: See full docs: [`docs/src/dlpack.md`](docs/src/dlpack.md) +## Benchmarking lazy reads + +Use the lightweight benchmark utility in `benchmarks/` to compare lazy tensor +read paths (TIFF source-backed, Parquet planes, Parquet chunks): + +```bash +uv run python benchmarks/benchmark_lazy_tensor.py --repeats 5 --warmup 1 +``` + +Notes: + +- This benchmark is for local iteration and relative comparisons. +- It is not part of CI pass/fail checks. +- CI also runs this benchmark in a dedicated `benchmark_canary` job and + uploads `benchmark-results.json` as a workflow artifact. + +Recalibrating `benchmarks/ci-baseline.json`: + +1. Run the benchmark on `main` a few times (for example 3-5 runs): + `uv run python benchmarks/benchmark_lazy_tensor.py --repeats 7 --warmup 2 --json-out benchmark-results.json` +1. For each case, collect the observed `median_ms` values. +1. Update `benchmarks/ci-baseline.json` with stable medians from those runs + (prefer a conservative value near the slower side, not the fastest sample). +1. Keep CI canary tolerance (`regression_factor` + `absolute_slack_ms`) unchanged + unless you have repeated false positives. + ## Contributing, Development, and Testing Please see our [contributing documentation](https://github.com/wayscience/ome-arrow/tree/main/CONTRIBUTING.md) for more details on contributions, development, and testing. diff --git a/benchmarks/benchmark_lazy_tensor.py b/benchmarks/benchmark_lazy_tensor.py new file mode 100644 index 0000000..12aee96 --- /dev/null +++ b/benchmarks/benchmark_lazy_tensor.py @@ -0,0 +1,322 @@ +"""Lightweight benchmark for lazy tensor read paths. + +This script compares `OMEArrow.scan(...).tensor_view(...).to_numpy()` across: +- TIFF source-backed lazy plane loading +- OME-Parquet (planes payload) +- OME-Parquet (chunked payload) + +It is intended as a quick local signal, not a rigorous microbenchmark. +""" + +from __future__ import annotations + +import argparse +import json +import statistics +import sys +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Callable + +import numpy as np + +from ome_arrow import OMEArrow +from ome_arrow.export import to_ome_parquet +from ome_arrow.ingest import from_numpy + + +@dataclass(frozen=True) +class BenchmarkResult: + """Summary stats for one benchmark case.""" + + name: str + median_ms: float + min_ms: float + max_ms: float + shape: tuple[int, ...] + + +@dataclass(frozen=True) +class RegressionCheck: + """Regression check output for one benchmark case.""" + + name: str + baseline_ms: float | None + threshold_ms: float | None + regressed: bool + + +def _time_case( + name: str, + fn: Callable[[], np.ndarray], + *, + repeats: int, + warmup: int, +) -> BenchmarkResult: + """Run one benchmark case and return timing stats.""" + out: np.ndarray | None = None + for _ in range(warmup): + out = fn() + + times_ms: list[float] = [] + for _ in range(repeats): + start = time.perf_counter() + out = fn() + end = time.perf_counter() + times_ms.append((end - start) * 1000.0) + + if out is None: + raise RuntimeError("Benchmark case did not produce output.") + return BenchmarkResult( + name=name, + median_ms=statistics.median(times_ms), + min_ms=min(times_ms), + max_ms=max(times_ms), + shape=tuple(out.shape), + ) + + +def _build_parquet_fixtures(workdir: Path) -> tuple[Path, Path]: + """Create small planes/chunks parquet fixtures for local benchmarking.""" + arr = np.arange(1 * 2 * 3 * 256 * 256, dtype=np.uint16).reshape(1, 2, 3, 256, 256) + + planes_scalar = from_numpy(arr, build_chunks=False, image_id="bench-planes") + chunks_scalar = from_numpy( + arr, + build_chunks=True, + chunk_shape=(1, 64, 64), + image_id="bench-chunks", + ) + + planes_path = workdir / "bench_planes.ome.parquet" + chunks_path = workdir / "bench_chunks.ome.parquet" + to_ome_parquet(planes_scalar, out_path=str(planes_path), column_name="ome_arrow") + to_ome_parquet(chunks_scalar, out_path=str(chunks_path), column_name="ome_arrow") + return planes_path, chunks_path + + +def _print_results(results: list[BenchmarkResult]) -> None: + """Print benchmark results in a compact table.""" + print("") + print("Lazy tensor benchmark (ms)") + print(f"{'case':38} {'median':>10} {'min':>10} {'max':>10} {'shape':>16}") + print("-" * 92) + for r in results: + print( + f"{r.name:38} {r.median_ms:10.2f} {r.min_ms:10.2f} {r.max_ms:10.2f} {str(r.shape):>16}" + ) + + +def _load_baseline(path: Path | None) -> dict[str, float]: + """Load baseline medians from JSON, if provided.""" + if path is None or not path.exists(): + return {} + payload = json.loads(path.read_text()) + cases = payload.get("cases", {}) + return {str(k): float(v) for k, v in cases.items()} + + +def _check_regressions( + results: list[BenchmarkResult], + *, + baseline: dict[str, float], + regression_factor: float, + absolute_slack_ms: float, +) -> list[RegressionCheck]: + """Compare benchmark medians against baseline thresholds.""" + checks: list[RegressionCheck] = [] + for r in results: + baseline_ms = baseline.get(r.name) + if baseline_ms is None: + checks.append( + RegressionCheck( + name=r.name, + baseline_ms=None, + threshold_ms=None, + regressed=False, + ) + ) + continue + + threshold_ms = baseline_ms * regression_factor + absolute_slack_ms + checks.append( + RegressionCheck( + name=r.name, + baseline_ms=baseline_ms, + threshold_ms=threshold_ms, + regressed=r.median_ms > threshold_ms, + ) + ) + return checks + + +def _print_regressions(checks: list[RegressionCheck]) -> None: + """Print regression-comparison details.""" + with_baseline = [c for c in checks if c.baseline_ms is not None] + if not with_baseline: + print("\nNo baseline cases configured; skipping regression checks.") + return + print("\nCanary comparison") + print(f"{'case':38} {'baseline':>10} {'threshold':>10} {'status':>10}") + print("-" * 74) + for c in checks: + if c.baseline_ms is None or c.threshold_ms is None: + status = "no-baseline" + baseline = "-" + threshold = "-" + else: + status = "regressed" if c.regressed else "ok" + baseline = f"{c.baseline_ms:.2f}" + threshold = f"{c.threshold_ms:.2f}" + print(f"{c.name:38} {baseline:>10} {threshold:>10} {status:>10}") + + +def _write_json_report( + path: Path, + *, + results: list[BenchmarkResult], + checks: list[RegressionCheck], + repeats: int, + warmup: int, +) -> None: + """Write a machine-readable benchmark report for CI artifacts.""" + payload = { + "repeats": repeats, + "warmup": warmup, + "results": [ + { + "name": r.name, + "median_ms": r.median_ms, + "min_ms": r.min_ms, + "max_ms": r.max_ms, + "shape": list(r.shape), + } + for r in results + ], + "regression_checks": [ + { + "name": c.name, + "baseline_ms": c.baseline_ms, + "threshold_ms": c.threshold_ms, + "regressed": c.regressed, + } + for c in checks + ], + } + path.write_text(json.dumps(payload, indent=2)) + + +def main() -> None: + """Run lightweight lazy tensor benchmarks.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--repeats", type=int, default=5, help="Timed iterations.") + parser.add_argument("--warmup", type=int, default=1, help="Warmup iterations.") + parser.add_argument( + "--json-out", + type=Path, + default=None, + help="Optional JSON output path for CI/reporting.", + ) + parser.add_argument( + "--baseline-json", + type=Path, + default=None, + help="Optional baseline JSON with {'cases': {'case-name': median_ms}}.", + ) + parser.add_argument( + "--regression-factor", + type=float, + default=1.25, + help="Allowed multiplicative slowdown vs baseline.", + ) + parser.add_argument( + "--absolute-slack-ms", + type=float, + default=2.0, + help="Allowed absolute slowdown slack in ms.", + ) + parser.add_argument( + "--fail-on-regression", + action="store_true", + help="Exit non-zero when any case exceeds regression threshold.", + ) + parser.add_argument( + "--tiff-path", + type=Path, + default=Path("tests/data/ome-artificial-5d-datasets/single-channel.ome.tiff"), + help="TIFF file used for the source-backed lazy case.", + ) + args = parser.parse_args() + + if args.repeats <= 0: + raise ValueError("--repeats must be > 0.") + if args.warmup < 0: + raise ValueError("--warmup must be >= 0.") + if args.regression_factor <= 0: + raise ValueError("--regression-factor must be > 0.") + if args.absolute_slack_ms < 0: + raise ValueError("--absolute-slack-ms must be >= 0.") + + with tempfile.TemporaryDirectory(prefix="ome_arrow_lazy_bench_") as tmp: + tmpdir = Path(tmp) + planes_path, chunks_path = _build_parquet_fixtures(tmpdir) + + cases: list[tuple[str, Callable[[], np.ndarray]]] = [] + if args.tiff_path.exists(): + cases.append( + ( + "scan+tiff -> tensor_view(YX)", + lambda: OMEArrow.scan(str(args.tiff_path)) + .tensor_view(t=0, z=0, c=0, layout="YX") + .to_numpy(contiguous=True), + ) + ) + else: + print(f"Skipping TIFF case; file not found: {args.tiff_path}") + + cases.extend( + [ + ( + "scan+parquet(planes) -> tensor_view(YX)", + lambda: OMEArrow.scan(str(planes_path)) + .tensor_view(t=0, z=1, c=1, layout="YX") + .to_numpy(contiguous=True), + ), + ( + "scan+parquet(chunks) -> tensor_view(YX)", + lambda: OMEArrow.scan(str(chunks_path)) + .tensor_view(t=0, z=1, c=1, layout="YX") + .to_numpy(contiguous=True), + ), + ] + ) + + results = [ + _time_case(name, fn, repeats=args.repeats, warmup=args.warmup) + for name, fn in cases + ] + _print_results(results) + baseline = _load_baseline(args.baseline_json) + checks = _check_regressions( + results, + baseline=baseline, + regression_factor=args.regression_factor, + absolute_slack_ms=args.absolute_slack_ms, + ) + _print_regressions(checks) + if args.json_out is not None: + _write_json_report( + args.json_out, + results=results, + checks=checks, + repeats=args.repeats, + warmup=args.warmup, + ) + if args.fail_on_regression and any(c.regressed for c in checks): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/ci-baseline.json b/benchmarks/ci-baseline.json new file mode 100644 index 0000000..c720277 --- /dev/null +++ b/benchmarks/ci-baseline.json @@ -0,0 +1,7 @@ +{ + "cases": { + "scan+tiff -> tensor_view(YX)": 20.0, + "scan+parquet(planes) -> tensor_view(YX)": 12.0, + "scan+parquet(chunks) -> tensor_view(YX)": 12.0 + } +} diff --git a/docs/src/dlpack.md b/docs/src/dlpack.md index 86f19ed..0b99cf9 100644 --- a/docs/src/dlpack.md +++ b/docs/src/dlpack.md @@ -55,6 +55,10 @@ cropped = lazy_crop.collect() # Then execute tensor selections on the sliced result. tensor_view = cropped.tensor_view(t=0, z=slice(0, 8), roi=(64, 64, 128, 128)) arr = tensor_view.to_numpy() + +# Note: executing a LazyTensorView from OMEArrow.scan(...) does not +# materialize the original OMEArrow object itself. +# Call obj.collect() explicitly if you need to materialize `obj`. ``` ## JAX @@ -132,3 +136,37 @@ pip install "ome-arrow[dlpack-torch]" # torch only pip install "ome-arrow[dlpack-jax]" # jax only pip install "ome-arrow[dlpack]" # both ``` + +## Benchmarking lazy reads + +To quickly compare lazy tensor read paths (TIFF source-backed execution, +Parquet planes, Parquet chunks), run: + +```bash +uv run python benchmarks/benchmark_lazy_tensor.py --repeats 5 --warmup 1 +``` + +This is a lightweight local benchmark intended for directional performance +checks during development. + +In CI, the `tests` workflow runs a `benchmark_canary` job that executes the +same script and uploads a JSON report artifact. + +### Recalibrating `ci-baseline.json` + +When performance changes are intentional (or runner behavior shifts), update +`benchmarks/ci-baseline.json` as follows: + +1. Check out the latest `main`. +1. Run the benchmark multiple times: + `uv run python benchmarks/benchmark_lazy_tensor.py --repeats 7 --warmup 2 --json-out benchmark-results.json` +1. Record `median_ms` per case across runs. +1. Set each baseline value to a stable, slightly conservative median. +1. Open a PR that updates baseline values only, with benchmark evidence. + +Expected variability: + +- Small fluctuations are normal on GitHub-hosted runners. +- Relative ordering of cases is usually stable. +- Typical drift should be modest, but occasional jumps can happen due to + runner image or dependency changes. diff --git a/src/ome_arrow/core.py b/src/ome_arrow/core.py index 207b708..6a3b5ea 100644 --- a/src/ome_arrow/core.py +++ b/src/ome_arrow/core.py @@ -35,6 +35,7 @@ from_ome_zarr, from_stack_pattern_path, from_tiff, + open_lazy_plane_source, ) from ome_arrow.meta import OME_ARROW_STRUCT from ome_arrow.tensor import LazyTensorView, TensorView @@ -366,6 +367,51 @@ def _tensor_source(self) -> pa.StructScalar | pa.StructArray: raise RuntimeError("OMEArrow data is not initialized.") return self._data + def _resolve_lazy_tensor_view(self, view_kwargs: dict[str, Any]) -> TensorView: + """Resolve a lazy tensor view plan without mutating this OMEArrow state. + + Args: + view_kwargs: TensorView constructor kwargs captured by LazyTensorView. + + Returns: + TensorView: Concrete tensor view for the planned selection. + """ + # Deferred slice plans rely on slice_ome_arrow over a materialized scalar; + # keep the existing behavior for those plans. + if self._lazy_slices: + self._ensure_materialized() + return TensorView(self._tensor_source(), **view_kwargs) + + if self._lazy_source is None: + return TensorView(self._tensor_source(), **view_kwargs) + + lazy_source = self._lazy_source + lazy_plane_source = open_lazy_plane_source(lazy_source.data) + if lazy_plane_source is not None: + pixels_meta, plane_loader = lazy_plane_source + lazy_record = { + "id": None, + "name": None, + "image_type": lazy_source.image_type, + "acquisition_datetime": None, + "pixels_meta": pixels_meta, + "channels": [], + "planes": [], + "masks": [], + "chunk_grid": None, + "chunks": [], + } + return TensorView(lazy_record, plane_loader=plane_loader, **view_kwargs) + + scalar, struct_array = self._load_from_string_source( + lazy_source.data, + column_name=lazy_source.column_name, + row_index=lazy_source.row_index, + image_type=lazy_source.image_type, + ) + source = struct_array if struct_array is not None else scalar + return TensorView(source, **view_kwargs) + @staticmethod def _wrap_with_image_type( data: pa.StructScalar, image_type: str @@ -739,7 +785,9 @@ def tensor_view( Returns: TensorView | LazyTensorView: Tensor view over selected pixels. In lazy mode, this returns a deferred ``LazyTensorView`` that - materializes on first execution call (for example ``to_numpy()``). + resolves on first execution call (for example ``to_numpy()``) + without forcing ``self`` to materialize unless deferred + ``slice_lazy`` operations are queued. Raises: ValueError: If an unsupported scene is requested. @@ -751,6 +799,7 @@ def tensor_view( if self._lazy_source is not None: return LazyTensorView( loader=self._tensor_source, + resolver=self._resolve_lazy_tensor_view, t=t, z=z, c=c, diff --git a/src/ome_arrow/ingest.py b/src/ome_arrow/ingest.py index 93d5d72..38643f8 100644 --- a/src/ome_arrow/ingest.py +++ b/src/ome_arrow/ingest.py @@ -8,7 +8,7 @@ import warnings from datetime import datetime, timezone from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Tuple +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple import bioio_ome_tiff import bioio_tifffile @@ -200,6 +200,75 @@ def _read_physical_pixel_sizes( return psize_x, psize_y, psize_z, unit, True +def open_lazy_plane_source( + source: str, +) -> tuple[dict[str, Any], Callable[[int, int, int], np.ndarray]] | None: + """Open a source-backed per-plane loader for lazy tensor execution. + + Args: + source: Input path/URL string for TIFF or OME-Zarr sources. + + Returns: + A tuple of ``(pixels_meta, plane_loader)`` when source-backed lazy plane + loading is supported for ``source``; otherwise ``None``. + """ + s = source.strip() + path = Path(s) + lower = s.lower() + + if path.suffix.lower() in {".tif", ".tiff"} or lower.endswith((".tif", ".tiff")): + img = BioImage( + image=str(path), + reader=( + bioio_ome_tiff.Reader + if str(path).lower().endswith(("ome.tif", "ome.tiff")) + else bioio_tifffile.Reader + ), + ) + elif ( + lower.endswith(".zarr") + or lower.endswith(".ome.zarr") + or ".zarr/" in lower + or (path.exists() and path.is_dir() and path.suffix.lower() == ".zarr") + ): + img = BioImage(image=str(path), reader=OMEZarrReader) + else: + return None + + dims = img.dims + size_t = int(dims.T or 1) + size_c = int(dims.C or 1) + size_z = int(dims.Z or 1) + size_y = int(dims.Y or 0) + size_x = int(dims.X or 0) + if size_x <= 0 or size_y <= 0: + sample = np.asarray(img.get_image_data("YX", T=0, C=0, Z=0)) + size_y, size_x = int(sample.shape[-2]), int(sample.shape[-1]) + + dim_order = "XYCT" if size_z == 1 else "XYZCT" + pixels_meta = { + "dimension_order": dim_order, + "type": "uint16", + "size_x": size_x, + "size_y": size_y, + "size_z": size_z, + "size_c": size_c, + "size_t": size_t, + "physical_size_x": None, + "physical_size_y": None, + "physical_size_z": None, + "physical_size_unit": None, + } + + def _plane_loader(t: int, z: int, c: int) -> np.ndarray: + plane = np.asarray(img.get_image_data("YX", T=t, C=c, Z=z)) + if plane.dtype != np.uint16: + plane = np.clip(plane, 0, 65535).astype(np.uint16) + return plane + + return pixels_meta, _plane_loader + + def _load_zarr_attrs(zarr_path: Path) -> dict: zarr_json = zarr_path / "zarr.json" if zarr_json.exists(): @@ -1306,16 +1375,53 @@ def from_ome_parquet( Raises: FileNotFoundError: If the Parquet path does not exist. ValueError: If the row index is out of range or no suitable column exists. + + Notes: + This reader targets the row group containing ``row_index`` and requests + only ``column_name`` when provided, avoiding eager full-table reads. """ p = Path(parquet_path) if not p.exists(): raise FileNotFoundError(f"No such file: {p}") - table = pq.read_table(p) + parquet_file = pq.ParquetFile(p) + metadata = parquet_file.metadata + if metadata is None or metadata.num_rows == 0: + raise ValueError("Table contains 0 rows; expected at least 1.") + if not (0 <= row_index < metadata.num_rows): + raise ValueError( + f"row_index {row_index} out of range [0, {metadata.num_rows})." + ) + + row_group_index = 0 + row_index_in_group = row_index + for i in range(metadata.num_row_groups): + group_rows = metadata.row_group(i).num_rows + if row_index_in_group < group_rows: + row_group_index = i + break + row_index_in_group -= group_rows + + requested_columns = [column_name] if column_name is not None else None + try: + table = parquet_file.read_row_group(row_group_index, columns=requested_columns) + except (KeyError, ValueError, pa.ArrowInvalid): + if requested_columns is None: + raise + # If the requested column is unavailable in the row group read path, fall + # back to all columns so downstream auto-detection/warnings remain intact. + table = parquet_file.read_row_group(row_group_index) + else: + if requested_columns is not None and column_name not in table.column_names: + # Some parquet backends return an empty projected table when a column + # is missing rather than raising. Retry with full row-group columns so + # _ome_arrow_from_table can auto-detect and emit the usual warning. + table = parquet_file.read_row_group(row_group_index) + return _ome_arrow_from_table( table, column_name=column_name, - row_index=row_index, + row_index=row_index_in_group, strict_schema=strict_schema, return_array=return_array, ) diff --git a/src/ome_arrow/tensor.py b/src/ome_arrow/tensor.py index b52f489..2c7813e 100644 --- a/src/ome_arrow/tensor.py +++ b/src/ome_arrow/tensor.py @@ -58,6 +58,7 @@ def __init__( [], dict[str, Any] | pa.StructScalar | pa.StructArray | pa.ChunkedArray, ], + resolver: Callable[[dict[str, Any]], "TensorView"] | None = None, t: int | slice | Sequence[int] | None = None, z: int | slice | Sequence[int] | None = None, c: int | slice | Sequence[int] | None = None, @@ -75,6 +76,8 @@ def __init__( Args: loader: Callable that returns concrete OME-Arrow pixel data. + resolver: Optional callable that resolves this lazy plan directly to + a concrete ``TensorView`` using current selection kwargs. t: Time index selection. z: Depth index selection. c: Channel index selection. @@ -90,6 +93,7 @@ def __init__( channel_policy: Behavior when dropping ``C`` from layout. """ self._loader = loader + self._resolver = resolver self._kwargs: dict[str, Any] = { "t": t, "z": z, @@ -110,7 +114,7 @@ def __init__( def _spawn(self, **updates: Any) -> "LazyTensorView": kwargs = dict(self._kwargs) kwargs.update(updates) - return LazyTensorView(loader=self._loader, **kwargs) + return LazyTensorView(loader=self._loader, resolver=self._resolver, **kwargs) def collect(self) -> "TensorView": """Materialize this lazy plan into a concrete TensorView.""" @@ -121,7 +125,10 @@ def collect(self) -> "TensorView": with self._collect_lock: resolved = self._resolved if resolved is None: - resolved = TensorView(self._loader(), **self._kwargs) + if self._resolver is not None: + resolved = self._resolver(dict(self._kwargs)) + else: + resolved = TensorView(self._loader(), **self._kwargs) self._resolved = resolved return resolved @@ -400,6 +407,7 @@ def __init__( self, data: dict[str, Any] | pa.StructScalar | pa.StructArray | pa.ChunkedArray, *, + plane_loader: Callable[[int, int, int], np.ndarray] | None = None, t: int | slice | Sequence[int] | None = None, z: int | slice | Sequence[int] | None = None, c: int | slice | Sequence[int] | None = None, @@ -417,6 +425,9 @@ def __init__( Args: data: OME-Arrow record as dict/StructScalar/StructArray/ChunkedArray. + plane_loader: Optional callback that returns one YX plane for + ``(t, z, c)``. When provided, per-plane reads use this loader + instead of ``planes``/``chunks`` payloads in ``data``. t: Time index selection (int, slice, or sequence). Default: all. z: Z index selection (int, slice, or sequence). Default: all. c: Channel index selection (int, slice, or sequence). Default: all. @@ -464,6 +475,7 @@ def __init__( # Keep normalized backing data so child TensorViews do not repeatedly # combine chunked Arrow arrays during iteration. self._data = data + self._plane_loader = plane_loader self._layout_override = _normalize_layout(layout) if layout else None self._channel_policy = _normalize_channel_policy(channel_policy) @@ -540,6 +552,7 @@ def with_layout(self, layout: str) -> "TensorView": return TensorView( self._data, + plane_loader=self._plane_loader, t=self._selection.t, z=self._selection.z, c=self._selection.c, @@ -792,6 +805,7 @@ def iter_tiles_3d( t, z_batch, x, y, w, h = batch[0] view = TensorView( self._data, + plane_loader=self._plane_loader, t=[t], z=list(z_batch), c=self._selection.c, @@ -828,6 +842,7 @@ def _iter_batches( for batch in _batched(t_indices, batch_size, prefetch=prefetch): view = TensorView( self._data, + plane_loader=self._plane_loader, t=batch, z=self._selection.z, c=self._selection.c, @@ -870,6 +885,7 @@ def _iter_tiles( x, y, w, h = batch[0] view = TensorView( self._data, + plane_loader=self._plane_loader, t=self._selection.t, z=self._selection.z, c=self._selection.c, @@ -912,6 +928,9 @@ def _build_tzchw(self) -> np.ndarray: def _plane_map(self) -> dict[tuple[int, int, int], list[Any]]: if self._has_chunks(): return {} + if self._struct_array is not None: + # Arrow-backed plane reads use _select_plane_values in _read_plane. + return {} plane_map = {} data_py = self._data_py_dict() for plane in data_py.get("planes", []): @@ -927,6 +946,30 @@ def _read_plane( z: int, c: int, ) -> np.ndarray: + if self._plane_loader is not None: + arr = np.asarray(self._plane_loader(t, z, c), dtype=self._dtype) + return arr.reshape(self._size_y, self._size_x) + + if self._struct_array is not None and not self._has_chunks(): + values = _select_plane_values(self._struct_array, t=t, z=z, c=c) + arr = values.to_numpy(zero_copy_only=False) + return np.asarray(arr, dtype=self._dtype).reshape( + self._size_y, self._size_x + ) + + if self._struct_array is not None and self._has_chunks(): + chunk_order = str(self._chunk_grid().get("chunk_order") or "ZYX").upper() + return _plane_from_chunks_arrow( + self._struct_array, + t=t, + z=z, + c=c, + size_x=self._size_x, + size_y=self._size_y, + dtype=self._dtype, + chunk_order=chunk_order, + ) + if self._has_chunks(): data_py = self._data_py_dict() return plane_from_chunks(data_py, t=t, z=z, c=c, dtype=self._dtype) @@ -1553,3 +1596,62 @@ def _select_chunk_values( pixels_list = selected.field("pixels")[0] return pixels_list.values + + +def _plane_from_chunks_arrow( + struct_arr: pa.StructArray, + *, + t: int, + z: int, + c: int, + size_x: int, + size_y: int, + dtype: np.dtype, + chunk_order: str = "ZYX", +) -> np.ndarray: + """Reconstruct one YX plane from Arrow chunk rows without Python dict casts.""" + if chunk_order != "ZYX": + raise ValueError("Only chunk_order='ZYX' is supported for now.") + + chunks_arr = struct_arr.field("chunks") + if len(chunks_arr) == 0 or chunks_arr.is_null().to_pylist()[0]: + return np.zeros((size_y, size_x), dtype=dtype) + + chunks = chunks_arr[0].values + z_field = chunks.field("z") + sz_field = chunks.field("shape_z") + z_end = pc.add(z_field, sz_field) + mask = pc.and_( + pc.equal(chunks.field("t"), t), + pc.and_( + pc.equal(chunks.field("c"), c), + pc.and_(pc.less_equal(z_field, z), pc.greater(z_end, z)), + ), + ) + selected = pc.filter(chunks, mask) + + out = np.zeros((size_y, size_x), dtype=dtype) + for i in range(len(selected)): + z0 = int(selected.field("z")[i].as_py()) + y0 = int(selected.field("y")[i].as_py()) + x0 = int(selected.field("x")[i].as_py()) + sz = int(selected.field("shape_z")[i].as_py()) + sy = int(selected.field("shape_y")[i].as_py()) + sx = int(selected.field("shape_x")[i].as_py()) + if y0 < 0 or x0 < 0 or sz <= 0 or sy <= 0 or sx <= 0: + raise ValueError("Chunk has invalid shape or origin.") + if z0 + sz <= z or z0 > z: + continue + if y0 + sy > size_y or x0 + sx > size_x: + raise ValueError("Chunk extent out of range.") + + pixels_values = selected.field("pixels")[i].values + pix = np.asarray(pixels_values.to_numpy(zero_copy_only=False), dtype=dtype) + expected_len = sz * sy * sx + if pix.size != expected_len: + raise ValueError( + f"Chunk pixels length {pix.size} != expected {expected_len}." + ) + arr3d = pix.reshape(sz, sy, sx) + out[y0 : y0 + sy, x0 : x0 + sx] = arr3d[z - z0] + return out diff --git a/tests/test_core.py b/tests/test_core.py index f85f5c0..bb15a43 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from ome_arrow import ingest from ome_arrow.core import OMEArrow @@ -421,6 +422,19 @@ def test_scan_collect_roundtrip() -> None: assert oa.info()["shape"] == (1, 1, 1, 72, 84) +def test_parquet_read_avoids_full_table_scan(monkeypatch: pytest.MonkeyPatch) -> None: + """Use row-group reads instead of eager pq.read_table for parquet ingest.""" + path = "tests/data/JUMP-BR00117006/BR00117006.ome.parquet" + + def _fail_read_table(*_args: object, **_kwargs: object) -> None: + raise AssertionError("pq.read_table should not be used for from_ome_parquet") + + monkeypatch.setattr(ingest.pq, "read_table", _fail_read_table) + with pytest.warns(UserWarning, match="Requested column 'ome_arrow'"): + oa = OMEArrow(path) + assert oa.info()["shape"] == (1, 1, 1, 72, 84) + + @pytest.mark.parametrize( ("input_data", "expected_shape"), [ diff --git a/tests/test_tensor.py b/tests/test_tensor.py index 4f7686c..cbde9e4 100644 --- a/tests/test_tensor.py +++ b/tests/test_tensor.py @@ -7,6 +7,7 @@ import pyarrow.compute as pc import pytest +import ome_arrow.core as core_module from ome_arrow import OMEArrow from ome_arrow.export import to_ome_parquet from ome_arrow.meta import OME_ARROW_STRUCT @@ -100,7 +101,7 @@ def test_lazy_tensor_view_collects_on_execution( arr = view.to_numpy(contiguous=True) assert arr.shape == (72, 84) - assert not oa.is_lazy + assert oa.is_lazy if recwarn: assert any( "Requested column 'ome_arrow'" in str(w.message) for w in recwarn.list @@ -133,7 +134,7 @@ def test_lazy_tensor_view_select_preserves_existing_dims() -> None: arr = view_z.to_numpy(contiguous=True) assert arr.shape == (1, 167, 439) - assert not oa.is_lazy + assert oa.is_lazy def test_lazy_tensor_view_with_layout_defers_materialization() -> None: @@ -153,7 +154,7 @@ def test_lazy_tensor_view_with_layout_defers_materialization() -> None: assert concrete.layout in {"YX", "HW"} arr = concrete.to_numpy(contiguous=True) assert arr.shape == (72, 84) - assert not oa.is_lazy + assert oa.is_lazy def test_dlpack_roundtrip_torch(example_correct_data: dict) -> None: @@ -446,6 +447,77 @@ def test_arrow_mode_zero_copy_parquet_jax( pytest.skip("JAX did not preserve zero-copy for pyarrow DLPack.") +def test_scan_tensor_view_to_numpy_avoids_python_record_materialization( + tmp_path: pathlib.Path, + example_correct_data: dict, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Read selected planes from Arrow without building a full Python record.""" + out = tmp_path / "lazy_arrow_plane.parquet" + record = dict(example_correct_data) + record["chunk_grid"] = None + record["chunks"] = [] + to_ome_parquet(record, out_path=str(out), column_name="ome_arrow") + + oa = OMEArrow.scan(str(out)) + + def _fail_data_py_dict(self: TensorView) -> dict: + raise AssertionError("_data_py_dict should not be used for Arrow plane reads") + + monkeypatch.setattr(TensorView, "_data_py_dict", _fail_data_py_dict) + arr = oa.tensor_view(t=0, z=0, c=0, layout="YX").to_numpy(contiguous=True) + + assert arr.shape == (3, 4) + assert oa.is_lazy + + +def test_scan_chunked_parquet_tensor_view_avoids_python_record_materialization( + tmp_path: pathlib.Path, + example_correct_data: dict, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Read chunked parquet planes through Arrow path without _data_py_dict.""" + out = tmp_path / "lazy_chunked.parquet" + to_ome_parquet(example_correct_data, out_path=str(out), column_name="ome_arrow") + + oa = OMEArrow.scan(str(out)) + + def _fail_data_py_dict(self: TensorView) -> dict: + raise AssertionError( + "_data_py_dict should not be used for Arrow chunked plane reads" + ) + + monkeypatch.setattr(TensorView, "_data_py_dict", _fail_data_py_dict) + arr = oa.tensor_view(t=0, z=0, c=1, layout="YX").to_numpy(contiguous=True) + + expected = np.array( + [[100, 101, 102, 103], [110, 111, 112, 113], [120, 121, 122, 123]], + dtype=np.uint16, + ) + np.testing.assert_array_equal(arr, expected) + assert oa.is_lazy + + +@pytest.mark.filterwarnings( + "ignore:As of version 0.4.0, the parser argument is ignored.*:DeprecationWarning" +) +def test_scan_tiff_tensor_view_uses_source_plane_loader( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Execute lazy TIFF tensor views without calling eager from_tiff ingestion.""" + path = "tests/data/ome-artificial-5d-datasets/single-channel.ome.tiff" + oa = OMEArrow.scan(path) + + def _fail_from_tiff(*_args: object, **_kwargs: object) -> None: + raise AssertionError("from_tiff should not be used for lazy tensor execution") + + monkeypatch.setattr(core_module, "from_tiff", _fail_from_tiff) + arr = oa.tensor_view(t=0, z=0, c=0, layout="YX").to_numpy(contiguous=True) + + assert arr.shape == (167, 439) + assert oa.is_lazy + + def _jax_buffer_ptr(arr: object) -> int: """Return a best-effort device pointer for a JAX array. From beff939f405c8f2db9d5ce115327bf04b6464bf6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci-lite[bot]" <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 17:42:39 +0000 Subject: [PATCH 09/10] [pre-commit.ci lite] apply automatic fixes --- benchmarks/benchmark_lazy_tensor.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/benchmarks/benchmark_lazy_tensor.py b/benchmarks/benchmark_lazy_tensor.py index 12aee96..bce17cb 100644 --- a/benchmarks/benchmark_lazy_tensor.py +++ b/benchmarks/benchmark_lazy_tensor.py @@ -105,7 +105,7 @@ def _print_results(results: list[BenchmarkResult]) -> None: print("-" * 92) for r in results: print( - f"{r.name:38} {r.median_ms:10.2f} {r.min_ms:10.2f} {r.max_ms:10.2f} {str(r.shape):>16}" + f"{r.name:38} {r.median_ms:10.2f} {r.min_ms:10.2f} {r.max_ms:10.2f} {r.shape!s:>16}" ) @@ -268,9 +268,11 @@ def main() -> None: cases.append( ( "scan+tiff -> tensor_view(YX)", - lambda: OMEArrow.scan(str(args.tiff_path)) - .tensor_view(t=0, z=0, c=0, layout="YX") - .to_numpy(contiguous=True), + lambda: ( + OMEArrow.scan(str(args.tiff_path)) + .tensor_view(t=0, z=0, c=0, layout="YX") + .to_numpy(contiguous=True) + ), ) ) else: @@ -280,15 +282,19 @@ def main() -> None: [ ( "scan+parquet(planes) -> tensor_view(YX)", - lambda: OMEArrow.scan(str(planes_path)) - .tensor_view(t=0, z=1, c=1, layout="YX") - .to_numpy(contiguous=True), + lambda: ( + OMEArrow.scan(str(planes_path)) + .tensor_view(t=0, z=1, c=1, layout="YX") + .to_numpy(contiguous=True) + ), ), ( "scan+parquet(chunks) -> tensor_view(YX)", - lambda: OMEArrow.scan(str(chunks_path)) - .tensor_view(t=0, z=1, c=1, layout="YX") - .to_numpy(contiguous=True), + lambda: ( + OMEArrow.scan(str(chunks_path)) + .tensor_view(t=0, z=1, c=1, layout="YX") + .to_numpy(contiguous=True) + ), ), ] ) From 0d0596ccde02dec02b8bfa8b582a496743ebaedb Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 25 Feb 2026 10:55:10 -0700 Subject: [PATCH 10/10] lint --- benchmarks/benchmark_lazy_tensor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_lazy_tensor.py b/benchmarks/benchmark_lazy_tensor.py index bce17cb..7abcc72 100644 --- a/benchmarks/benchmark_lazy_tensor.py +++ b/benchmarks/benchmark_lazy_tensor.py @@ -105,7 +105,8 @@ def _print_results(results: list[BenchmarkResult]) -> None: print("-" * 92) for r in results: print( - f"{r.name:38} {r.median_ms:10.2f} {r.min_ms:10.2f} {r.max_ms:10.2f} {r.shape!s:>16}" + f"{r.name:38} {r.median_ms:10.2f} " + f"{r.min_ms:10.2f} {r.max_ms:10.2f} {r.shape!s:>16}" )