Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions xrspatial/geotiff/_gpu_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,29 +1189,41 @@ class _NvcompDeflateDecompOpts(ctypes.Structure):
# LZW/Deflate concat-then-upload pattern below (~L1714-1722).
# Per-tile cupy.asarray was measured at 256x64KB -> 6.07 ms vs 3.65 ms
# for the batched form (~1.66x speedup, scales worse with more tiles).
comp_sizes_list = [len(t) for t in raw_tiles]
comp_offsets_h = np.zeros(n_tiles, dtype=np.int64)
for i in range(1, n_tiles):
comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1]
total_comp = sum(comp_sizes_list)
#
# ``np.cumsum(..., out=offsets[1:])`` vectorises the prefix-sum
# so the per-tile offsets land in one C-level pass instead of a
# Python ``for`` loop. Aligns with the sibling
# ``_batched_d2h_to_bytes`` helper (~L924) and the
# ``_nvcomp_batch_compress`` post-decompress prefix sum (~L2572).
# Microbench (1024 tiles): 84us Python loop -> 21us cumsum
# (~3.9x). See issue #1950.
# Allocate as uint64 up front: nvcomp consumes uint64 pointer/size
# arrays, so skipping the intermediate int64 -> uint64 .astype copies
# at the cupy.asarray sites avoids a redundant host-side allocation.
comp_sizes_arr = np.fromiter(
(len(t) for t in raw_tiles),
dtype=np.uint64, count=n_tiles,
)
comp_offsets_h = np.zeros(n_tiles, dtype=np.uint64)
if n_tiles > 1:
np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])
total_comp = int(comp_sizes_arr.sum())

comp_buf_host = np.empty(total_comp, dtype=np.uint8)
for i, tile in enumerate(raw_tiles):
comp_buf_host[comp_offsets_h[i]:comp_offsets_h[i] + comp_sizes_list[i]] = \
comp_buf_host[comp_offsets_h[i]:comp_offsets_h[i] + comp_sizes_arr[i]] = \
np.frombuffer(tile, dtype=np.uint8)

d_comp = cupy.asarray(comp_buf_host)
d_decomp = cupy.empty(n_tiles * tile_bytes, dtype=cupy.uint8)

base_comp_ptr = int(d_comp.data.ptr)
base_decomp_ptr = int(d_decomp.data.ptr)
d_comp_ptrs = cupy.asarray(
base_comp_ptr + comp_offsets_h.astype(np.uint64))
d_comp_ptrs = cupy.asarray(base_comp_ptr + comp_offsets_h)
decomp_offsets_h = (np.arange(n_tiles, dtype=np.uint64)
* np.uint64(tile_bytes))
d_decomp_ptrs = cupy.asarray(base_decomp_ptr + decomp_offsets_h)
d_comp_sizes = cupy.asarray(
np.array(comp_sizes_list, dtype=np.uint64))
d_comp_sizes = cupy.asarray(comp_sizes_arr)
d_buf_sizes = cupy.full(n_tiles, tile_bytes, dtype=cupy.uint64)
d_actual = cupy.empty(n_tiles, dtype=cupy.uint64)

Expand Down
154 changes: 154 additions & 0 deletions xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
"""Regression tests for issue #1950.

``_try_nvcomp_batch_decompress`` used to compute its per-tile host
prefix-sum offsets via a Python ``for`` loop:

```
comp_sizes_list = [len(t) for t in raw_tiles]
comp_offsets_h = np.zeros(n_tiles, dtype=np.int64)
for i in range(1, n_tiles):
comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1]
```

The sibling batched-D2H helper ``_batched_d2h_to_bytes`` at ~L924 and
the compress-side prefix sum in ``_nvcomp_batch_compress`` at ~L2572
both use ``np.cumsum(sizes, out=offsets[1:])``. Aligning the
decompress side keeps the codebase consistent and trims interpreter
overhead.

Two guards here:

1. Correctness -- a tiny synthetic nvCOMP round-trip (when the lib is
available) still decodes every tile correctly. Without nvCOMP the
test exercises the same prefix-sum reshape via direct comparison
against ``np.cumsum``.
2. Structural -- the source uses ``np.cumsum`` (not a Python
``range(1, n_tiles)`` loop) for the prefix sum.
"""
from __future__ import annotations

import importlib.util
import os
import re
import tempfile

import numpy as np
import pytest


def test_nvcomp_decompress_uses_cumsum_for_offsets_1950():
"""Source-level guard against reintroducing the Python for loop.

The fix swaps the per-tile prefix-sum loop for ``np.cumsum``.
This test fires if anyone reverts to the loop or otherwise breaks
the alignment with ``_batched_d2h_to_bytes`` / ``_nvcomp_batch_compress``.
"""
import pathlib

src_path = pathlib.Path(__file__).parent.parent / "_gpu_decode.py"
src = src_path.read_text()

# Anchor on the exact decompress-side prefix-sum call. Regex is
# tighter than a fixed character window and survives surrounding
# code edits.
cumsum_call = re.compile(
r"np\.cumsum\(\s*comp_sizes_arr\[:-1\]\s*,\s*"
r"out\s*=\s*comp_offsets_h\[1:\]\s*\)"
)
assert cumsum_call.search(src), (
"decompress upload block should use "
"``np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])`` for "
"prefix-sum offsets, aligning with _batched_d2h_to_bytes "
"(issue #1950)."
)
# The legacy Python loop would have written
# ``comp_offsets_h[i] = comp_offsets_h[i - 1] + ...`` inside a
# ``for i in range(1, n_tiles):`` block.
legacy_loop = re.compile(
r"for\s+i\s+in\s+range\(\s*1\s*,\s*n_tiles\s*\)\s*:\s*\n"
r"\s*comp_offsets_h\[i\]"
)
assert not legacy_loop.search(src), (
"decompress upload block should no longer compute prefix-sum "
"offsets with a Python for loop (issue #1950)."
)


def test_cumsum_matches_loop_prefix_sum_1950():
"""Equivalence between the vectorised cumsum and the prior loop.

Numeric guard. Even though the two forms produce the same output
by construction, a runtime check confirms the cumsum form does not
drift away from the previous semantics across numpy versions.
"""
rng = np.random.RandomState(1950)
n = 1024
sizes = rng.randint(100, 100_000, size=n).astype(np.int64)

# Vectorised form (matches the fix).
offsets_cumsum = np.zeros(n, dtype=np.int64)
if n > 1:
np.cumsum(sizes[:-1], out=offsets_cumsum[1:])

# Reference: explicit Python prefix sum.
offsets_loop = np.zeros(n, dtype=np.int64)
for i in range(1, n):
offsets_loop[i] = offsets_loop[i - 1] + sizes[i - 1]

np.testing.assert_array_equal(offsets_cumsum, offsets_loop)


@pytest.mark.skipif(
importlib.util.find_spec("cupy") is None,
reason="cupy required for nvCOMP path",
)
def test_nvcomp_batch_decompress_roundtrip_1950():
"""End-to-end check: a deflate-tiled raster still decodes correctly.

Exercises ``_try_nvcomp_batch_decompress`` on a real file via the
public ``read_geotiff_gpu`` entry point. If the prefix-sum
refactor mis-stages a tile, the decoded buffer would not match
the source, surfacing as a numerical regression here.

Gated on ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` so the run-time check
only fires in environments that actually carry nvCOMP. Without the
flag the GPU read path silently falls back to a CPU codec when
nvCOMP is missing, which bypasses the prefix-sum site entirely; a
pass under that fallback would be misleading, so we skip instead.
"""
if os.environ.get("XRSPATIAL_GEOTIFF_STRICT_GPU") != "1":
pytest.skip(
"set XRSPATIAL_GEOTIFF_STRICT_GPU=1 to exercise the nvCOMP "
"prefix-sum site; without it the GPU path may fall back to "
"a CPU codec and bypass this regression."
)
try:
import cupy
except ImportError:
pytest.skip("cupy not importable")
if not cupy.cuda.is_available():
pytest.skip("CUDA device not available")

import xarray as xr
from xrspatial.geotiff import open_geotiff, to_geotiff

rng = np.random.RandomState(1950)
height, width = 1024, 1024
arr = rng.rand(height, width).astype(np.float32)
da = xr.DataArray(
arr, dims=["y", "x"],
coords={"y": np.arange(height), "x": np.arange(width)},
attrs={"crs": 4326},
)

with tempfile.TemporaryDirectory() as td:
path = os.path.join(td, "tmp_1950_deflate.tif")
to_geotiff(da, path, compression="deflate", tile_size=256)

# Read back through the GPU pipeline.
result = open_geotiff(path, gpu=True)
assert result.shape == (height, width)
decoded = cupy.asnumpy(result.data) if hasattr(
result.data, "get") else np.asarray(result.data)

np.testing.assert_allclose(decoded, arr, atol=0, rtol=0)
Loading