diff --git a/xrspatial/geotiff/_writers/gpu.py b/xrspatial/geotiff/_writers/gpu.py index 9a356838..e666d512 100644 --- a/xrspatial/geotiff/_writers/gpu.py +++ b/xrspatial/geotiff/_writers/gpu.py @@ -479,8 +479,31 @@ def _gpu_compress_to_part(gpu_arr, w, h, spp): # all-sentinel cell NaN back to the sentinel after each level # so the on-disk overview tiles still carry the sentinel value # external readers expect. + # + # The sentinel rewrite uses ``cupy.putmask`` rather than + # ``current.copy(); current[nan_mask] = sentinel`` because + # ``make_overview_gpu`` always returns a freshly allocated + # buffer (``_block_reduce_2d_gpu`` ends in ``cupy.nan*`` / + # ``cupy.around(...).astype(...)`` / ``cropped[::2, ::2].copy()``, + # and the 3-D path ends in ``cupy.stack``) so nothing else + # aliases the buffer between the return and the rewrite. + # Dropping the redundant ``copy()`` skips one chunk-sized GPU + # allocation per overview level. Mirrors the in-place sentinel + # rewrite ``_apply_nodata_mask_gpu`` adopted in #1934. See + # issue #1948. current = arr cumulative_factor = 1 + # ``make_overview_gpu`` preserves dtype, so the sentinel cast is + # loop-invariant. Hoist it (and the float/finite gate) out of the + # inner ``while`` to skip redundant per-level scalar work. + rewrite_nodata = ( + nodata is not None + and np_dtype.kind == 'f' + and not np.isnan(float(nodata)) + ) + sentinel_scalar = ( + np_dtype.type(nodata) if rewrite_nodata else None + ) for target_factor in overview_levels: # Halve repeatedly until the cumulative decimation matches # the requested factor. Validation has already established @@ -490,14 +513,10 @@ def _gpu_compress_to_part(gpu_arr, w, h, spp): current = make_overview_gpu(current, method=overview_resampling, nodata=nodata) cumulative_factor *= 2 - if (nodata is not None - and np.dtype(str(current.dtype)).kind == 'f' - and not np.isnan(float(nodata))): + if rewrite_nodata: nan_mask = cupy.isnan(current) if bool(nan_mask.any().item()): - current = current.copy() - current[nan_mask] = np.dtype( - str(current.dtype)).type(nodata) + cupy.putmask(current, nan_mask, sentinel_scalar) oh, ow = current.shape[:2] parts.append(_gpu_compress_to_part(current, ow, oh, samples)) diff --git a/xrspatial/geotiff/tests/test_gpu_writer_overview_inplace_1948.py b/xrspatial/geotiff/tests/test_gpu_writer_overview_inplace_1948.py new file mode 100644 index 00000000..acf74c54 --- /dev/null +++ b/xrspatial/geotiff/tests/test_gpu_writer_overview_inplace_1948.py @@ -0,0 +1,188 @@ +"""Regression tests for issue #1948. + +The COG overview generation loop inside ``write_geotiff_gpu`` used to +replace the in-place NaN-to-sentinel rewrite with +``current = current.copy(); current[nan_mask] = sentinel``. Every call +to ``make_overview_gpu`` returns a freshly allocated cupy buffer +(``_block_reduce_2d_gpu`` ends in ``cupy.nan*`` / ``cupy.around(...).astype(...)`` / +``cropped[::2, ::2].copy()``, the 3-D path ends in ``cupy.stack``), so +nothing else aliases the buffer between the return and the rewrite. The +``.copy()`` allocated a second chunk-sized GPU buffer per overview +level for no semantic gain. + +The fix swaps the rewrite to ``cupy.putmask(current, nan_mask, sentinel)`` +which mutates the existing buffer in place. This drops one chunk-sized +device allocation per overview level and mirrors the in-place sentinel +rewrite ``_apply_nodata_mask_gpu`` adopted in #1934. + +Two guards here: + +1. Correctness -- a COG write with a sentinel-bearing float raster + round-trips through the GPU writer and back to the CPU reader with + the sentinel preserved in every overview level. +2. Structural -- the writer source uses ``cupy.putmask`` and no longer + carries the redundant ``current = current.copy()`` line in the + overview loop. +""" +from __future__ import annotations + +import os +import tempfile + +import numpy as np +import xarray as xr + +from xrspatial.geotiff.tests.conftest import requires_gpu as _gpu_only + + +def _make_float_raster_with_nodata(height: int, width: int, sentinel: float): + """Build a float32 raster whose sentinel pixels would poison overviews. + + Reserve a contiguous all-sentinel sub-rectangle so that every 2x + decimation step still contains at least one fully-sentinel 2x2 + block. ``_block_reduce_2d_gpu`` masks the sentinel back to NaN + before reducing, so a partially-sentinel block returns a valid mean + and the overview pixel never becomes NaN; only a fully-sentinel + block triggers the NaN->sentinel rewrite branch the fix touches. + """ + import cupy + + arr = np.random.RandomState(1948).rand(height, width).astype(np.float32) + # Bottom-right quadrant becomes all-sentinel so every 2x reduction + # of that region stays all-sentinel and the NaN rewrite branch + # fires at every overview level. + arr[height // 2:, width // 2:] = sentinel + return cupy.asarray(arr) + + +def test_gpu_writer_overview_loop_uses_putmask_1948(): + """Source-level guard against the redundant ``current.copy()``. + + The fix replaces the ``current = current.copy(); current[nan_mask] = ...`` + pair with ``cupy.putmask(current, nan_mask, ...)`` so the buffer + ``make_overview_gpu`` returned is mutated in place. This guard + fires if anyone reintroduces the redundant copy. + """ + import pathlib + + src_path = pathlib.Path(__file__).parent.parent / "_writers" / "gpu.py" + src = src_path.read_text() + # The overview loop should call cupy.putmask, not current.copy() + indexed write. + assert "cupy.putmask(" in src, ( + "write_geotiff_gpu overview loop should use cupy.putmask " + "for the in-place NaN->sentinel rewrite (issue #1948)." + ) + # Confirm the in-place pattern is the one inside the overview branch, + # not somewhere unrelated. The pattern's location follows the + # ``make_overview_gpu`` call and the ``nan_mask = cupy.isnan(current)`` + # gate; assert the order is preserved. + idx_overview = src.find("make_overview_gpu(current") + idx_putmask = src.find("cupy.putmask(", idx_overview) + assert idx_overview != -1 and idx_putmask != -1, ( + "could not locate the overview-loop in-place rewrite" + ) + # The legacy two-line pattern would have ``current = current.copy()`` + # right before the indexed write. Ensure the overview branch no + # longer contains that exact line. Anchor the slice on the next + # statement after the inner ``while`` (``parts.append(...)``) so + # the window tracks the real loop body instead of a fixed + # character count that drifts as surrounding code changes. + idx_parts_append = src.find("parts.append(", idx_overview) + assert idx_parts_append != -1, ( + "could not locate the ``parts.append(`` sentinel that closes " + "the overview-loop body" + ) + overview_branch = src[idx_overview:idx_parts_append] + assert "current = current.copy()" not in overview_branch, ( + "overview loop should no longer copy the cupy buffer before " + "the in-place sentinel rewrite (issue #1948)." + ) + + +@_gpu_only +def test_gpu_writer_cog_overview_sentinel_roundtrip_1948(): + """COG write -> CPU read preserves the sentinel through every overview. + + Regression check on the correctness of the in-place rewrite. If + ``cupy.putmask`` ever drifts away from the ``current.copy()`` + + indexed-write semantics (e.g. broadcasts the sentinel as the wrong + dtype) the round-trip would surface NaN where the sentinel should + be, breaking external-reader masking. The test is gated on a GPU + because the overview loop only runs on the GPU writer code path. + """ + from xrspatial.geotiff import open_geotiff, write_geotiff_gpu + + sentinel = np.float32(-9999.0) + height, width = 1024, 1024 + arr_gpu = _make_float_raster_with_nodata(height, width, float(sentinel)) + + da = xr.DataArray( + arr_gpu, dims=["y", "x"], + coords={"y": np.arange(height), "x": np.arange(width)}, + attrs={"crs": 4326, "nodata": float(sentinel)}, + ) + + with tempfile.TemporaryDirectory() as td: + path = os.path.join(td, "tmp_1948_cog.tif") + write_geotiff_gpu( + da, path, compression="deflate", tile_size=256, + cog=True, overview_levels=[2, 4], + ) + + # Read full-resolution and the two overview levels. + full = open_geotiff(path) + ov1 = open_geotiff(path, overview_level=1) + ov2 = open_geotiff(path, overview_level=2) + + # Full-resolution: sentinel pixels survive as NaN (the read path + # masks the sentinel back to NaN since attrs['nodata'] is set). + assert np.isnan(full.values).any(), ( + "full-resolution overview lost its sentinel pixels" + ) + assert ov1.shape == (height // 2, width // 2), ( + f"overview level 1 shape {ov1.shape}, expected " + f"({height // 2}, {width // 2})" + ) + assert ov2.shape == (height // 4, width // 4), ( + f"overview level 2 shape {ov2.shape}, expected " + f"({height // 4}, {width // 4})" + ) + # Each overview must still mask the sentinel. With the prior + # ``current.copy()`` pattern the test passed because the buffer was + # copied; with ``cupy.putmask`` the test still passes because the + # buffer is mutated in place. This guard fires only if the rewrite + # is dropped entirely or sentinel dtype/promotion semantics change. + assert np.isnan(ov1.values).any(), ( + "overview level 1 lost its sentinel pixels after the in-place rewrite" + ) + assert np.isnan(ov2.values).any(), ( + "overview level 2 lost its sentinel pixels after the in-place rewrite" + ) + + +@_gpu_only +def test_gpu_writer_overview_uses_make_overview_gpu_fresh_buffer_1948(): + """``make_overview_gpu`` returns a freshly allocated cupy buffer. + + The in-place rewrite contract relies on ``make_overview_gpu`` + handing back a buffer that no caller-visible state aliases. This + guard exercises every overview-method path inside + ``_block_reduce_2d_gpu`` and confirms the returned buffer is a + different cupy.ndarray than the input. + """ + import cupy + + from xrspatial.geotiff._gpu_decode import make_overview_gpu + + rng = np.random.RandomState(1948) + src_np = rng.rand(64, 64).astype(np.float32) + src_gpu = cupy.asarray(src_np) + + for method in ("mean", "nearest", "min", "max", "median"): + out = make_overview_gpu(src_gpu, method=method, nodata=None) + assert int(out.data.ptr) != int(src_gpu.data.ptr), ( + f"make_overview_gpu(method={method!r}) returned an aliased buffer" + ) + assert out.shape == (32, 32), ( + f"make_overview_gpu(method={method!r}) returned shape {out.shape}" + )