From 26e5153574da7b3d3a4e1749c0d960ca7547e9a6 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Thu, 14 May 2026 20:32:04 -0700
Subject: [PATCH] geotiff: extract GPU helpers to _backends/_gpu_helpers.py
 (#1884)

Step 6 of #1813's multi-PR refactor of __init__.py. Pure code motion;
no public API change. First PR to create the _backends/ subpackage.

Created xrspatial/geotiff/_backends/__init__.py (empty placeholder)
and moved into _backends/_gpu_helpers.py:

- _is_gpu_data: cupy-backed array / DataArray detection.
- _apply_nodata_mask_gpu: cupy mirror of the CPU sentinel-to-NaN mask
  (handles float and integer arrays, with the same out-of-range /
  fractional / non-finite skips as _resolve_masked_fill in
  _reader.py).
- _gpu_decode_single_band_tiles: two-stage GPU decode (GDS then
  CPU-extracted bytes) for the PlanarConfiguration=2 path.
- _apply_orientation_gpu: cupy mirror of the CPU orientation-flip
  helper for the eight TIFF Orientation tag values.
- _apply_orientation_geo_info: GeoTransform update after an
  orientation flip, with the #1765 refusal for georeferenced
  orientations 5-8.
- _gpu_apply_window_band: post-decode window + band slice on device.

Every helper lazy-imports cupy at call time so the module imports
cleanly on numpy-only environments. __init__.py re-imports the
helpers from _backends._gpu_helpers so existing callers
(read_geotiff_gpu, to_geotiff's GPU branch, write_geotiff_gpu) keep
their identical signatures.

Leading-underscore module name (_gpu_helpers.py) keeps _backends/gpu.py
free for the read_geotiff_gpu entry-point body in step 7.

Net __init__.py change: 4271 -> 3969 lines (-302). _gpu_helpers.py is
335 lines.

Verification: pixel-parity matrix from #1889 (192 cells, all GPU
cells included), runtime identity (9), GPU byteswap (#1508), overview
nodata inheritance (#1739) -- 230/230 pass.

Refs #1813.
---
 xrspatial/geotiff/__init__.py               | 318 +------------------
 xrspatial/geotiff/_backends/__init__.py     |   8 +
 xrspatial/geotiff/_backends/_gpu_helpers.py | 335 ++++++++++++++++++++
 3 files changed, 351 insertions(+), 310 deletions(-)
 create mode 100644 xrspatial/geotiff/_backends/__init__.py
 create mode 100644 xrspatial/geotiff/_backends/_gpu_helpers.py

diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py
index 2d3fa45f..f0b4cde8 100644
--- a/xrspatial/geotiff/__init__.py
+++ b/xrspatial/geotiff/__init__.py
@@ -68,6 +68,14 @@
     _populate_attrs_from_geo_info,
     _resolve_nodata_attr,
 )
+from ._backends._gpu_helpers import (
+    _apply_nodata_mask_gpu,
+    _apply_orientation_geo_info,
+    _apply_orientation_gpu,
+    _gpu_apply_window_band,
+    _gpu_decode_single_band_tiles,
+    _is_gpu_data,
+)
 from ._crs import _resolve_crs_to_wkt, _wkt_to_epsg
 from ._reader import read_to_array as _read_to_array
 from ._runtime import (
@@ -503,76 +511,6 @@ def open_geotiff(source: str | BinaryIO, *,
     return da
 
 
-def _is_gpu_data(data) -> bool:
-    """Check if data is CuPy-backed (raw array or DataArray)."""
-    try:
-        import cupy
-        _cupy_type = cupy.ndarray
-    except ImportError:
-        return False
-
-    if isinstance(data, xr.DataArray):
-        raw = data.data
-        if hasattr(raw, 'compute'):
-            meta = getattr(raw, '_meta', None)
-            return isinstance(meta, _cupy_type)
-        return isinstance(raw, _cupy_type)
-    return isinstance(data, _cupy_type)
-
-
-def _apply_nodata_mask_gpu(arr_gpu, nodata):
-    """Replace nodata sentinel values with NaN on a CuPy array.
-
-    Mirrors the CPU eager path in :func:`open_geotiff` (lines around the
-    ``# Apply nodata mask`` comment). Float arrays get NaN substituted in
-    place of the sentinel; integer arrays are promoted to float64 with NaN
-    where the sentinel was. NaN nodata on a float array is a no-op (the
-    sentinel already matches NaN-aware code).
-
-    Returns the (possibly promoted, possibly nodata-masked) CuPy array.
-    The caller is responsible for setting ``attrs['nodata']`` so the
-    sentinel is still discoverable downstream.
-    """
-    import cupy
-
-    if nodata is None:
-        return arr_gpu
-    arr_dtype = np.dtype(str(arr_gpu.dtype))
-    if arr_dtype.kind == 'f':
-        if not np.isnan(nodata):
-            sentinel = arr_dtype.type(nodata)
-            arr_gpu = cupy.where(arr_gpu == sentinel,
-                                 arr_dtype.type('nan'), arr_gpu)
-        return arr_gpu
-    if arr_dtype.kind in ('u', 'i'):
-        # Out-of-range sentinels (e.g. uint16 + GDAL_NODATA="-9999") cannot
-        # match any decoded pixel; skip the cast that would otherwise raise
-        # OverflowError. A non-finite sentinel ("NaN" / "Inf" GDAL_NODATA
-        # strings) also cannot match an integer pixel and would raise
-        # ValueError on ``int(nodata)``; gate on ``np.isfinite`` first to
-        # mirror ``_resolve_masked_fill`` in ``_reader.py`` (#1774). A
-        # fractional sentinel (e.g. ``"3.5"`` on a ``uint16`` file) also
-        # cannot match an integer pixel and ``int(3.5)`` would truncate
-        # to 3, silently masking a real pixel value; gate on
-        # ``float(nodata).is_integer()`` as well (mirrors the
-        # ``_writer.py`` / ``_vrt.py`` pattern used for #1564 / #1616).
-        # attrs['nodata'] is still set by the caller so the original
-        # sentinel survives a write round-trip.
-        if not (np.isfinite(nodata) and float(nodata).is_integer()):
-            return arr_gpu
-        nodata_int = int(nodata)
-        info = np.iinfo(arr_dtype)
-        if not (info.min <= nodata_int <= info.max):
-            return arr_gpu
-        sentinel = arr_dtype.type(nodata_int)
-        mask = arr_gpu == sentinel
-        if bool(mask.any().item()):
-            arr_gpu = arr_gpu.astype(cupy.float64)
-            arr_gpu = cupy.where(mask, cupy.float64('nan'), arr_gpu)
-        return arr_gpu
-    return arr_gpu
-
-
 def to_geotiff(data: xr.DataArray | np.ndarray,
                path: str | BinaryIO, *,
                crs: int | str | None = None,
@@ -1849,246 +1787,6 @@ def _read(http_meta):
     return _read(http_meta_key)
 
 
-def _gpu_decode_single_band_tiles(
-    source, lazy_data, offsets, byte_counts,
-    tw, th, width, height,
-    compression, predictor, file_dtype,
-    *,
-    byte_order: str,
-    gpu: str,
-):
-    """Decode one band's tile sequence into a 2-D ``(H, W)`` cupy array.
-
-    Helper for the ``PlanarConfiguration=2`` GPU path: the existing
-    ``gpu_decode_tiles`` / ``gpu_decode_tiles_from_file`` kernels assume
-    a single chunky tile sequence with ``bytes_per_pixel = itemsize *
-    samples``. For planar=2 each band has its own list of tiles, so we
-    invoke those kernels once per band with ``samples=1`` and stack the
-    resulting 2-D arrays into ``(H, W, samples)`` afterwards.
-
-    Mirrors the two-stage GPU pipeline in ``read_geotiff_gpu`` -- GDS
-    first, then CPU-extracted-tiles GPU decode. ``lazy_data`` is a
-    zero-arg callable that returns the full file bytes; it caches its
-    result so the first band that needs the stage-2 fallback pays the
-    ``read_all()``, and subsequent bands reuse the same buffer. When
-    every band's GDS path succeeds the file is never read at all.
-    Sparse tiles are not expected here; the caller routes sparse files
-    to the CPU path.
-
-    Auto-mode semantics: a stage-1 GDS failure warns and falls through
-    to stage 2; a stage-2 failure warns and returns ``None`` so the
-    caller can trigger a whole-image CPU fallback (a per-band CPU
-    decode would require a single-band CPU path keyed off planar
-    config, which doesn't exist). Strict mode re-raises from either
-    stage.
-    """
-    from ._gpu_decode import gpu_decode_tiles, gpu_decode_tiles_from_file
-
-    arr_gpu = None
-    try:
-        arr_gpu = gpu_decode_tiles_from_file(
-            source, offsets, byte_counts,
-            tw, th, width, height,
-            compression, predictor, file_dtype, 1,
-            byte_order=byte_order,
-        )
-    except Exception as e:
-        if gpu == 'strict' or _geotiff_strict_mode():
-            raise
-        warnings.warn(
-            f"read_geotiff_gpu: GPU decode failed "
-            f"({type(e).__name__}: {e}); falling back to CPU.",
-            RuntimeWarning,
-            stacklevel=3,
-        )
-        arr_gpu = None
-
-    if arr_gpu is None:
-        shared_data = lazy_data()
-        compressed_tiles = [
-            bytes(shared_data[offsets[i]:offsets[i] + byte_counts[i]])
-            for i in range(len(offsets))
-        ]
-        try:
-            arr_gpu = gpu_decode_tiles(
-                compressed_tiles,
-                tw, th, width, height,
-                compression, predictor, file_dtype, 1,
-                byte_order=byte_order,
-            )
-        except Exception as e:
-            if gpu == 'strict' or _geotiff_strict_mode():
-                raise
-            warnings.warn(
-                f"read_geotiff_gpu: GPU decode failed "
-                f"({type(e).__name__}: {e}); falling back to CPU.",
-                RuntimeWarning,
-                stacklevel=3,
-            )
-            return None
-
-    if arr_gpu.ndim != 2 or arr_gpu.shape != (height, width):
-        raise RuntimeError(
-            f"single-band GPU tile decode produced shape "
-            f"{arr_gpu.shape}, expected ({height}, {width})"
-        )
-    return arr_gpu
-
-
-def _apply_orientation_gpu(arr_gpu, orientation: int):
-    """cupy-side mirror of :func:`._reader._apply_orientation`.
-
-    The CPU reader applies the TIFF Orientation tag (274) post-decode so
-    pixel (0, 0) is always the visual top-left. The GPU read path used
-    to skip this remap, so reads of any file with orientation != 1
-    returned different pixel buffers than the CPU reader (#1540).
-
-    Same eight orientations the CPU helper handles. Operates on a cupy
-    ndarray and returns a cupy ndarray; ``cupy.ascontiguousarray`` is
-    applied so downstream views (DataArray.data) work without surprise
-    re-strides on the GPU.
-    """
-    import cupy
-    if orientation == 1:
-        return arr_gpu
-    if orientation == 2:
-        return cupy.ascontiguousarray(arr_gpu[:, ::-1])
-    if orientation == 3:
-        return cupy.ascontiguousarray(arr_gpu[::-1, ::-1])
-    if orientation == 4:
-        return cupy.ascontiguousarray(arr_gpu[::-1, :])
-    if arr_gpu.ndim == 3:
-        if orientation == 5:
-            return cupy.ascontiguousarray(arr_gpu.transpose(1, 0, 2))
-        if orientation == 6:
-            return cupy.ascontiguousarray(arr_gpu.transpose(1, 0, 2)[:, ::-1])
-        if orientation == 7:
-            return cupy.ascontiguousarray(
-                arr_gpu.transpose(1, 0, 2)[::-1, ::-1])
-        if orientation == 8:
-            return cupy.ascontiguousarray(arr_gpu.transpose(1, 0, 2)[::-1, :])
-    else:
-        if orientation == 5:
-            return cupy.ascontiguousarray(arr_gpu.T)
-        if orientation == 6:
-            return cupy.ascontiguousarray(arr_gpu.T[:, ::-1])
-        if orientation == 7:
-            return cupy.ascontiguousarray(arr_gpu.T[::-1, ::-1])
-        if orientation == 8:
-            return cupy.ascontiguousarray(arr_gpu.T[::-1, :])
-    raise ValueError(
-        f"Invalid TIFF Orientation tag value: {orientation} "
-        f"(must be 1-8 per TIFF 6.0)"
-    )
-
-
-def _apply_orientation_geo_info(geo_info, orientation: int,
-                                file_h: int, file_w: int):
-    """Mirror the transform updates `_reader.read_to_array` does post-flip.
-
-    Centralised so both ``read_to_array`` (CPU) and ``read_geotiff_gpu``
-    (this module) update the GeoTransform consistently. Operates only
-    on ``geo_info.transform``; the rest of the GeoInfo struct stays as
-    parsed.
-    """
-    if orientation == 1 or geo_info is None or geo_info.transform is None:
-        return geo_info
-    t = geo_info.transform
-    if orientation in (2, 3, 4):
-        if geo_info.raster_type == RASTER_PIXEL_IS_POINT:
-            x_shift = file_w - 1
-            y_shift = file_h - 1
-        else:
-            x_shift = file_w
-            y_shift = file_h
-        new_origin_x = t.origin_x
-        new_origin_y = t.origin_y
-        new_px_w = t.pixel_width
-        new_px_h = t.pixel_height
-        if orientation in (2, 3):
-            new_origin_x = t.origin_x + x_shift * t.pixel_width
-            new_px_w = -t.pixel_width
-        if orientation in (3, 4):
-            new_origin_y = t.origin_y + y_shift * t.pixel_height
-            new_px_h = -t.pixel_height
-        geo_info.transform = GeoTransform(
-            origin_x=new_origin_x,
-            origin_y=new_origin_y,
-            pixel_width=new_px_w,
-            pixel_height=new_px_h,
-        )
-    elif orientation in (5, 6, 7, 8):
-        # Match the CPU reader's #1765 refusal: a pixel-size swap alone
-        # cannot express the per-orientation origin shift plus rotation
-        # these orientations require, so the x/y coords would be wrong.
-        # ``has_georef`` is True for any file carrying ModelTransformation,
-        # ModelPixelScale, or ModelTiepoint, with or without a CRS tag, so
-        # gate on that flag rather than CRS presence.
-        if getattr(geo_info, 'has_georef', False):
-            raise NotImplementedError(
-                f"TIFF Orientation {orientation} on a georeferenced file "
-                f"requires a per-orientation origin shift plus a rotation "
-                f"that the axis-aligned GeoTransform used here cannot "
-                f"represent, so the returned x/y coords would be wrong. "
-                f"Reproject the file with another tool (e.g. GDAL) or "
-                f"strip the Orientation tag before reading. See issue "
-                f"#1765."
-            )
-        # Non-georeferenced file: swap pixel sizes to match the
-        # transposed array shape. No geographic claim to violate.
-        geo_info.transform = GeoTransform(
-            origin_x=t.origin_x,
-            origin_y=t.origin_y,
-            pixel_width=t.pixel_height,
-            pixel_height=t.pixel_width,
-        )
-    return geo_info
-
-
-def _gpu_apply_window_band(arr_gpu, geo_info, *, window, band):
-    """Slice a fully-decoded GPU array down to a window and/or band.
-
-    Used by ``read_geotiff_gpu`` to keep the public surface in line with
-    ``open_geotiff`` and ``read_geotiff_dask``: callers can pass ``window``
-    and ``band``, and the returned DataArray covers exactly that subset.
-
-    The current implementation slices on device after the full-image GPU
-    decode is complete. That preserves correctness but does no I/O
-    savings -- a future PR can short-circuit tile decode for partial
-    windows. For ``band`` selection, the savings are also post-decode
-    because the planar=1 (chunky) tile assembly returns all bands in a
-    single GPU buffer.
-
-    Returns ``(arr_gpu, coords)`` where ``coords`` is a dict with
-    ``y`` / ``x`` numpy arrays sized to the output array. The caller is
-    responsible for setting ``attrs['transform']`` to the windowed origin
-    via ``_populate_attrs_from_geo_info(..., window=window)`` so the array
-    and the transform agree.
-    """
-    if window is not None:
-        r0, c0, r1, c1 = window
-        arr_gpu = arr_gpu[r0:r1, c0:c1]
-        out_h = r1 - r0
-        out_w = c1 - c0
-        # Mirror the eager-numpy windowed coord computation in
-        # open_geotiff so the GPU-windowed coords carry the same
-        # absolute pixel-center values as the CPU path. For files
-        # with no GeoTIFF tags (``has_georef=False``), fall back to
-        # integer pixel coords matching ``_geo_to_coords`` (#1710).
-        coords = _coords_from_geo_info(
-            geo_info, out_h, out_w, window=window,
-        )
-    else:
-        out_h = arr_gpu.shape[0]
-        out_w = arr_gpu.shape[1]
-        coords = _geo_to_coords(geo_info, out_h, out_w)
-
-    if band is not None and arr_gpu.ndim == 3:
-        arr_gpu = arr_gpu[:, :, band]
-
-    return arr_gpu, coords
-
-
 def read_geotiff_gpu(source: str, *,
                      dtype: str | np.dtype | None = None,
                      overview_level: int | None = None,
diff --git a/xrspatial/geotiff/_backends/__init__.py b/xrspatial/geotiff/_backends/__init__.py
new file mode 100644
index 00000000..a26615d5
--- /dev/null
+++ b/xrspatial/geotiff/_backends/__init__.py
@@ -0,0 +1,8 @@
+"""Per-backend implementations for the geotiff read entry points.
+
+Step 6 of issue #1813 creates this subpackage and adds
+``_gpu_helpers.py``. Later steps add ``gpu.py``, ``dask.py``, and
+``vrt.py`` for the backend entry-point bodies. The package
+``__init__`` stays empty so nothing leaks into ``xrspatial.geotiff``
+through implicit re-exports.
+"""
diff --git a/xrspatial/geotiff/_backends/_gpu_helpers.py b/xrspatial/geotiff/_backends/_gpu_helpers.py
new file mode 100644
index 00000000..49399c40
--- /dev/null
+++ b/xrspatial/geotiff/_backends/_gpu_helpers.py
@@ -0,0 +1,335 @@
+"""GPU-only helpers consumed by ``read_geotiff_gpu`` and the GPU write path.
+
+Lazy-imports cupy at call time so the module imports cleanly on
+numpy-only environments. Every public entry point that touches GPU
+data routes through one of these helpers, so a single canonical copy
+keeps nodata-mask, orientation-flip, and window/band-slice semantics
+in lockstep with the CPU reader.
+
+Extracted in step 6 of issue #1813 so the next PR (step 7) can move
+``read_geotiff_gpu`` itself into ``_backends/gpu.py`` as a near-
+mechanical lift.
+"""
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+import xarray as xr
+
+from .._coords import (
+    coords_from_geo_info as _coords_from_geo_info,
+    geo_to_coords as _geo_to_coords,
+)
+from .._geotags import GeoTransform, RASTER_PIXEL_IS_POINT
+from .._runtime import _geotiff_strict_mode
+
+
+def _is_gpu_data(data) -> bool:
+    """Check if data is CuPy-backed (raw array or DataArray)."""
+    try:
+        import cupy
+        _cupy_type = cupy.ndarray
+    except ImportError:
+        return False
+
+    if isinstance(data, xr.DataArray):
+        raw = data.data
+        if hasattr(raw, 'compute'):
+            meta = getattr(raw, '_meta', None)
+            return isinstance(meta, _cupy_type)
+        return isinstance(raw, _cupy_type)
+    return isinstance(data, _cupy_type)
+
+
+def _apply_nodata_mask_gpu(arr_gpu, nodata):
+    """Replace nodata sentinel values with NaN on a CuPy array.
+
+    Mirrors the CPU eager path in :func:`open_geotiff` (lines around the
+    ``# Apply nodata mask`` comment). Float arrays get NaN substituted in
+    place of the sentinel; integer arrays are promoted to float64 with NaN
+    where the sentinel was. NaN nodata on a float array is a no-op (the
+    sentinel already matches NaN-aware code).
+
+    Returns the (possibly promoted, possibly nodata-masked) CuPy array.
+    The caller is responsible for setting ``attrs['nodata']`` so the
+    sentinel is still discoverable downstream.
+    """
+    import cupy
+
+    if nodata is None:
+        return arr_gpu
+    arr_dtype = np.dtype(str(arr_gpu.dtype))
+    if arr_dtype.kind == 'f':
+        if not np.isnan(nodata):
+            sentinel = arr_dtype.type(nodata)
+            arr_gpu = cupy.where(arr_gpu == sentinel,
+                                 arr_dtype.type('nan'), arr_gpu)
+        return arr_gpu
+    if arr_dtype.kind in ('u', 'i'):
+        # Out-of-range sentinels (e.g. uint16 + GDAL_NODATA="-9999") cannot
+        # match any decoded pixel; skip the cast that would otherwise raise
+        # OverflowError. A non-finite sentinel ("NaN" / "Inf" GDAL_NODATA
+        # strings) also cannot match an integer pixel and would raise
+        # ValueError on ``int(nodata)``; gate on ``np.isfinite`` first to
+        # mirror ``_resolve_masked_fill`` in ``_reader.py`` (#1774). A
+        # fractional sentinel (e.g. ``"3.5"`` on a ``uint16`` file) also
+        # cannot match an integer pixel and ``int(3.5)`` would truncate
+        # to 3, silently masking a real pixel value; gate on
+        # ``float(nodata).is_integer()`` as well (mirrors the
+        # ``_writer.py`` / ``_vrt.py`` pattern used for #1564 / #1616).
+        # attrs['nodata'] is still set by the caller so the original
+        # sentinel survives a write round-trip.
+        if not (np.isfinite(nodata) and float(nodata).is_integer()):
+            return arr_gpu
+        nodata_int = int(nodata)
+        info = np.iinfo(arr_dtype)
+        if not (info.min <= nodata_int <= info.max):
+            return arr_gpu
+        sentinel = arr_dtype.type(nodata_int)
+        mask = arr_gpu == sentinel
+        if bool(mask.any().item()):
+            arr_gpu = arr_gpu.astype(cupy.float64)
+            arr_gpu = cupy.where(mask, cupy.float64('nan'), arr_gpu)
+        return arr_gpu
+    return arr_gpu
+
+
+def _gpu_decode_single_band_tiles(
+    source, lazy_data, offsets, byte_counts,
+    tw, th, width, height,
+    compression, predictor, file_dtype,
+    *,
+    byte_order: str,
+    gpu: str,
+):
+    """Decode one band's tile sequence into a 2-D ``(H, W)`` cupy array.
+
+    Helper for the ``PlanarConfiguration=2`` GPU path: the existing
+    ``gpu_decode_tiles`` / ``gpu_decode_tiles_from_file`` kernels assume
+    a single chunky tile sequence with ``bytes_per_pixel = itemsize *
+    samples``. For planar=2 each band has its own list of tiles, so we
+    invoke those kernels once per band with ``samples=1`` and stack the
+    resulting 2-D arrays into ``(H, W, samples)`` afterwards.
+
+    Mirrors the two-stage GPU pipeline in ``read_geotiff_gpu`` -- GDS
+    first, then CPU-extracted-tiles GPU decode. ``lazy_data`` is a
+    zero-arg callable that returns the full file bytes; it caches its
+    result so the first band that needs the stage-2 fallback pays the
+    ``read_all()``, and subsequent bands reuse the same buffer. When
+    every band's GDS path succeeds the file is never read at all.
+    Sparse tiles are not expected here; the caller routes sparse files
+    to the CPU path.
+
+    Auto-mode semantics: a stage-1 GDS failure warns and falls through
+    to stage 2; a stage-2 failure warns and returns ``None`` so the
+    caller can trigger a whole-image CPU fallback (a per-band CPU
+    decode would require a single-band CPU path keyed off planar
+    config, which doesn't exist). Strict mode re-raises from either
+    stage.
+    """
+    from .._gpu_decode import gpu_decode_tiles, gpu_decode_tiles_from_file
+
+    arr_gpu = None
+    try:
+        arr_gpu = gpu_decode_tiles_from_file(
+            source, offsets, byte_counts,
+            tw, th, width, height,
+            compression, predictor, file_dtype, 1,
+            byte_order=byte_order,
+        )
+    except Exception as e:
+        if gpu == 'strict' or _geotiff_strict_mode():
+            raise
+        warnings.warn(
+            f"read_geotiff_gpu: GPU decode failed "
+            f"({type(e).__name__}: {e}); falling back to CPU.",
+            RuntimeWarning,
+            stacklevel=3,
+        )
+        arr_gpu = None
+
+    if arr_gpu is None:
+        shared_data = lazy_data()
+        compressed_tiles = [
+            bytes(shared_data[offsets[i]:offsets[i] + byte_counts[i]])
+            for i in range(len(offsets))
+        ]
+        try:
+            arr_gpu = gpu_decode_tiles(
+                compressed_tiles,
+                tw, th, width, height,
+                compression, predictor, file_dtype, 1,
+                byte_order=byte_order,
+            )
+        except Exception as e:
+            if gpu == 'strict' or _geotiff_strict_mode():
+                raise
+            warnings.warn(
+                f"read_geotiff_gpu: GPU decode failed "
+                f"({type(e).__name__}: {e}); falling back to CPU.",
+                RuntimeWarning,
+                stacklevel=3,
+            )
+            return None
+
+    if arr_gpu.ndim != 2 or arr_gpu.shape != (height, width):
+        raise RuntimeError(
+            f"single-band GPU tile decode produced shape "
+            f"{arr_gpu.shape}, expected ({height}, {width})"
+        )
+    return arr_gpu
+
+
+def _apply_orientation_gpu(arr_gpu, orientation: int):
+    """cupy-side mirror of :func:`._reader._apply_orientation`.
+
+    The CPU reader applies the TIFF Orientation tag (274) post-decode so
+    pixel (0, 0) is always the visual top-left. The GPU read path used
+    to skip this remap, so reads of any file with orientation != 1
+    returned different pixel buffers than the CPU reader (#1540).
+
+    Same eight orientations the CPU helper handles. Operates on a cupy
+    ndarray and returns a cupy ndarray; ``cupy.ascontiguousarray`` is
+    applied so downstream views (DataArray.data) work without surprise
+    re-strides on the GPU.
+    """
+    import cupy
+    if orientation == 1:
+        return arr_gpu
+    if orientation == 2:
+        return cupy.ascontiguousarray(arr_gpu[:, ::-1])
+    if orientation == 3:
+        return cupy.ascontiguousarray(arr_gpu[::-1, ::-1])
+    if orientation == 4:
+        return cupy.ascontiguousarray(arr_gpu[::-1, :])
+    if arr_gpu.ndim == 3:
+        if orientation == 5:
+            return cupy.ascontiguousarray(arr_gpu.transpose(1, 0, 2))
+        if orientation == 6:
+            return cupy.ascontiguousarray(arr_gpu.transpose(1, 0, 2)[:, ::-1])
+        if orientation == 7:
+            return cupy.ascontiguousarray(
+                arr_gpu.transpose(1, 0, 2)[::-1, ::-1])
+        if orientation == 8:
+            return cupy.ascontiguousarray(arr_gpu.transpose(1, 0, 2)[::-1, :])
+    else:
+        if orientation == 5:
+            return cupy.ascontiguousarray(arr_gpu.T)
+        if orientation == 6:
+            return cupy.ascontiguousarray(arr_gpu.T[:, ::-1])
+        if orientation == 7:
+            return cupy.ascontiguousarray(arr_gpu.T[::-1, ::-1])
+        if orientation == 8:
+            return cupy.ascontiguousarray(arr_gpu.T[::-1, :])
+    raise ValueError(
+        f"Invalid TIFF Orientation tag value: {orientation} "
+        f"(must be 1-8 per TIFF 6.0)"
+    )
+
+
+def _apply_orientation_geo_info(geo_info, orientation: int,
+                                file_h: int, file_w: int):
+    """Mirror the transform updates `_reader.read_to_array` does post-flip.
+
+    Centralised so both ``read_to_array`` (CPU) and ``read_geotiff_gpu``
+    (this module) update the GeoTransform consistently. Operates only
+    on ``geo_info.transform``; the rest of the GeoInfo struct stays as
+    parsed.
+    """
+    if orientation == 1 or geo_info is None or geo_info.transform is None:
+        return geo_info
+    t = geo_info.transform
+    if orientation in (2, 3, 4):
+        if geo_info.raster_type == RASTER_PIXEL_IS_POINT:
+            x_shift = file_w - 1
+            y_shift = file_h - 1
+        else:
+            x_shift = file_w
+            y_shift = file_h
+        new_origin_x = t.origin_x
+        new_origin_y = t.origin_y
+        new_px_w = t.pixel_width
+        new_px_h = t.pixel_height
+        if orientation in (2, 3):
+            new_origin_x = t.origin_x + x_shift * t.pixel_width
+            new_px_w = -t.pixel_width
+        if orientation in (3, 4):
+            new_origin_y = t.origin_y + y_shift * t.pixel_height
+            new_px_h = -t.pixel_height
+        geo_info.transform = GeoTransform(
+            origin_x=new_origin_x,
+            origin_y=new_origin_y,
+            pixel_width=new_px_w,
+            pixel_height=new_px_h,
+        )
+    elif orientation in (5, 6, 7, 8):
+        # Match the CPU reader's #1765 refusal: a pixel-size swap alone
+        # cannot express the per-orientation origin shift plus rotation
+        # these orientations require, so the x/y coords would be wrong.
+        # ``has_georef`` is True for any file carrying ModelTransformation,
+        # ModelPixelScale, or ModelTiepoint, with or without a CRS tag, so
+        # gate on that flag rather than CRS presence.
+        if getattr(geo_info, 'has_georef', False):
+            raise NotImplementedError(
+                f"TIFF Orientation {orientation} on a georeferenced file "
+                f"requires a per-orientation origin shift plus a rotation "
+                f"that the axis-aligned GeoTransform used here cannot "
+                f"represent, so the returned x/y coords would be wrong. "
+                f"Reproject the file with another tool (e.g. GDAL) or "
+                f"strip the Orientation tag before reading. See issue "
+                f"#1765."
+            )
+        # Non-georeferenced file: swap pixel sizes to match the
+        # transposed array shape. No geographic claim to violate.
+        geo_info.transform = GeoTransform(
+            origin_x=t.origin_x,
+            origin_y=t.origin_y,
+            pixel_width=t.pixel_height,
+            pixel_height=t.pixel_width,
+        )
+    return geo_info
+
+
+def _gpu_apply_window_band(arr_gpu, geo_info, *, window, band):
+    """Slice a fully-decoded GPU array down to a window and/or band.
+
+    Used by ``read_geotiff_gpu`` to keep the public surface in line with
+    ``open_geotiff`` and ``read_geotiff_dask``: callers can pass ``window``
+    and ``band``, and the returned DataArray covers exactly that subset.
+
+    The current implementation slices on device after the full-image GPU
+    decode is complete. That preserves correctness but does no I/O
+    savings -- a future PR can short-circuit tile decode for partial
+    windows. For ``band`` selection, the savings are also post-decode
+    because the planar=1 (chunky) tile assembly returns all bands in a
+    single GPU buffer.
+
+    Returns ``(arr_gpu, coords)`` where ``coords`` is a dict with
+    ``y`` / ``x`` numpy arrays sized to the output array. The caller is
+    responsible for setting ``attrs['transform']`` to the windowed origin
+    via ``_populate_attrs_from_geo_info(..., window=window)`` so the array
+    and the transform agree.
+    """
+    if window is not None:
+        r0, c0, r1, c1 = window
+        arr_gpu = arr_gpu[r0:r1, c0:c1]
+        out_h = r1 - r0
+        out_w = c1 - c0
+        # Mirror the eager-numpy windowed coord computation in
+        # open_geotiff so the GPU-windowed coords carry the same
+        # absolute pixel-center values as the CPU path. For files
+        # with no GeoTIFF tags (``has_georef=False``), fall back to
+        # integer pixel coords matching ``_geo_to_coords`` (#1710).
+        coords = _coords_from_geo_info(
+            geo_info, out_h, out_w, window=window,
+        )
+    else:
+        out_h = arr_gpu.shape[0]
+        out_w = arr_gpu.shape[1]
+        coords = _geo_to_coords(geo_info, out_h, out_w)
+
+    if band is not None and arr_gpu.ndim == 3:
+        arr_gpu = arr_gpu[:, :, band]
+
+    return arr_gpu, coords