Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions xrspatial/geotiff/_backends/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,32 @@
from .dask import read_geotiff_dask


def _preflight_cuda_runtime(cupy) -> None:
"""Verify the CUDA runtime is usable before the GPU pipeline runs.

``cupy`` can import successfully on a machine whose driver is older
than the build expects, was uninstalled, or belongs to a suspended
VM. Without this check the failure surfaces as a
``cudaErrorInsufficientDriver`` from a deep ``cupy.asarray(...)``
call in the CPU-fallback path (issue #1903). Raises a clean
``RuntimeError`` that names the underlying CUDA error so the user
can fix the driver, switch CuPy builds, or pass ``gpu=False``.
"""
try:
device_count = cupy.cuda.runtime.getDeviceCount()
except Exception as e:
raise RuntimeError(
f"read_geotiff_gpu: CUDA runtime is not usable "
f"({type(e).__name__}: {e}). Check the GPU driver matches "
f"the installed cupy build, or pass gpu=False."
) from e
if device_count == 0:
raise RuntimeError(
"read_geotiff_gpu: cupy reports 0 CUDA devices. Check "
"the GPU driver and CUDA_VISIBLE_DEVICES, or pass gpu=False."
)


def read_geotiff_gpu(source: str, *,
dtype: str | np.dtype | None = None,
overview_level: int | None = None,
Expand Down Expand Up @@ -123,8 +149,12 @@ def read_geotiff_gpu(source: str, *,

Stripped layouts and sparse-tile files route directly to the CPU
reader before either GPU decode stage runs, so the ``on_gpu_failure``
kwarg does not affect them. A failure inside the subsequent
``cupy.asarray(...)`` upload propagates unchanged in both modes.
kwarg does not affect them. The function preflights the CUDA
runtime via ``cupy.cuda.runtime.getDeviceCount()`` immediately
after importing cupy and raises ``RuntimeError`` if the driver
is unusable (#1903); transient errors inside a later
``cupy.asarray(...)`` upload (e.g. device OOM) still propagate
unchanged in both modes.
gpu : str, optional
Deprecated alias for ``on_gpu_failure``. Emits ``DeprecationWarning``
when used. Passing both ``gpu`` and ``on_gpu_failure`` raises
Expand Down Expand Up @@ -177,6 +207,12 @@ def read_geotiff_gpu(source: str, *,
"cupy is required for GPU reads. "
"Install it with: pip install cupy-cuda12x")

# Preflight CUDA. ``cupy`` can import on machines whose driver is
# older than the build expects or whose GPU is offline; the error
# otherwise surfaces as a low-level CUDA failure from
# ``cupy.asarray(...)`` deep in the CPU-fallback path (#1903).
_preflight_cuda_runtime(cupy)

# When ``chunks=`` is set, bound peak GPU memory to chunk size by
# building a Dask+CuPy graph that decodes one chunk at a time. The
# CPU dask path already lays out a window-per-chunk delayed graph
Expand Down
134 changes: 134 additions & 0 deletions xrspatial/geotiff/tests/test_gpu_cuda_preflight_1903.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""CUDA preflight in ``read_geotiff_gpu``.

Regression for issue #1903. When CuPy imports but the CUDA driver is
unusable (older driver than the build expects, suspended VM, etc.),
the failure used to surface as ``cudaErrorInsufficientDriver`` from a
``cupy.asarray(...)`` call deep in the CPU-fallback path. The fix
preflights the runtime via ``cupy.cuda.runtime.getDeviceCount()``
right after the cupy import and raises a clean ``RuntimeError``.

These tests stub ``cupy.cuda.runtime.getDeviceCount`` so they exercise
the preflight branch without requiring a real GPU. The function under
test is called directly to skip the file-source setup.
"""
from __future__ import annotations

import importlib.util
import sys
import types

import pytest


_CUPY_AVAILABLE = importlib.util.find_spec("cupy") is not None


def _install_cupy_stub(monkeypatch, *, get_device_count):
"""Install a minimal stub ``cupy`` module so the preflight runs.

Used on machines without cupy installed; lets us exercise the
preflight failure path on CPU-only CI.
"""
cupy_mod = types.ModuleType("cupy")
cuda_mod = types.ModuleType("cupy.cuda")
runtime_mod = types.ModuleType("cupy.cuda.runtime")
runtime_mod.getDeviceCount = get_device_count
cuda_mod.runtime = runtime_mod
cupy_mod.cuda = cuda_mod
monkeypatch.setitem(sys.modules, "cupy", cupy_mod)
monkeypatch.setitem(sys.modules, "cupy.cuda", cuda_mod)
monkeypatch.setitem(sys.modules, "cupy.cuda.runtime", runtime_mod)


def test_preflight_raises_on_runtime_error(monkeypatch):
"""A simulated cudaErrorInsufficientDriver becomes a clean RuntimeError."""
from xrspatial.geotiff._backends import gpu as gpu_mod

class FakeCudaError(RuntimeError):
pass

def _raise(*_a, **_kw):
raise FakeCudaError("cudaErrorInsufficientDriver")

_install_cupy_stub(monkeypatch, get_device_count=_raise)
import cupy
with pytest.raises(RuntimeError, match="CUDA runtime is not usable"):
gpu_mod._preflight_cuda_runtime(cupy)


def test_preflight_raises_on_zero_devices(monkeypatch):
"""``getDeviceCount()`` returning 0 also raises."""
from xrspatial.geotiff._backends import gpu as gpu_mod

_install_cupy_stub(monkeypatch, get_device_count=lambda: 0)
import cupy
with pytest.raises(RuntimeError, match="reports 0 CUDA devices"):
gpu_mod._preflight_cuda_runtime(cupy)


def test_preflight_returns_silently_when_device_present(monkeypatch):
"""A normal positive device count must not raise."""
from xrspatial.geotiff._backends import gpu as gpu_mod

_install_cupy_stub(monkeypatch, get_device_count=lambda: 1)
import cupy
# Should not raise.
gpu_mod._preflight_cuda_runtime(cupy)


def test_read_geotiff_gpu_preflight_surface(monkeypatch, tmp_path):
"""End-to-end: read_geotiff_gpu raises before touching any IFDs.

Build a real TIFF so the function gets past the file-source setup,
then verify the CUDA preflight RuntimeError surfaces from the
public entry point rather than from a deep cupy.asarray() call.
"""
import numpy as np
import xarray as xr
from xrspatial.geotiff import to_geotiff
from xrspatial.geotiff._backends.gpu import read_geotiff_gpu

da = xr.DataArray(
np.arange(16, dtype=np.float32).reshape(4, 4),
dims=["y", "x"],
coords={
"y": np.array([0.5, 1.5, 2.5, 3.5]),
"x": np.array([0.5, 1.5, 2.5, 3.5]),
},
attrs={"crs": 4326},
)
path = str(tmp_path / "preflight_1903.tif")
to_geotiff(da, path, tile_size=16)

class FakeCudaError(RuntimeError):
pass

def _raise(*_a, **_kw):
raise FakeCudaError("cudaErrorInsufficientDriver")

_install_cupy_stub(monkeypatch, get_device_count=_raise)

with pytest.raises(RuntimeError, match="CUDA runtime is not usable"):
read_geotiff_gpu(path)


@pytest.mark.skipif(
not _CUPY_AVAILABLE,
reason="cupy required to verify monkeypatch composes with a real import",
)
def test_preflight_when_real_cupy_present(monkeypatch):
"""When cupy is really installed, monkeypatching the runtime symbol
works the same way -- the import in read_geotiff_gpu finds the
patched attribute."""
import cupy
from xrspatial.geotiff._backends import gpu as gpu_mod

class FakeCudaError(RuntimeError):
pass

def _raise(*_a, **_kw):
raise FakeCudaError("cudaErrorInsufficientDriver")

monkeypatch.setattr(cupy.cuda.runtime, "getDeviceCount", _raise)
with pytest.raises(RuntimeError, match="CUDA runtime is not usable"):
gpu_mod._preflight_cuda_runtime(cupy)
21 changes: 19 additions & 2 deletions xrspatial/geotiff/tests/test_gpu_strict_fallback_1516.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
_CUPY_ORIG_SENTINEL = object()
_cupy_saved = _CUPY_ORIG_SENTINEL
_cupy_cuda_saved = _CUPY_ORIG_SENTINEL
_cupy_cuda_runtime_saved = _CUPY_ORIG_SENTINEL


def _cuda_actually_available() -> bool:
Expand Down Expand Up @@ -63,40 +64,56 @@ def _ensure_cupy_stub() -> bool:
installed but CUDA isn't available. The original module (if any) is
saved so :func:`_restore_cupy` can put it back.
"""
global _cupy_saved, _cupy_cuda_saved
global _cupy_saved, _cupy_cuda_saved, _cupy_cuda_runtime_saved

if _cuda_actually_available():
return False

_cupy_saved = sys.modules.get('cupy', _CUPY_ORIG_SENTINEL)
_cupy_cuda_saved = sys.modules.get('cupy.cuda', _CUPY_ORIG_SENTINEL)
_cupy_cuda_runtime_saved = sys.modules.get(
'cupy.cuda.runtime', _CUPY_ORIG_SENTINEL)

stub = types.ModuleType('cupy')
stub.ndarray = np.ndarray
stub.asarray = np.asarray

cuda_mod = types.ModuleType('cupy.cuda')
cuda_mod.is_available = lambda: False

# Pre-flight check in ``read_geotiff_gpu`` (added in #1903) calls
# ``cupy.cuda.runtime.getDeviceCount()`` to surface a clean
# ``RuntimeError`` for broken-driver setups. Tests in this file want
# to exercise the downstream simulated-failure paths, so the stubbed
# runtime reports one device and the preflight lets execution
# through. The real preflight tests live in
# ``test_gpu_cuda_preflight_1903.py``.
runtime_mod = types.ModuleType('cupy.cuda.runtime')
runtime_mod.getDeviceCount = lambda: 1
cuda_mod.runtime = runtime_mod
stub.cuda = cuda_mod

sys.modules['cupy'] = stub
sys.modules['cupy.cuda'] = cuda_mod
sys.modules['cupy.cuda.runtime'] = runtime_mod
return True


def _restore_cupy() -> None:
"""Undo :func:`_ensure_cupy_stub`."""
global _cupy_saved, _cupy_cuda_saved
global _cupy_saved, _cupy_cuda_saved, _cupy_cuda_runtime_saved
for name, saved in (
('cupy', _cupy_saved),
('cupy.cuda', _cupy_cuda_saved),
('cupy.cuda.runtime', _cupy_cuda_runtime_saved),
):
if saved is _CUPY_ORIG_SENTINEL:
sys.modules.pop(name, None)
else:
sys.modules[name] = saved
_cupy_saved = _CUPY_ORIG_SENTINEL
_cupy_cuda_saved = _CUPY_ORIG_SENTINEL
_cupy_cuda_runtime_saved = _CUPY_ORIG_SENTINEL
importlib.invalidate_caches()


Expand Down
Loading