diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 3ee5607c..d28d8a80 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -41,6 +41,32 @@ from .dask import read_geotiff_dask +def _preflight_cuda_runtime(cupy) -> None: + """Verify the CUDA runtime is usable before the GPU pipeline runs. + + ``cupy`` can import successfully on a machine whose driver is older + than the build expects, was uninstalled, or belongs to a suspended + VM. Without this check the failure surfaces as a + ``cudaErrorInsufficientDriver`` from a deep ``cupy.asarray(...)`` + call in the CPU-fallback path (issue #1903). Raises a clean + ``RuntimeError`` that names the underlying CUDA error so the user + can fix the driver, switch CuPy builds, or pass ``gpu=False``. + """ + try: + device_count = cupy.cuda.runtime.getDeviceCount() + except Exception as e: + raise RuntimeError( + f"read_geotiff_gpu: CUDA runtime is not usable " + f"({type(e).__name__}: {e}). Check the GPU driver matches " + f"the installed cupy build, or pass gpu=False." + ) from e + if device_count == 0: + raise RuntimeError( + "read_geotiff_gpu: cupy reports 0 CUDA devices. Check " + "the GPU driver and CUDA_VISIBLE_DEVICES, or pass gpu=False." + ) + + def read_geotiff_gpu(source: str, *, dtype: str | np.dtype | None = None, overview_level: int | None = None, @@ -123,8 +149,12 @@ def read_geotiff_gpu(source: str, *, Stripped layouts and sparse-tile files route directly to the CPU reader before either GPU decode stage runs, so the ``on_gpu_failure`` - kwarg does not affect them. A failure inside the subsequent - ``cupy.asarray(...)`` upload propagates unchanged in both modes. + kwarg does not affect them. The function preflights the CUDA + runtime via ``cupy.cuda.runtime.getDeviceCount()`` immediately + after importing cupy and raises ``RuntimeError`` if the driver + is unusable (#1903); transient errors inside a later + ``cupy.asarray(...)`` upload (e.g. device OOM) still propagate + unchanged in both modes. gpu : str, optional Deprecated alias for ``on_gpu_failure``. Emits ``DeprecationWarning`` when used. Passing both ``gpu`` and ``on_gpu_failure`` raises @@ -177,6 +207,12 @@ def read_geotiff_gpu(source: str, *, "cupy is required for GPU reads. " "Install it with: pip install cupy-cuda12x") + # Preflight CUDA. ``cupy`` can import on machines whose driver is + # older than the build expects or whose GPU is offline; the error + # otherwise surfaces as a low-level CUDA failure from + # ``cupy.asarray(...)`` deep in the CPU-fallback path (#1903). + _preflight_cuda_runtime(cupy) + # When ``chunks=`` is set, bound peak GPU memory to chunk size by # building a Dask+CuPy graph that decodes one chunk at a time. The # CPU dask path already lays out a window-per-chunk delayed graph diff --git a/xrspatial/geotiff/tests/test_gpu_cuda_preflight_1903.py b/xrspatial/geotiff/tests/test_gpu_cuda_preflight_1903.py new file mode 100644 index 00000000..30c4b3c7 --- /dev/null +++ b/xrspatial/geotiff/tests/test_gpu_cuda_preflight_1903.py @@ -0,0 +1,134 @@ +"""CUDA preflight in ``read_geotiff_gpu``. + +Regression for issue #1903. When CuPy imports but the CUDA driver is +unusable (older driver than the build expects, suspended VM, etc.), +the failure used to surface as ``cudaErrorInsufficientDriver`` from a +``cupy.asarray(...)`` call deep in the CPU-fallback path. The fix +preflights the runtime via ``cupy.cuda.runtime.getDeviceCount()`` +right after the cupy import and raises a clean ``RuntimeError``. + +These tests stub ``cupy.cuda.runtime.getDeviceCount`` so they exercise +the preflight branch without requiring a real GPU. The function under +test is called directly to skip the file-source setup. +""" +from __future__ import annotations + +import importlib.util +import sys +import types + +import pytest + + +_CUPY_AVAILABLE = importlib.util.find_spec("cupy") is not None + + +def _install_cupy_stub(monkeypatch, *, get_device_count): + """Install a minimal stub ``cupy`` module so the preflight runs. + + Used on machines without cupy installed; lets us exercise the + preflight failure path on CPU-only CI. + """ + cupy_mod = types.ModuleType("cupy") + cuda_mod = types.ModuleType("cupy.cuda") + runtime_mod = types.ModuleType("cupy.cuda.runtime") + runtime_mod.getDeviceCount = get_device_count + cuda_mod.runtime = runtime_mod + cupy_mod.cuda = cuda_mod + monkeypatch.setitem(sys.modules, "cupy", cupy_mod) + monkeypatch.setitem(sys.modules, "cupy.cuda", cuda_mod) + monkeypatch.setitem(sys.modules, "cupy.cuda.runtime", runtime_mod) + + +def test_preflight_raises_on_runtime_error(monkeypatch): + """A simulated cudaErrorInsufficientDriver becomes a clean RuntimeError.""" + from xrspatial.geotiff._backends import gpu as gpu_mod + + class FakeCudaError(RuntimeError): + pass + + def _raise(*_a, **_kw): + raise FakeCudaError("cudaErrorInsufficientDriver") + + _install_cupy_stub(monkeypatch, get_device_count=_raise) + import cupy + with pytest.raises(RuntimeError, match="CUDA runtime is not usable"): + gpu_mod._preflight_cuda_runtime(cupy) + + +def test_preflight_raises_on_zero_devices(monkeypatch): + """``getDeviceCount()`` returning 0 also raises.""" + from xrspatial.geotiff._backends import gpu as gpu_mod + + _install_cupy_stub(monkeypatch, get_device_count=lambda: 0) + import cupy + with pytest.raises(RuntimeError, match="reports 0 CUDA devices"): + gpu_mod._preflight_cuda_runtime(cupy) + + +def test_preflight_returns_silently_when_device_present(monkeypatch): + """A normal positive device count must not raise.""" + from xrspatial.geotiff._backends import gpu as gpu_mod + + _install_cupy_stub(monkeypatch, get_device_count=lambda: 1) + import cupy + # Should not raise. + gpu_mod._preflight_cuda_runtime(cupy) + + +def test_read_geotiff_gpu_preflight_surface(monkeypatch, tmp_path): + """End-to-end: read_geotiff_gpu raises before touching any IFDs. + + Build a real TIFF so the function gets past the file-source setup, + then verify the CUDA preflight RuntimeError surfaces from the + public entry point rather than from a deep cupy.asarray() call. + """ + import numpy as np + import xarray as xr + from xrspatial.geotiff import to_geotiff + from xrspatial.geotiff._backends.gpu import read_geotiff_gpu + + da = xr.DataArray( + np.arange(16, dtype=np.float32).reshape(4, 4), + dims=["y", "x"], + coords={ + "y": np.array([0.5, 1.5, 2.5, 3.5]), + "x": np.array([0.5, 1.5, 2.5, 3.5]), + }, + attrs={"crs": 4326}, + ) + path = str(tmp_path / "preflight_1903.tif") + to_geotiff(da, path, tile_size=16) + + class FakeCudaError(RuntimeError): + pass + + def _raise(*_a, **_kw): + raise FakeCudaError("cudaErrorInsufficientDriver") + + _install_cupy_stub(monkeypatch, get_device_count=_raise) + + with pytest.raises(RuntimeError, match="CUDA runtime is not usable"): + read_geotiff_gpu(path) + + +@pytest.mark.skipif( + not _CUPY_AVAILABLE, + reason="cupy required to verify monkeypatch composes with a real import", +) +def test_preflight_when_real_cupy_present(monkeypatch): + """When cupy is really installed, monkeypatching the runtime symbol + works the same way -- the import in read_geotiff_gpu finds the + patched attribute.""" + import cupy + from xrspatial.geotiff._backends import gpu as gpu_mod + + class FakeCudaError(RuntimeError): + pass + + def _raise(*_a, **_kw): + raise FakeCudaError("cudaErrorInsufficientDriver") + + monkeypatch.setattr(cupy.cuda.runtime, "getDeviceCount", _raise) + with pytest.raises(RuntimeError, match="CUDA runtime is not usable"): + gpu_mod._preflight_cuda_runtime(cupy) diff --git a/xrspatial/geotiff/tests/test_gpu_strict_fallback_1516.py b/xrspatial/geotiff/tests/test_gpu_strict_fallback_1516.py index ed547a5b..6ba2114d 100644 --- a/xrspatial/geotiff/tests/test_gpu_strict_fallback_1516.py +++ b/xrspatial/geotiff/tests/test_gpu_strict_fallback_1516.py @@ -36,6 +36,7 @@ _CUPY_ORIG_SENTINEL = object() _cupy_saved = _CUPY_ORIG_SENTINEL _cupy_cuda_saved = _CUPY_ORIG_SENTINEL +_cupy_cuda_runtime_saved = _CUPY_ORIG_SENTINEL def _cuda_actually_available() -> bool: @@ -63,13 +64,15 @@ def _ensure_cupy_stub() -> bool: installed but CUDA isn't available. The original module (if any) is saved so :func:`_restore_cupy` can put it back. """ - global _cupy_saved, _cupy_cuda_saved + global _cupy_saved, _cupy_cuda_saved, _cupy_cuda_runtime_saved if _cuda_actually_available(): return False _cupy_saved = sys.modules.get('cupy', _CUPY_ORIG_SENTINEL) _cupy_cuda_saved = sys.modules.get('cupy.cuda', _CUPY_ORIG_SENTINEL) + _cupy_cuda_runtime_saved = sys.modules.get( + 'cupy.cuda.runtime', _CUPY_ORIG_SENTINEL) stub = types.ModuleType('cupy') stub.ndarray = np.ndarray @@ -77,19 +80,32 @@ def _ensure_cupy_stub() -> bool: cuda_mod = types.ModuleType('cupy.cuda') cuda_mod.is_available = lambda: False + + # Pre-flight check in ``read_geotiff_gpu`` (added in #1903) calls + # ``cupy.cuda.runtime.getDeviceCount()`` to surface a clean + # ``RuntimeError`` for broken-driver setups. Tests in this file want + # to exercise the downstream simulated-failure paths, so the stubbed + # runtime reports one device and the preflight lets execution + # through. The real preflight tests live in + # ``test_gpu_cuda_preflight_1903.py``. + runtime_mod = types.ModuleType('cupy.cuda.runtime') + runtime_mod.getDeviceCount = lambda: 1 + cuda_mod.runtime = runtime_mod stub.cuda = cuda_mod sys.modules['cupy'] = stub sys.modules['cupy.cuda'] = cuda_mod + sys.modules['cupy.cuda.runtime'] = runtime_mod return True def _restore_cupy() -> None: """Undo :func:`_ensure_cupy_stub`.""" - global _cupy_saved, _cupy_cuda_saved + global _cupy_saved, _cupy_cuda_saved, _cupy_cuda_runtime_saved for name, saved in ( ('cupy', _cupy_saved), ('cupy.cuda', _cupy_cuda_saved), + ('cupy.cuda.runtime', _cupy_cuda_runtime_saved), ): if saved is _CUPY_ORIG_SENTINEL: sys.modules.pop(name, None) @@ -97,6 +113,7 @@ def _restore_cupy() -> None: sys.modules[name] = saved _cupy_saved = _CUPY_ORIG_SENTINEL _cupy_cuda_saved = _CUPY_ORIG_SENTINEL + _cupy_cuda_runtime_saved = _CUPY_ORIG_SENTINEL importlib.invalidate_caches()