From 2056fec96ef6b6dd0b040fd895547f87f05d148a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 9 Mar 2026 17:11:40 +0100 Subject: [PATCH 1/7] inject docstrings into stubs --- python/CMakeLists.txt | 28 +++++++ python/scripts/update_stub_docstrings.py | 95 ++++++++++++++++++------ 2 files changed, 101 insertions(+), 22 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 0630e0cff7cb..d7e0b70939fa 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1025,3 +1025,31 @@ if(PYARROW_BUILD_PARQUET) target_link_libraries(_parquet_encryption PRIVATE arrow_python_parquet_encryption) endif() endif() + +# +# Type stubs with docstring injection +# +set(PYARROW_STUBS_SOURCE_DIR "${CMAKE_SOURCE_DIR}/pyarrow-stubs/pyarrow") +if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") + install(DIRECTORY "${PYARROW_STUBS_SOURCE_DIR}/" + DESTINATION "." + FILES_MATCHING + PATTERN "*.pyi") + + if(DEFINED SKBUILD_STATE + AND SKBUILD_STATE STREQUAL "wheel" + AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + install(CODE " + execute_process( + COMMAND \"${Python3_EXECUTABLE}\" + \"${CMAKE_SOURCE_DIR}/scripts/update_stub_docstrings.py\" + \"${CMAKE_INSTALL_PREFIX}\" + \"${CMAKE_SOURCE_DIR}\" + RESULT_VARIABLE _pyarrow_stub_docstrings_result + ) + if(NOT _pyarrow_stub_docstrings_result EQUAL 0) + message(WARNING \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + endif() + ") + endif() +endif() diff --git a/python/scripts/update_stub_docstrings.py b/python/scripts/update_stub_docstrings.py index 5fd24014a024..a405b052a371 100644 --- a/python/scripts/update_stub_docstrings.py +++ b/python/scripts/update_stub_docstrings.py @@ -18,14 +18,18 @@ """ Extract docstrings from pyarrow runtime and insert them into stub files. -Usage (from python/ directory with pyarrow built): - python scripts/update_stub_docstrings.py pyarrow-stubs +Usage: + python scripts/update_stub_docstrings.py """ import argparse import importlib import inspect +import os +import shutil import sys +import sysconfig +import tempfile from pathlib import Path from textwrap import indent @@ -186,7 +190,8 @@ def add_docstrings_to_stubs(stubs_dir): if module_name in LIB_MODULES: namespace = "lib" elif stub_file.parent.name in ("parquet", "interchange"): - namespace = f"{stub_file.parent.name}.{module_name}" + namespace = (stub_file.parent.name if module_name == "__init__" + else f"{stub_file.parent.name}.{module_name}") elif module_name == "__init__": namespace = "" else: @@ -198,31 +203,77 @@ def add_docstrings_to_stubs(stubs_dir): stub_file.write_text(modified.code) -def add_docstrings_from_build(stubs_dir, build_lib): - """ - Entry point for setup.py: update docstrings using pyarrow from build directory. +def _link_or_copy(source, destination): + if sys.platform != "win32": + try: + os.symlink(source, destination) + return + except OSError: + pass + + if source.is_dir(): + shutil.copytree(source, destination, symlinks=(sys.platform != "win32")) + else: + shutil.copy2(source, destination) + - During the build process, pyarrow is not installed in the system Python. - We need to temporarily add the build directory to sys.path so we can - import pyarrow and extract docstrings from it. +def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir): """ - stubs_dir, build_lib = Path(stubs_dir), Path(build_lib) + Populate pyarrow_pkg with source Python modules and installed binary artifacts + so that pyarrow can be imported from the parent directory of pyarrow_pkg. + """ + ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") or ".so" + source_pyarrow = source_dir / "pyarrow" + if not source_pyarrow.exists(): + raise FileNotFoundError(f"PyArrow source package not found: {source_pyarrow}") + + for source_path in source_pyarrow.iterdir(): + if source_path.suffix == ".py": + _link_or_copy(source_path, pyarrow_pkg / source_path.name) + elif source_path.is_dir() and not source_path.name.startswith((".", "__")): + _link_or_copy(source_path, pyarrow_pkg / source_path.name) + + for artifact in install_pyarrow_dir.iterdir(): + if not artifact.is_file(): + continue - sys.path.insert(0, str(build_lib)) - try: - add_docstrings_to_stubs(stubs_dir) - finally: - sys.path.pop(0) + destination = pyarrow_pkg / artifact.name + if destination.exists(): + continue + + is_extension = ext_suffix in artifact.name or artifact.suffix == ".pyd" + is_shared_library = ( + ".so" in artifact.name or artifact.suffix in (".dylib", ".dll") + ) + if is_extension or is_shared_library: + _link_or_copy(artifact, destination) if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs folder") + parser.add_argument("install_prefix", type=Path, + help="CMAKE_INSTALL_PREFIX used by wheel build") + parser.add_argument("source_dir", type=Path, + help="PyArrow source directory") args = parser.parse_args() - # Add the directory containing this script's parent (python/) to sys.path - # so pyarrow can be imported when running from the python/ directory - script_dir = Path(__file__).resolve().parent - python_dir = script_dir.parent - sys.path.insert(0, str(python_dir)) - add_docstrings_to_stubs(args.stubs_dir.resolve()) + install_prefix = args.install_prefix.resolve() + source_dir = args.source_dir.resolve() + install_pyarrow_dir = install_prefix / "pyarrow" + if not install_pyarrow_dir.exists(): + install_pyarrow_dir = install_prefix + + if not any(install_pyarrow_dir.rglob("*.pyi")): + print("No .pyi files found in install tree, skipping docstring injection") + sys.exit(0) + + with tempfile.TemporaryDirectory() as tmpdir: + pyarrow_pkg = Path(tmpdir) / "pyarrow" + pyarrow_pkg.mkdir() + _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir) + + sys.path.insert(0, tmpdir) + try: + add_docstrings_to_stubs(install_pyarrow_dir) + finally: + sys.path.pop(0) From f885111a427b36c803b25f37316ec912e864e893 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 10 Mar 2026 01:32:33 +0100 Subject: [PATCH 2/7] check stubs are included at wheel build time --- ci/scripts/python_test_type_annotations.sh | 4 +- ci/scripts/python_wheel_validate_contents.py | 41 ++++++++++++++++++++ python/CMakeLists.txt | 31 +++++++++++++-- python/pyproject.toml | 11 ++++-- 4 files changed, 77 insertions(+), 10 deletions(-) diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh index c1a051b1e56d..092bedf3f5ea 100755 --- a/ci/scripts/python_test_type_annotations.sh +++ b/ci/scripts/python_test_type_annotations.sh @@ -34,5 +34,5 @@ pip install mypy pyright ty # Run type checkers cd "${pyarrow_dir}" mypy -pyright -ty check +pyright --stats +ty check --verbose --output-format concise diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 153a70eb4069..493811c1258b 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -27,6 +27,7 @@ def validate_wheel(path): error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})" assert len(wheels) == 1, error_msg f = zipfile.ZipFile(wheels[0]) + outliers = [ info.filename for info in f.filelist if not re.match( r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', info.filename @@ -37,6 +38,46 @@ def validate_wheel(path): assert any(info.filename.split("/")[-1] == filename for info in f.filelist), \ f"{filename} is missing from the wheel." + + assert any(info.filename == "pyarrow/py.typed" for info in f.filelist), \ + "pyarrow/py.typed is missing from the wheel." + + source_root = Path(__file__).resolve().parents[2] + stubs_dir = source_root / "python" / "pyarrow-stubs" / "pyarrow" + assert stubs_dir.exists(), f"Stub source directory not found: {stubs_dir}" + + expected_stub_files = { + f"pyarrow/{stub_file.relative_to(stubs_dir).as_posix()}" + for stub_file in stubs_dir.rglob("*.pyi") + } + + wheel_stub_files = { + info.filename + for info in f.filelist + if info.filename.startswith("pyarrow/") and info.filename.endswith(".pyi") + } + + assert wheel_stub_files == expected_stub_files, ( + "Wheel .pyi files differ from python/pyarrow-stubs/pyarrow.\n" + f"Missing in wheel: {sorted(expected_stub_files - wheel_stub_files)}\n" + f"Unexpected in wheel: {sorted(wheel_stub_files - expected_stub_files)}" + ) + + docstring_injected_stub_files = [] + for wheel_stub_file in wheel_stub_files: + stub_relpath = Path(wheel_stub_file).relative_to("pyarrow") + source_stub_file = stubs_dir / stub_relpath + source_content = source_stub_file.read_text(encoding="utf-8") + wheel_content = f.read(wheel_stub_file).decode("utf-8") + if wheel_content.count('"""') > source_content.count('"""'): + docstring_injected_stub_files.append(wheel_stub_file) + + assert docstring_injected_stub_files, ( + "No injected docstrings were detected in wheel stub files. " + "Expected at least one .pyi file in the wheel to contain more " + "triple-quoted docstrings than its source stub counterpart." + ) + print(f"The wheel: {wheels[0]} seems valid.") # TODO(GH-32609): Validate some docstrings were generated and added. diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index d7e0b70939fa..c9d9b7aa82c1 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1029,7 +1029,18 @@ endif() # # Type stubs with docstring injection # -set(PYARROW_STUBS_SOURCE_DIR "${CMAKE_SOURCE_DIR}/pyarrow-stubs/pyarrow") +# Stubs live in pyarrow-stubs/pyarrow/ during development but are installed +# alongside the package so type checkers can find them (PEP 561). +set(PYARROW_REQUIRE_STUB_DOCSTRINGS OFF) +if(DEFINED SKBUILD_STATE + AND SKBUILD_STATE STREQUAL "wheel" + AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" + AND DEFINED ENV{CI} + AND NOT "$ENV{CI}" STREQUAL "") + set(PYARROW_REQUIRE_STUB_DOCSTRINGS ON) +endif() + +set(PYARROW_STUBS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pyarrow-stubs/pyarrow") if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") install(DIRECTORY "${PYARROW_STUBS_SOURCE_DIR}/" DESTINATION "." @@ -1042,14 +1053,26 @@ if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") install(CODE " execute_process( COMMAND \"${Python3_EXECUTABLE}\" - \"${CMAKE_SOURCE_DIR}/scripts/update_stub_docstrings.py\" + \"${CMAKE_CURRENT_SOURCE_DIR}/scripts/update_stub_docstrings.py\" \"${CMAKE_INSTALL_PREFIX}\" - \"${CMAKE_SOURCE_DIR}\" + \"${CMAKE_CURRENT_SOURCE_DIR}\" RESULT_VARIABLE _pyarrow_stub_docstrings_result ) if(NOT _pyarrow_stub_docstrings_result EQUAL 0) - message(WARNING \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + if(${PYARROW_REQUIRE_STUB_DOCSTRINGS}) + message(FATAL_ERROR \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + else() + message(WARNING \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + endif() endif() ") endif() +else() + if(PYARROW_REQUIRE_STUB_DOCSTRINGS) + message(FATAL_ERROR "PyArrow stub source directory not found at ${PYARROW_STUBS_SOURCE_DIR}; " + "cannot build CI wheel without .pyi files.") + else() + message(WARNING "PyArrow stub source directory not found at ${PYARROW_STUBS_SOURCE_DIR}; " + "wheel will be built without .pyi files.") + endif() endif() diff --git a/python/pyproject.toml b/python/pyproject.toml index 14aa37ed0453..7ed2dce51a92 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -85,7 +85,7 @@ exclude = [ [tool.scikit-build] cmake.build-type = "Release" metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" -sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/"] +sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/", "pyarrow-stubs/"] wheel.packages = ["pyarrow"] wheel.install-dir = "pyarrow" @@ -102,7 +102,7 @@ version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' -# TODO: Enable type checking once stubs are merged +# TODO: Enable more type checks as more stubs are merged [tool.mypy] files = ["pyarrow-stubs"] mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" @@ -113,7 +113,7 @@ exclude = [ "^scripts/", ] -# TODO: Enable type checking once stubs are merged +# TODO: Enable more type checks as more stubs are merged [tool.pyright] pythonPlatform = "All" pythonVersion = "3.10" @@ -128,7 +128,10 @@ exclude = [ stubPath = "pyarrow-stubs" typeCheckingMode = "basic" -# TODO: Enable type checking once stubs are merged +# TODO: Enable more type checks as more stubs are merged +[tool.ty.environment] +extra-paths = ["pyarrow-stubs"] + [tool.ty.src] include = ["pyarrow-stubs"] exclude = [ From e4903ebbed2c3f5839d2e901b4ebf1dd2e1d7683 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 10 Mar 2026 13:42:47 +0100 Subject: [PATCH 3/7] we don't have docstrings yet --- ci/scripts/python_wheel_validate_contents.py | 36 ++++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 493811c1258b..52753eb51ded 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -16,11 +16,27 @@ # under the License. import argparse +import ast from pathlib import Path import re import zipfile +def _count_docstrings(source): + """Count docstrings in module, function, and class bodies.""" + tree = ast.parse(source) + count = 0 + for node in ast.walk(tree): + if isinstance(node, (ast.Module, ast.FunctionDef, + ast.AsyncFunctionDef, ast.ClassDef)): + if (node.body + and isinstance(node.body[0], ast.Expr) + and isinstance(node.body[0].value, ast.Constant) + and isinstance(node.body[0].value.value, str)): + count += 1 + return count + + def validate_wheel(path): p = Path(path) wheels = list(p.glob('*.whl')) @@ -63,23 +79,15 @@ def validate_wheel(path): f"Unexpected in wheel: {sorted(wheel_stub_files - expected_stub_files)}" ) - docstring_injected_stub_files = [] - for wheel_stub_file in wheel_stub_files: - stub_relpath = Path(wheel_stub_file).relative_to("pyarrow") - source_stub_file = stubs_dir / stub_relpath - source_content = source_stub_file.read_text(encoding="utf-8") - wheel_content = f.read(wheel_stub_file).decode("utf-8") - if wheel_content.count('"""') > source_content.count('"""'): - docstring_injected_stub_files.append(wheel_stub_file) - - assert docstring_injected_stub_files, ( - "No injected docstrings were detected in wheel stub files. " - "Expected at least one .pyi file in the wheel to contain more " - "triple-quoted docstrings than its source stub counterpart." + wheel_docstring_count = sum( + _count_docstrings(f.read(wsf).decode("utf-8")) + for wsf in wheel_stub_files ) + print(f"Found {wheel_docstring_count} docstring(s) in wheel stubs.") + assert wheel_docstring_count, "No docstrings found in wheel stub files." + print(f"The wheel: {wheels[0]} seems valid.") - # TODO(GH-32609): Validate some docstrings were generated and added. def main(): parser = argparse.ArgumentParser() From f3869c7b2a8d402e010e8ed82f97085a947a8c0f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Mar 2026 18:48:23 +0100 Subject: [PATCH 4/7] Apply suggestion from @raulcd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Raúl Cumplido --- python/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index c9d9b7aa82c1..b9c977d1a77e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1047,9 +1047,7 @@ if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") FILES_MATCHING PATTERN "*.pyi") - if(DEFINED SKBUILD_STATE - AND SKBUILD_STATE STREQUAL "wheel" - AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + if(PYARROW_REQUIRE_STUB_DOCSTRINGS) install(CODE " execute_process( COMMAND \"${Python3_EXECUTABLE}\" From b34e6e1caa34d737eb42ca846a29e323e215e981 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Mar 2026 19:00:57 +0100 Subject: [PATCH 5/7] review feedback --- ci/scripts/python_wheel_macos_build.sh | 1 + ci/scripts/python_wheel_windows_build.bat | 1 + ci/scripts/python_wheel_xlinux_build.sh | 1 + python/CMakeLists.txt | 15 +-------------- python/pyproject.toml | 1 + python/scripts/update_stub_docstrings.py | 10 ++++++++-- 6 files changed, 13 insertions(+), 16 deletions(-) diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 1571cd57f258..31395e26c23a 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -147,6 +147,7 @@ popd echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUNDLE_ARROW_CPP=ON +export PYARROW_REQUIRE_STUB_DOCSTRINGS=ON export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 14e3e5a62971..e094d82861df 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -116,6 +116,7 @@ popd echo "=== (%PYTHON%) Building wheel ===" set PYARROW_BUNDLE_ARROW_CPP=ON +set PYARROW_REQUIRE_STUB_DOCSTRINGS=ON set PYARROW_WITH_ACERO=%ARROW_ACERO% set PYARROW_WITH_AZURE=%ARROW_AZURE% set PYARROW_WITH_DATASET=%ARROW_DATASET% diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index 960fe5bad6d9..223bd0b1cbae 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -155,6 +155,7 @@ check_arrow_visibility echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUNDLE_ARROW_CPP=ON +export PYARROW_REQUIRE_STUB_DOCSTRINGS=ON export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b9c977d1a77e..c7c0915608a1 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1031,15 +1031,6 @@ endif() # # Stubs live in pyarrow-stubs/pyarrow/ during development but are installed # alongside the package so type checkers can find them (PEP 561). -set(PYARROW_REQUIRE_STUB_DOCSTRINGS OFF) -if(DEFINED SKBUILD_STATE - AND SKBUILD_STATE STREQUAL "wheel" - AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" - AND DEFINED ENV{CI} - AND NOT "$ENV{CI}" STREQUAL "") - set(PYARROW_REQUIRE_STUB_DOCSTRINGS ON) -endif() - set(PYARROW_STUBS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pyarrow-stubs/pyarrow") if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") install(DIRECTORY "${PYARROW_STUBS_SOURCE_DIR}/" @@ -1057,11 +1048,7 @@ if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") RESULT_VARIABLE _pyarrow_stub_docstrings_result ) if(NOT _pyarrow_stub_docstrings_result EQUAL 0) - if(${PYARROW_REQUIRE_STUB_DOCSTRINGS}) - message(FATAL_ERROR \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") - else() - message(WARNING \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") - endif() + message(${_stub_error_level} \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") endif() ") endif() diff --git a/python/pyproject.toml b/python/pyproject.toml index 7ed2dce51a92..7ba528a5b0d6 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -94,6 +94,7 @@ PYARROW_BUNDLE_ARROW_CPP = {env = "PYARROW_BUNDLE_ARROW_CPP", default = "OFF"} PYARROW_BUNDLE_CYTHON_CPP = {env = "PYARROW_BUNDLE_CYTHON_CPP", default = "OFF"} PYARROW_GENERATE_COVERAGE = {env = "PYARROW_GENERATE_COVERAGE", default = "OFF"} PYARROW_CXXFLAGS = {env = "PYARROW_CXXFLAGS", default = ""} +PYARROW_REQUIRE_STUB_DOCSTRINGS = {env = "PYARROW_REQUIRE_STUB_DOCSTRINGS", default = "OFF"} [tool.setuptools_scm] root = '..' diff --git a/python/scripts/update_stub_docstrings.py b/python/scripts/update_stub_docstrings.py index a405b052a371..edc730bb8b3b 100644 --- a/python/scripts/update_stub_docstrings.py +++ b/python/scripts/update_stub_docstrings.py @@ -219,9 +219,15 @@ def _link_or_copy(source, destination): def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir): """ - Populate pyarrow_pkg with source Python modules and installed binary artifacts - so that pyarrow can be imported from the parent directory of pyarrow_pkg. + Assemble an importable pyarrow package inside a temporary directory. + + During wheel builds the .py sources and compiled extensions (.so/.pyd/.dylib) + live in separate trees (source checkout vs CMake install prefix). This + function symlinks (or copies) both into *pyarrow_pkg* folder so that a plain + ``import pyarrow`` works and docstrings can be extracted at build time. """ + # Platform-specific suffix for Python extension modules + # (e.g. ".cpython-313-x86_64-linux-gnu.so") ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") or ".so" source_pyarrow = source_dir / "pyarrow" if not source_pyarrow.exists(): From e7e51db5170e8634259c86d29b3b84e04a3853ab Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Mar 2026 22:40:45 +0100 Subject: [PATCH 6/7] review feedback --- python/CMakeLists.txt | 2 +- python/scripts/update_stub_docstrings.py | 15 ++------------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index c7c0915608a1..ace0620dd4e3 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1048,7 +1048,7 @@ if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") RESULT_VARIABLE _pyarrow_stub_docstrings_result ) if(NOT _pyarrow_stub_docstrings_result EQUAL 0) - message(${_stub_error_level} \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + message(FATAL_ERROR \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") endif() ") endif() diff --git a/python/scripts/update_stub_docstrings.py b/python/scripts/update_stub_docstrings.py index edc730bb8b3b..2a1eb0843471 100644 --- a/python/scripts/update_stub_docstrings.py +++ b/python/scripts/update_stub_docstrings.py @@ -28,7 +28,6 @@ import os import shutil import sys -import sysconfig import tempfile from pathlib import Path from textwrap import indent @@ -226,9 +225,6 @@ def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir): function symlinks (or copies) both into *pyarrow_pkg* folder so that a plain ``import pyarrow`` works and docstrings can be extracted at build time. """ - # Platform-specific suffix for Python extension modules - # (e.g. ".cpython-313-x86_64-linux-gnu.so") - ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") or ".so" source_pyarrow = source_dir / "pyarrow" if not source_pyarrow.exists(): raise FileNotFoundError(f"PyArrow source package not found: {source_pyarrow}") @@ -240,18 +236,11 @@ def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir): _link_or_copy(source_path, pyarrow_pkg / source_path.name) for artifact in install_pyarrow_dir.iterdir(): - if not artifact.is_file(): + if not artifact.is_file() or artifact.suffix == ".pyi": continue destination = pyarrow_pkg / artifact.name - if destination.exists(): - continue - - is_extension = ext_suffix in artifact.name or artifact.suffix == ".pyd" - is_shared_library = ( - ".so" in artifact.name or artifact.suffix in (".dylib", ".dll") - ) - if is_extension or is_shared_library: + if not destination.exists(): _link_or_copy(artifact, destination) From a1b43d8b23a571abcce3331a4d0090e0c4ecb199 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 12 Mar 2026 22:51:41 +0100 Subject: [PATCH 7/7] reduce the PR somewhat --- ci/scripts/python_wheel_validate_contents.py | 85 ++++++++++---------- python/CMakeLists.txt | 2 +- python/pyproject.toml | 6 +- python/scripts/update_stub_docstrings.py | 18 +++-- 4 files changed, 57 insertions(+), 54 deletions(-) diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 52753eb51ded..8388f6ebf391 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -42,53 +42,54 @@ def validate_wheel(path): wheels = list(p.glob('*.whl')) error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})" assert len(wheels) == 1, error_msg - f = zipfile.ZipFile(wheels[0]) + with zipfile.ZipFile(wheels[0]) as wheel_zip: + outliers = [ + info.filename for info in wheel_zip.filelist if not re.match( + r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', info.filename + ) + ] + assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + for filename in ('LICENSE.txt', 'NOTICE.txt'): + assert any( + info.filename.split("/")[-1] == filename for info in wheel_zip.filelist + ), f"{filename} is missing from the wheel." + + assert any( + info.filename == "pyarrow/py.typed" for info in wheel_zip.filelist + ), "pyarrow/py.typed is missing from the wheel." + + source_root = Path(__file__).resolve().parents[2] + stubs_dir = source_root / "python" / "pyarrow-stubs" / "pyarrow" + assert stubs_dir.exists(), f"Stub source directory not found: {stubs_dir}" + + expected_stub_files = { + f"pyarrow/{stub_file.relative_to(stubs_dir).as_posix()}" + for stub_file in stubs_dir.rglob("*.pyi") + } + + wheel_stub_files = { + info.filename + for info in wheel_zip.filelist + if info.filename.startswith("pyarrow/") and info.filename.endswith(".pyi") + } + + assert wheel_stub_files == expected_stub_files, ( + "Wheel .pyi files differ from python/pyarrow-stubs/pyarrow.\n" + f"Missing in wheel: {sorted(expected_stub_files - wheel_stub_files)}\n" + f"Unexpected in wheel: {sorted(wheel_stub_files - expected_stub_files)}" + ) - outliers = [ - info.filename for info in f.filelist if not re.match( - r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', info.filename + wheel_docstring_count = sum( + _count_docstrings(wheel_zip.read(wsf).decode("utf-8")) + for wsf in wheel_stub_files ) - ] - assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" - for filename in ('LICENSE.txt', 'NOTICE.txt'): - assert any(info.filename.split("/")[-1] == filename - for info in f.filelist), \ - f"{filename} is missing from the wheel." - - assert any(info.filename == "pyarrow/py.typed" for info in f.filelist), \ - "pyarrow/py.typed is missing from the wheel." - - source_root = Path(__file__).resolve().parents[2] - stubs_dir = source_root / "python" / "pyarrow-stubs" / "pyarrow" - assert stubs_dir.exists(), f"Stub source directory not found: {stubs_dir}" - - expected_stub_files = { - f"pyarrow/{stub_file.relative_to(stubs_dir).as_posix()}" - for stub_file in stubs_dir.rglob("*.pyi") - } - - wheel_stub_files = { - info.filename - for info in f.filelist - if info.filename.startswith("pyarrow/") and info.filename.endswith(".pyi") - } - - assert wheel_stub_files == expected_stub_files, ( - "Wheel .pyi files differ from python/pyarrow-stubs/pyarrow.\n" - f"Missing in wheel: {sorted(expected_stub_files - wheel_stub_files)}\n" - f"Unexpected in wheel: {sorted(wheel_stub_files - expected_stub_files)}" - ) - - wheel_docstring_count = sum( - _count_docstrings(f.read(wsf).decode("utf-8")) - for wsf in wheel_stub_files - ) - - print(f"Found {wheel_docstring_count} docstring(s) in wheel stubs.") - assert wheel_docstring_count, "No docstrings found in wheel stub files." + + print(f"Found {wheel_docstring_count} docstring(s) in wheel stubs.") + assert wheel_docstring_count, "No docstrings found in wheel stub files." print(f"The wheel: {wheels[0]} seems valid.") + def main(): parser = argparse.ArgumentParser() parser.add_argument("--path", type=str, required=True, diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ace0620dd4e3..c7dbc661a060 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1055,7 +1055,7 @@ if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") else() if(PYARROW_REQUIRE_STUB_DOCSTRINGS) message(FATAL_ERROR "PyArrow stub source directory not found at ${PYARROW_STUBS_SOURCE_DIR}; " - "cannot build CI wheel without .pyi files.") + "cannot build wheel without .pyi files.") else() message(WARNING "PyArrow stub source directory not found at ${PYARROW_STUBS_SOURCE_DIR}; " "wheel will be built without .pyi files.") diff --git a/python/pyproject.toml b/python/pyproject.toml index 7ba528a5b0d6..a6bba335b8e0 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -103,7 +103,7 @@ version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' -# TODO: Enable more type checks as more stubs are merged +# TODO: Enable type checking once stubs are merged [tool.mypy] files = ["pyarrow-stubs"] mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" @@ -114,7 +114,7 @@ exclude = [ "^scripts/", ] -# TODO: Enable more type checks as more stubs are merged +# TODO: Enable type checking once stubs are merged [tool.pyright] pythonPlatform = "All" pythonVersion = "3.10" @@ -129,7 +129,7 @@ exclude = [ stubPath = "pyarrow-stubs" typeCheckingMode = "basic" -# TODO: Enable more type checks as more stubs are merged +# TODO: Enable type checking once stubs are merged [tool.ty.environment] extra-paths = ["pyarrow-stubs"] diff --git a/python/scripts/update_stub_docstrings.py b/python/scripts/update_stub_docstrings.py index 2a1eb0843471..44bd19bfdc8c 100644 --- a/python/scripts/update_stub_docstrings.py +++ b/python/scripts/update_stub_docstrings.py @@ -181,7 +181,7 @@ def add_docstrings_to_stubs(stubs_dir): pyarrow = importlib.import_module("pyarrow") - for stub_file in stubs_dir.rglob('*.pyi'): + for stub_file in sorted(stubs_dir.rglob('*.pyi')): if stub_file.name == "_stubs_typing.pyi": continue @@ -197,12 +197,14 @@ def add_docstrings_to_stubs(stubs_dir): namespace = module_name print(f" {stub_file.name} -> {namespace or '(root)'}") - tree = libcst.parse_module(stub_file.read_text()) + tree = libcst.parse_module(stub_file.read_text(encoding="utf-8")) modified = tree.visit(DocstringInserter(pyarrow, namespace)) - stub_file.write_text(modified.code) + stub_file.write_text(modified.code, encoding="utf-8") def _link_or_copy(source, destination): + # Prefer symlinks (faster, no disk use) but fall back to copying when the + # filesystem doesn't support them (e.g. Docker volumes, network mounts). if sys.platform != "win32": try: os.symlink(source, destination) @@ -220,22 +222,22 @@ def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir): """ Assemble an importable pyarrow package inside a temporary directory. - During wheel builds the .py sources and compiled extensions (.so/.pyd/.dylib) - live in separate trees (source checkout vs CMake install prefix). This - function symlinks (or copies) both into *pyarrow_pkg* folder so that a plain + During wheel builds the .py sources and compiled binary artifacts live in + separate trees (source checkout vs CMake install prefix). This function + symlinks (or copies) both into pyarrow_pkg folder so that a plain ``import pyarrow`` works and docstrings can be extracted at build time. """ source_pyarrow = source_dir / "pyarrow" if not source_pyarrow.exists(): raise FileNotFoundError(f"PyArrow source package not found: {source_pyarrow}") - for source_path in source_pyarrow.iterdir(): + for source_path in sorted(source_pyarrow.iterdir()): if source_path.suffix == ".py": _link_or_copy(source_path, pyarrow_pkg / source_path.name) elif source_path.is_dir() and not source_path.name.startswith((".", "__")): _link_or_copy(source_path, pyarrow_pkg / source_path.name) - for artifact in install_pyarrow_dir.iterdir(): + for artifact in sorted(install_pyarrow_dir.iterdir()): if not artifact.is_file() or artifact.suffix == ".pyi": continue