diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh index c1a051b1e56..092bedf3f5e 100755 --- a/ci/scripts/python_test_type_annotations.sh +++ b/ci/scripts/python_test_type_annotations.sh @@ -34,5 +34,5 @@ pip install mypy pyright ty # Run type checkers cd "${pyarrow_dir}" mypy -pyright -ty check +pyright --stats +ty check --verbose --output-format concise diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 1571cd57f25..31395e26c23 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -147,6 +147,7 @@ popd echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUNDLE_ARROW_CPP=ON +export PYARROW_REQUIRE_STUB_DOCSTRINGS=ON export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 153a70eb406..8388f6ebf39 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -16,29 +16,79 @@ # under the License. import argparse +import ast from pathlib import Path import re import zipfile +def _count_docstrings(source): + """Count docstrings in module, function, and class bodies.""" + tree = ast.parse(source) + count = 0 + for node in ast.walk(tree): + if isinstance(node, (ast.Module, ast.FunctionDef, + ast.AsyncFunctionDef, ast.ClassDef)): + if (node.body + and isinstance(node.body[0], ast.Expr) + and isinstance(node.body[0].value, ast.Constant) + and isinstance(node.body[0].value.value, str)): + count += 1 + return count + + def validate_wheel(path): p = Path(path) wheels = list(p.glob('*.whl')) error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})" assert len(wheels) == 1, error_msg - f = zipfile.ZipFile(wheels[0]) - outliers = [ - info.filename for info in f.filelist if not re.match( - r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', info.filename + with zipfile.ZipFile(wheels[0]) as wheel_zip: + outliers = [ + info.filename for info in wheel_zip.filelist if not re.match( + r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)', info.filename + ) + ] + assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + for filename in ('LICENSE.txt', 'NOTICE.txt'): + assert any( + info.filename.split("/")[-1] == filename for info in wheel_zip.filelist + ), f"{filename} is missing from the wheel." + + assert any( + info.filename == "pyarrow/py.typed" for info in wheel_zip.filelist + ), "pyarrow/py.typed is missing from the wheel." + + source_root = Path(__file__).resolve().parents[2] + stubs_dir = source_root / "python" / "pyarrow-stubs" / "pyarrow" + assert stubs_dir.exists(), f"Stub source directory not found: {stubs_dir}" + + expected_stub_files = { + f"pyarrow/{stub_file.relative_to(stubs_dir).as_posix()}" + for stub_file in stubs_dir.rglob("*.pyi") + } + + wheel_stub_files = { + info.filename + for info in wheel_zip.filelist + if info.filename.startswith("pyarrow/") and info.filename.endswith(".pyi") + } + + assert wheel_stub_files == expected_stub_files, ( + "Wheel .pyi files differ from python/pyarrow-stubs/pyarrow.\n" + f"Missing in wheel: {sorted(expected_stub_files - wheel_stub_files)}\n" + f"Unexpected in wheel: {sorted(wheel_stub_files - expected_stub_files)}" + ) + + wheel_docstring_count = sum( + _count_docstrings(wheel_zip.read(wsf).decode("utf-8")) + for wsf in wheel_stub_files ) - ] - assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" - for filename in ('LICENSE.txt', 'NOTICE.txt'): - assert any(info.filename.split("/")[-1] == filename - for info in f.filelist), \ - f"{filename} is missing from the wheel." + + print(f"Found {wheel_docstring_count} docstring(s) in wheel stubs.") + assert wheel_docstring_count, "No docstrings found in wheel stub files." + print(f"The wheel: {wheels[0]} seems valid.") - # TODO(GH-32609): Validate some docstrings were generated and added. + def main(): parser = argparse.ArgumentParser() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 14e3e5a6297..e094d82861d 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -116,6 +116,7 @@ popd echo "=== (%PYTHON%) Building wheel ===" set PYARROW_BUNDLE_ARROW_CPP=ON +set PYARROW_REQUIRE_STUB_DOCSTRINGS=ON set PYARROW_WITH_ACERO=%ARROW_ACERO% set PYARROW_WITH_AZURE=%ARROW_AZURE% set PYARROW_WITH_DATASET=%ARROW_DATASET% diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index 960fe5bad6d..223bd0b1cba 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -155,6 +155,7 @@ check_arrow_visibility echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUNDLE_ARROW_CPP=ON +export PYARROW_REQUIRE_STUB_DOCSTRINGS=ON export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 0630e0cff7c..6395b3e1e7a 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1025,3 +1025,36 @@ if(PYARROW_BUILD_PARQUET) target_link_libraries(_parquet_encryption PRIVATE arrow_python_parquet_encryption) endif() endif() + +# +# Type stubs with docstring injection +# +# Stubs live in pyarrow-stubs/pyarrow/ during development but are installed +# alongside the package so type checkers can find them (PEP 561). +set(PYARROW_STUBS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pyarrow-stubs/pyarrow") +if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}") + install(DIRECTORY "${PYARROW_STUBS_SOURCE_DIR}/" + DESTINATION "." + FILES_MATCHING + PATTERN "*.pyi") + + if(PYARROW_REQUIRE_STUB_DOCSTRINGS) + install(CODE " + execute_process( + COMMAND \"${Python3_EXECUTABLE}\" + \"${CMAKE_CURRENT_SOURCE_DIR}/scripts/update_stub_docstrings.py\" + \"${CMAKE_INSTALL_PREFIX}\" + \"${CMAKE_CURRENT_SOURCE_DIR}\" + RESULT_VARIABLE _pyarrow_stub_docstrings_result + ) + if(NOT _pyarrow_stub_docstrings_result EQUAL 0) + message(FATAL_ERROR \"Stub docstring injection failed (exit code: \${_pyarrow_stub_docstrings_result})\") + endif() + ") + endif() +else() + if(PYARROW_REQUIRE_STUB_DOCSTRINGS) + message(FATAL_ERROR "PyArrow stub source directory not found at ${PYARROW_STUBS_SOURCE_DIR}; " + "cannot build wheel without .pyi files.") + endif() +endif() diff --git a/python/pyproject.toml b/python/pyproject.toml index 14aa37ed045..a6bba335b8e 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -85,7 +85,7 @@ exclude = [ [tool.scikit-build] cmake.build-type = "Release" metadata.version.provider = "scikit_build_core.metadata.setuptools_scm" -sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/"] +sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/", "pyarrow-stubs/"] wheel.packages = ["pyarrow"] wheel.install-dir = "pyarrow" @@ -94,6 +94,7 @@ PYARROW_BUNDLE_ARROW_CPP = {env = "PYARROW_BUNDLE_ARROW_CPP", default = "OFF"} PYARROW_BUNDLE_CYTHON_CPP = {env = "PYARROW_BUNDLE_CYTHON_CPP", default = "OFF"} PYARROW_GENERATE_COVERAGE = {env = "PYARROW_GENERATE_COVERAGE", default = "OFF"} PYARROW_CXXFLAGS = {env = "PYARROW_CXXFLAGS", default = ""} +PYARROW_REQUIRE_STUB_DOCSTRINGS = {env = "PYARROW_REQUIRE_STUB_DOCSTRINGS", default = "OFF"} [tool.setuptools_scm] root = '..' @@ -129,6 +130,9 @@ stubPath = "pyarrow-stubs" typeCheckingMode = "basic" # TODO: Enable type checking once stubs are merged +[tool.ty.environment] +extra-paths = ["pyarrow-stubs"] + [tool.ty.src] include = ["pyarrow-stubs"] exclude = [ diff --git a/python/scripts/update_stub_docstrings.py b/python/scripts/update_stub_docstrings.py index 5fd24014a02..44bd19bfdc8 100644 --- a/python/scripts/update_stub_docstrings.py +++ b/python/scripts/update_stub_docstrings.py @@ -18,14 +18,17 @@ """ Extract docstrings from pyarrow runtime and insert them into stub files. -Usage (from python/ directory with pyarrow built): - python scripts/update_stub_docstrings.py pyarrow-stubs +Usage: + python scripts/update_stub_docstrings.py """ import argparse import importlib import inspect +import os +import shutil import sys +import tempfile from pathlib import Path from textwrap import indent @@ -178,7 +181,7 @@ def add_docstrings_to_stubs(stubs_dir): pyarrow = importlib.import_module("pyarrow") - for stub_file in stubs_dir.rglob('*.pyi'): + for stub_file in sorted(stubs_dir.rglob('*.pyi')): if stub_file.name == "_stubs_typing.pyi": continue @@ -186,43 +189,88 @@ def add_docstrings_to_stubs(stubs_dir): if module_name in LIB_MODULES: namespace = "lib" elif stub_file.parent.name in ("parquet", "interchange"): - namespace = f"{stub_file.parent.name}.{module_name}" + namespace = (stub_file.parent.name if module_name == "__init__" + else f"{stub_file.parent.name}.{module_name}") elif module_name == "__init__": namespace = "" else: namespace = module_name print(f" {stub_file.name} -> {namespace or '(root)'}") - tree = libcst.parse_module(stub_file.read_text()) + tree = libcst.parse_module(stub_file.read_text(encoding="utf-8")) modified = tree.visit(DocstringInserter(pyarrow, namespace)) - stub_file.write_text(modified.code) + stub_file.write_text(modified.code, encoding="utf-8") -def add_docstrings_from_build(stubs_dir, build_lib): +def _link_or_copy(source, destination): + # Prefer symlinks (faster, no disk use) but fall back to copying when the + # filesystem doesn't support them (e.g. Docker volumes, network mounts). + if sys.platform != "win32": + try: + os.symlink(source, destination) + return + except OSError: + pass + + if source.is_dir(): + shutil.copytree(source, destination, symlinks=(sys.platform != "win32")) + else: + shutil.copy2(source, destination) + + +def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir): """ - Entry point for setup.py: update docstrings using pyarrow from build directory. + Assemble an importable pyarrow package inside a temporary directory. - During the build process, pyarrow is not installed in the system Python. - We need to temporarily add the build directory to sys.path so we can - import pyarrow and extract docstrings from it. + During wheel builds the .py sources and compiled binary artifacts live in + separate trees (source checkout vs CMake install prefix). This function + symlinks (or copies) both into pyarrow_pkg folder so that a plain + ``import pyarrow`` works and docstrings can be extracted at build time. """ - stubs_dir, build_lib = Path(stubs_dir), Path(build_lib) + source_pyarrow = source_dir / "pyarrow" + if not source_pyarrow.exists(): + raise FileNotFoundError(f"PyArrow source package not found: {source_pyarrow}") + + for source_path in sorted(source_pyarrow.iterdir()): + if source_path.suffix == ".py": + _link_or_copy(source_path, pyarrow_pkg / source_path.name) + elif source_path.is_dir() and not source_path.name.startswith((".", "__")): + _link_or_copy(source_path, pyarrow_pkg / source_path.name) + + for artifact in sorted(install_pyarrow_dir.iterdir()): + if not artifact.is_file() or artifact.suffix == ".pyi": + continue - sys.path.insert(0, str(build_lib)) - try: - add_docstrings_to_stubs(stubs_dir) - finally: - sys.path.pop(0) + destination = pyarrow_pkg / artifact.name + if not destination.exists(): + _link_or_copy(artifact, destination) if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs folder") + parser.add_argument("install_prefix", type=Path, + help="CMAKE_INSTALL_PREFIX used by wheel build") + parser.add_argument("source_dir", type=Path, + help="PyArrow source directory") args = parser.parse_args() - # Add the directory containing this script's parent (python/) to sys.path - # so pyarrow can be imported when running from the python/ directory - script_dir = Path(__file__).resolve().parent - python_dir = script_dir.parent - sys.path.insert(0, str(python_dir)) - add_docstrings_to_stubs(args.stubs_dir.resolve()) + install_prefix = args.install_prefix.resolve() + source_dir = args.source_dir.resolve() + install_pyarrow_dir = install_prefix / "pyarrow" + if not install_pyarrow_dir.exists(): + install_pyarrow_dir = install_prefix + + if not any(install_pyarrow_dir.rglob("*.pyi")): + print("No .pyi files found in install tree, skipping docstring injection") + sys.exit(0) + + with tempfile.TemporaryDirectory() as tmpdir: + pyarrow_pkg = Path(tmpdir) / "pyarrow" + pyarrow_pkg.mkdir() + _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir) + + sys.path.insert(0, tmpdir) + try: + add_docstrings_to_stubs(install_pyarrow_dir) + finally: + sys.path.pop(0)