Skip to content
Draft
82 changes: 49 additions & 33 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,52 +38,68 @@ jobs:
pytorch:
name: 'PyTorch'
runs-on: ubuntu-latest
container:
image: ubuntu:24.04
options: --user root
steps:
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"

- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'

- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"

# - name: Maximize build space
# uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
# with:
# root-reserve-mb: 5120
# temp-reserve-mb: 32
# swap-size-mb: 10240
# remove-dotnet: 'true'
# remove-android: 'true'
# remove-haskell: 'true'
# remove-codeql: 'true'
#
- name: 'Container dependencies'
run: apt-get update && apt-get install -y git python3 python3-pip ccache
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive

- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity
- name: "Disk space check before dependencies"
run: df -lh

- name: 'Dependencies'
run: |
docker exec builder bash -c '\
apt-get update && \
apt-get install -y git python3.9 pip cudnn9-cuda-12 && \
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \
apt-get clean \
'
export PIP_BREAK_SYSTEM_PACKAGES=1
pip install cmake ninja pybind11 numpy packaging
pip install torch --index-url https://download.pytorch.org/whl/cu130
pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1"
export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')")
echo $CUDA_PATH
ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so
ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so
ln -s $CUDA_PATH/lib $CUDA_PATH/lib64

- name: "Disk space check after dependencies"
run: df -lh
- uses: actions/cache@v4
with:
path: /root/.ccache
key: ccache-${{ runner.os }}-${{ github.ref }}
restore-keys: ccache-${{ runner.os }}-
- name: "Disk space check after dependencies and ccache"
run: df -lh
- name: 'Build'
run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps'
run: |
export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')")
export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')")
export PATH=$CUDA_PATH/bin:$PATH
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH
export NVTE_BUILD_USE_NVIDIA_WHEELS=1
export PIP_BREAK_SYSTEM_PACKAGES=1
NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
env:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
MAX_JOBS: 2
CCACHE_DIR: /root/.ccache
CCACHE_MAXSIZE: 2G
- name: 'Sanity check'
run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py'
run: python3 tests/pytorch/test_sanity_import.py
jax:
name: 'JAX'
runs-on: ubuntu-latest
Expand Down
7 changes: 7 additions & 0 deletions build_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,13 @@ def cuda_version() -> Tuple[int, ...]:
version_str = get_version("nvidia-cuda-runtime-cu12")
version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit())
return version_tuple
except:
pass

try:
version_str = get_version("nvidia-cuda-runtime")
version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit())
return version_tuple
except importlib.metadata.PackageNotFoundError:
raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.")

Expand Down
4 changes: 2 additions & 2 deletions transformer_engine/common/util/logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
#include <cudnn.h>
#include <nvrtc.h>

#include "nccl.h"

#ifdef NVTE_WITH_CUBLASMP
#include <cublasmp.h>

#include "nccl.h"
#endif // NVTE_WITH_CUBLASMP

#include <iostream>
Expand Down
Loading