Skip to content

Commit 103bb90

Browse files
committed
fixes for sharp
1 parent eae332b commit 103bb90

File tree

3 files changed

+10
-10
lines changed

3 files changed

+10
-10
lines changed

.github/workflows/build_nccl_image_dev.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ jobs:
4747
with:
4848
images: ghcr.io/azure/ai-infrastructure-on-azure/nccl-test
4949
tags: |
50-
type=raw,value=dev2503
51-
type=raw,value=dev2503-{{sha}}
52-
type=raw,value=dev2503-pytorch-{{date 'YYYYMMDD-hhmmss' tz='UTC'}}
50+
type=raw,value=dev
51+
type=raw,value=dev-{{sha}}
52+
type=raw,value=dev-pytorch-{{date 'YYYYMMDD-hhmmss' tz='UTC'}}
5353
5454
- name: Build and push Dev Image
5555
uses: docker/build-push-action@v6
@@ -67,14 +67,14 @@ jobs:
6767
echo "## Dev NCCL Test Image Built Successfully! 🚀" >> $GITHUB_STEP_SUMMARY
6868
echo "" >> $GITHUB_STEP_SUMMARY
6969
echo "**Image Tags:**" >> $GITHUB_STEP_SUMMARY
70-
echo "- \`ghcr.io/azure/ai-infrastructure-on-azure/nccl-test:dev2503\`" >> $GITHUB_STEP_SUMMARY
71-
echo "- \`ghcr.io/azure/ai-infrastructure-on-azure/nccl-test:dev2503-${{ github.sha }}\`" >> $GITHUB_STEP_SUMMARY
70+
echo "- \`ghcr.io/azure/ai-infrastructure-on-azure/nccl-test:dev\`" >> $GITHUB_STEP_SUMMARY
71+
echo "- \`ghcr.io/azure/ai-infrastructure-on-azure/nccl-test:dev-${{ github.sha }}\`" >> $GITHUB_STEP_SUMMARY
7272
echo "" >> $GITHUB_STEP_SUMMARY
7373
echo "**Changes:**" >> $GITHUB_STEP_SUMMARY
74-
echo "- Updated to use PyTorch base image: \`nvcr.io/nvidia/pytorch:25.03-py3\`" >> $GITHUB_STEP_SUMMARY
74+
echo "- Updated to use PyTorch base image: \`nvcr.io/nvidia/pytorch:25.06-py3\`" >> $GITHUB_STEP_SUMMARY
7575
echo "- Includes NCCL tests for distributed training validation" >> $GITHUB_STEP_SUMMARY
7676
echo "" >> $GITHUB_STEP_SUMMARY
7777
echo "**Usage:**" >> $GITHUB_STEP_SUMMARY
7878
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
79-
echo "docker pull ghcr.io/azure/ai-infrastructure-on-azure/nccl-test:dev2503" >> $GITHUB_STEP_SUMMARY
79+
echo "docker pull ghcr.io/azure/ai-infrastructure-on-azure/nccl-test:dev" >> $GITHUB_STEP_SUMMARY
8080
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY

infrastructure_validations/aks/NCCL/docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:25.03-py3
1+
ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:25.06-py3
22
FROM ${PYTORCH_IMAGE}
33

44
ENV DEBIAN_FRONTEND=noninteractive

infrastructure_validations/aks/NCCL/helm/nccl-test/values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@ ncclTest:
4747

4848
# UCX settings
4949
UCX_TLS: "rc"
50-
UCX_NET_DEVICES: "mlx5_ib0:1"
50+
UCX_NET_DEVICES: "mlx5_0:1"
5151

5252
# SHARP settings
5353
NCCL_COLLNET_ENABLE: "1" # set to "0" to disable SHARP
54-
SHARP_SMX_UCX_INTERFACE: "mlx5_ib0:1"
54+
SHARP_SMX_UCX_INTERFACE: "mlx5_0:1"
5555
SHARP_COLL_ENABLE_SAT: "1"
5656
SHARP_COLL_LOG_LEVEL: "3"
5757
SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING: "1"

0 commit comments

Comments
 (0)