added cuBLASMp backend option to JAX unit tests for CollectiveGEMM

denera · denera · commit b4ad546ea9b5 · 2025-12-16T19:04:45.000Z
Signed-off-by: Alp Dener &lt;adener@nvidia.com&gt;
diff --git a/examples/jax/collective_gemm/common.py b/examples/jax/collective_gemm/common.py
@@ -154,6 +154,7 @@ def _initialize_distributed(args):
         num_devices_per_process=devices_per_process,
         process_id=args.process_id,
         tensor_parallel_size=args.tensor_parallel_size,
+        use_cublasmp=args.use_cublasmp,
     )
 
 
@@ -241,5 +242,11 @@ def cgemm_parser(description="Collective GEMM test on multi-GPU with tensor para
     parser.add_argument(
         "--enable-result-check", action="store_true", default=True, help="Enable result checking"
     )
+    parser.add_argument(
+        "--use-cublasmp",
+        action="store_true",
+        default=False,
+        help="Use the cuBLASMp backend for overlapping collective operations with GEMM computation",
+    )
 
     return parser
diff --git a/examples/jax/collective_gemm/run_test_cgemm.sh b/examples/jax/collective_gemm/run_test_cgemm.sh
@@ -65,50 +65,58 @@ for TEST_FILE in "${TEST_FILES[@]}"; do
   # Clear PIDs array for this test file
   PIDS=()
 
-  for i in $(seq 0 $(($NUM_GPUS - 1))); do
-    # Define output file for logs
-    LOG_FILE="${TEST_FILE}_gpu_${i}.log"
-
-    if [ $i -eq 0 ]; then
-      # For process 0: show live output AND save to log file using tee
-      echo "=== Live output from process 0 ==="
-      pytest -s -c "$TE_PATH/tests/jax/pytest.ini" \
-        -vs --junitxml=$XML_LOG_DIR/collective_gemm_${TEST_FILE}.xml \
-        "$TE_PATH/examples/jax/collective_gemm/$TEST_FILE" \
-        --num-processes=$NUM_GPUS \
-        --process-id=$i 2>&1 | tee "$LOG_FILE" &
-      PID=$!
-      PIDS+=($PID)
-    else
-      # For other processes: redirect to log files only
-      pytest -s -c "$TE_PATH/tests/jax/pytest.ini" \
-        -vs "$TE_PATH/examples/jax/collective_gemm/$TEST_FILE" \
-        --num-processes=$NUM_GPUS \
-        --process-id=$i > "$LOG_FILE" 2>&1 &
-      PID=$!
-      PIDS+=($PID)
-    fi
+  PYTEST_ARGS=(
+    "-vs"
+    "-c $TE_PATH/tests/jax/pytest.ini"
+    "$TE_PATH/examples/jax/collective_gemm/$TEST_FILE"
+    "--num-processes=$NUM_GPUS"
+  )
+
+  BACKENDS=("cublasmp" "userbuffers")
+  for backend in "${BACKENDS[@]}"; do
+    for i in $(seq 0 $(($NUM_GPUS - 1))); do
+      # Define output file for logs
+      LOG_FILE="${TEST_FILE}_gpu_${i}_${backend}.log"
+
+      if [ $i -eq 0 ]; then
+        # For process 0: show live output AND save to log file using tee
+        echo "=== Live output from process 0 with ${backend} ==="
+        pytest --junitxml=$XML_LOG_DIR/collective_gemm_${TEST_FILE}.xml \
+          "${PYTEST_ARGS[@]}" \
+          --process-id=$i 2>&1 | tee "$LOG_FILE" &
+        PID=$!
+        PIDS+=($PID)
+      else
+        # For other processes: redirect to log files only
+        pytest "${PYTEST_ARGS[@]}" \
+          --process-id=$i > "$LOG_FILE" 2>&1 &
+        PID=$!
+        PIDS+=($PID)
+      fi
+    done
   done
 
   # Wait for all processes to finish
   wait
 
   # Check and print the log content from process 0 (now has log file thanks to tee)
-  if grep -q "SKIPPED" "${TEST_FILE}_gpu_0.log"; then
-    echo "... $TEST_FILE SKIPPED"
-  elif grep -q "FAILED" "${TEST_FILE}_gpu_0.log"; then
-    echo "... $TEST_FILE FAILED"
-    HAS_FAILURE=1
-  elif grep -q "PASSED" "${TEST_FILE}_gpu_0.log"; then
-    echo "... $TEST_FILE PASSED"
-  else
-    echo "... $TEST_FILE INVALID"
-    HAS_FAILURE=1
-  fi
-
-  # Remove the log files after processing them
-  wait
-  rm ${TEST_FILE}_gpu_*.log
+  for backend in "${BACKENDS[@]}"; do
+    if grep -q "SKIPPED" "${TEST_FILE}_gpu_0_${backend}.log"; then
+      echo "... $TEST_FILE SKIPPED for ${backend} backend"
+    elif grep -q "FAILED" "${TEST_FILE}_gpu_0_${backend}.log"; then
+      echo "... $TEST_FILE FAILED for ${backend} backend"
+      HAS_FAILURE=1
+    elif grep -q "PASSED" "${TEST_FILE}_gpu_0_${backend}.log"; then
+      echo "... $TEST_FILE PASSED for ${backend} backend"
+    else
+      echo "... $TEST_FILE INVALID for ${backend} backend"
+      HAS_FAILURE=1
+    fi
+
+    # Remove the log files after processing them
+    wait
+    rm ${TEST_FILE}_gpu_*.log
+  done
 done
 
 wait
diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt
@@ -270,7 +270,7 @@ if (NVTE_WITH_CUBLASMP)
     target_compile_definitions(transformer_engine PRIVATE NVTE_WITH_CUBLASMP)
     target_include_directories(transformer_engine PRIVATE ${CUBLASMP_DIR}/include ${NVSHMEM_DIR}/include)
     find_library(CUBLASMP_LIB
-                 NAMES cublasmp libcublasmp
+                 NAMES cublasmp libcublasmp.so.0
                  PATHS ${CUBLASMP_DIR}
                  PATH_SUFFIXES lib
                  REQUIRED)
diff --git a/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp b/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
@@ -79,6 +79,8 @@ CommOverlapCore::CommOverlapCore(int myrank, int numranks, int mylocal, int numl
 
 CommOverlapCore::CommOverlapCore(int64_t nccl_comm_ptr, int tp_rank, int tp_size,
                                  int num_comm_sm, bool is_p2p, bool atomic_gemm) {
+  NVTE_CHECK(nvte_built_with_cublasmp(),
+             "Comm+GEMM overlap with cuBLASMp backend requires TE to be built with NVTE_WITH_CUBLASMP=1.");
   _with_cublasmp = true;
 
   nvte_comm_gemm_ctx_create(reinterpret_cast<ncclComm_t>(nccl_comm_ptr), tp_size, tp_rank);
diff --git a/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h b/transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h
@@ -18,7 +18,7 @@
 
 #define NVTE_COMM_OVERLAP_MAX_STREAMS 3
 
-/* \brief Check if TE is built with cuBlasMp.
+/* \brief Check if TE is built with cuBLASMp.
  *
  * \return True if TE is built with cuBlasMp.
  */
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -70,6 +70,7 @@
 
 
 num_cublas_streams = get_num_compute_streams()
+collective_gemm_with_cublasmp = False
 
 
 def get_cublas_workspace_size_bytes() -> None:
@@ -198,6 +199,7 @@ def collective_gemm_bootstrap(
     num_sm_for_communication=2,
     use_ce=True,
     aggregate_all_gather=False,
+    use_cublasmp=False,
 ):
     """Initialize NCCL communicators for Collective GEMM operations.
 
@@ -281,6 +283,8 @@ def collective_gemm_bootstrap(
         f" num_devices_per_process={num_devices_per_process}"
     )
     assert 0 <= process_id < num_total_devices, f"Invalid process_id={process_id}"
+    global collective_gemm_with_cublasmp
+    collective_gemm_with_cublasmp = use_cublasmp
     initialize_cgemm_communicator(
         num_total_devices,
         num_devices_per_process,
@@ -292,6 +296,7 @@ def collective_gemm_bootstrap(
         num_sm_for_communication,
         use_ce,
         aggregate_all_gather,
+        use_cublasmp,
     )
 
 
@@ -386,7 +391,7 @@ class GemmPrimitive(BasePrimitive):
 
     name = "te_gemm_ffi"
     multiple_results = True
-    impl_static_args = (8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
+    impl_static_args = (8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)
     inner_primitive = None
     outer_primitive = None
 
@@ -411,7 +416,6 @@ def abstract(
         sequence_dim,
         is_outer,
         collective_op,
-        use_cublasmp,
     ):
         del use_split_accumulator, transpose_batch_sequence
 
@@ -539,7 +543,7 @@ def _dims_are_consecutive(dims):
         if scaling_mode.is_nvfp4_scaling:
             workspace_size += lhs_scale_inv.size + rhs_scale_inv.size
         if not collective_op.is_none:
-            if use_cublasmp:
+            if collective_gemm_with_cublasmp:
                 # cuBlasMp manages its own cuBlasLt workspaces per stream
                 workspace_size = 0
             else:
@@ -578,7 +582,6 @@ def lowering(
         sequence_dim,
         is_outer,
         collective_op,
-        use_cublasmp,
     ):
         del out_dtype, transpose_batch_sequence, sequence_dim, is_outer
 
@@ -623,7 +626,7 @@ def lowering(
             "grad": grad,
             "use_split_accumulator": use_split_accumulator,
             "collective_op": int(collective_op.value),
-            "use_cublasmp": use_cublasmp,
+            "use_cublasmp": collective_gemm_with_cublasmp,
         }
 
         operand_output_aliases = {}
@@ -658,7 +661,6 @@ def impl(
         sequence_dim,
         is_outer,
         collective_op,
-        use_cublasmp,
     ):
         if scaling_mode.is_1d_block_scaling():
             lhs_cdims, rhs_cdims = map(sanitize_dims, (lhs.ndim, rhs.ndim), contracting_dims)
@@ -726,7 +728,6 @@ def impl(
             transpose_batch_sequence=transpose_batch_sequence,
             sequence_dim=sequence_dim,
             is_outer=is_outer,
-            use_cublasmp=use_cublasmp,
         )
         # Alter output blocks for CGEMM AG
         if (
@@ -778,7 +779,6 @@ def outer_impl(
         sequence_dim,
         is_outer,
         collective_op,
-        use_cublasmp,
     ):
         return GemmPrimitive.impl(
             lhs,
@@ -800,7 +800,6 @@ def outer_impl(
             sequence_dim,
             is_outer,
             collective_op,
-            use_cublasmp,
         )
 
     @staticmethod
@@ -818,7 +817,6 @@ def batcher(
         sequence_dim,
         is_outer,
         collective_op,
-        use_cublasmp,
     ):
         del transpose_batch_sequence, sequence_dim, is_outer
         assert GemmPrimitive.outer_primitive is not None
@@ -852,7 +850,6 @@ def batcher(
                 transpose_batch_sequence=transpose_batch_sequence,
                 sequence_dim=sequence_dim,
                 is_outer=is_outer,
-                use_cublasmp=use_cublasmp,
             ),
             (out_bdims, bias_bdims, pre_gelu_bdims),
         )
@@ -1015,7 +1012,6 @@ def infer_sharding_from_operands(
         sequence_dim,
         is_outer,
         collective_op,
-        use_cublasmp,
         mesh,
         arg_infos,
         result_infos,
@@ -1027,7 +1023,6 @@ def infer_sharding_from_operands(
             result_infos,
             is_outer,
             sequence_dim,
-            use_cublasmp,
         )
 
         (_, (out_specs, dbias_specs, pre_gelu_specs), *_) = (
@@ -1062,7 +1057,6 @@ def partition(
         sequence_dim,
         is_outer,
         collective_op,
-        use_cublasmp,
         mesh,
         arg_infos,
         result_infos,
@@ -1141,7 +1135,6 @@ def _sharded_impl(lhs, lhs_scale_inv, rhs, rhs_scale_inv, bias, gelu_input, alph
                 sequence_dim=inferred_sequence_dim,
                 is_outer=False,
                 collective_op=collective_op,
-                use_cublasmp=use_cublasmp,
             )
 
             if reduce_spec is not None:
@@ -1173,7 +1166,6 @@ def shardy_sharding_rule(
         sequence_dim,
         is_outer,
         collective_op,
-        use_cublasmp,
         mesh,
         operand_types,
         result_types,
@@ -1268,7 +1260,6 @@ def _te_gemm(
     use_split_accumulator: bool = None,
     transpose_batch_sequence: bool = False,
     collective_op: CollectiveOp = CollectiveOp.NONE,
-    use_cublasmp: bool = False,
 ) -> Tuple[jax.Array, ...]:
 
     if grad or fuse_gelu:
@@ -1372,7 +1363,6 @@ def _te_gemm(
         sequence_dim=-1,  #  Dummy value and will be set in the primitive
         is_outer=True,
         collective_op=collective_op,
-        use_cublasmp=use_cublasmp,
     )
 
 
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
@@ -516,7 +516,9 @@ class CommOverlapHelper : torch::CustomClassHolder {
   void ub_barrier(ExtComm comm);
 
   int64_t get_nccl_comm_ptr(std::string comm_name) {
-    NVTE_CHECK(backend_is_nccl, "Cannot get nccComm_t ptr if backend is not NCCL.");
+    NVTE_CHECK(backend_is_nccl,
+               "Comm+GEMM overlap with cuBLASMp backend requires a tensor-parallel process ",
+               "group with NCCL backend.");
     return reinterpret_cast<c10d::ProcessGroupNCCL *>(pgs[comm_name])->getCommPtr();
   }
 };

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`
`19`	`19`	`#define NVTE_COMM_OVERLAP_MAX_STREAMS 3`
`20`	`20`
`21`		`-/* \brief Check if TE is built with cuBlasMp.`
	`21`	`+/* \brief Check if TE is built with cuBLASMp.`
`22`	`22`	`*`
`23`	`23`	`* \return True if TE is built with cuBlasMp.`
`24`	`24`	`*/`