From e184a21b01c1795852d55ecb992ab64284d3b3b3 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 11 Jun 2026 21:50:48 -0700
Subject: [PATCH] Fix stale Int4Tensor assertions in gemma4_31b CUDA pipeline
 tests

Packing converts Int4Tensor weights to CudaCoalescedInt4Tensor because the
CUDA int4 kernel is registered only on the coalesced type, which is
intentionally not a subclass of Int4Tensor. Update test_int4_weights_preserved
and test_load_converts_weights to assert CudaCoalescedInt4Tensor.
---
 .../gemma4_31b/tests/test_cuda_pipeline.py    | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
index 1f66652bb2b..0e31a50f37b 100644
--- a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
+++ b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py
@@ -190,11 +190,13 @@ def _forward(self):
             return self.model(tok, pos, temp)
 
     def test_int4_weights_preserved(self):
-        """Packing passes Int4Tensor through without conversion."""
-        from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
+        """Packing converts Int4Tensor to CudaCoalescedInt4Tensor."""
+        from executorch.backends.cuda.coalesced_int4_tensor import (
+            CudaCoalescedInt4Tensor,
+        )
 
         w = self.model.layers[0].mlp.gate_proj.weight.data
-        self.assertIsInstance(w, Int4Tensor)
+        self.assertIsInstance(w, CudaCoalescedInt4Tensor)
 
     def test_inference_produces_valid_output(self):
         out = self._forward()
@@ -243,14 +245,19 @@ def _load(self, tmp):
         return load_gguf_model(path, backend="cuda", config=GGUF_CONFIG)
 
     def test_load_converts_weights(self):
-        """GGUF -> CUDA: Q4_K -> Int4Tensor, Q6_K -> IntxUnpacked, embedding bf16."""
+        """GGUF -> CUDA: Q4_K -> CudaCoalescedInt4Tensor, Q6_K -> IntxUnpacked,
+        embedding bf16."""
+        from executorch.backends.cuda.coalesced_int4_tensor import (
+            CudaCoalescedInt4Tensor,
+        )
         from torchao.quantization import IntxUnpackedToInt8Tensor
-        from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
 
         with tempfile.TemporaryDirectory() as tmp:
             model, _ = self._load(tmp)
 
-        self.assertIsInstance(model.layers[0].self_attn.q_proj.weight.data, Int4Tensor)
+        self.assertIsInstance(
+            model.layers[0].self_attn.q_proj.weight.data, CudaCoalescedInt4Tensor
+        )
         self.assertIsInstance(
             model.layers[0].mlp.down_proj.weight.data, IntxUnpackedToInt8Tensor
         )