From e184a21b01c1795852d55ecb992ab64284d3b3b3 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 11 Jun 2026 21:50:48 -0700 Subject: [PATCH] Fix stale Int4Tensor assertions in gemma4_31b CUDA pipeline tests Packing converts Int4Tensor weights to CudaCoalescedInt4Tensor because the CUDA int4 kernel is registered only on the coalesced type, which is intentionally not a subclass of Int4Tensor. Update test_int4_weights_preserved and test_load_converts_weights to assert CudaCoalescedInt4Tensor. --- .../gemma4_31b/tests/test_cuda_pipeline.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py index 1f66652bb2b..0e31a50f37b 100644 --- a/examples/models/gemma4_31b/tests/test_cuda_pipeline.py +++ b/examples/models/gemma4_31b/tests/test_cuda_pipeline.py @@ -190,11 +190,13 @@ def _forward(self): return self.model(tok, pos, temp) def test_int4_weights_preserved(self): - """Packing passes Int4Tensor through without conversion.""" - from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor + """Packing converts Int4Tensor to CudaCoalescedInt4Tensor.""" + from executorch.backends.cuda.coalesced_int4_tensor import ( + CudaCoalescedInt4Tensor, + ) w = self.model.layers[0].mlp.gate_proj.weight.data - self.assertIsInstance(w, Int4Tensor) + self.assertIsInstance(w, CudaCoalescedInt4Tensor) def test_inference_produces_valid_output(self): out = self._forward() @@ -243,14 +245,19 @@ def _load(self, tmp): return load_gguf_model(path, backend="cuda", config=GGUF_CONFIG) def test_load_converts_weights(self): - """GGUF -> CUDA: Q4_K -> Int4Tensor, Q6_K -> IntxUnpacked, embedding bf16.""" + """GGUF -> CUDA: Q4_K -> CudaCoalescedInt4Tensor, Q6_K -> IntxUnpacked, + embedding bf16.""" + from executorch.backends.cuda.coalesced_int4_tensor import ( + CudaCoalescedInt4Tensor, + ) from torchao.quantization import IntxUnpackedToInt8Tensor - from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor with tempfile.TemporaryDirectory() as tmp: model, _ = self._load(tmp) - self.assertIsInstance(model.layers[0].self_attn.q_proj.weight.data, Int4Tensor) + self.assertIsInstance( + model.layers[0].self_attn.q_proj.weight.data, CudaCoalescedInt4Tensor + ) self.assertIsInstance( model.layers[0].mlp.down_proj.weight.data, IntxUnpackedToInt8Tensor )