From 85203be2d6e9008d9457177b82d0c44b1e9f48d7 Mon Sep 17 00:00:00 2001 From: yuyazhua Date: Thu, 2 Jul 2026 16:26:42 +0800 Subject: [PATCH] Qualcomm AI Engine Direct - Update mix-precision analyzer unitest - Update test_analyzer_to_file_generation to the new analyze(decoder_inference, text_dataloader) signature - Replace SimpleModel with SimpleLLMDecoder so the SQNR analyzer captures a realistic per-layer structure; add it to tests/models.py --- backends/qualcomm/tests/models.py | 71 ++++++++++++++++++++ backends/qualcomm/tests/test_qnn_delegate.py | 42 +++++++++--- 2 files changed, 104 insertions(+), 9 deletions(-) diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 0201edb6dee..8ab850854ea 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -2386,6 +2386,77 @@ def forward(self, x, y): return z5 +class SimpleLLMDecoder(torch.nn.Module): + """ + Minimal transformer decoder mirroring how QNN LLM decoders are built: + a token embedding feeds a stack of decoder blocks whose linear projections + are expressed as 1x1 conv2d (see static_llama.py), grouped under a + ``layers.N`` ModuleList. Takes token ids and an additive attention mask. + """ + + class ConvAttention(torch.nn.Module): + def __init__(self, dim, n_heads): + super().__init__() + self.n_heads = n_heads + self.head_dim = dim // n_heads + self.scale = self.head_dim**-0.5 + self.wq_conv = torch.nn.Conv2d(dim, dim, 1, bias=False) + self.wk_conv = torch.nn.Conv2d(dim, dim, 1, bias=False) + self.wv_conv = torch.nn.Conv2d(dim, dim, 1, bias=False) + self.wo_conv = torch.nn.Conv2d(dim, dim, 1, bias=False) + + def forward(self, x, atten_mask): # x: (b, dim, 1, seq) + b, dim, _, seq = x.shape + q = self.wq_conv(x).view(b, self.n_heads, self.head_dim, seq) + k = self.wk_conv(x).view(b, self.n_heads, self.head_dim, seq) + v = self.wv_conv(x).view(b, self.n_heads, self.head_dim, seq) + attn = torch.matmul(q.transpose(-2, -1), k) * self.scale + attn = torch.softmax(attn + atten_mask, dim=-1) + ctx = torch.matmul(v, attn.transpose(-2, -1)) + ctx = ctx.reshape(b, dim, 1, seq) + return self.wo_conv(ctx) + + class ConvFeedForward(torch.nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1_conv = torch.nn.Conv2d(dim, hidden_dim, 1, bias=False) + self.w2_conv = torch.nn.Conv2d(hidden_dim, dim, 1, bias=False) + self.w3_conv = torch.nn.Conv2d(dim, hidden_dim, 1, bias=False) + self.act_fn = torch.nn.SiLU() + + def forward(self, x): + return self.w2_conv(self.act_fn(self.w1_conv(x)) * self.w3_conv(x)) + + class DecoderLayer(torch.nn.Module): + def __init__(self, dim, hidden_dim, n_heads): + super().__init__() + self.attention = SimpleLLMDecoder.ConvAttention(dim, n_heads) + self.feed_forward = SimpleLLMDecoder.ConvFeedForward(dim, hidden_dim) + + def forward(self, x, atten_mask): + x = x + self.attention(x, atten_mask) + x = x + self.feed_forward(x) + return x + + def __init__(self, vocab_size=128, dim=32, hidden_dim=64, n_heads=4, n_layers=1): + super().__init__() + self.tok_embeddings = torch.nn.Embedding(vocab_size, dim) + self.layers = torch.nn.ModuleList( + [self.DecoderLayer(dim, hidden_dim, n_heads) for _ in range(n_layers)] + ) + self.output_conv = torch.nn.Conv2d(dim, dim, 1, bias=False) + self.eval() + + def forward(self, input_ids, atten_mask): # input_ids: (b, seq) + x = self.tok_embeddings(input_ids) # (b, seq, dim) + b, seq, dim = x.shape + x = x.reshape(b, seq, 1, dim).transpose(1, 3) # (b, dim, 1, seq) + for layer in self.layers: + x = layer(x, atten_mask) + x = self.output_conv(x) + return x.transpose(1, 3).reshape(b, seq, dim) + + class SkipBackToBack(torch.nn.Module): def __init__(self): diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index fcb365292ee..0ba1f16f949 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -10812,19 +10812,40 @@ def test_analyzer_to_file_generation(self): save_suggest_recipes, ) - module = SimpleModel() # noqa: F405 - sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + torch.manual_seed(8) + n_layers = 20 + vocab_size, seq_len, n_heads = 128, 8, 4 + module = SimpleLLMDecoder( # noqa: F405 + vocab_size=vocab_size, n_heads=n_heads, n_layers=n_layers + ) + input_ids = torch.randint(0, vocab_size, (1, seq_len), dtype=torch.int32) + atten_mask = torch.triu( + torch.full((1, 1, seq_len, seq_len), float("-inf")), diagonal=1 + ) + sample_input = (input_ids, atten_mask) fp32_gm = torch.export.export(module, sample_input, strict=True).module() qdq_gm = self.get_qdq_module( module, sample_input, quant_dtype=QuantDtype.use_8a4w ) + class DecoderInference: + def get_inputs(self, input_ids, attn_mask): + return (input_ids, attn_mask) + + text_dataloader = [ + { + "input_ids": input_ids, + "attention_mask": atten_mask, + } + ] + + num_sharding = 5 report = PerLayerSqnrAnalyzer( - model_name="simple_conv", - num_layers=4, + model_name="simple_llm_decoder", + num_layers=n_layers, fp32_gm=fp32_gm, qdq_gm=qdq_gm, - ).analyze([sample_input], num_sharding=4) + ).analyze(DecoderInference(), text_dataloader, num_sharding=num_sharding) overrides = report.suggest_recipe_overrides(sqnr_threshold=22.0) @@ -10833,10 +10854,13 @@ def test_analyzer_to_file_generation(self): save_suggest_recipes(report, overrides, output_dir=tmp_dir) # --- save_analysis_summary csv file --- - with open(f"{tmp_dir}/simple_conv_quantization_error.csv") as f: + with open(f"{tmp_dir}/simple_llm_decoder_quantization_error.csv") as f: csv_content = f.read() rows = list(csv.reader(csv_content.splitlines())) - self.assertEqual(len(rows), 5) # 1 header + 4 group rows + # 1 header + per-shard conv groups (7 projections each: wq/wk/wv/wo, + # w1/w2/w3) + the model-level output_conv. Layers are bucketed into + # num_sharding contiguous shards (n_layers >= num_sharding). + self.assertEqual(len(rows), 1 + num_sharding * 7 + 1) self.assertEqual( rows[0], [ @@ -10852,11 +10876,11 @@ def test_analyzer_to_file_generation(self): # --- save_suggest_recipes .py file (only written when sensitive layers exist) --- if overrides: - with open(f"{tmp_dir}/simple_conv_suggest_recipe.py") as f: + with open(f"{tmp_dir}/simple_llm_decoder_suggest_recipe.py") as f: py_content = f.read() # generated file must be valid Python try: - compile(py_content, "simple_conv_suggest_recipe.py", "exec") + compile(py_content, "simple_llm_decoder_suggest_recipe.py", "exec") except SyntaxError as e: self.fail( f"Generated recipe file has syntax error: {e}\n{py_content}"