Fix Qwen3 garbage output: apply RMSNorm +1 to all Qwen-family models (#23)

unamedkr · claude · web-flow · commit a44df86ebfe6 · 2026-04-10T14:55:24.000+09:00
Qwen's RMSNorm computes `output = norm(x) * (1 + weight)`, not
`norm(x) * weight`. The +1 weight adjustment was only applied when
`delta_n_heads &gt; 0` (DeltaNet/Qwen3.5-hybrid) or `model_type == 1`
(Gemma). Plain Qwen3 (and Qwen2/2.5) models have `delta_n_heads=0`
and `model_type=0`, so the adjustment was skipped entirely.

Without it, RMSNorm produces wrong scales and activations explode
by layer 2 (values reaching 6000+), generating garbage tokens.

Fix: detect any Qwen-family model via `strstr(gguf-&gt;arch, "qwen")`
in addition to the existing DeltaNet check. This covers qwen2,
qwen2moe, qwen3, qwen3_5 — all use the same (1+w) RMSNorm.

Applied to tq_model.c (library) + quant.h (single-header/WASM).
WASM binary rebuilt to include the fix.

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/quant.h b/quant.h
@@ -9982,7 +9982,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
 
     free(tensors);
 
-    /* Qwen3.5 RMSNorm adjustment: Qwen3_5RMSNorm computes
+    /* Qwen RMSNorm adjustment: Qwen's RMSNorm computes
      * output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
      * We bake the "+1" into the weight so tq_rmsnorm can stay as
      * out = x * rsqrt * weight.
@@ -9992,8 +9992,14 @@ static tq_model_t* tq_load_safetensors(const char* path) {
      * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
      * uses plain weight without +1).
      *
-     * We detect Qwen3.5 by the presence of DeltaNet layers. */
-    if (model->config.delta_n_heads > 0) {
+     * Applies to all Qwen-family models (qwen2, qwen3, qwen3_5, etc.)
+     * Detected by arch string or DeltaNet presence. */
+    int is_qwen_family = (model->config.delta_n_heads > 0);
+    if (model->gguf_ctx) {
+        const tq_gguf_ctx_t* gctx = (const tq_gguf_ctx_t*)model->gguf_ctx;
+        if (strstr(gctx->arch, "qwen") != NULL) is_qwen_family = 1;
+    }
+    if (is_qwen_family) {
         int dim_h = model->config.hidden_dim;
         int head_dim_h = model->config.head_dim;
 
@@ -10022,7 +10028,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
             for (int i = 0; i < dim_h; i++)
                 model->output_norm[i] += 1.0f;
         }
-        fprintf(stderr, "tq_load_model: applied Qwen3.5 RMSNorm +1 weight adjustment\n");
+        fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
     }
 
     /* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -1517,7 +1517,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
 
     free(tensors);
 
-    /* Qwen3.5 RMSNorm adjustment: Qwen3_5RMSNorm computes
+    /* Qwen RMSNorm adjustment: Qwen's RMSNorm computes
      * output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
      * We bake the "+1" into the weight so tq_rmsnorm can stay as
      * out = x * rsqrt * weight.
@@ -1527,8 +1527,14 @@ static tq_model_t* tq_load_safetensors(const char* path) {
      * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
      * uses plain weight without +1).
      *
-     * We detect Qwen3.5 by the presence of DeltaNet layers. */
-    if (model->config.delta_n_heads > 0) {
+     * Applies to all Qwen-family models (qwen2, qwen3, qwen3_5, etc.)
+     * Detected by arch string or DeltaNet presence. */
+    int is_qwen_family = (model->config.delta_n_heads > 0);
+    if (model->gguf_ctx) {
+        const tq_gguf_ctx_t* gctx = (const tq_gguf_ctx_t*)model->gguf_ctx;
+        if (strstr(gctx->arch, "qwen") != NULL) is_qwen_family = 1;
+    }
+    if (is_qwen_family) {
         int dim_h = model->config.hidden_dim;
         int head_dim_h = model->config.head_dim;
 
@@ -1557,7 +1563,7 @@ static tq_model_t* tq_load_safetensors(const char* path) {
             for (int i = 0; i < dim_h; i++)
                 model->output_norm[i] += 1.0f;
         }
-        fprintf(stderr, "tq_load_model: applied Qwen3.5 RMSNorm +1 weight adjustment\n");
+        fprintf(stderr, "tq_load_model: applied Qwen RMSNorm +1 weight adjustment\n");
     }
 
     /* Gemma3 RMSNorm adjustment: same (1+w) scaling as Qwen3.5 */
diff --git a/wasm/quant.wasm b/wasm/quant.wasm