Fix Qwen RMSNorm: revert runtime +1 for GGUF + switch demo to Qwen3.5 (#24)

unamedkr · claude · web-flow · commit 9b9ce04349a0 · 2026-04-10T15:16:36.000+09:00
PR #23 incorrectly added RMSNorm +1 for all Qwen-family GGUF models. Investigation reveals: - Qwen2/Qwen3: standard RMSNorm (weight * norm(x)), no +1 needed - Qwen3.5/Gemma: use (1+weight), but llama.cpp's GGUF converter already bakes +1 into the weights during conversion - Runtime +1 was double-applying for Qwen3.5 and incorrectly applying for Qwen2/3, causing activation explosion Fix: skip runtime +1 for all GGUF models. Only apply for non-GGUF (raw checkpoint) DeltaNet models. Also switch WASM demo default from Qwen3-0.6B Q4_K_M (broken due to double-quantization on a tiny model) to Qwen3.5-0.8B Q4_K_M (~508 MB) which produces coherent output at 25 tok/s. Verified: - Qwen3.5 0.8B Q8_0: coherent English output - Llama 3.2 1B Q8_0: coherent English output (unchanged) - Qwen3 0.6B Q4_K_M: real words now (was garbage Unicode), but quality limited by double-quantization on 0.6B model Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/bindings/python/quantcpp/__init__.py b/bindings/python/quantcpp/__init__.py
@@ -53,10 +53,10 @@
         "smollm2-135m-instruct-q8_0.gguf",
         135,
     ),
-    "Qwen3-0.6B": (
-        "unsloth/Qwen3-0.6B-GGUF",
-        "Qwen3-0.6B-Q4_K_M.gguf",
-        378,
+    "Qwen3.5-0.8B": (
+        "unsloth/Qwen3.5-0.8B-GGUF",
+        "Qwen3.5-0.8B-Q4_K_M.gguf",
+        508,
     ),
     "Llama-3.2-1B": (
         "hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
diff --git a/quant.h b/quant.h
@@ -9982,24 +9982,11 @@ static tq_model_t* tq_load_safetensors(const char* path) {
 
     free(tensors);
 
-    /* Qwen RMSNorm adjustment: Qwen's RMSNorm computes
-     * output = norm(x) * (1.0 + weight), NOT norm(x) * weight.
-     * We bake the "+1" into the weight so tq_rmsnorm can stay as
-     * out = x * rsqrt * weight.
-     *
-     * This applies to: input_layernorm, post_attention_layernorm,
-     * model.norm, q_norm, k_norm.
-     * It does NOT apply to: linear_attn.norm (Qwen3_5RMSNormGated
-     * uses plain weight without +1).
-     *
-     * Applies to all Qwen-family models (qwen2, qwen3, qwen3_5, etc.)
-     * Detected by arch string or DeltaNet presence. */
-    int is_qwen_family = (model->config.delta_n_heads > 0);
-    if (model->gguf_ctx) {
-        const tq_gguf_ctx_t* gctx = (const tq_gguf_ctx_t*)model->gguf_ctx;
-        if (strstr(gctx->arch, "qwen") != NULL) is_qwen_family = 1;
-    }
-    if (is_qwen_family) {
+    /* Qwen3.5 (DeltaNet hybrid) RMSNorm adjustment.
+     * Only for non-GGUF models (raw checkpoints). GGUF files from
+     * llama.cpp already have +1 baked in by the converter.
+     * Qwen2/Qwen3 use standard RMSNorm and never need +1. */
+    if (model->config.delta_n_heads > 0 && !model->gguf_ctx) {
         int dim_h = model->config.hidden_dim;
         int head_dim_h = model->config.head_dim;
 
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -4065,6 +4065,13 @@ skip_q4_conversion: ;
 
     #undef GGUF_KEY
 
+    /* NOTE: No runtime RMSNorm +1 adjustment for GGUF models.
+     * - Qwen2/Qwen3: standard RMSNorm (weight * norm(x)), no +1 needed.
+     * - Qwen3.5/Gemma: use (1+weight) convention, but llama.cpp's GGUF
+     *   converter already bakes +1 into the weights during conversion.
+     *   Adding +1 at runtime would double-apply and cause activation explosion.
+     * The Gemma heuristic above (mean > 2.0 check) handles the Gemma case. */
+
     /* Initialize persistent Metal GPU buffers for layer-level compute */
 #ifdef TQ_HAS_METAL
     {
diff --git a/wasm/index.html b/wasm/index.html
@@ -121,11 +121,11 @@ <h2>LLM in Your Browser</h2>
         <p style="margin-bottom:16px; color:#6ee7b7; font-size:15px">No install. No API key. No server. Just click.</p>
 
         <div class="model-cards" id="modelCards">
-            <div class="model-card recommended" onclick="loadDemoModel('qwen3-0.6b')">
-                <div class="name">Qwen3 0.6B</div>
-                <div class="meta">~378 MB download &middot; Q4_K_M</div>
+            <div class="model-card recommended" onclick="loadDemoModel('qwen3.5-0.8b')">
+                <div class="name">Qwen3.5 0.8B</div>
+                <div class="meta">~508 MB download &middot; Q4_K_M</div>
                 <span class="tag">Recommended</span>
-                <div class="meta" style="margin-top:4px">Fast, multilingual, good for demo</div>
+                <div class="meta" style="margin-top:4px">Fast, multilingual, best quality/size</div>
             </div>
             <div class="model-card" onclick="loadDemoModel('llama-3.2-1b')">
                 <div class="name">Llama 3.2 1B</div>
@@ -167,11 +167,11 @@ <h2>LLM in Your Browser</h2>
 
 // ---- Model registry ----
 const MODELS = {
-    'qwen3-0.6b': {
-        url: 'https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf',
-        name: 'Qwen3-0.6B Q4_K_M',
-        size: '~378 MB',
-        cacheKey: 'qwen3-0.6b-q4km',
+    'qwen3.5-0.8b': {
+        url: 'https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q4_K_M.gguf',
+        name: 'Qwen3.5-0.8B Q4_K_M',
+        size: '~508 MB',
+        cacheKey: 'qwen3.5-0.8b-q4km',
         chatTemplate: (text) => `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`,
     },
     'llama-3.2-1b': {
diff --git a/wasm/quant.wasm b/wasm/quant.wasm