fix(gemma4): partial fixes + diagnostic findings for E2B support

unamedkr · claude · unamedkr · commit dac9c8f75ad4 · 2026-04-13T01:32:57.000+09:00
Fixed in quant.h:
- RoPE: remove incorrect /2 on rope_n_dims_full for Gemma 4
  (split-source doesn't halve; quant.h was divergent)
- Attention softcap: exclude Gemma 4 from hardcoded 50.0
  (Gemma 4 config has no attn_logit_softcapping)

Fixed in unified server:
- Chat template: add Gemma format (&lt;start_of_turn&gt;user/model)
  with auto-detection from model filename
- Template token filtering: add &lt;start_of_turn&gt;, &lt;end_of_turn&gt;, &lt;eos&gt;
- 3-way template: ChatML / Phi-3 / Gemma

STILL BROKEN — Gemma 4 E2B produces garbage on ALL builds:
Root cause analysis:
1. NOT Metal (TQ_NO_METAL still garbage)
2. NOT Q4 conversion (TQ_NO_Q4 still garbage)
3. NOT chat template (CLI uses correct &lt;start_of_turn&gt; template)
4. Likely candidates:
   a. KV cache sharing (num_kv_shared_layers=20) not implemented
   b. Hybrid attention Q dim (8×512=4096) &gt; hidden_dim (1536)
      requires upscaling projection that may not exist
   c. Proportional RoPE (partial_rotary_factor=0.25) for full
      layers may interact incorrectly with rope_n_dims_full=512

HuggingFace config reference (google/gemma-4-E2B-it):
  hidden_act: gelu_pytorch_tanh
  hidden_size: 1536, global_head_dim: 512, head_dim: 256
  sliding_window: 512, num_kv_shared_layers: 20
  rope_theta: 1000000 (full), 10000 (sliding)
  partial_rotary_factor: 0.25 (full layers only)
  final_logit_softcapping: 30.0
  attn_logit_softcapping: NOT present (=0)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/quant.h b/quant.h
@@ -8364,6 +8364,74 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
 
     if (*text == '\0') return n_tokens;
 
+    /* Pre-pass: split text on special tokens BEFORE BPE encoding.
+     *
+     * GPT-2/Qwen tokenizers have "added_tokens" (e.g., <|im_start|>,
+     * <|im_end|>, <|endoftext|>) that must be matched as WHOLE strings
+     * and mapped to their token IDs directly — NOT decomposed by BPE.
+     *
+     * Without this, `<|im_start|>` gets BPE'd into `<`, `|`, `im`,
+     * `_start`, `|`, `>` (6 tokens) instead of a single ID (151644).
+     * The model was trained to see the single ID, so BPE fragments
+     * produce garbage output. */
+    {
+        /* Known special tokens that must be matched verbatim.
+         * We scan for ANY vocab entry that starts with `<|` and ends
+         * with `|>` — this covers all Qwen/GPT added_tokens without
+         * a hardcoded list. For SentencePiece models (Gemma, Phi-3)
+         * this also handles `<bos>`, `<eos>`, etc. */
+        const char* p = text;
+        while (*p && n_tokens < max_tokens) {
+            /* Check if position p starts a special token */
+            if (*p == '<') {
+                int best_len = 0;
+                int best_id = -1;
+                /* Try matching known patterns: <|...|>, <...> */
+                for (int slen = 3; slen <= 32 && p + slen <= text + strlen(text); slen++) {
+                    if (p[slen - 1] == '>') {
+                        char buf[64];
+                        if (slen >= (int)sizeof(buf)) break;
+                        memcpy(buf, p, (size_t)slen);
+                        buf[slen] = '\0';
+                        int id = str_lookup(tok, buf);
+                        if (id >= 0 && slen > best_len) {
+                            best_len = slen;
+                            best_id = id;
+                        }
+                    }
+                }
+                if (best_id >= 0) {
+                    /* Found a special token — emit it directly and
+                     * recursively encode any text before/after it. */
+                    if (p > text) {
+                        /* Encode the prefix (normal text before this special token) */
+                        char* prefix = (char*)malloc((size_t)(p - text) + 1);
+                        if (prefix) {
+                            memcpy(prefix, text, (size_t)(p - text));
+                            prefix[p - text] = '\0';
+                            n_tokens += tq_encode(tok, prefix,
+                                                   tokens + n_tokens,
+                                                   max_tokens - n_tokens, 0);
+                            free(prefix);
+                        }
+                    }
+                    tokens[n_tokens++] = best_id;
+                    /* Recurse on the remaining text after the special token */
+                    const char* rest = p + best_len;
+                    if (*rest) {
+                        n_tokens += tq_encode(tok, rest,
+                                               tokens + n_tokens,
+                                               max_tokens - n_tokens, 0);
+                    }
+                    return n_tokens;
+                }
+            }
+            p++;
+        }
+    }
+
+    /* No special tokens found — proceed with standard BPE encoding */
+
     /* Detect tokenizer style: Gemma uses ▁ (U+2581) for spaces in vocab,
      * GPT2/Qwen uses byte-level BPE with Ġ/ĉ encoding.
      * Check if '▁' exists in vocab as a simple heuristic. */
@@ -11394,8 +11462,9 @@ tq_model_t* tq_load_gguf(const char* path) {
                                tq_gguf_get_f32(gguf, GGUF_KEY("rope.freq_base"), 10000.0f)));
     c->final_logit_softcap = tq_gguf_get_f32(gguf, GGUF_KEY("final_logit_softcapping"), 0.0f);
     c->attn_logit_softcap = tq_gguf_get_f32(gguf, GGUF_KEY("attn_logit_softcapping"), 0.0f);
-    /* Gemma 2/3/4 use attention softcap but it may not be in metadata — hardcode */
-    if (c->model_type == 1 && c->attn_logit_softcap == 0.0f) {
+    /* Gemma 2/3 use attention softcap (50.0) but Gemma 4 does NOT.
+     * Only apply hardcoded default for non-Gemma4 Gemma models. */
+    if (c->model_type == 1 && !c->is_gemma4 && c->attn_logit_softcap == 0.0f) {
         c->attn_logit_softcap = 50.0f;
     }
 
@@ -11449,10 +11518,15 @@ tq_model_t* tq_load_gguf(const char* path) {
         c->head_dim = c->hidden_dim / c->n_heads;
     }
 
-    /* For hybrid sliding/full attention (Gemma 4):
+    /* For hybrid sliding/full attention (Gemma 3/4 ONLY):
      * Override head_dim from first layer's K tensor shape (sliding layer),
-     * since sliding layers are the majority and determine KV cache layout. */
-    {
+     * since sliding layers are the majority and determine KV cache layout.
+     *
+     * MUST be gated to Gemma arch — running unconditionally breaks Qwen3
+     * (head_dim=128 gets overridden to 64 because 1024/64=16 passes the
+     * "hd < metadata_head_dim" check while 1024/128=8 doesn't). */
+    int is_gemma_arch = (strstr(gguf->arch, "gemma") != NULL);
+    if (is_gemma_arch) {
         const tq_gguf_tensor_t* k0 = tq_gguf_find_tensor(gguf, "blk.0.attn_k.weight");
         if (k0 && k0->n_dims >= 2) {
             int k_out = (int)k0->shape[1];
@@ -11517,12 +11591,11 @@ tq_model_t* tq_load_gguf(const char* path) {
         /* Gemma 4 (STEP35) detection: architecture string is "gemma4" */
         if (strstr(gguf->arch, "gemma4") != NULL) {
             c->is_gemma4 = 1;
-            /* STEP35: full attention layers use half the RoPE dimensions */
-            if (c->rope_n_dims_full > 0) {
-                c->rope_n_dims_full = c->rope_n_dims_full / 2;
-            }
+            /* Gemma 4: full attention layers use rope.dimension_count directly.
+             * Do NOT halve — split-source (tq_model.c) correctly keeps full=512.
+             * The /2 was a misport that caused garbage output. */
             fprintf(stderr, "tq_load_gguf: Gemma4 — RoPE dims swa=%d full=%d, "
-                    "GeGLU, rope_freqs for full layers only\n",
+                    "SiLU FFN, rope_freqs for full layers only\n",
                     c->rope_n_dims, c->rope_n_dims_full);
         }
         fprintf(stderr, "tq_load_gguf: Gemma family detected (sliding_window=%d)\n", c->sliding_window);
diff --git a/tools/quant_server_unified.c b/tools/quant_server_unified.c
@@ -41,14 +41,20 @@ typedef struct {
     int          port;
     int          n_threads;
     int          has_fused_qkv;   /* Phi-3 detection */
+    int          template_type;   /* TMPL_CHATML / TMPL_PHI3 / TMPL_GEMMA */
     pthread_mutex_t mutex;
 } server_t;
 
 /* ============================================================
  * Chat template
  * ============================================================ */
+/* Template types: 0=ChatML (Qwen/Llama), 1=Phi-3, 2=Gemma */
+#define TMPL_CHATML  0
+#define TMPL_PHI3    1
+#define TMPL_GEMMA   2
+
 static char* build_prompt(const char** roles, const char** contents,
-                           int n_msgs, int is_phi3) {
+                           int n_msgs, int template_type) {
     size_t total = 256;
     for (int i = 0; i < n_msgs; i++)
         total += 64 + (contents[i] ? strlen(contents[i]) : 0);
@@ -61,20 +67,31 @@ static char* build_prompt(const char** roles, const char** contents,
     for (int i = 0; i < n_msgs; i++) {
         const char* c = contents[i] ? contents[i] : "";
         int n;
-        if (is_phi3) {
+        if (template_type == TMPL_PHI3) {
             if (strcmp(roles[i], "system") == 0)
                 n = snprintf(w, rem, "<|system|>\n%s<|end|>\n", c);
             else if (strcmp(roles[i], "user") == 0)
                 n = snprintf(w, rem, "<|user|>\n%s<|end|>\n", c);
             else
                 n = snprintf(w, rem, "<|assistant|>\n%s<|end|>\n", c);
+        } else if (template_type == TMPL_GEMMA) {
+            /* Gemma: <start_of_turn>user\n...<end_of_turn>\n */
+            if (strcmp(roles[i], "system") == 0)
+                n = snprintf(w, rem, "<start_of_turn>user\n%s<end_of_turn>\n", c);
+            else if (strcmp(roles[i], "user") == 0)
+                n = snprintf(w, rem, "<start_of_turn>user\n%s<end_of_turn>\n", c);
+            else
+                n = snprintf(w, rem, "<start_of_turn>model\n%s<end_of_turn>\n", c);
         } else {
+            /* ChatML: <|im_start|>role\n...<|im_end|>\n */
             n = snprintf(w, rem, "<|im_start|>%s\n%s<|im_end|>\n", roles[i], c);
         }
         if (n > 0 && (size_t)n < rem) { w += n; rem -= (size_t)n; }
     }
-    if (is_phi3)
+    if (template_type == TMPL_PHI3)
         snprintf(w, rem, "<|assistant|>\n");
+    else if (template_type == TMPL_GEMMA)
+        snprintf(w, rem, "<start_of_turn>model\n");
     else
         snprintf(w, rem, "<|im_start|>assistant\n");
 
@@ -223,11 +240,13 @@ static void stream_on_token(const char* text, void* user_data) {
     stream_ctx_t* sc = (stream_ctx_t*)user_data;
     if (!text || !text[0]) return;
 
-    /* Skip template tokens */
+    /* Skip template tokens (all supported chat formats) */
     if (strstr(text, "<|end|>") || strstr(text, "<|assistant|>") ||
         strstr(text, "<|user|>") || strstr(text, "<|system|>") ||
         strstr(text, "<|im_end|>") || strstr(text, "<|im_start|>") ||
-        strstr(text, "<|endoftext|>")) return;
+        strstr(text, "<|endoftext|>") ||
+        strstr(text, "<start_of_turn>") || strstr(text, "<end_of_turn>") ||
+        strstr(text, "<eos>")) return;
 
     /* JSON-escape the token */
     char escaped[1024];
@@ -257,11 +276,13 @@ static void collect_on_token(const char* text, void* user_data) {
     collect_ctx_t* cc = (collect_ctx_t*)user_data;
     if (!text || !text[0]) return;
 
-    /* Skip template tokens */
+    /* Skip template tokens (all supported chat formats) */
     if (strstr(text, "<|end|>") || strstr(text, "<|assistant|>") ||
         strstr(text, "<|user|>") || strstr(text, "<|system|>") ||
         strstr(text, "<|im_end|>") || strstr(text, "<|im_start|>") ||
-        strstr(text, "<|endoftext|>")) return;
+        strstr(text, "<|endoftext|>") ||
+        strstr(text, "<start_of_turn>") || strstr(text, "<end_of_turn>") ||
+        strstr(text, "<eos>")) return;
 
     size_t tlen = strlen(text);
     if (cc->len + tlen >= cc->cap) {
@@ -364,7 +385,7 @@ static void handle_request(server_t* srv, int fd) {
         }
 
         /* Build prompt */
-        char* prompt = build_prompt(roles, contents, n_msgs, srv->has_fused_qkv);
+        char* prompt = build_prompt(roles, contents, n_msgs, srv->template_type);
 
         /* Generate completion ID — unique per request (A14: timestamp + counter) */
         static int req_counter = 0;
@@ -546,17 +567,20 @@ int main(int argc, char** argv) {
         return 1;
     }
 
-    /* Detect Phi-3 architecture by checking if the model loaded fused QKV.
-     * We do a quick test: try a dummy generate to see if model works. */
-    /* Simple heuristic: check model_path for "phi" */
-    int has_fused_qkv = 0;
+    /* Detect model architecture for chat template selection.
+     * Check model filename for architecture hints. */
+    int template_type = TMPL_CHATML;  /* default */
     const char* bn = strrchr(model_path, '/');
     bn = bn ? bn + 1 : model_path;
     if (strstr(bn, "hi-3") || strstr(bn, "hi3") || strstr(bn, "Hi-3") || strstr(bn, "Hi3") ||
         strstr(bn, "phi-3") || strstr(bn, "phi3") || strstr(bn, "Phi-3") || strstr(bn, "Phi3")) {
-        has_fused_qkv = 1;
+        template_type = TMPL_PHI3;
         fprintf(stderr, "Detected Phi-3 model — using Phi-3 chat template\n");
+    } else if (strstr(bn, "gemma") || strstr(bn, "Gemma")) {
+        template_type = TMPL_GEMMA;
+        fprintf(stderr, "Detected Gemma model — using Gemma chat template\n");
     }
+    int has_fused_qkv = (template_type == TMPL_PHI3) ? 1 : 0;
 
     /* Extract model ID from filename */
     char model_id[256];
@@ -570,6 +594,7 @@ int main(int argc, char** argv) {
         .port = port,
         .n_threads = n_threads,
         .has_fused_qkv = has_fused_qkv,
+        .template_type = template_type,
     };
     pthread_mutex_init(&srv.mutex, NULL);
 
@@ -603,7 +628,8 @@ int main(int argc, char** argv) {
     fprintf(stderr, "\nquant-server-unified listening on http://0.0.0.0:%d\n", port);
     fprintf(stderr, "  Model: %s\n", model_id);
     fprintf(stderr, "  Threads: %d\n", n_threads);
-    fprintf(stderr, "  Template: %s\n", has_fused_qkv ? "phi3" : "chatml");
+    const char* tmpl_names[] = {"chatml", "phi3", "gemma"};
+    fprintf(stderr, "  Template: %s\n", tmpl_names[template_type]);
     fprintf(stderr, "  POST /v1/chat/completions\n");
     fprintf(stderr, "  GET  /v1/models\n");
     fprintf(stderr, "  GET  /health\n\n");