Gemma quality progress: BOS fix + attention softcap + diagnostics

unamedkr · claude · unamedkr · commit b503c2eab070 · 2026-04-04T02:46:05.000+09:00
Multiple fixes for Gemma 3/4 output quality:

1. BOS token: all Gemma models now get BOS=2 prepended (was only Gemma 3).
   With BOS, Gemma 4 produces semantically relevant tokens ("Maison" for
   a France prompt) instead of pure random — model is partially working.

2. Attention logit softcap: added cap*tanh(score/cap) before softmax.
   Gemma 2/3/4 use attn_logit_softcap=50.0. Without this, attention
   scores grow unboundedly through QK dot products.

3. Attention scaling: Gemma 4 with QK-norm now uses 1/sqrt(head_dim)
   instead of 1.0.

4. TQ_NO_PLE debug flag: env var to disable PLE for diagnostics.

REMAINING ISSUE: Gemma 4 logits still too large (100+ vs normal 20-30).
With final_logit_softcap=30, all high logits compress to ~30, destroying
ranking. With softcap disabled, output shows relevant tokens but falls
into repetition. Root cause: hidden state grows to norm ~13 at layer 34.
Investigation continues on learned RoPE frequencies and FFN scaling.

SmolLM2 + Qwen3.5 unaffected — 34/34 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -209,11 +209,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     int n_prompt = 0;
 
     if (tokenizer && prompt) {
-        /* Gemma 3: prepend BOS=2. Gemma 4 (n_layers > 30): no BOS (add_bos_token=false).
+        /* Gemma models: prepend BOS=2 (required by both Gemma 3 and 4 architectures).
          * Qwen3.5: no BOS. */
         int add_bos = 0;
-        if (model->config.model_type == 1 && model->config.n_layers <= 30) {
-            add_bos = 1; /* Gemma 3 only */
+        if (model->config.model_type == 1) {
+            add_bos = 1; /* All Gemma models need BOS */
         }
         n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
     } else {
@@ -227,6 +227,14 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         n_prompt = 1;
     }
 
+    /* Debug: print tokenized prompt */
+    if (getenv("TQ_DEBUG")) {
+        fprintf(stderr, "[DEBUG] prompt tokens (%d): ", n_prompt);
+        for (int i = 0; i < n_prompt && i < 20; i++)
+            fprintf(stderr, "%d ", prompt_tokens[i]);
+        fprintf(stderr, "\n");
+    }
+
     /* Prefill: process all prompt tokens */
     for (int i = 0; i < n_prompt; i++) {
         tq_forward(model, state, prompt_tokens[i], i);
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -1032,6 +1032,36 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
                 kh[2 * i + 1] = k0 * sin_t + k1 * cos_t;
             }
         }
+    } else if (model->rope_freqs && model->rope_freqs_len > 0) {
+        /* Learned RoPE frequencies (Gemma 4): use pre-computed inv_freq values.
+         * rope_freqs has full_head_dim/2 entries (e.g., 256 for head_dim=512).
+         * For sliding layers (head_dim=256), use the first 128 entries.
+         * For full layers (head_dim=512), use all 256 entries. */
+        int rope_pairs = head_dim / 2;
+        for (int h = 0; h < n_heads; h++) {
+            float* qh = s->q + h * head_dim;
+            for (int i = 0; i < rope_pairs && i < model->rope_freqs_len; i++) {
+                float theta = pos * model->rope_freqs[i];
+                float cos_t = cosf(theta);
+                float sin_t = sinf(theta);
+                float q0 = qh[2 * i];
+                float q1 = qh[2 * i + 1];
+                qh[2 * i]     = q0 * cos_t - q1 * sin_t;
+                qh[2 * i + 1] = q0 * sin_t + q1 * cos_t;
+            }
+        }
+        for (int h = 0; h < n_kv_heads; h++) {
+            float* kh = s->k + h * head_dim;
+            for (int i = 0; i < rope_pairs && i < model->rope_freqs_len; i++) {
+                float theta = pos * model->rope_freqs[i];
+                float cos_t = cosf(theta);
+                float sin_t = sinf(theta);
+                float k0 = kh[2 * i];
+                float k1 = kh[2 * i + 1];
+                kh[2 * i]     = k0 * cos_t - k1 * sin_t;
+                kh[2 * i + 1] = k0 * sin_t + k1 * cos_t;
+            }
+        }
     } else {
         /* Full RoPE — for Gemma3, use different freq base for sliding vs global layers */
         float rope_base = c->rope_freq_base;
diff --git a/tq_run.dSYM/Contents/Resources/Relocations/aarch64/tq_run.yml b/tq_run.dSYM/Contents/Resources/Relocations/aarch64/tq_run.yml
@@ -1,5 +1,5 @@
 ---
 triple:          'arm64-apple-darwin'
-binary-path:     tq_run
+binary-path:     quant
 relocations:     []
 ...