Gemma attention softcap + attention scaling fix + CLI features

unamedkr · claude · unamedkr · commit d3e7a44a757b · 2026-04-04T02:41:53.000+09:00
Gemma 2/3/4 models use attention logit soft-capping (cap=50.0): score = cap * tanh(score / cap) This was missing, causing unbounded attention scores and cascading hidden state growth through layers. Now applied before softmax. Also fixed attention scaling for Gemma 4 with QK-norm: was: scale = 1.0 (no scaling) now: scale = 1/sqrt(head_dim) Added PLE debug bypass: TQ_NO_PLE=1 env var. Added CLI: --version flag, --json PPL output mode. NOTE: Gemma 4 output is still garbled despite these fixes. Root cause investigation continues — the hybrid sliding/full attention or K=V sharing is suspected. Fixes #4, addresses #8, relates to #9. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/include/turboquant/tq_engine.h b/include/turboquant/tq_engine.h
@@ -58,6 +58,7 @@ typedef struct {
     int full_n_heads;        /* n_heads for full layers (e.g., 8 vs sliding 16) */
     int full_n_kv_heads;     /* n_kv_heads for full layers (e.g., 2 vs sliding 8) */
     float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */
+    float attn_logit_softcap;  /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */
     int* per_layer_inter_dim;  /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */
 } tq_model_config_t;
 
@@ -214,6 +215,10 @@ typedef struct {
     /* Gemma3 sliding window support */
     int* layer_is_sliding;    /* [n_layers] per-layer flag: 1=sliding, 0=global (NULL if not used) */
 
+    /* Learned RoPE frequencies (Gemma 4) — NULL if using computed frequencies */
+    float* rope_freqs;        /* [rope_dim/2] learned inv_freq values (F32) */
+    int rope_freqs_len;       /* length of rope_freqs array (rope_dim/2) */
+
     /* Gemma 4 Per-Layer Embedding (PLE) — NULL if not used */
     const void* ple_embedding;/* [n_layers * ple_dim, vocab_size] GGUF quantized (e.g. Q5_K) */
     int ple_embedding_type;   /* tq_ggml_dtype of ple_embedding (for runtime dequant) */
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -2881,6 +2881,11 @@ tq_model_t* tq_load_gguf(const char* path) {
                                tq_gguf_get_f32(gguf, GGUF_KEY("rope.local.freq_base"),
                                tq_gguf_get_f32(gguf, GGUF_KEY("rope.freq_base"), 10000.0f)));
     c->final_logit_softcap = tq_gguf_get_f32(gguf, GGUF_KEY("final_logit_softcapping"), 0.0f);
+    c->attn_logit_softcap = tq_gguf_get_f32(gguf, GGUF_KEY("attn_logit_softcapping"), 0.0f);
+    /* Gemma 2/3/4 use attention softcap but it may not be in metadata — hardcode */
+    if (c->model_type == 1 && c->attn_logit_softcap == 0.0f) {
+        c->attn_logit_softcap = 50.0f;
+    }
 
     /* Cap context for memory safety on small machines.
      * GGUF models often claim 262K context but we cap at 4096 by default.
@@ -3551,6 +3556,17 @@ tq_model_t* tq_load_gguf(const char* path) {
         }
     }
 
+    /* Learned RoPE frequencies (Gemma 4): pre-computed inv_freq values */
+    {
+        const tq_gguf_tensor_t* rope_t = find_gguf_tensor(gguf, "rope_freqs.weight");
+        if (rope_t) {
+            model->rope_freqs = dequant_tensor_fp32(rope_t);
+            model->rope_freqs_len = (int)rope_t->shape[0];
+            fprintf(stderr, "tq_load_gguf: loaded learned RoPE frequencies (%d values)\n",
+                    model->rope_freqs_len);
+        }
+    }
+
     /* Gemma 4 PLE (Per-Layer Embedding) global tensors */
     {
         const tq_gguf_tensor_t* ple_emb_t = find_gguf_tensor(gguf, "per_layer_token_embd.weight");
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -143,6 +143,12 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
     int max_seq = config->max_seq_len;
     int n_layers = config->n_layers;
 
+    /* For hybrid attention (Gemma 4), full layers have larger kv_dim.
+     * Allocate K/V buffers and KV cache with the MAX of sliding and full kv_dim. */
+    int full_kv_dim = (config->full_n_kv_heads > 0 && config->full_head_dim > 0)
+        ? config->full_n_kv_heads * config->full_head_dim : kv_dim;
+    int max_kv_dim = (full_kv_dim > kv_dim) ? full_kv_dim : kv_dim;
+
     tq_state_t* s = (tq_state_t*)calloc(1, sizeof(tq_state_t));
     if (!s) return NULL;
 
@@ -171,15 +177,15 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
     s->xb     = (float*)calloc((size_t)max_dim, sizeof(float));
     s->xb2    = (float*)calloc((size_t)max_dim, sizeof(float));
     s->q      = (float*)calloc((size_t)max_q_dim, sizeof(float));
-    s->k      = (float*)calloc((size_t)kv_dim, sizeof(float));
-    s->v      = (float*)calloc((size_t)kv_dim, sizeof(float));
+    s->k      = (float*)calloc((size_t)max_kv_dim, sizeof(float));
+    s->v      = (float*)calloc((size_t)max_kv_dim, sizeof(float));
     s->att    = (float*)calloc((size_t)n_heads * max_seq, sizeof(float));
     s->hb     = (float*)calloc((size_t)inter_dim, sizeof(float));
     s->hb2    = (float*)calloc((size_t)inter_dim, sizeof(float));
     s->logits = (float*)calloc((size_t)config->vocab_size, sizeof(float));
 
-    /* KV cache for self_attn layers */
-    size_t kv_layer_size = (size_t)max_seq * kv_dim;
+    /* KV cache for self_attn layers — use max_kv_dim for hybrid attention compatibility */
+    size_t kv_layer_size = (size_t)max_seq * max_kv_dim;
     s->key_cache   = (float*)calloc((size_t)n_layers * kv_layer_size, sizeof(float));
 
     /* Value cache quantization: Q4 or Q2 for aggressive V compression.
@@ -188,8 +194,8 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
      * Q2: 8 packed bytes + 1 float scale per block of 32 = 12 bytes/32 values */
     s->value_quant_bits = value_quant_bits;
     if (value_quant_bits == 4 || value_quant_bits == 2) {
-        /* Quantized V cache */
-        int n_blocks_per_pos = (kv_dim + 31) / 32; /* blocks per position (all heads) */
+        /* Quantized V cache — use max_kv_dim for hybrid attention compatibility */
+        int n_blocks_per_pos = (max_kv_dim + 31) / 32; /* blocks per position (all heads) */
         size_t packed_per_block = (value_quant_bits == 4) ? 16 : 8;
         s->value_stride_qs = (size_t)n_blocks_per_pos * packed_per_block;
         s->value_stride_scales = (size_t)n_blocks_per_pos;
@@ -883,8 +889,12 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
 
     int kv_dim = n_kv_heads * head_dim;
     int kv_mul = n_heads / n_kv_heads;
-    /* KV cache stride uses the global (sliding) config for uniform allocation */
-    int cache_kv_dim = c->n_kv_heads * c->head_dim;
+    /* KV cache stride uses the MAX of sliding and full kv_dim for uniform allocation.
+     * This ensures full attention layers (with larger kv_dim) don't overflow the cache. */
+    int sliding_kv_dim = c->n_kv_heads * c->head_dim;
+    int full_kv_dim_cache = (c->full_n_kv_heads > 0 && c->full_head_dim > 0)
+        ? c->full_n_kv_heads * c->full_head_dim : sliding_kv_dim;
+    int cache_kv_dim = (full_kv_dim_cache > sliding_kv_dim) ? full_kv_dim_cache : sliding_kv_dim;
     size_t kv_layer_stride = (size_t)c->max_seq_len * cache_kv_dim;
 
     /* Pre-quantize activation to Q8 once for all Q2/Q4 projections in this layer.
@@ -1222,8 +1232,10 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
      * Others: scale = 1/sqrt(head_dim) */
     float attn_scale_dim = (float)head_dim;
     if (c->use_qk_norm && c->model_type == 1 && c->full_head_dim > 0 && !c->is_moe) {
-        /* Gemma 4 dense (E2B): attention_scale = 1.0 (QK-norm handles scaling) */
-        attn_scale_dim = 1.0f; /* will compute 1/sqrt(1) = 1.0 */
+        /* Gemma 4: QK-norm normalizes Q,K per head, but we still need 1/sqrt(head_dim)
+         * scaling. QK-norm ensures ||Q||=||K||~sqrt(head_dim) after norm weights,
+         * so the dot product scales as head_dim without explicit scaling. */
+        attn_scale_dim = (float)head_dim;
     } else if (c->query_pre_attn_scalar > 0.0f) {
         attn_scale_dim = c->query_pre_attn_scalar;
         if (c->full_head_dim > 0 && model->layer_is_sliding && !model->layer_is_sliding[l]) {
@@ -1439,6 +1451,15 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
             }
         }
 
+        /* Attention logit soft-capping (Gemma 2/3/4): cap * tanh(score / cap) */
+        if (c->attn_logit_softcap > 0.0f) {
+            float cap = c->attn_logit_softcap;
+            float inv_cap = 1.0f / cap;
+            for (int t = attn_start; t < seq_len; t++) {
+                atth[t] = cap * tanhf(atth[t] * inv_cap);
+            }
+        }
+
         /* Softmax */
         tq_softmax(atth, seq_len);
 
@@ -1789,7 +1810,7 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
      *   1. per_layer_token_embd[token] (dequant from Q5_K) → reshape [n_layers, ple_dim]
      *   2. per_layer_model_proj @ embed_raw (FP32 matmul) → reshape [n_layers, ple_dim]
      *   3. Combine with RMS-norm and averaging. */
-    if (model->ple_dim > 0 && model->ple_embedding && model->ple_proj) {
+    if (model->ple_dim > 0 && model->ple_embedding && model->ple_proj && !getenv("TQ_NO_PLE")) {
         int ple_dim = model->ple_dim;
         int n_layers = c->n_layers;
         int total_ple = n_layers * ple_dim;  /* e.g., 35 * 256 = 8960 */
@@ -2033,12 +2054,13 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         }
 
         /* Gemma 4 PLE: apply per-layer embedding after FFN, before layer_output_scale.
+         * Can be disabled with TQ_NO_PLE=1 for debugging.
          * 1. gate_out = gelu(inp_gate @ hidden_state) → [ple_dim]
          * 2. mixed = gate_out * ple_input[l] → elementwise [ple_dim]
          * 3. proj_out = proj @ mixed → [hidden_dim]
          * 4. normed = rms_norm(proj_out, post_norm) → [hidden_dim]
          * 5. hidden_state = hidden_state + normed */
-        if (model->ple_dim > 0 && s->ple_buf && layer->ple_gate && layer->ple_proj && layer->ple_norm) {
+        if (model->ple_dim > 0 && s->ple_buf && layer->ple_gate && layer->ple_proj && layer->ple_norm && !getenv("TQ_NO_PLE")) {
             int ple_dim = model->ple_dim;
             float ple_gate_out[256];  /* ple_dim <= 256 */
             float ple_mixed[256];