fix: resolve 13 GitHub issues + add sync check & hardening

unamedkr · claude · unamedkr · commit 221efbb6c5b8 · 2026-04-12T13:15:20.000+09:00
Bug fixes: - #60: fix 108MB memory leak — free GGUF dequant norm/embedding buffers in tq_free_model() (both quant.h and tq_model.c), gated on gguf_ctx - #61: fix quant.h quantized KV cache stride for hybrid attention — use max(head_dim, full_head_dim) and max(n_kv_heads, full_n_kv_heads) - #63: replace blocking mutex with trylock + HTTP 429 in quant-server - #57: add </s> and <|end|> to CHAT_END_MARKERS filter - #67: port Phi-3 support to split sources (fused QKV/FFN, LongRoPE, NeoX RoPE, BOS handling, state buffer sizing) Hardening (from issue resolution insights): - Add 0-self_attn hard-fail in tq_model.c GGUF loader — prevents silent garbage output on unsupported architectures - Unify BOS token handling: replace model-specific has_fused_qkv check with generic vocab-based <s> auto-detection (covers Phi-3, LLaMA 2, and future models) - Add <|begin_of_text|> to BOS lookup chain in tq_tokenizer.c - Add scripts/check_sync.sh — automated 7-category sync verification between quant.h and split sources (caught 2 drifts on first run) - Add ARM SVE backend stub (src/backend/cpu/tq_sve.c) with dispatch wiring — scaffolding for Graviton3/4 optimization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/quant.h b/quant.h
@@ -12942,6 +12942,43 @@ void tq_free_model(tq_model_t* model) {
         }
     }
     free(model->moe_config);
+
+    /* Free dequantized norm/embedding buffers (GGUF path only).
+     * In the GGUF path, dequant_tensor_fp32() individually malloc's each
+     * norm weight. In the SafeTensor path, these point into _converted_data
+     * (freed above), so we must NOT free them again. */
+    if (model->gguf_ctx && model->layers) {
+        for (int l = 0; l < model->config.n_layers; l++) {
+            tq_layer_weights_t* layer = &model->layers[l];
+            free(layer->attn_norm);
+            free(layer->ffn_norm);
+            free(layer->q_norm);
+            free(layer->k_norm);
+            free(layer->post_attn_norm);
+            free(layer->post_ffn_norm);
+            free(layer->pre_ffn_norm);
+            free(layer->post_ffn_norm_1);
+            free(layer->pre_ffn_norm_2);
+            free(layer->post_ffn_norm_2);
+            free(layer->ple_norm);
+            free(layer->delta_a_log);
+            free(layer->delta_conv1d);
+            free(layer->delta_dt_bias);
+            free(layer->delta_in_proj_qkv);
+            free(layer->delta_in_proj_z);
+            free(layer->delta_norm);
+            free(layer->delta_in_proj_a);
+            free(layer->delta_in_proj_b);
+            free(layer->delta_out_proj);
+        }
+        free(model->token_embedding);
+        free(model->output_weight);
+        free(model->output_norm);
+        free(model->rope_freqs);
+        free(model->ple_proj);
+        free(model->ple_proj_norm);
+    }
+
     free(model->layers);
 
     /* Free GGUF context (handles munmap internally) */
diff --git a/scripts/check_sync.sh b/scripts/check_sync.sh
@@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+# check_sync.sh — verify critical code sections are in sync between
+# quant.h (single header) and src/ (split sources).
+#
+# This catches the #67-class bug: a feature implemented in quant.h
+# but not ported to the split sources (or vice versa).
+#
+# Usage: bash scripts/check_sync.sh
+# Returns 0 if all checks pass, 1 if any drift is detected.
+
+set -euo pipefail
+
+HEADER="quant.h"
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+ERRORS=0
+
+check_marker_list() {
+    local label="$1"
+    local file1="$2"
+    local file2="$3"
+    local pattern="$4"
+
+    local list1 list2
+    list1=$(grep -o "$pattern" "$file1" 2>/dev/null | sort -u)
+    list2=$(grep -o "$pattern" "$file2" 2>/dev/null | sort -u)
+
+    if [ "$list1" = "$list2" ]; then
+        echo -e "  ${GREEN}✓${NC} $label"
+    else
+        echo -e "  ${RED}✗${NC} $label — MISMATCH"
+        diff <(echo "$list1") <(echo "$list2") || true
+        ERRORS=$((ERRORS + 1))
+    fi
+}
+
+check_field_exists() {
+    local label="$1"
+    local field="$2"
+    local file="$3"
+
+    if grep -q "$field" "$file" 2>/dev/null; then
+        echo -e "  ${GREEN}✓${NC} $label: '$field' found in $(basename $file)"
+    else
+        echo -e "  ${RED}✗${NC} $label: '$field' MISSING in $(basename $file)"
+        ERRORS=$((ERRORS + 1))
+    fi
+}
+
+check_both_have() {
+    local label="$1"
+    local pattern="$2"
+    local file1="$3"
+    local file2="$4"
+
+    local has1 has2
+    has1=$(grep -c "$pattern" "$file1" 2>/dev/null || echo 0)
+    has2=$(grep -c "$pattern" "$file2" 2>/dev/null || echo 0)
+
+    if [ "$has1" -gt 0 ] && [ "$has2" -gt 0 ]; then
+        echo -e "  ${GREEN}✓${NC} $label: present in both files"
+    elif [ "$has1" -eq 0 ] && [ "$has2" -eq 0 ]; then
+        echo -e "  ${YELLOW}—${NC} $label: absent in both (OK if not yet needed)"
+    else
+        local missing
+        [ "$has1" -eq 0 ] && missing="$(basename $file1)" || missing="$(basename $file2)"
+        echo -e "  ${RED}✗${NC} $label: MISSING in $missing"
+        ERRORS=$((ERRORS + 1))
+    fi
+}
+
+echo "=== quant.h ↔ split-source sync check ==="
+echo ""
+
+# --- 1. CHAT_END_MARKERS list ---
+echo "[1] CHAT_END_MARKERS (template token filter)"
+# Extract only the markers from the CHAT_END_MARKERS array definition
+extract_markers() {
+    sed -n '/CHAT_END_MARKERS\[\]/,/NULL/p' "$1" | grep -o '"[^"]*"' | sort -u
+}
+local_m1=$(extract_markers "$HEADER")
+local_m2=$(extract_markers "src/engine/tq_generate.c")
+if [ "$local_m1" = "$local_m2" ]; then
+    echo -e "  ${GREEN}✓${NC} End markers"
+else
+    echo -e "  ${RED}✗${NC} End markers — MISMATCH"
+    diff <(echo "$local_m1") <(echo "$local_m2") || true
+    ERRORS=$((ERRORS + 1))
+fi
+
+# --- 2. Phi-3 fused tensor support ---
+echo ""
+echo "[2] Phi-3 fused tensor fields"
+check_field_exists "Config: has_fused_qkv" "has_fused_qkv" "include/turboquant/tq_engine.h"
+check_field_exists "Config: has_fused_up_gate" "has_fused_up_gate" "include/turboquant/tq_engine.h"
+check_field_exists "Layer: gguf_w_qkv" "gguf_w_qkv" "include/turboquant/tq_engine.h"
+check_field_exists "Layer: gguf_w_up_gate" "gguf_w_up_gate" "include/turboquant/tq_engine.h"
+check_field_exists "Config: rope_factors_short" "rope_factors_short" "include/turboquant/tq_engine.h"
+
+# --- 3. Fused QKV forward path ---
+echo ""
+echo "[3] Fused QKV forward path"
+check_both_have "Fused QKV matmul" "gguf_w_qkv" \
+    "$HEADER" "src/engine/tq_transformer.c"
+check_both_have "Fused FFN gate||up" "gguf_w_up_gate" \
+    "$HEADER" "src/engine/tq_transformer.c"
+
+# --- 4. LongRoPE ---
+echo ""
+echo "[4] LongRoPE rotation"
+check_both_have "rope_factors_short" "rope_factors_short" \
+    "$HEADER" "src/engine/tq_transformer.c"
+check_both_have "rope_factors_long" "rope_factors_long" \
+    "$HEADER" "src/engine/tq_transformer.c"
+
+# --- 5. BOS token handling ---
+echo ""
+echo "[5] BOS token handling"
+check_both_have "BOS <s> lookup in tokenizer" '"<s>"' \
+    "$HEADER" "src/engine/tq_tokenizer.c"
+check_both_have "BOS <s> auto-detect in generate" '"<s>"' \
+    "$HEADER" "src/engine/tq_generate.c"
+check_both_have "BOS <|begin_of_text|> lookup" '"<|begin_of_text|>"' \
+    "$HEADER" "src/engine/tq_tokenizer.c"
+
+# --- 6. Hybrid attention stride (GQA fix) ---
+echo ""
+echo "[6] Hybrid attention cache stride"
+check_both_have "max_head_dim in quant cache" "max_head_dim" \
+    "$HEADER" "src/engine/tq_transformer.c"
+check_both_have "max_kv_heads in quant cache" "max_kv_heads" \
+    "$HEADER" "src/engine/tq_transformer.c"
+
+# --- 7. Memory free completeness ---
+echo ""
+echo "[7] GGUF dequant memory free"
+check_both_have "free(layer->attn_norm)" "free(layer->attn_norm)" \
+    "$HEADER" "src/engine/tq_model.c"
+
+# --- Summary ---
+echo ""
+echo "========================================="
+if [ "$ERRORS" -eq 0 ]; then
+    echo -e "  ${GREEN}ALL CHECKS PASSED${NC}"
+else
+    echo -e "  ${RED}$ERRORS SYNC ISSUES DETECTED${NC}"
+fi
+echo "========================================="
+exit "$ERRORS"
diff --git a/src/backend/cpu/tq_sve.c b/src/backend/cpu/tq_sve.c
@@ -0,0 +1,69 @@
+/**
+ * ARM SVE backend stub — scaffolding for Scalable Vector Extension kernels
+ *
+ * SVE is needed for AWS Graviton3/4 and other modern ARM servers.
+ * Currently all functions delegate to the generic reference implementations.
+ * Replace with SVE intrinsics for real optimization.
+ *
+ * Only compiled when __ARM_FEATURE_SVE is defined.
+ */
+
+#include "turboquant/turboquant.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+
+/* ================================================================
+ * Uniform 4-bit — SVE stubs (delegate to reference)
+ * ================================================================ */
+
+extern void tq_uniform_4b_quantize_ref(const float* src, void* dst, int n);
+extern void tq_uniform_4b_dequantize_ref(const void* src, float* dst, int n);
+
+void tq_uniform_4b_quantize_sve(const float* src, void* dst, int n) {
+    /* TODO: SVE implementation — use svptrue/svld1/svmin/svmax for vectorized min-max */
+    tq_uniform_4b_quantize_ref(src, dst, n);
+}
+
+void tq_uniform_4b_dequantize_sve(const void* src, float* dst, int n) {
+    /* TODO: SVE implementation */
+    tq_uniform_4b_dequantize_ref(src, dst, n);
+}
+
+/* ================================================================
+ * Polar 3/4-bit — SVE stubs (delegate to reference)
+ * ================================================================ */
+
+extern void tq_polar_quantize_ref(const float* src, void* dst, int n);
+extern void tq_polar_dequantize_ref(const void* src, float* dst, int n);
+
+void tq_polar_quantize_sve(const float* src, void* dst, int n) {
+    /* TODO: SVE implementation — vectorize L2 norm + angular quantization */
+    tq_polar_quantize_ref(src, dst, n);
+}
+
+void tq_polar_dequantize_sve(const void* src, float* dst, int n) {
+    /* TODO: SVE implementation */
+    tq_polar_dequantize_ref(src, dst, n);
+}
+
+/* ================================================================
+ * QJL 1-bit — SVE stubs (delegate to reference)
+ * ================================================================ */
+
+extern void tq_qjl_quantize_ref(const float* src, void* dst, int n);
+extern void tq_qjl_attention_ref(const float* q, const void* kv,
+                                  float* s, int seq, int hd);
+
+void tq_qjl_quantize_sve(const float* src, void* dst, int n) {
+    /* TODO: SVE implementation — vectorize sign hashing with svcompact */
+    tq_qjl_quantize_ref(src, dst, n);
+}
+
+void tq_qjl_attention_sve(const float* q, const void* kv,
+                           float* s, int seq, int hd) {
+    /* TODO: SVE implementation — vectorize popcount-based dot product */
+    tq_qjl_attention_ref(q, kv, s, seq, hd);
+}
+
+#endif /* __ARM_FEATURE_SVE */
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -219,15 +219,22 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
 
     if (tokenizer && prompt) {
         /* BOS token handling:
-         * Gemma 3/4: BOS=2 (required)
-         * Phi-3: BOS via <s> (required — garbage without it)
-         * LLaMA 3: BOS=128000 (<|begin_of_text|>) — but tokenizer usually adds it
-         * Qwen3.5: no BOS needed */
+         * Gemma 3/4: model_type==1, BOS=2 (required)
+         * Phi-3 / LLaMA 2: vocab has <s> as BOS (required)
+         * LLaMA 3: BOS=128000 (<|begin_of_text|>) — tq_encode lookup chain handles it
+         * Qwen3.5 / GPT-2 BPE: no native BOS, skip */
         int add_bos = 0;
         if (model->config.model_type == 1) {
             add_bos = 1; /* Gemma: always prepend BOS=2 */
-        } else if (model->config.has_fused_qkv) {
-            add_bos = 1; /* Phi-3: requires <s> BOS */
+        } else {
+            /* Auto-detect: if vocab[0..7] contains <s>, add BOS.
+             * This covers Phi-3, LLaMA 2, and any future model
+             * that uses <s> as BOS without needing model-specific flags. */
+            for (int i = 0; i < tokenizer->vocab_size && i < 8; i++) {
+                if (tokenizer->vocab[i] && strcmp(tokenizer->vocab[i], "<s>") == 0) {
+                    add_bos = 1; break;
+                }
+            }
         }
         n_prompt = tq_encode(tokenizer, prompt, prompt_tokens, 4096, add_bos);
     } else {
@@ -648,7 +655,16 @@ int tq_generate_continue(tq_model_t* model,
     if (!new_tokens) return -1;
     int n_new = 0;
     if (tokenizer && prompt) {
-        int add_bos = (model->config.model_type == 1 || model->config.has_fused_qkv) ? 1 : 0;
+        int add_bos = 0;
+        if (model->config.model_type == 1) {
+            add_bos = 1;
+        } else {
+            for (int i = 0; i < tokenizer->vocab_size && i < 8; i++) {
+                if (tokenizer->vocab[i] && strcmp(tokenizer->vocab[i], "<s>") == 0) {
+                    add_bos = 1; break;
+                }
+            }
+        }
         n_new = tq_encode(tokenizer, prompt, new_tokens, max_prompt, add_bos);
     }
     if (n_new <= 0) {
diff --git a/src/engine/tq_model.c b/src/engine/tq_model.c
@@ -3739,6 +3739,21 @@ tq_model_t* tq_load_gguf(const char* path) {
             c->is_moe ? ", MoE" : "",
             c->hidden_dim, c->n_heads, c->n_kv_heads, c->vocab_size);
 
+    /* Hard-fail when no attention layers were detected. Without this,
+     * the forward pass runs against zero-initialized weights → garbage.
+     * This was the root cause of the Phi-3 first-time experience bug:
+     * "loaded 32 layers (0 self_attn)" looked like success. */
+    if (n_attn_layers == 0 && c->delta_n_heads == 0) {
+        fprintf(stderr,
+            "tq_load_gguf: ERROR — model architecture '%s' is not supported.\n"
+            "  Detected 0 self_attn layers and no DeltaNet weights.\n"
+            "  This usually means the model uses an unsupported attention\n"
+            "  tensor layout. See docs/supported_models.md.\n",
+            gguf->arch[0] ? gguf->arch : "unknown");
+        tq_free_model(model);
+        return NULL;
+    }
+
     /* ============================================================
      * Load-time weight conversion: GGUF -> Q4
      *
diff --git a/src/engine/tq_tokenizer.c b/src/engine/tq_tokenizer.c
@@ -1187,6 +1187,7 @@ int tq_encode(const tq_tokenizer_t* tok, const char* text,
         /* Look up <bos> token in vocab; default to id 2 (Gemma convention) */
         int bos_id = str_lookup(tok, "<bos>");
         if (bos_id < 0) { bos_id = str_lookup(tok, "<s>"); }
+        if (bos_id < 0) { bos_id = str_lookup(tok, "<|begin_of_text|>"); }
         if (bos_id < 0) { bos_id = str_lookup(tok, "<|im_start|>"); }
         if (bos_id >= 0) {
             tokens[n_tokens++] = bos_id;
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c