feat(wasm): Llama 3.2 1B Instruct + skip Q4 reconversion (#36)

unamedkr · claude · web-flow · commit 6f25f34f744d · 2026-04-10T20:55:08.000+09:00
* fix: n-gram loop detection + generation regression test

Root cause of ~530-token text collapse: small model + T=0 greedy enters
repetition loop → KV quant error compounds through softmax → collapse.
NOT a code bug — FP32 also degenerates, just slower.

Fix:
- Add 4-gram loop detection (stop when same 4-gram repeats 3+ times)
- Increase rep_window 32→64, recent_tokens buffer 64→128
- Add bench/generation_regression_test.sh (4 tests):
  1. T=0 500-token coherence check (no garbage output)
  2. Loop detection fires on repetitive generation
  3. No false positives at T=0.7
  4. PPL within 15% of FP32

Why this wasn't caught before:
- PPL tests are teacher-forced (no error accumulation)
- Generation tests were ≤100 tokens (collapse at ~530)
- No T=0 stress test existed

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* feat(wasm): Llama 3.2 1B Instruct default + skip Q4 reconversion

Two changes for WASM demo reliability and speed:

1. Model: switch from Qwen3.5-0.8B (base, gated, Qwen arch issues)
   to Llama 3.2 1B Instruct (verified working, good quality, public
   HuggingFace URL, proper Instruct tuning for chat).

2. Speed: add -DTQ_NO_Q4=1 to WASM build. Skips the load-time Q4
   reconversion (GGUF Q4_K_M → FP32 → internal Q4) which was
   expensive and redundant for already-quantized models. Uses GGUF
   on-the-fly dequant instead. Saves several seconds of model init
   and reduces peak memory usage.

   Added compile-time #ifdef TQ_NO_Q4 guard in quant.h so it works
   in WASM (no getenv). Native builds are unaffected.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bench/generation_regression_test.sh b/bench/generation_regression_test.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+# quant.cpp — Generation Regression Test
+#
+# Detects autoregressive generation collapse that PPL tests miss.
+# Tests: T=0 greedy 500-token generation → verify no garbage output.
+#
+# The key insight: PPL (teacher-forced) is near-identical for FP32 and
+# turbo_kv_4b at all context lengths. But autoregressive generation
+# can collapse at ~500 tokens when T=0 repetition compounds KV quant error.
+#
+# This test catches that class of bugs by checking:
+# 1. Loop detection triggers (prevents garbage, so verify it fires)
+# 2. Output before loop detection is coherent (no random Unicode)
+# 3. PPL sanity check at multiple context lengths
+#
+# Usage:
+#   bash bench/generation_regression_test.sh [model.gguf]
+#
+# Requires: built quant binary in build/
+
+set -e
+
+MODEL="${1:-models/Llama-3.2-1B-Instruct-Q8_0.gguf}"
+TQ_RUN="./build/quant"
+THREADS=4
+PASS=0
+FAIL=0
+
+if [ ! -f "$TQ_RUN" ]; then
+    echo "Error: $TQ_RUN not found. Build first."
+    exit 1
+fi
+if [ ! -f "$MODEL" ]; then
+    echo "SKIP: Model not found: $MODEL"
+    exit 0
+fi
+
+echo "============================================"
+echo "  Generation Regression Test"
+echo "  Model: $MODEL"
+echo "============================================"
+echo ""
+
+check() {
+    local desc="$1" result="$2"
+    if [ "$result" = "PASS" ]; then
+        echo "  [PASS] $desc"
+        PASS=$((PASS + 1))
+    else
+        echo "  [FAIL] $desc"
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+# Test 1: T=0 generation should NOT produce garbage at 500 tokens
+echo "[Test 1] T=0 500-token generation — no garbage output"
+OUTPUT=$($TQ_RUN "$MODEL" -p "Explain the theory of relativity in detail" \
+    -n 500 -T 0.0 -j $THREADS -k turbo_kv_4b --chat 2>/dev/null)
+
+# Check for garbage patterns: random Unicode, excessive non-ASCII
+# Garbage typically has lots of CJK/Arabic/Thai mixed with Latin
+GARBAGE_CHARS=$(echo "$OUTPUT" | tr -cd '\200-\377' | wc -c | tr -d ' ')
+TOTAL_CHARS=$(echo "$OUTPUT" | wc -c | tr -d ' ')
+if [ "$TOTAL_CHARS" -gt 0 ]; then
+    GARBAGE_RATIO=$((GARBAGE_CHARS * 100 / TOTAL_CHARS))
+else
+    GARBAGE_RATIO=100
+fi
+if [ "$GARBAGE_RATIO" -lt 30 ]; then
+    check "turbo_kv_4b output coherence (${GARBAGE_RATIO}% non-ASCII)" "PASS"
+else
+    check "turbo_kv_4b output coherence (${GARBAGE_RATIO}% non-ASCII, threshold 30%)" "FAIL"
+fi
+
+# Test 2: Loop detection should fire for T=0 repetitive prompt
+echo ""
+echo "[Test 2] Loop detection fires on repetitive T=0 generation"
+LOOP_OUTPUT=$($TQ_RUN "$MODEL" -p "what is your name?" \
+    -n 1000 -T 0.0 -j $THREADS -k turbo_kv_4b 2>&1)
+
+if echo "$LOOP_OUTPUT" | grep -q "repetition loop detected"; then
+    LOOP_TOKENS=$(echo "$LOOP_OUTPUT" | grep "repetition loop" | grep -o "after [0-9]* tokens" | grep -o "[0-9]*")
+    check "loop detected at ${LOOP_TOKENS} tokens (before 500)" "PASS"
+else
+    TOTAL_TOK=$(echo "$LOOP_OUTPUT" | grep "tok/s" | grep -o "^[0-9]*")
+    if [ "${TOTAL_TOK:-1000}" -lt 500 ]; then
+        check "EOS hit at ${TOTAL_TOK} tokens (no loop needed)" "PASS"
+    else
+        check "no loop detection in 1000 tokens" "FAIL"
+    fi
+fi
+
+# Test 3: Non-repetitive generation should NOT trigger loop detection
+echo ""
+echo "[Test 3] Non-repetitive generation (T=0.7) — no false positives"
+NORMAL_OUTPUT=$($TQ_RUN "$MODEL" -p "Tell me a creative story" \
+    -n 200 -T 0.7 -j $THREADS -k turbo_kv_4b --chat 2>&1)
+
+if echo "$NORMAL_OUTPUT" | grep -q "repetition loop detected"; then
+    check "no false loop detection at T=0.7" "FAIL"
+else
+    check "no false loop detection at T=0.7" "PASS"
+fi
+
+# Test 4: FP32 vs turbo_kv_4b PPL sanity (if ppl data exists)
+PPL_FILE="bench/data/ppl_test_1k.txt"
+if [ -f "$PPL_FILE" ]; then
+    echo ""
+    echo "[Test 4] PPL sanity: turbo_kv_4b within 15% of FP32"
+    FP32_PPL=$($TQ_RUN "$MODEL" --ppl "$PPL_FILE" -k fp32 -j $THREADS 2>&1 \
+        | grep "PPL_CSV" | cut -d, -f3)
+    Q4_PPL=$($TQ_RUN "$MODEL" --ppl "$PPL_FILE" -k turbo_kv_4b -j $THREADS 2>&1 \
+        | grep "PPL_CSV" | cut -d, -f3)
+
+    if [ -n "$FP32_PPL" ] && [ -n "$Q4_PPL" ]; then
+        # Compare using integer math (multiply by 1000)
+        FP32_INT=$(echo "$FP32_PPL" | awk '{printf "%d", $1 * 1000}')
+        Q4_INT=$(echo "$Q4_PPL" | awk '{printf "%d", $1 * 1000}')
+        THRESHOLD=$((FP32_INT * 115 / 100))  # 15% margin
+        if [ "$Q4_INT" -le "$THRESHOLD" ]; then
+            DELTA=$(echo "$FP32_PPL $Q4_PPL" | awk '{printf "%.1f", ($2/$1 - 1)*100}')
+            check "PPL delta: ${DELTA}% (within 15%)" "PASS"
+        else
+            DELTA=$(echo "$FP32_PPL $Q4_PPL" | awk '{printf "%.1f", ($2/$1 - 1)*100}')
+            check "PPL delta: ${DELTA}% (exceeds 15%)" "FAIL"
+        fi
+    else
+        check "PPL comparison (could not parse results)" "FAIL"
+    fi
+fi
+
+echo ""
+echo "============================================"
+echo "  Results: ${PASS} passed, ${FAIL} failed"
+echo "============================================"
+
+if [ "$FAIL" -gt 0 ]; then
+    exit 1
+fi
diff --git a/quant.h b/quant.h
@@ -12179,8 +12179,13 @@ tq_model_t* tq_load_gguf(const char* path) {
         }
 
         const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
-        /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
+        /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
+         * Can be set via environment variable or compile-time define (useful for WASM). */
+#ifdef TQ_NO_Q4
+        if (1) {
+#else
         if (getenv("TQ_NO_Q4")) {
+#endif
             fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
             goto skip_q4_conversion;
         }
diff --git a/src/engine/tq_generate.c b/src/engine/tq_generate.c
@@ -254,22 +254,31 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     int vocab_size = model->config.vocab_size;
     float rep_penalty = config->rep_penalty;
     int rep_window = config->rep_window;
-    if (rep_window > 64) rep_window = 64;
-    int recent_tokens[64];
+    if (rep_window > 128) rep_window = 128;
+    int recent_tokens[128];
     int recent_count = 0;
 
+    /* N-gram loop detection: track recent 4-grams to detect infinite loops.
+     * Small models with T=0 greedy decoding enter repetition loops where
+     * the same ~30-token pattern repeats endlessly. KV quantization error
+     * compounds through these repetitions, eventually collapsing output
+     * into garbage. Detecting loops early prevents wasted compute. */
+    uint32_t ngram_hashes[64];
+    int ngram_hash_count = 0;
+    int loop_detected = 0;
+
     /* Seed recent tokens with tail of prompt for better penalty coverage */
     for (int i = (n_prompt > rep_window ? n_prompt - rep_window : 0); i < n_prompt; i++) {
-        recent_tokens[recent_count % 64] = prompt_tokens[i];
+        recent_tokens[recent_count % 128] = prompt_tokens[i];
         recent_count++;
     }
 
     /* Apply repetition penalty to logits before first sample */
     if (rep_penalty > 1.0f) {
         int window = recent_count < rep_window ? recent_count : rep_window;
         for (int r = 0; r < window; r++) {
-            int idx = (recent_count - 1 - r) % 64;
-            if (idx < 0) idx += 64;
+            int idx = (recent_count - 1 - r) % 128;
+            if (idx < 0) idx += 128;
             int tok = recent_tokens[idx];
             if (tok >= 0 && tok < vocab_size) {
                 if (state->logits[tok] > 0)
@@ -288,7 +297,7 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
                                      &rng_state);
 
     /* Record first sampled token */
-    recent_tokens[recent_count % 64] = next_token;
+    recent_tokens[recent_count % 128] = next_token;
     recent_count++;
 
     int generated = 0;
@@ -483,8 +492,32 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
                                      &rng_state);
 
         /* Record sampled token for repetition penalty */
-        recent_tokens[recent_count % 64] = next_token;
+        recent_tokens[recent_count % 128] = next_token;
         recent_count++;
+
+        /* N-gram loop detection: hash recent 4-gram and check for repeats */
+        if (recent_count >= 4) {
+            uint32_t h = 0;
+            for (int r = 0; r < 4; r++) {
+                int gi = (recent_count - 4 + r) % 128;
+                h = h * 31 + (uint32_t)recent_tokens[gi];
+            }
+            int matches = 0;
+            int ring_len = ngram_hash_count < 64 ? ngram_hash_count : 64;
+            for (int r = 0; r < ring_len; r++) {
+                if (ngram_hashes[r] == h) matches++;
+            }
+            ngram_hashes[ngram_hash_count % 64] = h;
+            ngram_hash_count++;
+            if (matches >= 3) {
+                loop_detected = 1;
+                break;
+            }
+        }
+    }
+
+    if (loop_detected) {
+        fprintf(stderr, "[generate] repetition loop detected after %d tokens, stopping\n", generated);
     }
 
     /* Null-terminate output */
diff --git a/wasm/build.sh b/wasm/build.sh
@@ -40,6 +40,7 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
     -lm \
     -DNDEBUG \
     -D__EMSCRIPTEN__ \
+    -DTQ_NO_Q4=1 \
     -Wno-gnu-zero-variadic-macro-arguments \
     -Wno-dollar-in-identifier-extension
 
diff --git a/wasm/index.html b/wasm/index.html
@@ -174,16 +174,11 @@ <h2>Run an <span>LLM</span> in your browser</h2>
         <p class="subtitle">No install. No API key. No server.</p>
 
         <div class="model-cards" id="modelCards">
-            <div class="model-card recommended" id="card-qwen" onclick="loadDemoModel('qwen3.5-0.8b')">
-                <div class="name">Qwen3.5 0.8B</div>
-                <div class="meta" id="meta-qwen">~508 MB &middot; Q4_K_M</div>
+            <div class="model-card recommended" id="card-llama" onclick="loadDemoModel('llama-3.2-1b')">
+                <div class="name">Llama 3.2 1B Instruct</div>
+                <div class="meta" id="meta-llama">~770 MB &middot; Verified quality</div>
                 <span class="tag">Recommended</span>
             </div>
-            <div class="model-card" id="card-llama" onclick="loadDemoModel('llama-3.2-1b')">
-                <div class="name">Llama 3.2 1B</div>
-                <div class="meta" id="meta-llama">~770 MB &middot; Q4_K_M</div>
-                <span class="tag blue">Higher quality</span>
-            </div>
         </div>
 
         <div class="progress-wrap" id="progressWrap">
@@ -223,17 +218,9 @@ <h2>Run an <span>LLM</span> in your browser</h2>
 let activeModelId = null;
 
 const MODELS = {
-    'qwen3.5-0.8b': {
-        url: 'https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q4_K_M.gguf',
-        name: 'Qwen3.5 0.8B',
-        size: 508,
-        cacheKey: 'qwen3.5-0.8b-q4km',
-        chatTemplate: (t) => `<|im_start|>user\n${t}<|im_end|>\n<|im_start|>assistant\n`,
-        cardId: 'card-qwen', metaId: 'meta-qwen',
-    },
     'llama-3.2-1b': {
         url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
-        name: 'Llama 3.2 1B',
+        name: 'Llama 3.2 1B Instruct',
         size: 770,
         cacheKey: 'llama-3.2-1b-q4km',
         chatTemplate: (t) => `<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n${t}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
diff --git a/wasm/quant.wasm b/wasm/quant.wasm

Original file line number	Diff line number	Diff line change
`@@ -12179,8 +12179,13 @@ tq_model_t* tq_load_gguf(const char* path) {`
`12179`	`12179`	`}`
`12180`	`12180`
`12181`	`12181`	`const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */`
`12182`		`- /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */`
	`12182`	`+ /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.`
	`12183`	`+ * Can be set via environment variable or compile-time define (useful for WASM). */`
	`12184`	`+#ifdef TQ_NO_Q4`
	`12185`	`+ if (1) {`
	`12186`	`+#else`
`12183`	`12187`	`if (getenv("TQ_NO_Q4")) {`
	`12188`	`+#endif`
`12184`	`12189`	`fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");`
`12185`	`12190`	`goto skip_q4_conversion;`
`12186`	`12191`	`}`