|
| 1 | +#!/bin/bash |
| 2 | +# quant.cpp — Generation Regression Test |
| 3 | +# |
| 4 | +# Detects autoregressive generation collapse that PPL tests miss. |
| 5 | +# Tests: T=0 greedy 500-token generation → verify no garbage output. |
| 6 | +# |
| 7 | +# The key insight: PPL (teacher-forced) is near-identical for FP32 and |
| 8 | +# turbo_kv_4b at all context lengths. But autoregressive generation |
| 9 | +# can collapse at ~500 tokens when T=0 repetition compounds KV quant error. |
| 10 | +# |
| 11 | +# This test catches that class of bugs by checking: |
| 12 | +# 1. Loop detection triggers (prevents garbage, so verify it fires) |
| 13 | +# 2. Output before loop detection is coherent (no random Unicode) |
| 14 | +# 3. PPL sanity check at multiple context lengths |
| 15 | +# |
| 16 | +# Usage: |
| 17 | +# bash bench/generation_regression_test.sh [model.gguf] |
| 18 | +# |
| 19 | +# Requires: built quant binary in build/ |
| 20 | + |
| 21 | +set -e |
| 22 | + |
| 23 | +MODEL="${1:-models/Llama-3.2-1B-Instruct-Q8_0.gguf}" |
| 24 | +TQ_RUN="./build/quant" |
| 25 | +THREADS=4 |
| 26 | +PASS=0 |
| 27 | +FAIL=0 |
| 28 | + |
| 29 | +if [ ! -f "$TQ_RUN" ]; then |
| 30 | + echo "Error: $TQ_RUN not found. Build first." |
| 31 | + exit 1 |
| 32 | +fi |
| 33 | +if [ ! -f "$MODEL" ]; then |
| 34 | + echo "SKIP: Model not found: $MODEL" |
| 35 | + exit 0 |
| 36 | +fi |
| 37 | + |
| 38 | +echo "============================================" |
| 39 | +echo " Generation Regression Test" |
| 40 | +echo " Model: $MODEL" |
| 41 | +echo "============================================" |
| 42 | +echo "" |
| 43 | + |
| 44 | +check() { |
| 45 | + local desc="$1" result="$2" |
| 46 | + if [ "$result" = "PASS" ]; then |
| 47 | + echo " [PASS] $desc" |
| 48 | + PASS=$((PASS + 1)) |
| 49 | + else |
| 50 | + echo " [FAIL] $desc" |
| 51 | + FAIL=$((FAIL + 1)) |
| 52 | + fi |
| 53 | +} |
| 54 | + |
| 55 | +# Test 1: T=0 generation should NOT produce garbage at 500 tokens |
| 56 | +echo "[Test 1] T=0 500-token generation — no garbage output" |
| 57 | +OUTPUT=$($TQ_RUN "$MODEL" -p "Explain the theory of relativity in detail" \ |
| 58 | + -n 500 -T 0.0 -j $THREADS -k turbo_kv_4b --chat 2>/dev/null) |
| 59 | + |
| 60 | +# Check for garbage patterns: random Unicode, excessive non-ASCII |
| 61 | +# Garbage typically has lots of CJK/Arabic/Thai mixed with Latin |
| 62 | +GARBAGE_CHARS=$(echo "$OUTPUT" | tr -cd '\200-\377' | wc -c | tr -d ' ') |
| 63 | +TOTAL_CHARS=$(echo "$OUTPUT" | wc -c | tr -d ' ') |
| 64 | +if [ "$TOTAL_CHARS" -gt 0 ]; then |
| 65 | + GARBAGE_RATIO=$((GARBAGE_CHARS * 100 / TOTAL_CHARS)) |
| 66 | +else |
| 67 | + GARBAGE_RATIO=100 |
| 68 | +fi |
| 69 | +if [ "$GARBAGE_RATIO" -lt 30 ]; then |
| 70 | + check "turbo_kv_4b output coherence (${GARBAGE_RATIO}% non-ASCII)" "PASS" |
| 71 | +else |
| 72 | + check "turbo_kv_4b output coherence (${GARBAGE_RATIO}% non-ASCII, threshold 30%)" "FAIL" |
| 73 | +fi |
| 74 | + |
| 75 | +# Test 2: Loop detection should fire for T=0 repetitive prompt |
| 76 | +echo "" |
| 77 | +echo "[Test 2] Loop detection fires on repetitive T=0 generation" |
| 78 | +LOOP_OUTPUT=$($TQ_RUN "$MODEL" -p "what is your name?" \ |
| 79 | + -n 1000 -T 0.0 -j $THREADS -k turbo_kv_4b 2>&1) |
| 80 | + |
| 81 | +if echo "$LOOP_OUTPUT" | grep -q "repetition loop detected"; then |
| 82 | + LOOP_TOKENS=$(echo "$LOOP_OUTPUT" | grep "repetition loop" | grep -o "after [0-9]* tokens" | grep -o "[0-9]*") |
| 83 | + check "loop detected at ${LOOP_TOKENS} tokens (before 500)" "PASS" |
| 84 | +else |
| 85 | + TOTAL_TOK=$(echo "$LOOP_OUTPUT" | grep "tok/s" | grep -o "^[0-9]*") |
| 86 | + if [ "${TOTAL_TOK:-1000}" -lt 500 ]; then |
| 87 | + check "EOS hit at ${TOTAL_TOK} tokens (no loop needed)" "PASS" |
| 88 | + else |
| 89 | + check "no loop detection in 1000 tokens" "FAIL" |
| 90 | + fi |
| 91 | +fi |
| 92 | + |
| 93 | +# Test 3: Non-repetitive generation should NOT trigger loop detection |
| 94 | +echo "" |
| 95 | +echo "[Test 3] Non-repetitive generation (T=0.7) — no false positives" |
| 96 | +NORMAL_OUTPUT=$($TQ_RUN "$MODEL" -p "Tell me a creative story" \ |
| 97 | + -n 200 -T 0.7 -j $THREADS -k turbo_kv_4b --chat 2>&1) |
| 98 | + |
| 99 | +if echo "$NORMAL_OUTPUT" | grep -q "repetition loop detected"; then |
| 100 | + check "no false loop detection at T=0.7" "FAIL" |
| 101 | +else |
| 102 | + check "no false loop detection at T=0.7" "PASS" |
| 103 | +fi |
| 104 | + |
| 105 | +# Test 4: FP32 vs turbo_kv_4b PPL sanity (if ppl data exists) |
| 106 | +PPL_FILE="bench/data/ppl_test_1k.txt" |
| 107 | +if [ -f "$PPL_FILE" ]; then |
| 108 | + echo "" |
| 109 | + echo "[Test 4] PPL sanity: turbo_kv_4b within 15% of FP32" |
| 110 | + FP32_PPL=$($TQ_RUN "$MODEL" --ppl "$PPL_FILE" -k fp32 -j $THREADS 2>&1 \ |
| 111 | + | grep "PPL_CSV" | cut -d, -f3) |
| 112 | + Q4_PPL=$($TQ_RUN "$MODEL" --ppl "$PPL_FILE" -k turbo_kv_4b -j $THREADS 2>&1 \ |
| 113 | + | grep "PPL_CSV" | cut -d, -f3) |
| 114 | + |
| 115 | + if [ -n "$FP32_PPL" ] && [ -n "$Q4_PPL" ]; then |
| 116 | + # Compare using integer math (multiply by 1000) |
| 117 | + FP32_INT=$(echo "$FP32_PPL" | awk '{printf "%d", $1 * 1000}') |
| 118 | + Q4_INT=$(echo "$Q4_PPL" | awk '{printf "%d", $1 * 1000}') |
| 119 | + THRESHOLD=$((FP32_INT * 115 / 100)) # 15% margin |
| 120 | + if [ "$Q4_INT" -le "$THRESHOLD" ]; then |
| 121 | + DELTA=$(echo "$FP32_PPL $Q4_PPL" | awk '{printf "%.1f", ($2/$1 - 1)*100}') |
| 122 | + check "PPL delta: ${DELTA}% (within 15%)" "PASS" |
| 123 | + else |
| 124 | + DELTA=$(echo "$FP32_PPL $Q4_PPL" | awk '{printf "%.1f", ($2/$1 - 1)*100}') |
| 125 | + check "PPL delta: ${DELTA}% (exceeds 15%)" "FAIL" |
| 126 | + fi |
| 127 | + else |
| 128 | + check "PPL comparison (could not parse results)" "FAIL" |
| 129 | + fi |
| 130 | +fi |
| 131 | + |
| 132 | +echo "" |
| 133 | +echo "============================================" |
| 134 | +echo " Results: ${PASS} passed, ${FAIL} failed" |
| 135 | +echo "============================================" |
| 136 | + |
| 137 | +if [ "$FAIL" -gt 0 ]; then |
| 138 | + exit 1 |
| 139 | +fi |
0 commit comments