Skip to content

Commit 1992bf1

Browse files
unamedkrclaude
andcommitted
Benchmark suite: llama.cpp comparison, standard PPL text, 1K-token verification
bench/compare_llamacpp.sh: side-by-side TurboQuant vs llama.cpp comparison bench/ppl_standard.sh: standardized PPL measurement script bench/data/ppl_test_1k.txt: 1095-word standard test text bench/data/ppl_test_2k.txt: 1910-word extended test text Verified: SmolLM2 1.7B, 1000 tokens: baseline PPL = 12.07 1-bit K PPL = 12.07 (+0.00%) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 417fa3b commit 1992bf1

5 files changed

Lines changed: 579 additions & 0 deletions

File tree

bench/compare_llamacpp.sh

Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
#!/bin/bash
2+
# =============================================================================
3+
# compare_llamacpp.sh — TurboQuant vs llama.cpp KV cache quantization benchmark
4+
# =============================================================================
5+
#
6+
# This script measures TurboQuant's KV cache compression and documents
7+
# the equivalent llama.cpp commands for fair side-by-side comparison.
8+
#
9+
# Usage:
10+
# bash bench/compare_llamacpp.sh <model.gguf> [threads]
11+
#
12+
# Example:
13+
# bash bench/compare_llamacpp.sh models/SmolLM2-1.7B-Instruct-Q8_0.gguf 6
14+
#
15+
# What it measures (TurboQuant -- actually runs):
16+
# - Perplexity (teacher-forced on fixed 1095-word test text)
17+
# - KV cache memory per token
18+
# - Generation speed (tok/s)
19+
#
20+
# What it documents (llama.cpp -- commands printed, not executed):
21+
# - Equivalent llama.cpp commands with --cache-type-k/--cache-type-v flags
22+
# - Expected memory usage based on llama.cpp's quantization formats
23+
#
24+
# IMPORTANT: All measurements use the SAME model, SAME test text, SAME hardware.
25+
# The only variable is the KV cache quantization method.
26+
# =============================================================================
27+
28+
set -e
29+
30+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
31+
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
32+
TQ_RUN="$PROJECT_DIR/build/tq_run"
33+
PPL_TEXT="$SCRIPT_DIR/data/ppl_test_1k.txt"
34+
RESULTS_DIR="$SCRIPT_DIR/compare_results"
35+
36+
MODEL="${1:?Usage: bash bench/compare_llamacpp.sh <model.gguf> [threads]}"
37+
THREADS="${2:-6}"
38+
39+
# ---------------------------------------------------------------------------
40+
# Validate
41+
# ---------------------------------------------------------------------------
42+
if [ ! -f "$TQ_RUN" ]; then
43+
echo "ERROR: $TQ_RUN not found. Build first:"
44+
echo " cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j\$(nproc)"
45+
exit 1
46+
fi
47+
if [ ! -f "$MODEL" ]; then
48+
echo "ERROR: Model not found: $MODEL"
49+
exit 1
50+
fi
51+
if [ ! -f "$PPL_TEXT" ]; then
52+
echo "ERROR: Test text not found: $PPL_TEXT"
53+
exit 1
54+
fi
55+
56+
mkdir -p "$RESULTS_DIR"
57+
58+
MODEL_NAME=$(basename "$MODEL")
59+
DATE_STR=$(date +%Y-%m-%d_%H%M%S)
60+
HOSTNAME_STR=$(hostname -s 2>/dev/null || echo "unknown")
61+
CSV_OUT="$RESULTS_DIR/comparison_${DATE_STR}.csv"
62+
63+
echo ""
64+
echo "================================================================"
65+
echo " TurboQuant vs llama.cpp KV Cache Quantization Comparison"
66+
echo "================================================================"
67+
echo ""
68+
echo " Model: $MODEL_NAME"
69+
echo " Threads: $THREADS"
70+
echo " Host: $HOSTNAME_STR"
71+
echo " Date: $DATE_STR"
72+
echo " Text: ppl_test_1k.txt (1095 words, ~1400 tokens)"
73+
echo ""
74+
echo "================================================================"
75+
76+
# =========================================================================
77+
# SECTION 1: TurboQuant measurements (actually executed)
78+
# =========================================================================
79+
80+
echo ""
81+
echo "================================================================"
82+
echo " SECTION 1: TurboQuant Measurements (live)"
83+
echo "================================================================"
84+
echo ""
85+
86+
# Configs: "label kv_flag v_flag bits_k bits_v"
87+
TQ_CONFIGS=(
88+
"TQ:uniform_4b(K)+FP16(V) uniform_4b fp16 4.0 16.0"
89+
"TQ:turbo_1b(K)+FP16(V) turbo_kv_1b fp16 1.0 16.0"
90+
"TQ:turbo_1b(K)+Q4(V) turbo_kv_1b q4 1.0 4.0"
91+
"TQ:turbo_3b(K)+FP16(V) turbo_kv_3b fp16 3.0 16.0"
92+
"TQ:turbo_3b(K)+Q4(V) turbo_kv_3b q4 3.0 4.0"
93+
)
94+
95+
# Collect results into arrays
96+
declare -a R_LABEL R_PPL R_NLL R_TOKS R_KV_PER_TOK R_COMPRESS R_SAVED
97+
98+
run_tq_config() {
99+
local idx=$1
100+
local config_line="${TQ_CONFIGS[$idx]}"
101+
local label=$(echo "$config_line" | awk '{print $1}')
102+
local kv_type=$(echo "$config_line" | awk '{print $2}')
103+
local v_quant=$(echo "$config_line" | awk '{print $3}')
104+
local bits_k=$(echo "$config_line" | awk '{print $4}')
105+
local bits_v=$(echo "$config_line" | awk '{print $5}')
106+
107+
echo " Running: $label ..."
108+
109+
# --- PPL measurement ---
110+
local ppl_cmd="$TQ_RUN $MODEL --ppl $PPL_TEXT -j $THREADS -k $kv_type"
111+
if [ "$v_quant" != "fp16" ]; then
112+
ppl_cmd="$ppl_cmd -v $v_quant"
113+
fi
114+
local ppl_output
115+
ppl_output=$($ppl_cmd 2>&1) || true
116+
117+
local ppl nll tok_s tokens
118+
ppl=$(echo "$ppl_output" | grep "^PPL_CSV:" | cut -d, -f3)
119+
nll=$(echo "$ppl_output" | grep "^PPL_CSV:" | cut -d, -f2)
120+
tokens=$(echo "$ppl_output" | grep "^PPL_CSV:" | cut -d, -f1 | sed 's/PPL_CSV://')
121+
tok_s=$(echo "$ppl_output" | grep "tok/s" | tail -1 | grep -o '[0-9]*\.[0-9]* tok/s' | grep -o '[0-9]*\.[0-9]*')
122+
123+
# Fallback
124+
if [ -z "$ppl" ]; then
125+
ppl=$(echo "$ppl_output" | grep "Perplexity:" | grep -o '[0-9]*\.[0-9]*')
126+
nll=$(echo "$ppl_output" | grep "Avg NLL:" | grep -o '[0-9]*\.[0-9]*')
127+
fi
128+
129+
# --- Memory measurement (generate 200 tokens to get meaningful KV stats) ---
130+
local mem_cmd="$TQ_RUN $MODEL -p 'The quick brown fox jumps over the lazy dog and continues walking through the forest path.' -n 200 -T 0.0 -j $THREADS -k $kv_type -M"
131+
if [ "$v_quant" != "fp16" ]; then
132+
mem_cmd="$mem_cmd -v $v_quant"
133+
fi
134+
local mem_output
135+
mem_output=$($mem_cmd 2>&1) || true
136+
137+
local kv_per_tok compress_ratio mem_saved
138+
kv_per_tok=$(echo "$mem_output" | grep "Per-token K+V total:" | grep -o '[0-9]*\.[0-9]* KB')
139+
compress_ratio=$(echo "$mem_output" | grep "Compression ratio:" | grep -o '[0-9]*\.[0-9]*x')
140+
mem_saved=$(echo "$mem_output" | grep "Memory saved:" | grep -o '[0-9]*\.[0-9]* MB')
141+
142+
# Store results
143+
R_LABEL[$idx]="$label"
144+
R_PPL[$idx]="${ppl:-N/A}"
145+
R_NLL[$idx]="${nll:-N/A}"
146+
R_TOKS[$idx]="${tok_s:-N/A}"
147+
R_KV_PER_TOK[$idx]="${kv_per_tok:-N/A}"
148+
R_COMPRESS[$idx]="${compress_ratio:-N/A}"
149+
R_SAVED[$idx]="${mem_saved:-N/A}"
150+
}
151+
152+
for i in "${!TQ_CONFIGS[@]}"; do
153+
run_tq_config "$i"
154+
done
155+
156+
# ---------------------------------------------------------------------------
157+
# Print TurboQuant results table
158+
# ---------------------------------------------------------------------------
159+
echo ""
160+
echo " TurboQuant Results:"
161+
echo " -----------------------------------------------------------------------"
162+
printf " %-30s %8s %8s %10s %12s %8s\n" \
163+
"Config" "PPL" "NLL" "tok/s" "KV/tok" "Ratio"
164+
printf " %-30s %8s %8s %10s %12s %8s\n" \
165+
"------" "---" "---" "-----" "------" "-----"
166+
167+
for i in "${!TQ_CONFIGS[@]}"; do
168+
printf " %-30s %8s %8s %10s %12s %8s\n" \
169+
"${R_LABEL[$i]}" "${R_PPL[$i]}" "${R_NLL[$i]}" \
170+
"${R_TOKS[$i]}" "${R_KV_PER_TOK[$i]}" "${R_COMPRESS[$i]}"
171+
done
172+
173+
# =========================================================================
174+
# SECTION 2: llama.cpp equivalent commands (documented, not executed)
175+
# =========================================================================
176+
177+
echo ""
178+
echo "================================================================"
179+
echo " SECTION 2: llama.cpp Equivalent Commands (reference)"
180+
echo "================================================================"
181+
echo ""
182+
echo " These commands are NOT executed by this script. They document"
183+
echo " the equivalent llama.cpp invocations for fair comparison."
184+
echo " Run them separately with a llama.cpp build to get comparable numbers."
185+
echo ""
186+
echo " Prerequisites:"
187+
echo " cd /path/to/llama.cpp"
188+
echo " cmake -B build -DCMAKE_BUILD_TYPE=Release"
189+
echo " cmake --build build -j\$(nproc)"
190+
echo ""
191+
192+
# Create a temporary PPL file path placeholder
193+
LLAMACPP_PPL_TEXT="bench/data/ppl_test_1k.txt"
194+
195+
cat << 'DOCEOF'
196+
-----------------------------------------------------------------------
197+
Config llama.cpp command
198+
-----------------------------------------------------------------------
199+
200+
1. Baseline (FP16 KV cache — no quantization):
201+
202+
./build/bin/llama-perplexity \
203+
-m MODEL.gguf \
204+
-f bench/data/ppl_test_1k.txt \
205+
--cache-type-k f16 \
206+
--cache-type-v f16 \
207+
-t THREADS
208+
209+
Memory: 16 bits/value for K, 16 bits/value for V
210+
Per-token KV = 2 * n_layers * n_kv_heads * head_dim * 2 bytes
211+
212+
2. Q8_0 K cache (8-bit quantized keys):
213+
214+
./build/bin/llama-perplexity \
215+
-m MODEL.gguf \
216+
-f bench/data/ppl_test_1k.txt \
217+
--cache-type-k q8_0 \
218+
--cache-type-v f16 \
219+
-t THREADS
220+
221+
Memory: 8.5 bits/value for K (q8_0 has scale overhead), 16 bits/value for V
222+
Expected: near-lossless, PPL increase < 0.1
223+
224+
3. Q4_0 K cache (4-bit quantized keys):
225+
226+
./build/bin/llama-perplexity \
227+
-m MODEL.gguf \
228+
-f bench/data/ppl_test_1k.txt \
229+
--cache-type-k q4_0 \
230+
--cache-type-v f16 \
231+
-t THREADS
232+
233+
Memory: 4.5 bits/value for K (q4_0 has scale overhead), 16 bits/value for V
234+
Expected: small PPL increase, typically < 0.5
235+
236+
4. Q4_0 K + Q4_0 V (4-bit K and V):
237+
238+
./build/bin/llama-perplexity \
239+
-m MODEL.gguf \
240+
-f bench/data/ppl_test_1k.txt \
241+
--cache-type-k q4_0 \
242+
--cache-type-v q4_0 \
243+
-t THREADS
244+
245+
Memory: 4.5 bits/value for both K and V
246+
Expected: moderate PPL increase
247+
248+
For generation speed measurement:
249+
250+
./build/bin/llama-cli \
251+
-m MODEL.gguf \
252+
-p "The quick brown fox" \
253+
-n 200 \
254+
--cache-type-k {f16|q8_0|q4_0} \
255+
--cache-type-v {f16|q4_0} \
256+
-t THREADS \
257+
--temp 0
258+
259+
-----------------------------------------------------------------------
260+
DOCEOF
261+
262+
# =========================================================================
263+
# SECTION 3: Theoretical comparison table
264+
# =========================================================================
265+
266+
echo ""
267+
echo "================================================================"
268+
echo " SECTION 3: Side-by-Side Comparison"
269+
echo "================================================================"
270+
echo ""
271+
echo " Key comparison points (same model, same text):"
272+
echo ""
273+
echo " -----------------------------------------------------------------------"
274+
printf " %-32s %6s %6s %10s %s\n" \
275+
"Method" "K bit" "V bit" "KV/tok" "Notes"
276+
printf " %-32s %6s %6s %10s %s\n" \
277+
"------" "-----" "-----" "------" "-----"
278+
printf " %-32s %6s %6s %10s %s\n" \
279+
"llama.cpp f16/f16 (baseline)" "16" "16" "~192 KB*" "No compression"
280+
printf " %-32s %6s %6s %10s %s\n" \
281+
"llama.cpp q8_0/f16" "8.5" "16" "~150 KB*" "Near-lossless K"
282+
printf " %-32s %6s %6s %10s %s\n" \
283+
"llama.cpp q4_0/f16" "4.5" "16" "~126 KB*" "4-bit uniform K"
284+
printf " %-32s %6s %6s %10s %s\n" \
285+
"llama.cpp q4_0/q4_0" "4.5" "4.5" "~55 KB*" "Both quantized"
286+
287+
echo " -----------------------------------------------------------------------"
288+
289+
# Now fill in the actual TurboQuant measurements
290+
for i in "${!TQ_CONFIGS[@]}"; do
291+
local_label="${R_LABEL[$i]}"
292+
local_bits_k=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $4}')
293+
local_bits_v=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $5}')
294+
local_kv="${R_KV_PER_TOK[$i]}"
295+
local_ppl="${R_PPL[$i]}"
296+
printf " %-32s %6s %6s %10s %s\n" \
297+
"$local_label" "$local_bits_k" "$local_bits_v" "$local_kv" "PPL=$local_ppl"
298+
done
299+
300+
echo " -----------------------------------------------------------------------"
301+
echo ""
302+
echo " * llama.cpp KV/tok estimates assume: 24 layers, 32 kv_heads, head_dim=64"
303+
echo " Formula: n_layers * n_kv_heads * head_dim * (bits_k + bits_v) / 8 bytes"
304+
echo " Actual values depend on model architecture; run llama.cpp to confirm."
305+
echo ""
306+
307+
# =========================================================================
308+
# SECTION 4: Key insights
309+
# =========================================================================
310+
311+
echo "================================================================"
312+
echo " SECTION 4: Key Comparison Insights"
313+
echo "================================================================"
314+
echo ""
315+
echo " What TurboQuant offers vs llama.cpp KV quantization:"
316+
echo ""
317+
echo " 1. LOWER BIT RATES: TurboQuant achieves 1-bit and 3-bit K cache"
318+
echo " quantization using PolarQuant + QJL algorithms. llama.cpp's"
319+
echo " lowest is q4_0 (4.5 effective bits)."
320+
echo ""
321+
echo " 2. DIFFERENT ALGORITHMS: llama.cpp uses block-wise min-max (uniform)"
322+
echo " quantization. TurboQuant uses:"
323+
echo " - PolarQuant: exploits angular structure of attention keys"
324+
echo " - QJL: Johnson-Lindenstrauss sign hashing for 1-bit keys"
325+
echo " - TurboQuant: progressive residual (Polar 2b + QJL 1b = 3b)"
326+
echo ""
327+
echo " 3. QUALITY AT LOW BITS: The critical comparison is at the low end:"
328+
echo " - TurboQuant 3-bit K vs llama.cpp 4-bit K (q4_0)"
329+
echo " - If TurboQuant 3b matches or beats llama.cpp 4b in PPL,"
330+
echo " that is 25% more compression at equal quality."
331+
echo ""
332+
echo " 4. EXTREME COMPRESSION: TurboQuant 1-bit K + Q4 V achieves"
333+
echo " approximately 5x total KV compression. No llama.cpp equivalent"
334+
echo " exists at this bit rate."
335+
echo ""
336+
337+
# =========================================================================
338+
# SECTION 5: CSV output
339+
# =========================================================================
340+
341+
echo "date,model,method,kv_type,v_quant,bits_k,bits_v,ppl,nll,tok_s,kv_per_tok,compress_ratio" > "$CSV_OUT"
342+
343+
for i in "${!TQ_CONFIGS[@]}"; do
344+
local_label="${R_LABEL[$i]}"
345+
local_kv_type=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $2}')
346+
local_v_quant=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $3}')
347+
local_bits_k=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $4}')
348+
local_bits_v=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $5}')
349+
echo "$DATE_STR,$MODEL_NAME,turboquant,$local_kv_type,$local_v_quant,$local_bits_k,$local_bits_v,${R_PPL[$i]},${R_NLL[$i]},${R_TOKS[$i]},${R_KV_PER_TOK[$i]},${R_COMPRESS[$i]}" >> "$CSV_OUT"
350+
done
351+
352+
# Add llama.cpp reference rows (no measurements, just theoretical)
353+
echo "$DATE_STR,$MODEL_NAME,llamacpp,f16,f16,16,16,---,---,---,~192KB,1.00x" >> "$CSV_OUT"
354+
echo "$DATE_STR,$MODEL_NAME,llamacpp,q8_0,f16,8.5,16,---,---,---,~150KB,1.28x" >> "$CSV_OUT"
355+
echo "$DATE_STR,$MODEL_NAME,llamacpp,q4_0,f16,4.5,16,---,---,---,~126KB,1.52x" >> "$CSV_OUT"
356+
echo "$DATE_STR,$MODEL_NAME,llamacpp,q4_0,q4_0,4.5,4.5,---,---,---,~55KB,3.49x" >> "$CSV_OUT"
357+
358+
echo "================================================================"
359+
echo " Results saved to: $CSV_OUT"
360+
echo "================================================================"
361+
echo ""
362+
echo " To complete the comparison, build llama.cpp and run the"
363+
echo " commands from Section 2 on the same machine with the same model."
364+
echo " Then paste the llama.cpp PPL numbers alongside TurboQuant's"
365+
echo " for a fair apples-to-apples comparison."
366+
echo ""
367+
echo " Quick validation command:"
368+
echo " diff <(bash bench/ppl_standard.sh $MODEL) <(bash bench/ppl_standard.sh $MODEL)"
369+
echo " (should show identical results for reproducibility check)"
370+
echo ""

bench/data/ppl_results.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
date,model,label,kv_type,v_quant,tokens,nll,ppl,tok_s

0 commit comments

Comments
 (0)