|
| 1 | +#!/bin/bash |
| 2 | +# ============================================================================= |
| 3 | +# compare_llamacpp.sh — TurboQuant vs llama.cpp KV cache quantization benchmark |
| 4 | +# ============================================================================= |
| 5 | +# |
| 6 | +# This script measures TurboQuant's KV cache compression and documents |
| 7 | +# the equivalent llama.cpp commands for fair side-by-side comparison. |
| 8 | +# |
| 9 | +# Usage: |
| 10 | +# bash bench/compare_llamacpp.sh <model.gguf> [threads] |
| 11 | +# |
| 12 | +# Example: |
| 13 | +# bash bench/compare_llamacpp.sh models/SmolLM2-1.7B-Instruct-Q8_0.gguf 6 |
| 14 | +# |
| 15 | +# What it measures (TurboQuant -- actually runs): |
| 16 | +# - Perplexity (teacher-forced on fixed 1095-word test text) |
| 17 | +# - KV cache memory per token |
| 18 | +# - Generation speed (tok/s) |
| 19 | +# |
| 20 | +# What it documents (llama.cpp -- commands printed, not executed): |
| 21 | +# - Equivalent llama.cpp commands with --cache-type-k/--cache-type-v flags |
| 22 | +# - Expected memory usage based on llama.cpp's quantization formats |
| 23 | +# |
| 24 | +# IMPORTANT: All measurements use the SAME model, SAME test text, SAME hardware. |
| 25 | +# The only variable is the KV cache quantization method. |
| 26 | +# ============================================================================= |
| 27 | + |
| 28 | +set -e |
| 29 | + |
| 30 | +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" |
| 31 | +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" |
| 32 | +TQ_RUN="$PROJECT_DIR/build/tq_run" |
| 33 | +PPL_TEXT="$SCRIPT_DIR/data/ppl_test_1k.txt" |
| 34 | +RESULTS_DIR="$SCRIPT_DIR/compare_results" |
| 35 | + |
| 36 | +MODEL="${1:?Usage: bash bench/compare_llamacpp.sh <model.gguf> [threads]}" |
| 37 | +THREADS="${2:-6}" |
| 38 | + |
| 39 | +# --------------------------------------------------------------------------- |
| 40 | +# Validate |
| 41 | +# --------------------------------------------------------------------------- |
| 42 | +if [ ! -f "$TQ_RUN" ]; then |
| 43 | + echo "ERROR: $TQ_RUN not found. Build first:" |
| 44 | + echo " cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j\$(nproc)" |
| 45 | + exit 1 |
| 46 | +fi |
| 47 | +if [ ! -f "$MODEL" ]; then |
| 48 | + echo "ERROR: Model not found: $MODEL" |
| 49 | + exit 1 |
| 50 | +fi |
| 51 | +if [ ! -f "$PPL_TEXT" ]; then |
| 52 | + echo "ERROR: Test text not found: $PPL_TEXT" |
| 53 | + exit 1 |
| 54 | +fi |
| 55 | + |
| 56 | +mkdir -p "$RESULTS_DIR" |
| 57 | + |
| 58 | +MODEL_NAME=$(basename "$MODEL") |
| 59 | +DATE_STR=$(date +%Y-%m-%d_%H%M%S) |
| 60 | +HOSTNAME_STR=$(hostname -s 2>/dev/null || echo "unknown") |
| 61 | +CSV_OUT="$RESULTS_DIR/comparison_${DATE_STR}.csv" |
| 62 | + |
| 63 | +echo "" |
| 64 | +echo "================================================================" |
| 65 | +echo " TurboQuant vs llama.cpp KV Cache Quantization Comparison" |
| 66 | +echo "================================================================" |
| 67 | +echo "" |
| 68 | +echo " Model: $MODEL_NAME" |
| 69 | +echo " Threads: $THREADS" |
| 70 | +echo " Host: $HOSTNAME_STR" |
| 71 | +echo " Date: $DATE_STR" |
| 72 | +echo " Text: ppl_test_1k.txt (1095 words, ~1400 tokens)" |
| 73 | +echo "" |
| 74 | +echo "================================================================" |
| 75 | + |
| 76 | +# ========================================================================= |
| 77 | +# SECTION 1: TurboQuant measurements (actually executed) |
| 78 | +# ========================================================================= |
| 79 | + |
| 80 | +echo "" |
| 81 | +echo "================================================================" |
| 82 | +echo " SECTION 1: TurboQuant Measurements (live)" |
| 83 | +echo "================================================================" |
| 84 | +echo "" |
| 85 | + |
| 86 | +# Configs: "label kv_flag v_flag bits_k bits_v" |
| 87 | +TQ_CONFIGS=( |
| 88 | + "TQ:uniform_4b(K)+FP16(V) uniform_4b fp16 4.0 16.0" |
| 89 | + "TQ:turbo_1b(K)+FP16(V) turbo_kv_1b fp16 1.0 16.0" |
| 90 | + "TQ:turbo_1b(K)+Q4(V) turbo_kv_1b q4 1.0 4.0" |
| 91 | + "TQ:turbo_3b(K)+FP16(V) turbo_kv_3b fp16 3.0 16.0" |
| 92 | + "TQ:turbo_3b(K)+Q4(V) turbo_kv_3b q4 3.0 4.0" |
| 93 | +) |
| 94 | + |
| 95 | +# Collect results into arrays |
| 96 | +declare -a R_LABEL R_PPL R_NLL R_TOKS R_KV_PER_TOK R_COMPRESS R_SAVED |
| 97 | + |
| 98 | +run_tq_config() { |
| 99 | + local idx=$1 |
| 100 | + local config_line="${TQ_CONFIGS[$idx]}" |
| 101 | + local label=$(echo "$config_line" | awk '{print $1}') |
| 102 | + local kv_type=$(echo "$config_line" | awk '{print $2}') |
| 103 | + local v_quant=$(echo "$config_line" | awk '{print $3}') |
| 104 | + local bits_k=$(echo "$config_line" | awk '{print $4}') |
| 105 | + local bits_v=$(echo "$config_line" | awk '{print $5}') |
| 106 | + |
| 107 | + echo " Running: $label ..." |
| 108 | + |
| 109 | + # --- PPL measurement --- |
| 110 | + local ppl_cmd="$TQ_RUN $MODEL --ppl $PPL_TEXT -j $THREADS -k $kv_type" |
| 111 | + if [ "$v_quant" != "fp16" ]; then |
| 112 | + ppl_cmd="$ppl_cmd -v $v_quant" |
| 113 | + fi |
| 114 | + local ppl_output |
| 115 | + ppl_output=$($ppl_cmd 2>&1) || true |
| 116 | + |
| 117 | + local ppl nll tok_s tokens |
| 118 | + ppl=$(echo "$ppl_output" | grep "^PPL_CSV:" | cut -d, -f3) |
| 119 | + nll=$(echo "$ppl_output" | grep "^PPL_CSV:" | cut -d, -f2) |
| 120 | + tokens=$(echo "$ppl_output" | grep "^PPL_CSV:" | cut -d, -f1 | sed 's/PPL_CSV://') |
| 121 | + tok_s=$(echo "$ppl_output" | grep "tok/s" | tail -1 | grep -o '[0-9]*\.[0-9]* tok/s' | grep -o '[0-9]*\.[0-9]*') |
| 122 | + |
| 123 | + # Fallback |
| 124 | + if [ -z "$ppl" ]; then |
| 125 | + ppl=$(echo "$ppl_output" | grep "Perplexity:" | grep -o '[0-9]*\.[0-9]*') |
| 126 | + nll=$(echo "$ppl_output" | grep "Avg NLL:" | grep -o '[0-9]*\.[0-9]*') |
| 127 | + fi |
| 128 | + |
| 129 | + # --- Memory measurement (generate 200 tokens to get meaningful KV stats) --- |
| 130 | + local mem_cmd="$TQ_RUN $MODEL -p 'The quick brown fox jumps over the lazy dog and continues walking through the forest path.' -n 200 -T 0.0 -j $THREADS -k $kv_type -M" |
| 131 | + if [ "$v_quant" != "fp16" ]; then |
| 132 | + mem_cmd="$mem_cmd -v $v_quant" |
| 133 | + fi |
| 134 | + local mem_output |
| 135 | + mem_output=$($mem_cmd 2>&1) || true |
| 136 | + |
| 137 | + local kv_per_tok compress_ratio mem_saved |
| 138 | + kv_per_tok=$(echo "$mem_output" | grep "Per-token K+V total:" | grep -o '[0-9]*\.[0-9]* KB') |
| 139 | + compress_ratio=$(echo "$mem_output" | grep "Compression ratio:" | grep -o '[0-9]*\.[0-9]*x') |
| 140 | + mem_saved=$(echo "$mem_output" | grep "Memory saved:" | grep -o '[0-9]*\.[0-9]* MB') |
| 141 | + |
| 142 | + # Store results |
| 143 | + R_LABEL[$idx]="$label" |
| 144 | + R_PPL[$idx]="${ppl:-N/A}" |
| 145 | + R_NLL[$idx]="${nll:-N/A}" |
| 146 | + R_TOKS[$idx]="${tok_s:-N/A}" |
| 147 | + R_KV_PER_TOK[$idx]="${kv_per_tok:-N/A}" |
| 148 | + R_COMPRESS[$idx]="${compress_ratio:-N/A}" |
| 149 | + R_SAVED[$idx]="${mem_saved:-N/A}" |
| 150 | +} |
| 151 | + |
| 152 | +for i in "${!TQ_CONFIGS[@]}"; do |
| 153 | + run_tq_config "$i" |
| 154 | +done |
| 155 | + |
| 156 | +# --------------------------------------------------------------------------- |
| 157 | +# Print TurboQuant results table |
| 158 | +# --------------------------------------------------------------------------- |
| 159 | +echo "" |
| 160 | +echo " TurboQuant Results:" |
| 161 | +echo " -----------------------------------------------------------------------" |
| 162 | +printf " %-30s %8s %8s %10s %12s %8s\n" \ |
| 163 | + "Config" "PPL" "NLL" "tok/s" "KV/tok" "Ratio" |
| 164 | +printf " %-30s %8s %8s %10s %12s %8s\n" \ |
| 165 | + "------" "---" "---" "-----" "------" "-----" |
| 166 | + |
| 167 | +for i in "${!TQ_CONFIGS[@]}"; do |
| 168 | + printf " %-30s %8s %8s %10s %12s %8s\n" \ |
| 169 | + "${R_LABEL[$i]}" "${R_PPL[$i]}" "${R_NLL[$i]}" \ |
| 170 | + "${R_TOKS[$i]}" "${R_KV_PER_TOK[$i]}" "${R_COMPRESS[$i]}" |
| 171 | +done |
| 172 | + |
| 173 | +# ========================================================================= |
| 174 | +# SECTION 2: llama.cpp equivalent commands (documented, not executed) |
| 175 | +# ========================================================================= |
| 176 | + |
| 177 | +echo "" |
| 178 | +echo "================================================================" |
| 179 | +echo " SECTION 2: llama.cpp Equivalent Commands (reference)" |
| 180 | +echo "================================================================" |
| 181 | +echo "" |
| 182 | +echo " These commands are NOT executed by this script. They document" |
| 183 | +echo " the equivalent llama.cpp invocations for fair comparison." |
| 184 | +echo " Run them separately with a llama.cpp build to get comparable numbers." |
| 185 | +echo "" |
| 186 | +echo " Prerequisites:" |
| 187 | +echo " cd /path/to/llama.cpp" |
| 188 | +echo " cmake -B build -DCMAKE_BUILD_TYPE=Release" |
| 189 | +echo " cmake --build build -j\$(nproc)" |
| 190 | +echo "" |
| 191 | + |
| 192 | +# Create a temporary PPL file path placeholder |
| 193 | +LLAMACPP_PPL_TEXT="bench/data/ppl_test_1k.txt" |
| 194 | + |
| 195 | +cat << 'DOCEOF' |
| 196 | + ----------------------------------------------------------------------- |
| 197 | + Config llama.cpp command |
| 198 | + ----------------------------------------------------------------------- |
| 199 | +
|
| 200 | + 1. Baseline (FP16 KV cache — no quantization): |
| 201 | +
|
| 202 | + ./build/bin/llama-perplexity \ |
| 203 | + -m MODEL.gguf \ |
| 204 | + -f bench/data/ppl_test_1k.txt \ |
| 205 | + --cache-type-k f16 \ |
| 206 | + --cache-type-v f16 \ |
| 207 | + -t THREADS |
| 208 | +
|
| 209 | + Memory: 16 bits/value for K, 16 bits/value for V |
| 210 | + Per-token KV = 2 * n_layers * n_kv_heads * head_dim * 2 bytes |
| 211 | +
|
| 212 | + 2. Q8_0 K cache (8-bit quantized keys): |
| 213 | +
|
| 214 | + ./build/bin/llama-perplexity \ |
| 215 | + -m MODEL.gguf \ |
| 216 | + -f bench/data/ppl_test_1k.txt \ |
| 217 | + --cache-type-k q8_0 \ |
| 218 | + --cache-type-v f16 \ |
| 219 | + -t THREADS |
| 220 | +
|
| 221 | + Memory: 8.5 bits/value for K (q8_0 has scale overhead), 16 bits/value for V |
| 222 | + Expected: near-lossless, PPL increase < 0.1 |
| 223 | +
|
| 224 | + 3. Q4_0 K cache (4-bit quantized keys): |
| 225 | +
|
| 226 | + ./build/bin/llama-perplexity \ |
| 227 | + -m MODEL.gguf \ |
| 228 | + -f bench/data/ppl_test_1k.txt \ |
| 229 | + --cache-type-k q4_0 \ |
| 230 | + --cache-type-v f16 \ |
| 231 | + -t THREADS |
| 232 | +
|
| 233 | + Memory: 4.5 bits/value for K (q4_0 has scale overhead), 16 bits/value for V |
| 234 | + Expected: small PPL increase, typically < 0.5 |
| 235 | +
|
| 236 | + 4. Q4_0 K + Q4_0 V (4-bit K and V): |
| 237 | +
|
| 238 | + ./build/bin/llama-perplexity \ |
| 239 | + -m MODEL.gguf \ |
| 240 | + -f bench/data/ppl_test_1k.txt \ |
| 241 | + --cache-type-k q4_0 \ |
| 242 | + --cache-type-v q4_0 \ |
| 243 | + -t THREADS |
| 244 | +
|
| 245 | + Memory: 4.5 bits/value for both K and V |
| 246 | + Expected: moderate PPL increase |
| 247 | +
|
| 248 | + For generation speed measurement: |
| 249 | +
|
| 250 | + ./build/bin/llama-cli \ |
| 251 | + -m MODEL.gguf \ |
| 252 | + -p "The quick brown fox" \ |
| 253 | + -n 200 \ |
| 254 | + --cache-type-k {f16|q8_0|q4_0} \ |
| 255 | + --cache-type-v {f16|q4_0} \ |
| 256 | + -t THREADS \ |
| 257 | + --temp 0 |
| 258 | +
|
| 259 | + ----------------------------------------------------------------------- |
| 260 | +DOCEOF |
| 261 | + |
| 262 | +# ========================================================================= |
| 263 | +# SECTION 3: Theoretical comparison table |
| 264 | +# ========================================================================= |
| 265 | + |
| 266 | +echo "" |
| 267 | +echo "================================================================" |
| 268 | +echo " SECTION 3: Side-by-Side Comparison" |
| 269 | +echo "================================================================" |
| 270 | +echo "" |
| 271 | +echo " Key comparison points (same model, same text):" |
| 272 | +echo "" |
| 273 | +echo " -----------------------------------------------------------------------" |
| 274 | +printf " %-32s %6s %6s %10s %s\n" \ |
| 275 | + "Method" "K bit" "V bit" "KV/tok" "Notes" |
| 276 | +printf " %-32s %6s %6s %10s %s\n" \ |
| 277 | + "------" "-----" "-----" "------" "-----" |
| 278 | +printf " %-32s %6s %6s %10s %s\n" \ |
| 279 | + "llama.cpp f16/f16 (baseline)" "16" "16" "~192 KB*" "No compression" |
| 280 | +printf " %-32s %6s %6s %10s %s\n" \ |
| 281 | + "llama.cpp q8_0/f16" "8.5" "16" "~150 KB*" "Near-lossless K" |
| 282 | +printf " %-32s %6s %6s %10s %s\n" \ |
| 283 | + "llama.cpp q4_0/f16" "4.5" "16" "~126 KB*" "4-bit uniform K" |
| 284 | +printf " %-32s %6s %6s %10s %s\n" \ |
| 285 | + "llama.cpp q4_0/q4_0" "4.5" "4.5" "~55 KB*" "Both quantized" |
| 286 | + |
| 287 | +echo " -----------------------------------------------------------------------" |
| 288 | + |
| 289 | +# Now fill in the actual TurboQuant measurements |
| 290 | +for i in "${!TQ_CONFIGS[@]}"; do |
| 291 | + local_label="${R_LABEL[$i]}" |
| 292 | + local_bits_k=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $4}') |
| 293 | + local_bits_v=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $5}') |
| 294 | + local_kv="${R_KV_PER_TOK[$i]}" |
| 295 | + local_ppl="${R_PPL[$i]}" |
| 296 | + printf " %-32s %6s %6s %10s %s\n" \ |
| 297 | + "$local_label" "$local_bits_k" "$local_bits_v" "$local_kv" "PPL=$local_ppl" |
| 298 | +done |
| 299 | + |
| 300 | +echo " -----------------------------------------------------------------------" |
| 301 | +echo "" |
| 302 | +echo " * llama.cpp KV/tok estimates assume: 24 layers, 32 kv_heads, head_dim=64" |
| 303 | +echo " Formula: n_layers * n_kv_heads * head_dim * (bits_k + bits_v) / 8 bytes" |
| 304 | +echo " Actual values depend on model architecture; run llama.cpp to confirm." |
| 305 | +echo "" |
| 306 | + |
| 307 | +# ========================================================================= |
| 308 | +# SECTION 4: Key insights |
| 309 | +# ========================================================================= |
| 310 | + |
| 311 | +echo "================================================================" |
| 312 | +echo " SECTION 4: Key Comparison Insights" |
| 313 | +echo "================================================================" |
| 314 | +echo "" |
| 315 | +echo " What TurboQuant offers vs llama.cpp KV quantization:" |
| 316 | +echo "" |
| 317 | +echo " 1. LOWER BIT RATES: TurboQuant achieves 1-bit and 3-bit K cache" |
| 318 | +echo " quantization using PolarQuant + QJL algorithms. llama.cpp's" |
| 319 | +echo " lowest is q4_0 (4.5 effective bits)." |
| 320 | +echo "" |
| 321 | +echo " 2. DIFFERENT ALGORITHMS: llama.cpp uses block-wise min-max (uniform)" |
| 322 | +echo " quantization. TurboQuant uses:" |
| 323 | +echo " - PolarQuant: exploits angular structure of attention keys" |
| 324 | +echo " - QJL: Johnson-Lindenstrauss sign hashing for 1-bit keys" |
| 325 | +echo " - TurboQuant: progressive residual (Polar 2b + QJL 1b = 3b)" |
| 326 | +echo "" |
| 327 | +echo " 3. QUALITY AT LOW BITS: The critical comparison is at the low end:" |
| 328 | +echo " - TurboQuant 3-bit K vs llama.cpp 4-bit K (q4_0)" |
| 329 | +echo " - If TurboQuant 3b matches or beats llama.cpp 4b in PPL," |
| 330 | +echo " that is 25% more compression at equal quality." |
| 331 | +echo "" |
| 332 | +echo " 4. EXTREME COMPRESSION: TurboQuant 1-bit K + Q4 V achieves" |
| 333 | +echo " approximately 5x total KV compression. No llama.cpp equivalent" |
| 334 | +echo " exists at this bit rate." |
| 335 | +echo "" |
| 336 | + |
| 337 | +# ========================================================================= |
| 338 | +# SECTION 5: CSV output |
| 339 | +# ========================================================================= |
| 340 | + |
| 341 | +echo "date,model,method,kv_type,v_quant,bits_k,bits_v,ppl,nll,tok_s,kv_per_tok,compress_ratio" > "$CSV_OUT" |
| 342 | + |
| 343 | +for i in "${!TQ_CONFIGS[@]}"; do |
| 344 | + local_label="${R_LABEL[$i]}" |
| 345 | + local_kv_type=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $2}') |
| 346 | + local_v_quant=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $3}') |
| 347 | + local_bits_k=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $4}') |
| 348 | + local_bits_v=$(echo "${TQ_CONFIGS[$i]}" | awk '{print $5}') |
| 349 | + echo "$DATE_STR,$MODEL_NAME,turboquant,$local_kv_type,$local_v_quant,$local_bits_k,$local_bits_v,${R_PPL[$i]},${R_NLL[$i]},${R_TOKS[$i]},${R_KV_PER_TOK[$i]},${R_COMPRESS[$i]}" >> "$CSV_OUT" |
| 350 | +done |
| 351 | + |
| 352 | +# Add llama.cpp reference rows (no measurements, just theoretical) |
| 353 | +echo "$DATE_STR,$MODEL_NAME,llamacpp,f16,f16,16,16,---,---,---,~192KB,1.00x" >> "$CSV_OUT" |
| 354 | +echo "$DATE_STR,$MODEL_NAME,llamacpp,q8_0,f16,8.5,16,---,---,---,~150KB,1.28x" >> "$CSV_OUT" |
| 355 | +echo "$DATE_STR,$MODEL_NAME,llamacpp,q4_0,f16,4.5,16,---,---,---,~126KB,1.52x" >> "$CSV_OUT" |
| 356 | +echo "$DATE_STR,$MODEL_NAME,llamacpp,q4_0,q4_0,4.5,4.5,---,---,---,~55KB,3.49x" >> "$CSV_OUT" |
| 357 | + |
| 358 | +echo "================================================================" |
| 359 | +echo " Results saved to: $CSV_OUT" |
| 360 | +echo "================================================================" |
| 361 | +echo "" |
| 362 | +echo " To complete the comparison, build llama.cpp and run the" |
| 363 | +echo " commands from Section 2 on the same machine with the same model." |
| 364 | +echo " Then paste the llama.cpp PPL numbers alongside TurboQuant's" |
| 365 | +echo " for a fair apples-to-apples comparison." |
| 366 | +echo "" |
| 367 | +echo " Quick validation command:" |
| 368 | +echo " diff <(bash bench/ppl_standard.sh $MODEL) <(bash bench/ppl_standard.sh $MODEL)" |
| 369 | +echo " (should show identical results for reproducibility check)" |
| 370 | +echo "" |
0 commit comments