Skip to content

Commit ac3c46a

Browse files
unamedkrclaude
andcommitted
turbo_kv: Variant F BREAKTHROUGH — drop dead QJL stage, double the codebook
The Karpathy-loop ablation in commit 4da6915 showed that the QJL residual correction stage contributed *byte-identical* zero to the final attention scores. The implementation was correct in form but the constant √(π/2)/m on Rademacher rows yields a contribution dwarfed by the residual norm, which after a good codebook fit is tiny. Variant F: drop QJL entirely from turbo_kv_3b/4b and reinvest the freed 16 bytes of qjl_signs into a *larger* codebook. Same block size, one extra bit of resolution per element. Layout change (turbo_kv_4b, still 72 bytes): before: 8 hdr + 48 mse_3bit + 16 qjl_signs after: 8 hdr + 64 mse_4bit Layout change (turbo_kv_3b, still 56 bytes): before: 8 hdr + 32 mse_2bit + 16 qjl_signs after: 8 hdr + 48 mse_3bit Combined with max-abs scaling (Variant B winner from round 2), the resulting estimator is single-stage RHT + 2^b-level Lloyd-Max codebook + ‖x‖. Cleaner than the paper's 2-stage and empirically much better on our perplexity benchmark. Llama 3.2 3B PPL on bench/data/ppl_1k.txt (FP32 baseline = 13.56): Config | Before | After (Variant F) | Δ | vs uniform_4b ---------------|--------|-------------------|--------|--------------- uniform_4b | 14.41 | 14.41 | 0 | reference turbo_kv_4b | 16.03 | 14.28 | -1.75 | BEATS by 0.13 ⭐ turbo_kv_3b | 25.84 | 15.39 | -10.45 | within 1.0 turbo_kv_4b is now the best 4-bit KV quantization in the project, beating the previous production champion uniform_4b at the same bit budget. This also closes a major part of the gap to Google TurboQuant — we can now honestly say "TurboQuant-class compression" instead of "TurboQuant structure with broken numbers". Tests: 35/35 passing. The QJLSignsNonZero test was removed (no longer applies — see test comment). Karpathy loop summary: Round 1 (empirical std): 4b 15.87 3b 25.07 Round 2 (max-abs / no clip): 4b 15.39 3b 84.97 ❌ revert Round 3 (99th percentile): 4b 17.24 ❌ revert Round 4 (K*std sweep): best K=2.0 → 15.53 (worse than B) Round 5 (uniform linear): 4b 16.28 ❌ revert Round 6 (Variant F: 4-bit cb): 4b 14.28 ✅ 3b 15.39 ✅ HIT Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent fe6df17 commit ac3c46a

3 files changed

Lines changed: 90 additions & 265 deletions

File tree

include/turboquant/tq_types.h

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -204,31 +204,36 @@ typedef struct {
204204
}
205205
#endif
206206

207-
/* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
208-
* 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
209-
* Block covers TQ_BK elements (128).
210-
* Layout: norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_2bit(32) + qjl_signs(16) = 56 bytes
207+
/* TurboQuant KV cache block: 3-bit variant (Variant F: codebook-only, no QJL)
208+
*
209+
* Karpathy-loop ablation: QJL contributed ~0. Reclaimed those 16 bytes to
210+
* upgrade from 2-bit (4 levels) to 3-bit (8 levels) Lloyd-Max codebook —
211+
* 2x finer resolution at the same block size.
212+
*
213+
* Layout: norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_3bit(48) = 56 bytes
211214
*/
212215
typedef struct {
213216
uint16_t norm; /* L2 norm of original vector (fp16) */
214-
uint16_t residual_norm; /* L2 norm of residual after MSE (fp16) */
215-
uint16_t inv_std_fp16; /* per-block 1/std for codebook lookup */
216-
uint16_t _pad; /* alignment padding (was rht_seed upper) */
217-
uint8_t mse_indices[TQ_BK / 4]; /* 2-bit packed codebook indices (32B) */
218-
uint8_t qjl_signs[TQ_BK / 8]; /* 1-bit QJL sign hash on residual (16B) */
217+
uint16_t residual_norm; /* unused (kept for layout) */
218+
uint16_t inv_std_fp16; /* per-block inv_std for codebook lookup */
219+
uint16_t _pad; /* alignment padding */
220+
uint8_t mse_indices[TQ_BK * 3 / 8]; /* 3-bit packed codebook indices (48B) */
219221
} block_tq_turbo_kv_3b;
220222

221-
/* TurboQuant KV cache block: 4-bit variant
222-
* 3-bit codebook (8 levels) + 1-bit QJL sign hash
223-
* Layout: norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_3bit(48) + qjl_signs(16) = 72 bytes
223+
/* TurboQuant KV cache block: 4-bit variant (Variant F: codebook-only, no QJL)
224+
*
225+
* Karpathy-loop ablation showed the QJL residual contributes ~0 to scores.
226+
* We reclaim those 16 bytes to upgrade from 3-bit (8 levels) Lloyd-Max codebook
227+
* to 4-bit (16 levels) — 2x finer reconstruction at the same block size.
228+
*
229+
* Layout: norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_4bit(64) = 72 bytes
224230
*/
225231
typedef struct {
226232
uint16_t norm; /* L2 norm of original vector (fp16) */
227-
uint16_t residual_norm; /* L2 norm of residual after MSE (fp16) */
228-
uint16_t inv_std_fp16; /* per-block 1/std for codebook lookup */
233+
uint16_t residual_norm; /* unused now (kept for future residual) */
234+
uint16_t inv_std_fp16; /* per-block inv_std for codebook lookup */
229235
uint16_t _pad; /* alignment padding */
230-
uint8_t mse_indices[TQ_BK * 3 / 8]; /* 3-bit packed codebook indices (48B) */
231-
uint8_t qjl_signs[TQ_BK / 8]; /* 1-bit QJL sign hash on residual (16B) */
236+
uint8_t mse_indices[TQ_BK / 2]; /* 4-bit packed linear indices 0..15 (64B) */
232237
} block_tq_turbo_kv_4b;
233238

234239
/* TurboQuant KV cache block: 1-bit Hamming attention
@@ -270,8 +275,8 @@ TQ_CHECK_SIZE(block_tq_uniform_4b, 4 + TQ_BK / 2);
270275
TQ_CHECK_SIZE(block_tq_uniform_2b, 4 * TQ_2B_NSUB + TQ_BK / 4);
271276
TQ_CHECK_SIZE(block_tq_uniform_3b, 4 * TQ_3B_NSUB + TQ_BK * 3 / 8);
272277
TQ_CHECK_SIZE(block_tq_mixed_4b8, 4 + TQ_MIXED_OUTLIERS + TQ_MIXED_OUTLIERS * 2 + TQ_BK / 2);
273-
TQ_CHECK_SIZE(block_tq_turbo_kv_3b, 8 + TQ_BK / 4 + TQ_BK / 8);
274-
TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
278+
TQ_CHECK_SIZE(block_tq_turbo_kv_3b, 8 + TQ_BK * 3 / 8);
279+
TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK / 2);
275280
TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
276281
TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
277282

0 commit comments

Comments
 (0)