turbo_kv: Karpathy loop round 1 — empirical per-block std (Variant A)

unamedkr · claude · unamedkr · commit fe6df17d4577 · 2026-04-08T01:50:00.000+09:00
Repurposed the always-constant rht_seed field as inv_std_fp16 (block
size unchanged, alignment preserved via _pad). Quantize now computes
empirical std of the rotated values per block and stores its inverse
for later dequant lookup, instead of the theoretical sqrt(dim).

Llama 3.2 3B PPL on bench/data/ppl_1k.txt:
  turbo_kv_4b: 16.03 → 15.87  (Δ -0.16)
  turbo_kv_3b: 25.84 → 25.07  (Δ -0.77)

Marginal improvement — confirms variance mismatch was real but small.
The dominant bottleneck remains outlier clipping in the Lloyd-Max
codebook. Round 2 will try max-abs based scaling.

Target: turbo_kv_4b ≤ 14.5 (matching uniform_4b at same bit budget).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/include/turboquant/tq_types.h b/include/turboquant/tq_types.h
@@ -207,26 +207,28 @@ typedef struct {
 /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
  * Block covers TQ_BK elements (128).
- * Layout: norm(2) + residual_norm(2) + rht_seed(4) + mse_2bit(32) + qjl_signs(16) = 56 bytes
+ * Layout: norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_2bit(32) + qjl_signs(16) = 56 bytes
  */
 typedef struct {
-    uint16_t norm;                     /* L2 norm of original vector (fp16)      */
-    uint16_t residual_norm;            /* L2 norm of residual after MSE (fp16)   */
-    uint32_t rht_seed;                 /* RHT random seed for this block         */
-    uint8_t  mse_indices[TQ_BK / 4];  /* 2-bit packed codebook indices (32B)    */
-    uint8_t  qjl_signs[TQ_BK / 8];    /* 1-bit QJL sign hash on residual (16B) */
+    uint16_t norm;                     /* L2 norm of original vector (fp16)        */
+    uint16_t residual_norm;            /* L2 norm of residual after MSE (fp16)     */
+    uint16_t inv_std_fp16;             /* per-block 1/std for codebook lookup      */
+    uint16_t _pad;                     /* alignment padding (was rht_seed upper)   */
+    uint8_t  mse_indices[TQ_BK / 4];  /* 2-bit packed codebook indices (32B)      */
+    uint8_t  qjl_signs[TQ_BK / 8];    /* 1-bit QJL sign hash on residual (16B)   */
 } block_tq_turbo_kv_3b;
 
 /* TurboQuant KV cache block: 4-bit variant
  * 3-bit codebook (8 levels) + 1-bit QJL sign hash
- * Layout: norm(2) + residual_norm(2) + rht_seed(4) + mse_3bit(48) + qjl_signs(16) = 72 bytes
+ * Layout: norm(2) + residual_norm(2) + inv_std(2) + _pad(2) + mse_3bit(48) + qjl_signs(16) = 72 bytes
  */
 typedef struct {
-    uint16_t norm;                         /* L2 norm of original vector (fp16)      */
-    uint16_t residual_norm;                /* L2 norm of residual after MSE (fp16)   */
-    uint32_t rht_seed;                     /* RHT random seed for this block         */
-    uint8_t  mse_indices[TQ_BK * 3 / 8];  /* 3-bit packed codebook indices (48B)    */
-    uint8_t  qjl_signs[TQ_BK / 8];        /* 1-bit QJL sign hash on residual (16B) */
+    uint16_t norm;                         /* L2 norm of original vector (fp16)        */
+    uint16_t residual_norm;                /* L2 norm of residual after MSE (fp16)     */
+    uint16_t inv_std_fp16;                 /* per-block 1/std for codebook lookup      */
+    uint16_t _pad;                         /* alignment padding                        */
+    uint8_t  mse_indices[TQ_BK * 3 / 8];  /* 3-bit packed codebook indices (48B)      */
+    uint8_t  qjl_signs[TQ_BK / 8];        /* 1-bit QJL sign hash on residual (16B)   */
 } block_tq_turbo_kv_4b;
 
 /* TurboQuant KV cache block: 1-bit Hamming attention
diff --git a/src/core/tq_turbo_kv.c b/src/core/tq_turbo_kv.c
@@ -152,15 +152,17 @@ static void compute_qjl_signs(const float* residual, uint8_t* signs,
 
 static void dequant_mse_rotated_2bit(const block_tq_turbo_kv_3b* block,
                                       float* rotated, int dim) {
-    float inv_std = sqrtf((float)dim);
+    float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
+    if (inv_std < 1e-10f) inv_std = sqrtf((float)dim); /* fallback */
     uint8_t indices[TQ_BK] = {0};
     unpack_2bit(block->mse_indices, indices, dim);
     tq_codebook_dequantize(indices, rotated, dim, 2, inv_std);
 }
 
 static void dequant_mse_rotated_3bit(const block_tq_turbo_kv_4b* block,
                                       float* rotated, int dim) {
-    float inv_std = sqrtf((float)dim);
+    float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
+    if (inv_std < 1e-10f) inv_std = sqrtf((float)dim); /* fallback */
     uint8_t indices[TQ_BK] = {0};
     unpack_3bit(block->mse_indices, indices, dim);
     tq_codebook_dequantize(indices, rotated, dim, 3, inv_std);
@@ -195,14 +197,20 @@ void tq_turbo_kv_3b_quantize_ref(const float* src, void* dst, int n) {
     }
 
     /* Step 3: Apply RHT (in-place on rotated) */
-    uint32_t seed = TKV_DEFAULT_SEED;
-    block->rht_seed = seed;
-    tq_rht_transform(rotated, dim, seed);
-
-    /* Step 4: Scalar quantize with 2-bit codebook
-     * After RHT, coordinates are approximately N(0, 1/sqrt(dim)).
-     * inv_std = sqrt(dim) to normalize to N(0,1). */
-    float inv_std = sqrtf((float)dim);
+    tq_rht_transform(rotated, dim, TKV_DEFAULT_SEED);
+
+    /* Step 4: Compute per-block empirical std and quantize with 2-bit codebook.
+     * Theoretical analysis says rotated coords ~ N(0, 1/dim), but real key
+     * vectors after a single Hadamard rotation often have heavier tails or
+     * different variance per block. Using the empirical std adapts the
+     * codebook to the actual block distribution. */
+    float var_emp = 0.0f;
+    for (int i = 0; i < dim; i++) var_emp += rotated[i] * rotated[i];
+    var_emp /= (float)dim;
+    float std_emp = sqrtf(var_emp);
+    if (std_emp < 1e-10f) std_emp = 1.0f / sqrtf((float)dim);
+    float inv_std = 1.0f / std_emp;
+    block->inv_std_fp16 = tkv_fp32_to_fp16(inv_std);
     uint8_t indices[TQ_BK];
     tq_codebook_quantize(rotated, indices, dim, 2, inv_std);
 
@@ -248,7 +256,7 @@ void tq_turbo_kv_3b_dequantize_ref(const void* src, float* dst, int n) {
     if (dim > TQ_BK) dim = TQ_BK;
 
     float norm = tkv_fp16_to_fp32(block->norm);
-    uint32_t seed = block->rht_seed;
+    uint32_t seed = TKV_DEFAULT_SEED;
 
     /* MSE-only dequantize in rotated space */
     float rotated[TQ_BK];
@@ -432,11 +440,16 @@ void tq_turbo_kv_4b_quantize_ref(const float* src, void* dst, int n) {
         rotated[i] = 0.0f;
     }
 
-    uint32_t seed = TKV_DEFAULT_SEED;
-    block->rht_seed = seed;
-    tq_rht_transform(rotated, dim, seed);
+    tq_rht_transform(rotated, dim, TKV_DEFAULT_SEED);
 
-    float inv_std = sqrtf((float)dim);
+    /* Per-block empirical std (see 3-bit variant for rationale) */
+    float var_emp = 0.0f;
+    for (int i = 0; i < dim; i++) var_emp += rotated[i] * rotated[i];
+    var_emp /= (float)dim;
+    float std_emp = sqrtf(var_emp);
+    if (std_emp < 1e-10f) std_emp = 1.0f / sqrtf((float)dim);
+    float inv_std = 1.0f / std_emp;
+    block->inv_std_fp16 = tkv_fp32_to_fp16(inv_std);
     uint8_t indices[TQ_BK];
     tq_codebook_quantize(rotated, indices, dim, 3, inv_std);
     pack_3bit(indices, block->mse_indices, dim);
@@ -471,7 +484,7 @@ void tq_turbo_kv_4b_dequantize_ref(const void* src, float* dst, int n) {
     if (dim > TQ_BK) dim = TQ_BK;
 
     float norm = tkv_fp16_to_fp32(block->norm);
-    uint32_t seed = block->rht_seed;
+    uint32_t seed = TKV_DEFAULT_SEED;
 
     float rotated[TQ_BK];
     dequant_mse_rotated_3bit(block, rotated, dim);
@@ -694,7 +707,7 @@ void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n) {
     if (sketch_dim < TQ_BK) sketch_dim = TQ_BK;
 
     float norm = tkv_fp16_to_fp32(block->norm);
-    uint32_t seed = block->rht_seed;
+    uint32_t seed = TKV_DEFAULT_SEED;
 
     /* Reconstruct sign vector in rotated space.
      * After RHT, coordinates are ~N(0, 1/sqrt(dim)).