turbo_kv_4bo: Variant G — 4-bit codebook + 8 per-block FP16 outliers

unamedkr · claude · unamedkr · commit 4576910a0da6 · 2026-04-08T07:53:16.000+09:00
New TQ_TYPE_TURBO_KV_4BO type. Each block stores the 8 channels with the largest |rotated[i]| as exact FP16 values (with their indices) on top of the existing Variant F 4-bit Lloyd-Max codebook. At dequant time these 8 positions are overwritten with the stored exact values, eliminating the worst quantization errors per block. This is a simpler, local form of the per-channel outlier handling described in the Google TurboQuant paper. Llama 3.2 3B PPL on bench/data/ppl_1k.txt (FP32 = 13.56): turbo_kv_4b 14.28 (+5.3%) ← 72B turbo_kv_4bo 13.86 (+2.2%) ← 96B ← gap cut by 58% turbo_kv_5b 13.60 (+0.34%) ← 88B SmolLM2 135M PPL (FP32 = 18.62): turbo_kv_4b 19.70 (+5.8%) turbo_kv_4bo 19.29 (+3.6%) ← gap cut by 38% turbo_kv_5b 18.94 (+1.7%) The technique works (validates Issue #15's per-channel outlier hypothesis), but at 96 bytes the variant is currently bigger than 5b (88B) without matching its quality. Next iteration will combine outliers with a 3-bit base codebook (turbo_kv_3bo, ~80 bytes) to test whether outliers + smaller base can beat 5b at smaller block size. Block layout (96 bytes): norm(2) + residual_norm(2) + inv_std(2) + _pad(2) mse_indices[64] // 4-bit packed (Variant F base) out_indices[8] // 1 byte per outlier out_values[8] // FP16 per outlier Quantize finds top-K outliers by |rotated| and stores them verbatim. The codebook scaling uses BODY-only max-abs (excluding outliers) so the codebook doesn't waste resolution on the tails the outliers already capture exactly. 35/35 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/include/turboquant/tq_types.h b/include/turboquant/tq_types.h
@@ -55,7 +55,8 @@ typedef enum {
     TQ_TYPE_TURBO_KV_2B = 11,/* TurboQuant KV: 2-bit (1-bit codebook + 1-bit QJL) */
     TQ_TYPE_UNIFORM_3B= 12,  /* Min-Max uniform 3-bit with sub-block scales     */
     TQ_TYPE_TURBO_KV_5B = 13,/* TurboQuant KV: RHT + 5-bit Lloyd-Max codebook   */
-    TQ_TYPE_COUNT     = 14
+    TQ_TYPE_TURBO_KV_4BO = 14,/* TurboQuant KV: 4-bit codebook + 8 FP16 outliers */
+    TQ_TYPE_COUNT     = 15
 } tq_type;
 
 /* ============================================================
@@ -221,6 +222,29 @@ typedef struct {
     uint8_t  mse_indices[TQ_BK * 3 / 8];  /* 3-bit packed codebook indices (48B)  */
 } block_tq_turbo_kv_3b;
 
+/* TurboQuant KV cache block: 4-bit + per-block outliers (Variant G)
+ *
+ * Same Variant F base (RHT + 4-bit Lloyd-Max codebook), plus a per-block
+ * outlier list: the K=8 largest |rotated[i]| values are stored verbatim
+ * as FP16 with their channel index, and OVERWRITE the codebook
+ * reconstruction at dequantize time. This addresses the heavy-tail
+ * problem the Google TurboQuant paper handles via per-channel bit
+ * allocation, but in a simpler local form.
+ *
+ * Layout: 8 hdr + 64 mse_4bit + 8 out_idx + 16 out_val_fp16 = 96 bytes
+ */
+#define TQ_KV_4BO_OUTLIERS 8
+
+typedef struct {
+    uint16_t norm;                              /* L2 norm of original (fp16)         */
+    uint16_t residual_norm;                     /* unused                             */
+    uint16_t inv_std_fp16;                      /* per-block inv_std                  */
+    uint16_t _pad;                              /* alignment                          */
+    uint8_t  mse_indices[TQ_BK / 2];           /* 4-bit packed indices (64B)         */
+    uint8_t  out_indices[TQ_KV_4BO_OUTLIERS];  /* outlier channel indices (8B)       */
+    uint16_t out_values[TQ_KV_4BO_OUTLIERS];   /* outlier values FP16 (16B)          */
+} block_tq_turbo_kv_4bo;
+
 /* TurboQuant KV cache block: 5-bit variant (Variant F architecture)
  *
  * 5-bit (32-level) Lloyd-Max-Gaussian codebook on RHT-rotated values.
@@ -295,6 +319,7 @@ TQ_CHECK_SIZE(block_tq_mixed_4b8, 4 + TQ_MIXED_OUTLIERS + TQ_MIXED_OUTLIERS * 2
 TQ_CHECK_SIZE(block_tq_turbo_kv_3b, 8 + TQ_BK * 3 / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK / 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_5b, 8 + TQ_BK * 5 / 8);
+TQ_CHECK_SIZE(block_tq_turbo_kv_4bo, 8 + TQ_BK / 2 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
 
diff --git a/integrations/llamacpp/tq_kv_cache.cpp b/integrations/llamacpp/tq_kv_cache.cpp
@@ -46,7 +46,8 @@ enum {
     GGML_TYPE_TQ_TURBO_KV_2B  = GGML_TYPE_TQ_BASE + 11,
     GGML_TYPE_TQ_UNIFORM_3B   = GGML_TYPE_TQ_BASE + 12,
     GGML_TYPE_TQ_TURBO_KV_5B  = GGML_TYPE_TQ_BASE + 13,
-    GGML_TYPE_TQ_COUNT         = 14,
+    GGML_TYPE_TQ_TURBO_KV_4BO = GGML_TYPE_TQ_BASE + 14,
+    GGML_TYPE_TQ_COUNT         = 15,
 };
 
 /* ============================================================
@@ -69,6 +70,7 @@ static int tq_to_ggml_type(tq_type type) {
         case TQ_TYPE_TURBO_KV_2B:  return GGML_TYPE_TQ_TURBO_KV_2B;
         case TQ_TYPE_UNIFORM_3B:   return GGML_TYPE_TQ_UNIFORM_3B;
         case TQ_TYPE_TURBO_KV_5B:  return GGML_TYPE_TQ_TURBO_KV_5B;
+        case TQ_TYPE_TURBO_KV_4BO: return GGML_TYPE_TQ_TURBO_KV_4BO;
         default: return -1;
     }
 }
@@ -89,6 +91,7 @@ static tq_type ggml_to_tq_type(int ggml_id) {
         case GGML_TYPE_TQ_TURBO_KV_2B:  return TQ_TYPE_TURBO_KV_2B;
         case GGML_TYPE_TQ_UNIFORM_3B:   return TQ_TYPE_UNIFORM_3B;
         case GGML_TYPE_TQ_TURBO_KV_5B:  return TQ_TYPE_TURBO_KV_5B;
+        case GGML_TYPE_TQ_TURBO_KV_4BO: return TQ_TYPE_TURBO_KV_4BO;
         default: return TQ_TYPE_COUNT;
     }
 }
@@ -155,6 +158,7 @@ TQ_GGML_WRAPPERS(turbo_kv_1b,  TQ_TYPE_TURBO_KV_1B)
 TQ_GGML_WRAPPERS(turbo_kv_2b,  TQ_TYPE_TURBO_KV_2B)
 TQ_GGML_WRAPPERS(uniform_3b,  TQ_TYPE_UNIFORM_3B)
 TQ_GGML_WRAPPERS(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
+TQ_GGML_WRAPPERS(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
 
 /* ============================================================
  * vec_dot wrappers (quantized key . FP32 query -> scalar)
@@ -209,6 +213,7 @@ TQ_GGML_VEC_DOT(turbo_kv_1b,  TQ_TYPE_TURBO_KV_1B)
 TQ_GGML_VEC_DOT(turbo_kv_2b,  TQ_TYPE_TURBO_KV_2B)
 TQ_GGML_VEC_DOT(uniform_3b,  TQ_TYPE_UNIFORM_3B)
 TQ_GGML_VEC_DOT(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
+TQ_GGML_VEC_DOT(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
 
 /* ============================================================
  * GGML type trait table
@@ -340,6 +345,14 @@ static const tq_ggml_type_trait TQ_GGML_TRAITS[GGML_TYPE_TQ_COUNT] = {
         tq_ggml_to_float_turbo_kv_5b,
         tq_ggml_vec_dot_turbo_kv_5b,
     },
+    {
+        "tq_turbo_kv_4bo", GGML_TYPE_TQ_TURBO_KV_4BO, TQ_TYPE_TURBO_KV_4BO,
+        sizeof(block_tq_turbo_kv_4bo), TQ_BK,
+        (float)sizeof(block_tq_turbo_kv_4bo) * 8.0f / TQ_BK,
+        tq_ggml_from_float_turbo_kv_4bo,
+        tq_ggml_to_float_turbo_kv_4bo,
+        tq_ggml_vec_dot_turbo_kv_4bo,
+    },
 };
 
 #define TQ_GGML_NUM_TYPES (sizeof(TQ_GGML_TRAITS) / sizeof(TQ_GGML_TRAITS[0]))
@@ -432,6 +445,7 @@ tq_type tq_parse_kv_cache_type(const char* arg) {
         { "turbokv3",       TQ_TYPE_TURBO_KV_3B },
         { "turbo_kv_4b",    TQ_TYPE_TURBO_KV_4B },
         { "turbo_kv_5b",    TQ_TYPE_TURBO_KV_5B },
+        { "turbo_kv_4bo",   TQ_TYPE_TURBO_KV_4BO },
         { "tq-turbo-kv-4b", TQ_TYPE_TURBO_KV_4B },
         { "turbokv4",       TQ_TYPE_TURBO_KV_4B },
         { "turbo_kv_1b",    TQ_TYPE_TURBO_KV_1B },
diff --git a/src/core/tq_traits.c b/src/core/tq_traits.c
@@ -53,6 +53,11 @@ extern void tq_turbo_kv_5b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_5b_attention_ref(const float* query, const void* kv,
                                           float* scores, int seq_len, int head_dim);
 
+extern void tq_turbo_kv_4bo_quantize_ref(const float* src, void* dst, int n);
+extern void tq_turbo_kv_4bo_dequantize_ref(const void* src, float* dst, int n);
+extern void tq_turbo_kv_4bo_attention_ref(const float* query, const void* kv,
+                                          float* scores, int seq_len, int head_dim);
+
 extern void tq_turbo_kv_1b_quantize_ref(const float* src, void* dst, int n);
 extern void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_1b_attention_ref(const float* query, const void* kv,
@@ -175,6 +180,16 @@ tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
         .attention  = tq_turbo_kv_5b_attention_ref,
         .residual_type = TQ_TYPE_COUNT,
     },
+    [TQ_TYPE_TURBO_KV_4BO] = {
+        .name       = "turbo_kv_4bo",
+        .block_size = TQ_BK,
+        .type_size  = sizeof(block_tq_turbo_kv_4bo),
+        .bpe        = (float)sizeof(block_tq_turbo_kv_4bo) * 8.0f / TQ_BK,
+        .quantize   = tq_turbo_kv_4bo_quantize_ref,
+        .dequantize = tq_turbo_kv_4bo_dequantize_ref,
+        .attention  = tq_turbo_kv_4bo_attention_ref,
+        .residual_type = TQ_TYPE_COUNT,
+    },
     [TQ_TYPE_TURBO_KV_1B] = {
         .name       = "turbo_kv_1b",
         .block_size = TQ_BK,
@@ -276,6 +291,8 @@ tq_format_spec_t tq_get_format_spec(tq_type type) {
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4; break;
         case TQ_TYPE_TURBO_KV_5B:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 5; break;
+        case TQ_TYPE_TURBO_KV_4BO:
+            spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4; break;
         case TQ_TYPE_TURBO_KV_1B:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 1; break;
         case TQ_TYPE_TURBO_KV_2B:
diff --git a/src/core/tq_turbo_kv.c b/src/core/tq_turbo_kv.c
@@ -1052,3 +1052,179 @@ void tq_turbo_kv_5b_attention_ref(const float* query, const void* kv_cache,
         scores[seq] = norm * mse_dot;
     }
 }
+
+/* ============================================================
+ * TurboQuant KV 4-bit + outliers (Variant G):
+ *   normalize -> RHT -> 4-bit (16-level) Lloyd-Max codebook
+ *   + top-K outliers stored verbatim as FP16 with channel index
+ *
+ * Same Variant F base + per-block outlier list. The K largest |rotated|
+ * channels are stored exactly and overwrite the codebook reconstruction
+ * at dequant time. Closes more PPL gap than 4b-only without going as
+ * heavy as 5b on memory.
+ * ============================================================ */
+
+void tq_turbo_kv_4bo_quantize_ref(const float* src, void* dst, int n) {
+    block_tq_turbo_kv_4bo* block = (block_tq_turbo_kv_4bo*)dst;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    /* Step 1: L2 norm */
+    float norm_sq = 0.0f;
+    for (int i = 0; i < dim; i++) norm_sq += src[i] * src[i];
+    float norm = sqrtf(norm_sq);
+    block->norm = tkv_fp32_to_fp16(norm);
+    block->residual_norm = 0;
+    block->_pad = 0;
+
+    /* Step 2: Normalize + RHT */
+    float rotated[TQ_BK];
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+    for (int i = 0; i < dim; i++) rotated[i] = src[i] * inv_norm;
+    for (int i = dim; i < TQ_BK; i++) rotated[i] = 0.0f;
+    tq_rht_transform(rotated, dim, TKV_DEFAULT_SEED);
+
+    /* Step 3: Find top-K outliers by |rotated| (selection sort, K is small) */
+    int K = TQ_KV_4BO_OUTLIERS;
+    int out_idx[TQ_KV_4BO_OUTLIERS];
+    float out_abs[TQ_KV_4BO_OUTLIERS];
+    for (int k = 0; k < K; k++) { out_idx[k] = -1; out_abs[k] = -1.0f; }
+
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(rotated[i]);
+        /* Find smallest in current top-K and replace if larger */
+        int min_pos = 0;
+        for (int k = 1; k < K; k++) {
+            if (out_abs[k] < out_abs[min_pos]) min_pos = k;
+        }
+        if (a > out_abs[min_pos]) {
+            out_abs[min_pos] = a;
+            out_idx[min_pos] = i;
+        }
+    }
+
+    /* Store outlier indices and FP16 values */
+    for (int k = 0; k < K; k++) {
+        int idx = out_idx[k];
+        if (idx < 0) {
+            block->out_indices[k] = 0;
+            block->out_values[k] = 0;
+        } else {
+            block->out_indices[k] = (uint8_t)idx;
+            block->out_values[k] = tkv_fp32_to_fp16(rotated[idx]);
+        }
+    }
+
+    /* Step 4: max-abs scaling on the NON-outlier values for the codebook.
+     * Outliers are stored exact, so the codebook only needs to fit the body.
+     * Mask outliers out for max-abs computation. */
+    char is_outlier[TQ_BK];
+    memset(is_outlier, 0, sizeof(is_outlier));
+    for (int k = 0; k < K; k++) {
+        if (out_idx[k] >= 0) is_outlier[out_idx[k]] = 1;
+    }
+
+    float body_max_abs = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        if (is_outlier[i]) continue;
+        float a = fabsf(rotated[i]);
+        if (a > body_max_abs) body_max_abs = a;
+    }
+    if (body_max_abs < 1e-10f) body_max_abs = 1.0f;
+    const float CENT_4BIT_MAX = 2.7326f;
+    float inv_std = CENT_4BIT_MAX / body_max_abs;
+    block->inv_std_fp16 = tkv_fp32_to_fp16(inv_std);
+
+    /* Step 5: Quantize all 128 with 4-bit codebook (outlier values get
+     * overwritten at dequant time, so their codebook indices don't matter
+     * for accuracy — but we still write something so the bytes are defined). */
+    uint8_t indices[TQ_BK];
+    tq_codebook_quantize(rotated, indices, dim, 4, inv_std);
+    memset(block->mse_indices, 0, TQ_BK / 2);
+    for (int i = 0; i < dim; i++) {
+        int byte_idx = i / 2;
+        int bit_pos  = (i & 1) * 4;
+        block->mse_indices[byte_idx] |= (uint8_t)((indices[i] & 0x0F) << bit_pos);
+    }
+}
+
+static void dequant_mse_rotated_4bo(const block_tq_turbo_kv_4bo* block,
+                                     float* rotated, int dim) {
+    /* 4-bit codebook lookup */
+    float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
+    if (inv_std < 1e-10f) inv_std = sqrtf((float)dim);
+    uint8_t indices[TQ_BK];
+    for (int i = 0; i < dim; i++) {
+        uint8_t b = block->mse_indices[i / 2];
+        indices[i] = (i & 1) ? (b >> 4) : (b & 0x0F);
+    }
+    tq_codebook_dequantize(indices, rotated, dim, 4, inv_std);
+
+    /* Overwrite outlier positions with stored exact FP16 values */
+    int K = TQ_KV_4BO_OUTLIERS;
+    for (int k = 0; k < K; k++) {
+        int idx = block->out_indices[k];
+        if (idx < dim) {
+            rotated[idx] = tkv_fp16_to_fp32(block->out_values[k]);
+        }
+    }
+}
+
+void tq_turbo_kv_4bo_dequantize_ref(const void* src, float* dst, int n) {
+    const block_tq_turbo_kv_4bo* block = (const block_tq_turbo_kv_4bo*)src;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float norm = tkv_fp16_to_fp32(block->norm);
+    float rotated[TQ_BK];
+    dequant_mse_rotated_4bo(block, rotated, dim);
+    tq_rht_inverse(rotated, dim, TKV_DEFAULT_SEED);
+    for (int i = 0; i < dim; i++) dst[i] = rotated[i] * norm;
+}
+
+void tq_turbo_kv_4bo_attention_ref(const float* query, const void* kv_cache,
+                                     float* scores, int seq_len, int head_dim) {
+    const block_tq_turbo_kv_4bo* blocks_4bo = (const block_tq_turbo_kv_4bo*)kv_cache;
+    int dim = head_dim;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    /* Pre-rotate query once */
+    float q_rot[TQ_BK];
+    memcpy(q_rot, query, (size_t)dim * sizeof(float));
+    for (int i = dim; i < TQ_BK; i++) q_rot[i] = 0.0f;
+    tq_rht_transform(q_rot, dim, TKV_DEFAULT_SEED);
+
+    for (int seq = 0; seq < seq_len; seq++) {
+        const block_tq_turbo_kv_4bo* block = &blocks_4bo[seq];
+        float norm = tkv_fp16_to_fp32(block->norm);
+
+        float rotated[TQ_BK];
+        dequant_mse_rotated_4bo(block, rotated, dim);
+
+        float mse_dot = 0.0f;
+#ifdef __ARM_NEON
+        {
+            float32x4_t acc0 = vdupq_n_f32(0.0f);
+            float32x4_t acc1 = vdupq_n_f32(0.0f);
+            float32x4_t acc2 = vdupq_n_f32(0.0f);
+            float32x4_t acc3 = vdupq_n_f32(0.0f);
+            int d = 0;
+            for (; d + 15 < dim; d += 16) {
+                acc0 = vfmaq_f32(acc0, vld1q_f32(&q_rot[d]),      vld1q_f32(&rotated[d]));
+                acc1 = vfmaq_f32(acc1, vld1q_f32(&q_rot[d + 4]),  vld1q_f32(&rotated[d + 4]));
+                acc2 = vfmaq_f32(acc2, vld1q_f32(&q_rot[d + 8]),  vld1q_f32(&rotated[d + 8]));
+                acc3 = vfmaq_f32(acc3, vld1q_f32(&q_rot[d + 12]), vld1q_f32(&rotated[d + 12]));
+            }
+            acc0 = vaddq_f32(vaddq_f32(acc0, acc1), vaddq_f32(acc2, acc3));
+            for (; d + 3 < dim; d += 4) {
+                acc0 = vfmaq_f32(acc0, vld1q_f32(&q_rot[d]), vld1q_f32(&rotated[d]));
+            }
+            mse_dot = vaddvq_f32(acc0);
+            for (; d < dim; d++) mse_dot += q_rot[d] * rotated[d];
+        }
+#else
+        for (int d = 0; d < dim; d++) mse_dot += q_rot[d] * rotated[d];
+#endif
+        scores[seq] = norm * mse_dot;
+    }
+}
diff --git a/tools/quant.c b/tools/quant.c
@@ -82,6 +82,7 @@ static tq_type parse_kv_type(const char* s) {
     if (strcmp(s, "turbo_kv_3b") == 0) return TQ_TYPE_TURBO_KV_3B;
     if (strcmp(s, "turbo_kv_4b") == 0) return TQ_TYPE_TURBO_KV_4B;
     if (strcmp(s, "turbo_kv_5b") == 0) return TQ_TYPE_TURBO_KV_5B;
+    if (strcmp(s, "turbo_kv_4bo") == 0) return TQ_TYPE_TURBO_KV_4BO;
     if (strcmp(s, "turbo_kv_1b") == 0) return TQ_TYPE_TURBO_KV_1B;
     if (strcmp(s, "qjl_1b") == 0)     return TQ_TYPE_QJL_1B;
     if (strcmp(s, "mixed_4b8") == 0)  return TQ_TYPE_MIXED_4B8;