turbo_kv_3bo: Variant G with 3-bit base + 8 outliers (research)

unamedkr · claude · unamedkr · commit 5b5e4b7594cf · 2026-04-08T08:16:21.000+09:00
Smaller base codebook with the same per-block outlier mechanism as 4bo.
Block layout: 8 hdr + 48 mse_3bit + 8 out_idx + 16 out_val_fp16 = 80 bytes.
Lives between 4b (72B) and 5b (88B) on the size axis.

Llama 3.2 3B PPL on bench/data/ppl_1k.txt (FP32 = 13.56):

  turbo_kv_4b   72B  14.28  (+5.3%)
  turbo_kv_3bo  80B  14.03  (+3.5%)  ← Pareto improvement over 4b
  turbo_kv_5b   88B  13.60  (+0.34%)
  turbo_kv_4bo  96B  13.86  (+2.2%)  (dominated by 5b)

SmolLM2 135M PPL (FP32 = 18.62):

  turbo_kv_4b   72B  19.70  (+5.8%)
  turbo_kv_3bo  80B  20.45  (+9.8%)  ← regression on this model
  turbo_kv_5b   88B  18.94  (+1.7%)
  turbo_kv_4bo  96B  19.29  (+3.6%)

Key finding: per-channel outlier handling is **model-dependent**. On
Llama 3.2 3B with head_dim=128 and a heavier-tailed distribution,
3bo Pareto-improves over 4b. On SmolLM2 135M with smaller dimensions,
the 3-bit base is too coarse even with outliers and we regress past
4b. 5b remains the quality champion across both models.

Decision: ship 3bo and 4bo as research/experimental types (selectable
via -k turbo_kv_3bo / turbo_kv_4bo). The README headline keeps
turbo_kv_4b as default and turbo_kv_5b as the quality option.

35/35 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/include/turboquant/tq_types.h b/include/turboquant/tq_types.h
@@ -56,7 +56,8 @@ typedef enum {
     TQ_TYPE_UNIFORM_3B= 12,  /* Min-Max uniform 3-bit with sub-block scales     */
     TQ_TYPE_TURBO_KV_5B = 13,/* TurboQuant KV: RHT + 5-bit Lloyd-Max codebook   */
     TQ_TYPE_TURBO_KV_4BO = 14,/* TurboQuant KV: 4-bit codebook + 8 FP16 outliers */
-    TQ_TYPE_COUNT     = 15
+    TQ_TYPE_TURBO_KV_3BO = 15,/* TurboQuant KV: 3-bit codebook + 8 FP16 outliers */
+    TQ_TYPE_COUNT     = 16
 } tq_type;
 
 /* ============================================================
@@ -245,6 +246,24 @@ typedef struct {
     uint16_t out_values[TQ_KV_4BO_OUTLIERS];   /* outlier values FP16 (16B)          */
 } block_tq_turbo_kv_4bo;
 
+/* TurboQuant KV cache block: 3-bit + per-block outliers (Variant G, smaller base)
+ *
+ * Same outlier mechanism as turbo_kv_4bo but with a 3-bit (8-level) codebook
+ * for the body. Smaller block size at the cost of a coarser codebook.
+ *
+ * Layout: 8 hdr + 48 mse_3bit + 8 out_idx + 16 out_val_fp16 = 80 bytes
+ * Compare: 4b=72B, 4bo=96B, 5b=88B, 3bo=80B
+ */
+typedef struct {
+    uint16_t norm;                              /* L2 norm of original (fp16)       */
+    uint16_t residual_norm;                     /* unused                           */
+    uint16_t inv_std_fp16;                      /* per-block inv_std                */
+    uint16_t _pad;                              /* alignment                        */
+    uint8_t  mse_indices[TQ_BK * 3 / 8];       /* 3-bit packed indices (48B)       */
+    uint8_t  out_indices[TQ_KV_4BO_OUTLIERS];  /* outlier channel indices (8B)     */
+    uint16_t out_values[TQ_KV_4BO_OUTLIERS];   /* outlier values FP16 (16B)        */
+} block_tq_turbo_kv_3bo;
+
 /* TurboQuant KV cache block: 5-bit variant (Variant F architecture)
  *
  * 5-bit (32-level) Lloyd-Max-Gaussian codebook on RHT-rotated values.
@@ -320,6 +339,7 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_3b, 8 + TQ_BK * 3 / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK / 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_5b, 8 + TQ_BK * 5 / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_4bo, 8 + TQ_BK / 2 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
+TQ_CHECK_SIZE(block_tq_turbo_kv_3bo, 8 + TQ_BK * 3 / 8 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
 
diff --git a/integrations/llamacpp/tq_kv_cache.cpp b/integrations/llamacpp/tq_kv_cache.cpp
@@ -47,7 +47,8 @@ enum {
     GGML_TYPE_TQ_UNIFORM_3B   = GGML_TYPE_TQ_BASE + 12,
     GGML_TYPE_TQ_TURBO_KV_5B  = GGML_TYPE_TQ_BASE + 13,
     GGML_TYPE_TQ_TURBO_KV_4BO = GGML_TYPE_TQ_BASE + 14,
-    GGML_TYPE_TQ_COUNT         = 15,
+    GGML_TYPE_TQ_TURBO_KV_3BO = GGML_TYPE_TQ_BASE + 15,
+    GGML_TYPE_TQ_COUNT         = 16,
 };
 
 /* ============================================================
@@ -71,6 +72,7 @@ static int tq_to_ggml_type(tq_type type) {
         case TQ_TYPE_UNIFORM_3B:   return GGML_TYPE_TQ_UNIFORM_3B;
         case TQ_TYPE_TURBO_KV_5B:  return GGML_TYPE_TQ_TURBO_KV_5B;
         case TQ_TYPE_TURBO_KV_4BO: return GGML_TYPE_TQ_TURBO_KV_4BO;
+        case TQ_TYPE_TURBO_KV_3BO: return GGML_TYPE_TQ_TURBO_KV_3BO;
         default: return -1;
     }
 }
@@ -92,6 +94,7 @@ static tq_type ggml_to_tq_type(int ggml_id) {
         case GGML_TYPE_TQ_UNIFORM_3B:   return TQ_TYPE_UNIFORM_3B;
         case GGML_TYPE_TQ_TURBO_KV_5B:  return TQ_TYPE_TURBO_KV_5B;
         case GGML_TYPE_TQ_TURBO_KV_4BO: return TQ_TYPE_TURBO_KV_4BO;
+        case GGML_TYPE_TQ_TURBO_KV_3BO: return TQ_TYPE_TURBO_KV_3BO;
         default: return TQ_TYPE_COUNT;
     }
 }
@@ -159,6 +162,7 @@ TQ_GGML_WRAPPERS(turbo_kv_2b,  TQ_TYPE_TURBO_KV_2B)
 TQ_GGML_WRAPPERS(uniform_3b,  TQ_TYPE_UNIFORM_3B)
 TQ_GGML_WRAPPERS(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
 TQ_GGML_WRAPPERS(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
+TQ_GGML_WRAPPERS(turbo_kv_3bo, TQ_TYPE_TURBO_KV_3BO)
 
 /* ============================================================
  * vec_dot wrappers (quantized key . FP32 query -> scalar)
@@ -214,6 +218,7 @@ TQ_GGML_VEC_DOT(turbo_kv_2b,  TQ_TYPE_TURBO_KV_2B)
 TQ_GGML_VEC_DOT(uniform_3b,  TQ_TYPE_UNIFORM_3B)
 TQ_GGML_VEC_DOT(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
 TQ_GGML_VEC_DOT(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
+TQ_GGML_VEC_DOT(turbo_kv_3bo, TQ_TYPE_TURBO_KV_3BO)
 
 /* ============================================================
  * GGML type trait table
@@ -353,6 +358,14 @@ static const tq_ggml_type_trait TQ_GGML_TRAITS[GGML_TYPE_TQ_COUNT] = {
         tq_ggml_to_float_turbo_kv_4bo,
         tq_ggml_vec_dot_turbo_kv_4bo,
     },
+    {
+        "tq_turbo_kv_3bo", GGML_TYPE_TQ_TURBO_KV_3BO, TQ_TYPE_TURBO_KV_3BO,
+        sizeof(block_tq_turbo_kv_3bo), TQ_BK,
+        (float)sizeof(block_tq_turbo_kv_3bo) * 8.0f / TQ_BK,
+        tq_ggml_from_float_turbo_kv_3bo,
+        tq_ggml_to_float_turbo_kv_3bo,
+        tq_ggml_vec_dot_turbo_kv_3bo,
+    },
 };
 
 #define TQ_GGML_NUM_TYPES (sizeof(TQ_GGML_TRAITS) / sizeof(TQ_GGML_TRAITS[0]))
@@ -446,6 +459,7 @@ tq_type tq_parse_kv_cache_type(const char* arg) {
         { "turbo_kv_4b",    TQ_TYPE_TURBO_KV_4B },
         { "turbo_kv_5b",    TQ_TYPE_TURBO_KV_5B },
         { "turbo_kv_4bo",   TQ_TYPE_TURBO_KV_4BO },
+        { "turbo_kv_3bo",   TQ_TYPE_TURBO_KV_3BO },
         { "tq-turbo-kv-4b", TQ_TYPE_TURBO_KV_4B },
         { "turbokv4",       TQ_TYPE_TURBO_KV_4B },
         { "turbo_kv_1b",    TQ_TYPE_TURBO_KV_1B },
diff --git a/src/core/tq_traits.c b/src/core/tq_traits.c
@@ -58,6 +58,11 @@ extern void tq_turbo_kv_4bo_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_4bo_attention_ref(const float* query, const void* kv,
                                           float* scores, int seq_len, int head_dim);
 
+extern void tq_turbo_kv_3bo_quantize_ref(const float* src, void* dst, int n);
+extern void tq_turbo_kv_3bo_dequantize_ref(const void* src, float* dst, int n);
+extern void tq_turbo_kv_3bo_attention_ref(const float* query, const void* kv,
+                                          float* scores, int seq_len, int head_dim);
+
 extern void tq_turbo_kv_1b_quantize_ref(const float* src, void* dst, int n);
 extern void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_1b_attention_ref(const float* query, const void* kv,
@@ -190,6 +195,16 @@ tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
         .attention  = tq_turbo_kv_4bo_attention_ref,
         .residual_type = TQ_TYPE_COUNT,
     },
+    [TQ_TYPE_TURBO_KV_3BO] = {
+        .name       = "turbo_kv_3bo",
+        .block_size = TQ_BK,
+        .type_size  = sizeof(block_tq_turbo_kv_3bo),
+        .bpe        = (float)sizeof(block_tq_turbo_kv_3bo) * 8.0f / TQ_BK,
+        .quantize   = tq_turbo_kv_3bo_quantize_ref,
+        .dequantize = tq_turbo_kv_3bo_dequantize_ref,
+        .attention  = tq_turbo_kv_3bo_attention_ref,
+        .residual_type = TQ_TYPE_COUNT,
+    },
     [TQ_TYPE_TURBO_KV_1B] = {
         .name       = "turbo_kv_1b",
         .block_size = TQ_BK,
@@ -293,6 +308,8 @@ tq_format_spec_t tq_get_format_spec(tq_type type) {
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 5; break;
         case TQ_TYPE_TURBO_KV_4BO:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4; break;
+        case TQ_TYPE_TURBO_KV_3BO:
+            spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 3; break;
         case TQ_TYPE_TURBO_KV_1B:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 1; break;
         case TQ_TYPE_TURBO_KV_2B:
diff --git a/src/core/tq_turbo_kv.c b/src/core/tq_turbo_kv.c
@@ -1228,3 +1228,130 @@ void tq_turbo_kv_4bo_attention_ref(const float* query, const void* kv_cache,
         scores[seq] = norm * mse_dot;
     }
 }
+
+/* ============================================================
+ * TurboQuant KV 3-bit + outliers (Variant G, smaller base):
+ *   Same outlier mechanism as 4bo but with a 3-bit codebook for the body.
+ *   80 byte block — between 4b (72) and 5b (88).
+ * ============================================================ */
+
+void tq_turbo_kv_3bo_quantize_ref(const float* src, void* dst, int n) {
+    block_tq_turbo_kv_3bo* block = (block_tq_turbo_kv_3bo*)dst;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float norm_sq = 0.0f;
+    for (int i = 0; i < dim; i++) norm_sq += src[i] * src[i];
+    float norm = sqrtf(norm_sq);
+    block->norm = tkv_fp32_to_fp16(norm);
+    block->residual_norm = 0;
+    block->_pad = 0;
+
+    float rotated[TQ_BK];
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+    for (int i = 0; i < dim; i++) rotated[i] = src[i] * inv_norm;
+    for (int i = dim; i < TQ_BK; i++) rotated[i] = 0.0f;
+    tq_rht_transform(rotated, dim, TKV_DEFAULT_SEED);
+
+    /* Find top-K outliers */
+    int K = TQ_KV_4BO_OUTLIERS;
+    int out_idx[TQ_KV_4BO_OUTLIERS];
+    float out_abs[TQ_KV_4BO_OUTLIERS];
+    for (int k = 0; k < K; k++) { out_idx[k] = -1; out_abs[k] = -1.0f; }
+
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(rotated[i]);
+        int min_pos = 0;
+        for (int k = 1; k < K; k++) {
+            if (out_abs[k] < out_abs[min_pos]) min_pos = k;
+        }
+        if (a > out_abs[min_pos]) {
+            out_abs[min_pos] = a;
+            out_idx[min_pos] = i;
+        }
+    }
+    for (int k = 0; k < K; k++) {
+        int idx = out_idx[k];
+        if (idx < 0) {
+            block->out_indices[k] = 0;
+            block->out_values[k] = 0;
+        } else {
+            block->out_indices[k] = (uint8_t)idx;
+            block->out_values[k] = tkv_fp32_to_fp16(rotated[idx]);
+        }
+    }
+
+    /* Body-only max-abs scaling for 3-bit codebook */
+    char is_outlier[TQ_BK];
+    memset(is_outlier, 0, sizeof(is_outlier));
+    for (int k = 0; k < K; k++) {
+        if (out_idx[k] >= 0) is_outlier[out_idx[k]] = 1;
+    }
+    float body_max_abs = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        if (is_outlier[i]) continue;
+        float a = fabsf(rotated[i]);
+        if (a > body_max_abs) body_max_abs = a;
+    }
+    if (body_max_abs < 1e-10f) body_max_abs = 1.0f;
+    const float CENT_3BIT_MAX = 2.1520f;
+    float inv_std = CENT_3BIT_MAX / body_max_abs;
+    block->inv_std_fp16 = tkv_fp32_to_fp16(inv_std);
+
+    uint8_t indices[TQ_BK];
+    tq_codebook_quantize(rotated, indices, dim, 3, inv_std);
+    pack_3bit(indices, block->mse_indices, dim);
+}
+
+static void dequant_mse_rotated_3bo(const block_tq_turbo_kv_3bo* block,
+                                     float* rotated, int dim) {
+    float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
+    if (inv_std < 1e-10f) inv_std = sqrtf((float)dim);
+    uint8_t indices[TQ_BK] = {0};
+    unpack_3bit(block->mse_indices, indices, dim);
+    tq_codebook_dequantize(indices, rotated, dim, 3, inv_std);
+
+    int K = TQ_KV_4BO_OUTLIERS;
+    for (int k = 0; k < K; k++) {
+        int idx = block->out_indices[k];
+        if (idx < dim) {
+            rotated[idx] = tkv_fp16_to_fp32(block->out_values[k]);
+        }
+    }
+}
+
+void tq_turbo_kv_3bo_dequantize_ref(const void* src, float* dst, int n) {
+    const block_tq_turbo_kv_3bo* block = (const block_tq_turbo_kv_3bo*)src;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float norm = tkv_fp16_to_fp32(block->norm);
+    float rotated[TQ_BK];
+    dequant_mse_rotated_3bo(block, rotated, dim);
+    tq_rht_inverse(rotated, dim, TKV_DEFAULT_SEED);
+    for (int i = 0; i < dim; i++) dst[i] = rotated[i] * norm;
+}
+
+void tq_turbo_kv_3bo_attention_ref(const float* query, const void* kv_cache,
+                                     float* scores, int seq_len, int head_dim) {
+    const block_tq_turbo_kv_3bo* blocks_3bo = (const block_tq_turbo_kv_3bo*)kv_cache;
+    int dim = head_dim;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float q_rot[TQ_BK];
+    memcpy(q_rot, query, (size_t)dim * sizeof(float));
+    for (int i = dim; i < TQ_BK; i++) q_rot[i] = 0.0f;
+    tq_rht_transform(q_rot, dim, TKV_DEFAULT_SEED);
+
+    for (int seq = 0; seq < seq_len; seq++) {
+        const block_tq_turbo_kv_3bo* block = &blocks_3bo[seq];
+        float norm = tkv_fp16_to_fp32(block->norm);
+
+        float rotated[TQ_BK];
+        dequant_mse_rotated_3bo(block, rotated, dim);
+
+        float mse_dot = 0.0f;
+        for (int d = 0; d < dim; d++) mse_dot += q_rot[d] * rotated[d];
+        scores[seq] = norm * mse_dot;
+    }
+}
diff --git a/tools/quant.c b/tools/quant.c
@@ -83,6 +83,7 @@ static tq_type parse_kv_type(const char* s) {
     if (strcmp(s, "turbo_kv_4b") == 0) return TQ_TYPE_TURBO_KV_4B;
     if (strcmp(s, "turbo_kv_5b") == 0) return TQ_TYPE_TURBO_KV_5B;
     if (strcmp(s, "turbo_kv_4bo") == 0) return TQ_TYPE_TURBO_KV_4BO;
+    if (strcmp(s, "turbo_kv_3bo") == 0) return TQ_TYPE_TURBO_KV_3BO;
     if (strcmp(s, "turbo_kv_1b") == 0) return TQ_TYPE_TURBO_KV_1B;
     if (strcmp(s, "qjl_1b") == 0)     return TQ_TYPE_QJL_1B;
     if (strcmp(s, "mixed_4b8") == 0)  return TQ_TYPE_MIXED_4B8;