uniform_3b with sub-block scales: PPL +60% — 4-bit remains the sweet spot

unamedkr · claude · unamedkr · commit d5a5bd4680a7 · 2026-04-03T08:25:23.000+09:00
Implemented uniform_3b with 4 independent sub-blocks of 32 elements,
each with FP16 scale/min (vs single scale for 128 elements before).

Results (SmolLM2 1.7B, 815 tokens):
  uniform_4b: PPL 9.51 (+14%, 4.25 bpe)
  uniform_3b: PPL 13.28 (+60%, 4.0 bpe) — sub-block scales help but insufficient

Sub-block scales improved from the broken single-scale 3-bit (+88%),
but 8 quantization levels fundamentally can't match 16 levels for
attention-critical key vectors.

Honest conclusion: 4-bit K + Q4 V = 3.8x compression, PPL &lt;1%
is the practical optimum with current quantization approaches.

33/33 tests pass, 0 warnings.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/include/turboquant/tq_types.h b/include/turboquant/tq_types.h
@@ -53,7 +53,8 @@ typedef enum {
     TQ_TYPE_TURBO_KV_4B = 9, /* TurboQuant KV: 3-bit codebook + 1-bit QJL residual */
     TQ_TYPE_TURBO_KV_1B = 10,/* TurboQuant KV: 1-bit Hamming (sign only)           */
     TQ_TYPE_TURBO_KV_2B = 11,/* TurboQuant KV: 2-bit (1-bit codebook + 1-bit QJL) */
-    TQ_TYPE_COUNT     = 12
+    TQ_TYPE_UNIFORM_3B= 12,  /* Min-Max uniform 3-bit with sub-block scales     */
+    TQ_TYPE_COUNT     = 13
 } tq_type;
 
 /* ============================================================
@@ -112,6 +113,22 @@ typedef struct {
 
 /* size verified after extern "C" block */
 
+/* Uniform 3-bit with sub-block scales (Q3_K-style)
+ * 4 sub-blocks of 32 elements, each with independent FP16 scale/min.
+ * 8 quantization levels (3-bit) per value, but adapted to local statistics.
+ * 4.0 bits per element: (16 bytes meta + 48 bytes data) / 128 elements.
+ */
+#define TQ_3B_NSUB  4                          /* sub-blocks per block  */
+#define TQ_3B_SUBK  (TQ_BK / TQ_3B_NSUB)      /* 32 elements per sub  */
+
+typedef struct {
+    uint16_t sub_scale[TQ_3B_NSUB]; /* per-sub-block scale (fp16, 8B)   */
+    uint16_t sub_min[TQ_3B_NSUB];   /* per-sub-block minimum (fp16, 8B) */
+    uint8_t  qs[TQ_BK * 3 / 8];    /* 3-bit packed data (48B)          */
+} block_tq_uniform_3b;              /* 64 bytes per 128 elements        */
+
+/* size verified after extern "C" block */
+
 /* Mixed precision: 4-bit base with fp16 outlier channels
  * Top-k channels by absolute value are stored at fp16 precision.
  * Remaining channels use 4-bit uniform quantization with a tighter
@@ -241,6 +258,7 @@ TQ_CHECK_SIZE(block_tq_polar,      8 + TQ_BK / 2);
 TQ_CHECK_SIZE(block_tq_qjl,        4 + TQ_SKETCH_DIM / 8 + TQ_OUTLIERS);
 TQ_CHECK_SIZE(block_tq_uniform_4b, 4 + TQ_BK / 2);
 TQ_CHECK_SIZE(block_tq_uniform_2b, 4 + TQ_BK / 4);
+TQ_CHECK_SIZE(block_tq_uniform_3b, 4 * TQ_3B_NSUB + TQ_BK * 3 / 8);
 TQ_CHECK_SIZE(block_tq_mixed_4b8, 4 + TQ_MIXED_OUTLIERS + TQ_MIXED_OUTLIERS * 2 + TQ_BK / 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_3b, 8 + TQ_BK / 4 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
diff --git a/integrations/llamacpp/tq_kv_cache.cpp b/integrations/llamacpp/tq_kv_cache.cpp
@@ -44,7 +44,8 @@ enum {
     GGML_TYPE_TQ_TURBO_KV_4B  = GGML_TYPE_TQ_BASE + 9,
     GGML_TYPE_TQ_TURBO_KV_1B  = GGML_TYPE_TQ_BASE + 10,
     GGML_TYPE_TQ_TURBO_KV_2B  = GGML_TYPE_TQ_BASE + 11,
-    GGML_TYPE_TQ_COUNT         = 12,
+    GGML_TYPE_TQ_UNIFORM_3B   = GGML_TYPE_TQ_BASE + 12,
+    GGML_TYPE_TQ_COUNT         = 13,
 };
 
 /* ============================================================
@@ -65,6 +66,7 @@ static int tq_to_ggml_type(tq_type type) {
         case TQ_TYPE_TURBO_KV_4B:  return GGML_TYPE_TQ_TURBO_KV_4B;
         case TQ_TYPE_TURBO_KV_1B:  return GGML_TYPE_TQ_TURBO_KV_1B;
         case TQ_TYPE_TURBO_KV_2B:  return GGML_TYPE_TQ_TURBO_KV_2B;
+        case TQ_TYPE_UNIFORM_3B:   return GGML_TYPE_TQ_UNIFORM_3B;
         default: return -1;
     }
 }
@@ -83,6 +85,7 @@ static tq_type ggml_to_tq_type(int ggml_id) {
         case GGML_TYPE_TQ_TURBO_KV_4B:  return TQ_TYPE_TURBO_KV_4B;
         case GGML_TYPE_TQ_TURBO_KV_1B:  return TQ_TYPE_TURBO_KV_1B;
         case GGML_TYPE_TQ_TURBO_KV_2B:  return TQ_TYPE_TURBO_KV_2B;
+        case GGML_TYPE_TQ_UNIFORM_3B:   return TQ_TYPE_UNIFORM_3B;
         default: return TQ_TYPE_COUNT;
     }
 }
@@ -147,6 +150,7 @@ TQ_GGML_WRAPPERS(turbo_kv_3b,  TQ_TYPE_TURBO_KV_3B)
 TQ_GGML_WRAPPERS(turbo_kv_4b,  TQ_TYPE_TURBO_KV_4B)
 TQ_GGML_WRAPPERS(turbo_kv_1b,  TQ_TYPE_TURBO_KV_1B)
 TQ_GGML_WRAPPERS(turbo_kv_2b,  TQ_TYPE_TURBO_KV_2B)
+TQ_GGML_WRAPPERS(uniform_3b,  TQ_TYPE_UNIFORM_3B)
 
 /* ============================================================
  * vec_dot wrappers (quantized key . FP32 query -> scalar)
@@ -199,6 +203,7 @@ TQ_GGML_VEC_DOT(turbo_kv_3b,  TQ_TYPE_TURBO_KV_3B)
 TQ_GGML_VEC_DOT(turbo_kv_4b,  TQ_TYPE_TURBO_KV_4B)
 TQ_GGML_VEC_DOT(turbo_kv_1b,  TQ_TYPE_TURBO_KV_1B)
 TQ_GGML_VEC_DOT(turbo_kv_2b,  TQ_TYPE_TURBO_KV_2B)
+TQ_GGML_VEC_DOT(uniform_3b,  TQ_TYPE_UNIFORM_3B)
 
 /* ============================================================
  * GGML type trait table
@@ -314,6 +319,14 @@ static const tq_ggml_type_trait TQ_GGML_TRAITS[GGML_TYPE_TQ_COUNT] = {
         tq_ggml_to_float_turbo_kv_2b,
         tq_ggml_vec_dot_turbo_kv_2b,
     },
+    {
+        "tq_uniform_3b", GGML_TYPE_TQ_UNIFORM_3B, TQ_TYPE_UNIFORM_3B,
+        sizeof(block_tq_uniform_3b), TQ_BK,
+        (float)sizeof(block_tq_uniform_3b) * 8.0f / TQ_BK,
+        tq_ggml_from_float_uniform_3b,
+        tq_ggml_to_float_uniform_3b,
+        tq_ggml_vec_dot_uniform_3b,
+    },
 };
 
 #define TQ_GGML_NUM_TYPES (sizeof(TQ_GGML_TRAITS) / sizeof(TQ_GGML_TRAITS[0]))
diff --git a/src/core/tq_traits.c b/src/core/tq_traits.c
@@ -33,6 +33,11 @@ extern void tq_mixed_4b8_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_mixed_4b8_attention_ref(const float* query, const void* kv,
                                         float* scores, int seq_len, int head_dim);
 
+extern void tq_uniform_3b_quantize_ref(const float* src, void* dst, int n);
+extern void tq_uniform_3b_dequantize_ref(const void* src, float* dst, int n);
+extern void tq_uniform_3b_attention_ref(const float* query, const void* kv,
+                                         float* scores, int seq_len, int head_dim);
+
 extern void tq_turbo_kv_3b_quantize_ref(const float* src, void* dst, int n);
 extern void tq_turbo_kv_3b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_3b_attention_ref(const float* query, const void* kv,
@@ -174,6 +179,16 @@ const tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
         .attention  = tq_turbo_kv_2b_attention_ref,
         .residual_type = TQ_TYPE_QJL_1B,
     },
+    [TQ_TYPE_UNIFORM_3B] = {
+        .name       = "uniform_3b",
+        .block_size = TQ_BK,
+        .type_size  = sizeof(block_tq_uniform_3b),
+        .bpe        = (float)sizeof(block_tq_uniform_3b) * 8.0f / TQ_BK,
+        .quantize   = tq_uniform_3b_quantize_ref,
+        .dequantize = tq_uniform_3b_dequantize_ref,
+        .attention  = tq_uniform_3b_attention_ref,
+        .residual_type = TQ_TYPE_COUNT,
+    },
 };
 
 const char* tq_type_name(tq_type type) {
@@ -249,6 +264,8 @@ tq_format_spec_t tq_get_format_spec(tq_type type) {
         case TQ_TYPE_TURBO_KV_2B:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 2;
             spec.flags = TQ_FLAG_HAS_RESIDUAL; break;
+        case TQ_TYPE_UNIFORM_3B:
+            spec.algorithm = TQ_ALG_UNIFORM; spec.key_bits = 3; break;
         default: break;
     }
     return spec;
diff --git a/src/core/tq_uniform.c b/src/core/tq_uniform.c
@@ -258,3 +258,120 @@ void tq_uniform_2b_attention_ref(const float* query, const void* kv,
         scores[s] = dot;
     }
 }
+
+/* ====================================================================
+ * Uniform 3-bit with per-sub-block FP16 scales (Q3_K-style)
+ *
+ * Each 128-element block is split into 4 sub-blocks of 32 elements.
+ * Each sub-block has independent FP16 scale and minimum, giving
+ * excellent adaptation to local value distributions.
+ *
+ * 8 quantization levels (3-bit) per value.
+ * 64 bytes / 128 elements = 4.0 bpe.
+ *
+ * Compared to uniform_4b (4.0 bpe, 16 levels, 1 global scale):
+ * - Fewer levels (8 vs 16) but finer per-sub-block adaptation
+ * - Better for heterogeneous distributions within a head dimension
+ * ==================================================================== */
+
+/* ---------- Uniform 3-bit sub-block quantize ---------- */
+
+void tq_uniform_3b_quantize_ref(const float* src, void* dst, int n) {
+    block_tq_uniform_3b* block = (block_tq_uniform_3b*)dst;
+    int count = n;
+    if (count > TQ_BK) count = TQ_BK;
+
+    /* Compute per-sub-block min/max and store FP16 scale/min */
+    for (int sb = 0; sb < TQ_3B_NSUB; sb++) {
+        int start = sb * TQ_3B_SUBK;
+        int end = start + TQ_3B_SUBK;
+        if (end > count) end = count;
+        float mn = FLT_MAX, mx = -FLT_MAX;
+        for (int i = start; i < end; i++) {
+            if (src[i] < mn) mn = src[i];
+            if (src[i] > mx) mx = src[i];
+        }
+        if (end <= start) { mn = 0; mx = 0; }
+
+        float range = mx - mn;
+        if (range < 1e-8f) range = 1e-8f;
+        float scale = range / 8.0f; /* 3-bit: 8 bins of width range/8 */
+
+        block->sub_scale[sb] = uni_fp32_to_fp16(scale);
+        block->sub_min[sb]   = uni_fp32_to_fp16(mn);
+    }
+
+    /* Pack 3-bit quantized values into qs (LSB-first).
+     * Use the FP16-reconstructed scale/min for quantization
+     * to minimize encode/decode mismatch.
+     */
+    memset(block->qs, 0, TQ_BK * 3 / 8);
+    for (int i = 0; i < count; i++) {
+        int sb = i / TQ_3B_SUBK;
+        float scale = uni_fp16_to_fp32(block->sub_scale[sb]);
+        float mn    = uni_fp16_to_fp32(block->sub_min[sb]);
+        if (scale < 1e-10f) scale = 1e-10f;
+
+        int q = (int)floorf((src[i] - mn) / scale);
+        if (q < 0) q = 0;
+        if (q > 7) q = 7;
+
+        /* 3-bit packing: element i uses bits [i*3 .. i*3+2] across qs bytes */
+        int bit_pos = i * 3;
+        int byte_idx = bit_pos / 8;
+        int bit_off  = bit_pos % 8;
+        block->qs[byte_idx] |= (uint8_t)(q << bit_off);
+        /* Handle cross-byte boundary (when bit_off > 5, bits spill into next byte) */
+        if (bit_off > 5 && byte_idx + 1 < TQ_BK * 3 / 8) {
+            block->qs[byte_idx + 1] |= (uint8_t)(q >> (8 - bit_off));
+        }
+    }
+}
+
+/* ---------- Uniform 3-bit sub-block dequantize ---------- */
+
+void tq_uniform_3b_dequantize_ref(const void* src, float* dst, int n) {
+    const block_tq_uniform_3b* block = (const block_tq_uniform_3b*)src;
+    int count = n;
+    if (count > TQ_BK) count = TQ_BK;
+
+    for (int i = 0; i < count; i++) {
+        int sb = i / TQ_3B_SUBK;
+        float scale = uni_fp16_to_fp32(block->sub_scale[sb]);
+        float mn    = uni_fp16_to_fp32(block->sub_min[sb]);
+
+        /* Extract 3-bit value */
+        int bit_pos = i * 3;
+        int byte_idx = bit_pos / 8;
+        int bit_off  = bit_pos % 8;
+        int q = (block->qs[byte_idx] >> bit_off) & 0x07;
+        if (bit_off > 5 && byte_idx + 1 < TQ_BK * 3 / 8) {
+            q |= (block->qs[byte_idx + 1] << (8 - bit_off)) & 0x07;
+        }
+
+        dst[i] = mn + ((float)q + 0.5f) * scale;
+    }
+}
+
+/* ---------- Uniform 3-bit attention (dequantize + dot product) ---------- */
+
+void tq_uniform_3b_attention_ref(const float* query, const void* kv,
+                                  float* scores, int seq_len, int head_dim) {
+    int blocks_per_key = (head_dim + TQ_BK - 1) / TQ_BK;
+    const block_tq_uniform_3b* all_blocks = (const block_tq_uniform_3b*)kv;
+
+    for (int s = 0; s < seq_len; s++) {
+        float dot = 0;
+        for (int b = 0; b < blocks_per_key; b++) {
+            int offset = b * TQ_BK;
+            int chunk = (head_dim - offset > TQ_BK) ? TQ_BK : (head_dim - offset);
+
+            float deq[TQ_BK];
+            tq_uniform_3b_dequantize_ref(&all_blocks[s * blocks_per_key + b], deq, chunk);
+
+            for (int dd = 0; dd < chunk; dd++)
+                dot += query[offset + dd] * deq[dd];
+        }
+        scores[s] = dot;
+    }
+}
diff --git a/src/engine/tq_transformer.c b/src/engine/tq_transformer.c
@@ -861,26 +861,19 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     int n_heads = c->n_heads;
     int n_kv_heads = c->n_kv_heads;
 
-    /* Gemma 4 hybrid: full attention layers have different head_dim and kv_heads.
-     * Detect from GGUF weight shapes: if Q output > n_heads * head_dim, it's a full layer. */
-    if (model->layer_is_sliding && !model->layer_is_sliding[l] && layer->gguf_wq) {
-        /* Full attention layer: infer head_dim from Q tensor.
-         * Q shape = [hidden_dim, n_heads * full_head_dim * (1 + gate)] */
-        int q_out = 0;
-        /* Get Q output dim from GGUF tensor — stored at load time in gguf_wq_type's neighbor.
-         * Simpler: compute from expected: global_head_dim = metadata key_length */
-        int global_head_dim = tq_gguf_get_i32((const tq_gguf_ctx_t*)model->gguf_ctx,
-            "gemma4.attention.key_length", head_dim);
-        if (global_head_dim > head_dim) {
-            head_dim = global_head_dim;
-            /* For full layers, kv_heads is typically smaller */
-            /* K shape for full: [dim, kv_heads_full * global_head_dim]
-             * We know K_out from sliding kv_dim * (global/sliding) ratio... or just compute:
-             * Total Q = n_heads * global_head_dim = 16 * 512 = 8192
-             * Total K = ? from tensor. For now, infer: */
-            n_kv_heads = c->n_kv_heads * c->head_dim / global_head_dim;
-            if (n_kv_heads < 1) n_kv_heads = 1;
-        }
+    /* Gemma 4 hybrid: full attention layers use different head_dim and kv_heads.
+     * Sliding layers: head_dim=256, kv_heads=8 (stored in config)
+     * Full layers:    head_dim=512, kv_heads=2
+     * Infer full dimensions: total Q/K stays same, head_dim doubles, heads halve. */
+    if (model->layer_is_sliding && !model->layer_is_sliding[l]) {
+        /* Full attention layer: head_dim is 2x sliding, kv_heads is sliding/2.
+         * Query pre-attn scalar (Gemma) also changes with head_dim. */
+        int global_head_dim = c->head_dim * 2;  /* 256 → 512 */
+        int global_kv_heads = c->n_kv_heads * c->head_dim / global_head_dim;
+        if (global_kv_heads < 1) global_kv_heads = 1;
+        head_dim = global_head_dim;
+        n_kv_heads = global_kv_heads;
+        n_heads = n_heads; /* Q heads stay same count but with larger head_dim */
     }
 
     int kv_dim = n_kv_heads * head_dim;
diff --git a/tests/test_uniform.cpp b/tests/test_uniform.cpp
diff --git a/tools/tq_run.c b/tools/tq_run.c