Skip to content

Commit 5b5e4b7

Browse files
unamedkrclaude
andcommitted
turbo_kv_3bo: Variant G with 3-bit base + 8 outliers (research)
Smaller base codebook with the same per-block outlier mechanism as 4bo. Block layout: 8 hdr + 48 mse_3bit + 8 out_idx + 16 out_val_fp16 = 80 bytes. Lives between 4b (72B) and 5b (88B) on the size axis. Llama 3.2 3B PPL on bench/data/ppl_1k.txt (FP32 = 13.56): turbo_kv_4b 72B 14.28 (+5.3%) turbo_kv_3bo 80B 14.03 (+3.5%) ← Pareto improvement over 4b turbo_kv_5b 88B 13.60 (+0.34%) turbo_kv_4bo 96B 13.86 (+2.2%) (dominated by 5b) SmolLM2 135M PPL (FP32 = 18.62): turbo_kv_4b 72B 19.70 (+5.8%) turbo_kv_3bo 80B 20.45 (+9.8%) ← regression on this model turbo_kv_5b 88B 18.94 (+1.7%) turbo_kv_4bo 96B 19.29 (+3.6%) Key finding: per-channel outlier handling is **model-dependent**. On Llama 3.2 3B with head_dim=128 and a heavier-tailed distribution, 3bo Pareto-improves over 4b. On SmolLM2 135M with smaller dimensions, the 3-bit base is too coarse even with outliers and we regress past 4b. 5b remains the quality champion across both models. Decision: ship 3bo and 4bo as research/experimental types (selectable via -k turbo_kv_3bo / turbo_kv_4bo). The README headline keeps turbo_kv_4b as default and turbo_kv_5b as the quality option. 35/35 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4576910 commit 5b5e4b7

5 files changed

Lines changed: 181 additions & 2 deletions

File tree

include/turboquant/tq_types.h

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ typedef enum {
5656
TQ_TYPE_UNIFORM_3B= 12, /* Min-Max uniform 3-bit with sub-block scales */
5757
TQ_TYPE_TURBO_KV_5B = 13,/* TurboQuant KV: RHT + 5-bit Lloyd-Max codebook */
5858
TQ_TYPE_TURBO_KV_4BO = 14,/* TurboQuant KV: 4-bit codebook + 8 FP16 outliers */
59-
TQ_TYPE_COUNT = 15
59+
TQ_TYPE_TURBO_KV_3BO = 15,/* TurboQuant KV: 3-bit codebook + 8 FP16 outliers */
60+
TQ_TYPE_COUNT = 16
6061
} tq_type;
6162

6263
/* ============================================================
@@ -245,6 +246,24 @@ typedef struct {
245246
uint16_t out_values[TQ_KV_4BO_OUTLIERS]; /* outlier values FP16 (16B) */
246247
} block_tq_turbo_kv_4bo;
247248

249+
/* TurboQuant KV cache block: 3-bit + per-block outliers (Variant G, smaller base)
250+
*
251+
* Same outlier mechanism as turbo_kv_4bo but with a 3-bit (8-level) codebook
252+
* for the body. Smaller block size at the cost of a coarser codebook.
253+
*
254+
* Layout: 8 hdr + 48 mse_3bit + 8 out_idx + 16 out_val_fp16 = 80 bytes
255+
* Compare: 4b=72B, 4bo=96B, 5b=88B, 3bo=80B
256+
*/
257+
typedef struct {
258+
uint16_t norm; /* L2 norm of original (fp16) */
259+
uint16_t residual_norm; /* unused */
260+
uint16_t inv_std_fp16; /* per-block inv_std */
261+
uint16_t _pad; /* alignment */
262+
uint8_t mse_indices[TQ_BK * 3 / 8]; /* 3-bit packed indices (48B) */
263+
uint8_t out_indices[TQ_KV_4BO_OUTLIERS]; /* outlier channel indices (8B) */
264+
uint16_t out_values[TQ_KV_4BO_OUTLIERS]; /* outlier values FP16 (16B) */
265+
} block_tq_turbo_kv_3bo;
266+
248267
/* TurboQuant KV cache block: 5-bit variant (Variant F architecture)
249268
*
250269
* 5-bit (32-level) Lloyd-Max-Gaussian codebook on RHT-rotated values.
@@ -320,6 +339,7 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_3b, 8 + TQ_BK * 3 / 8);
320339
TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK / 2);
321340
TQ_CHECK_SIZE(block_tq_turbo_kv_5b, 8 + TQ_BK * 5 / 8);
322341
TQ_CHECK_SIZE(block_tq_turbo_kv_4bo, 8 + TQ_BK / 2 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
342+
TQ_CHECK_SIZE(block_tq_turbo_kv_3bo, 8 + TQ_BK * 3 / 8 + TQ_KV_4BO_OUTLIERS + TQ_KV_4BO_OUTLIERS * 2);
323343
TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
324344
TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
325345

integrations/llamacpp/tq_kv_cache.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ enum {
4747
GGML_TYPE_TQ_UNIFORM_3B = GGML_TYPE_TQ_BASE + 12,
4848
GGML_TYPE_TQ_TURBO_KV_5B = GGML_TYPE_TQ_BASE + 13,
4949
GGML_TYPE_TQ_TURBO_KV_4BO = GGML_TYPE_TQ_BASE + 14,
50-
GGML_TYPE_TQ_COUNT = 15,
50+
GGML_TYPE_TQ_TURBO_KV_3BO = GGML_TYPE_TQ_BASE + 15,
51+
GGML_TYPE_TQ_COUNT = 16,
5152
};
5253

5354
/* ============================================================
@@ -71,6 +72,7 @@ static int tq_to_ggml_type(tq_type type) {
7172
case TQ_TYPE_UNIFORM_3B: return GGML_TYPE_TQ_UNIFORM_3B;
7273
case TQ_TYPE_TURBO_KV_5B: return GGML_TYPE_TQ_TURBO_KV_5B;
7374
case TQ_TYPE_TURBO_KV_4BO: return GGML_TYPE_TQ_TURBO_KV_4BO;
75+
case TQ_TYPE_TURBO_KV_3BO: return GGML_TYPE_TQ_TURBO_KV_3BO;
7476
default: return -1;
7577
}
7678
}
@@ -92,6 +94,7 @@ static tq_type ggml_to_tq_type(int ggml_id) {
9294
case GGML_TYPE_TQ_UNIFORM_3B: return TQ_TYPE_UNIFORM_3B;
9395
case GGML_TYPE_TQ_TURBO_KV_5B: return TQ_TYPE_TURBO_KV_5B;
9496
case GGML_TYPE_TQ_TURBO_KV_4BO: return TQ_TYPE_TURBO_KV_4BO;
97+
case GGML_TYPE_TQ_TURBO_KV_3BO: return TQ_TYPE_TURBO_KV_3BO;
9598
default: return TQ_TYPE_COUNT;
9699
}
97100
}
@@ -159,6 +162,7 @@ TQ_GGML_WRAPPERS(turbo_kv_2b, TQ_TYPE_TURBO_KV_2B)
159162
TQ_GGML_WRAPPERS(uniform_3b, TQ_TYPE_UNIFORM_3B)
160163
TQ_GGML_WRAPPERS(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
161164
TQ_GGML_WRAPPERS(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
165+
TQ_GGML_WRAPPERS(turbo_kv_3bo, TQ_TYPE_TURBO_KV_3BO)
162166

163167
/* ============================================================
164168
* vec_dot wrappers (quantized key . FP32 query -> scalar)
@@ -214,6 +218,7 @@ TQ_GGML_VEC_DOT(turbo_kv_2b, TQ_TYPE_TURBO_KV_2B)
214218
TQ_GGML_VEC_DOT(uniform_3b, TQ_TYPE_UNIFORM_3B)
215219
TQ_GGML_VEC_DOT(turbo_kv_5b, TQ_TYPE_TURBO_KV_5B)
216220
TQ_GGML_VEC_DOT(turbo_kv_4bo, TQ_TYPE_TURBO_KV_4BO)
221+
TQ_GGML_VEC_DOT(turbo_kv_3bo, TQ_TYPE_TURBO_KV_3BO)
217222

218223
/* ============================================================
219224
* GGML type trait table
@@ -353,6 +358,14 @@ static const tq_ggml_type_trait TQ_GGML_TRAITS[GGML_TYPE_TQ_COUNT] = {
353358
tq_ggml_to_float_turbo_kv_4bo,
354359
tq_ggml_vec_dot_turbo_kv_4bo,
355360
},
361+
{
362+
"tq_turbo_kv_3bo", GGML_TYPE_TQ_TURBO_KV_3BO, TQ_TYPE_TURBO_KV_3BO,
363+
sizeof(block_tq_turbo_kv_3bo), TQ_BK,
364+
(float)sizeof(block_tq_turbo_kv_3bo) * 8.0f / TQ_BK,
365+
tq_ggml_from_float_turbo_kv_3bo,
366+
tq_ggml_to_float_turbo_kv_3bo,
367+
tq_ggml_vec_dot_turbo_kv_3bo,
368+
},
356369
};
357370

358371
#define TQ_GGML_NUM_TYPES (sizeof(TQ_GGML_TRAITS) / sizeof(TQ_GGML_TRAITS[0]))
@@ -446,6 +459,7 @@ tq_type tq_parse_kv_cache_type(const char* arg) {
446459
{ "turbo_kv_4b", TQ_TYPE_TURBO_KV_4B },
447460
{ "turbo_kv_5b", TQ_TYPE_TURBO_KV_5B },
448461
{ "turbo_kv_4bo", TQ_TYPE_TURBO_KV_4BO },
462+
{ "turbo_kv_3bo", TQ_TYPE_TURBO_KV_3BO },
449463
{ "tq-turbo-kv-4b", TQ_TYPE_TURBO_KV_4B },
450464
{ "turbokv4", TQ_TYPE_TURBO_KV_4B },
451465
{ "turbo_kv_1b", TQ_TYPE_TURBO_KV_1B },

src/core/tq_traits.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ extern void tq_turbo_kv_4bo_dequantize_ref(const void* src, float* dst, int n);
5858
extern void tq_turbo_kv_4bo_attention_ref(const float* query, const void* kv,
5959
float* scores, int seq_len, int head_dim);
6060

61+
extern void tq_turbo_kv_3bo_quantize_ref(const float* src, void* dst, int n);
62+
extern void tq_turbo_kv_3bo_dequantize_ref(const void* src, float* dst, int n);
63+
extern void tq_turbo_kv_3bo_attention_ref(const float* query, const void* kv,
64+
float* scores, int seq_len, int head_dim);
65+
6166
extern void tq_turbo_kv_1b_quantize_ref(const float* src, void* dst, int n);
6267
extern void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n);
6368
extern void tq_turbo_kv_1b_attention_ref(const float* query, const void* kv,
@@ -190,6 +195,16 @@ tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
190195
.attention = tq_turbo_kv_4bo_attention_ref,
191196
.residual_type = TQ_TYPE_COUNT,
192197
},
198+
[TQ_TYPE_TURBO_KV_3BO] = {
199+
.name = "turbo_kv_3bo",
200+
.block_size = TQ_BK,
201+
.type_size = sizeof(block_tq_turbo_kv_3bo),
202+
.bpe = (float)sizeof(block_tq_turbo_kv_3bo) * 8.0f / TQ_BK,
203+
.quantize = tq_turbo_kv_3bo_quantize_ref,
204+
.dequantize = tq_turbo_kv_3bo_dequantize_ref,
205+
.attention = tq_turbo_kv_3bo_attention_ref,
206+
.residual_type = TQ_TYPE_COUNT,
207+
},
193208
[TQ_TYPE_TURBO_KV_1B] = {
194209
.name = "turbo_kv_1b",
195210
.block_size = TQ_BK,
@@ -293,6 +308,8 @@ tq_format_spec_t tq_get_format_spec(tq_type type) {
293308
spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 5; break;
294309
case TQ_TYPE_TURBO_KV_4BO:
295310
spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4; break;
311+
case TQ_TYPE_TURBO_KV_3BO:
312+
spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 3; break;
296313
case TQ_TYPE_TURBO_KV_1B:
297314
spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 1; break;
298315
case TQ_TYPE_TURBO_KV_2B:

src/core/tq_turbo_kv.c

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,3 +1228,130 @@ void tq_turbo_kv_4bo_attention_ref(const float* query, const void* kv_cache,
12281228
scores[seq] = norm * mse_dot;
12291229
}
12301230
}
1231+
1232+
/* ============================================================
1233+
* TurboQuant KV 3-bit + outliers (Variant G, smaller base):
1234+
* Same outlier mechanism as 4bo but with a 3-bit codebook for the body.
1235+
* 80 byte block — between 4b (72) and 5b (88).
1236+
* ============================================================ */
1237+
1238+
void tq_turbo_kv_3bo_quantize_ref(const float* src, void* dst, int n) {
1239+
block_tq_turbo_kv_3bo* block = (block_tq_turbo_kv_3bo*)dst;
1240+
int dim = n;
1241+
if (dim > TQ_BK) dim = TQ_BK;
1242+
1243+
float norm_sq = 0.0f;
1244+
for (int i = 0; i < dim; i++) norm_sq += src[i] * src[i];
1245+
float norm = sqrtf(norm_sq);
1246+
block->norm = tkv_fp32_to_fp16(norm);
1247+
block->residual_norm = 0;
1248+
block->_pad = 0;
1249+
1250+
float rotated[TQ_BK];
1251+
float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
1252+
for (int i = 0; i < dim; i++) rotated[i] = src[i] * inv_norm;
1253+
for (int i = dim; i < TQ_BK; i++) rotated[i] = 0.0f;
1254+
tq_rht_transform(rotated, dim, TKV_DEFAULT_SEED);
1255+
1256+
/* Find top-K outliers */
1257+
int K = TQ_KV_4BO_OUTLIERS;
1258+
int out_idx[TQ_KV_4BO_OUTLIERS];
1259+
float out_abs[TQ_KV_4BO_OUTLIERS];
1260+
for (int k = 0; k < K; k++) { out_idx[k] = -1; out_abs[k] = -1.0f; }
1261+
1262+
for (int i = 0; i < dim; i++) {
1263+
float a = fabsf(rotated[i]);
1264+
int min_pos = 0;
1265+
for (int k = 1; k < K; k++) {
1266+
if (out_abs[k] < out_abs[min_pos]) min_pos = k;
1267+
}
1268+
if (a > out_abs[min_pos]) {
1269+
out_abs[min_pos] = a;
1270+
out_idx[min_pos] = i;
1271+
}
1272+
}
1273+
for (int k = 0; k < K; k++) {
1274+
int idx = out_idx[k];
1275+
if (idx < 0) {
1276+
block->out_indices[k] = 0;
1277+
block->out_values[k] = 0;
1278+
} else {
1279+
block->out_indices[k] = (uint8_t)idx;
1280+
block->out_values[k] = tkv_fp32_to_fp16(rotated[idx]);
1281+
}
1282+
}
1283+
1284+
/* Body-only max-abs scaling for 3-bit codebook */
1285+
char is_outlier[TQ_BK];
1286+
memset(is_outlier, 0, sizeof(is_outlier));
1287+
for (int k = 0; k < K; k++) {
1288+
if (out_idx[k] >= 0) is_outlier[out_idx[k]] = 1;
1289+
}
1290+
float body_max_abs = 0.0f;
1291+
for (int i = 0; i < dim; i++) {
1292+
if (is_outlier[i]) continue;
1293+
float a = fabsf(rotated[i]);
1294+
if (a > body_max_abs) body_max_abs = a;
1295+
}
1296+
if (body_max_abs < 1e-10f) body_max_abs = 1.0f;
1297+
const float CENT_3BIT_MAX = 2.1520f;
1298+
float inv_std = CENT_3BIT_MAX / body_max_abs;
1299+
block->inv_std_fp16 = tkv_fp32_to_fp16(inv_std);
1300+
1301+
uint8_t indices[TQ_BK];
1302+
tq_codebook_quantize(rotated, indices, dim, 3, inv_std);
1303+
pack_3bit(indices, block->mse_indices, dim);
1304+
}
1305+
1306+
static void dequant_mse_rotated_3bo(const block_tq_turbo_kv_3bo* block,
1307+
float* rotated, int dim) {
1308+
float inv_std = tkv_fp16_to_fp32(block->inv_std_fp16);
1309+
if (inv_std < 1e-10f) inv_std = sqrtf((float)dim);
1310+
uint8_t indices[TQ_BK] = {0};
1311+
unpack_3bit(block->mse_indices, indices, dim);
1312+
tq_codebook_dequantize(indices, rotated, dim, 3, inv_std);
1313+
1314+
int K = TQ_KV_4BO_OUTLIERS;
1315+
for (int k = 0; k < K; k++) {
1316+
int idx = block->out_indices[k];
1317+
if (idx < dim) {
1318+
rotated[idx] = tkv_fp16_to_fp32(block->out_values[k]);
1319+
}
1320+
}
1321+
}
1322+
1323+
void tq_turbo_kv_3bo_dequantize_ref(const void* src, float* dst, int n) {
1324+
const block_tq_turbo_kv_3bo* block = (const block_tq_turbo_kv_3bo*)src;
1325+
int dim = n;
1326+
if (dim > TQ_BK) dim = TQ_BK;
1327+
1328+
float norm = tkv_fp16_to_fp32(block->norm);
1329+
float rotated[TQ_BK];
1330+
dequant_mse_rotated_3bo(block, rotated, dim);
1331+
tq_rht_inverse(rotated, dim, TKV_DEFAULT_SEED);
1332+
for (int i = 0; i < dim; i++) dst[i] = rotated[i] * norm;
1333+
}
1334+
1335+
void tq_turbo_kv_3bo_attention_ref(const float* query, const void* kv_cache,
1336+
float* scores, int seq_len, int head_dim) {
1337+
const block_tq_turbo_kv_3bo* blocks_3bo = (const block_tq_turbo_kv_3bo*)kv_cache;
1338+
int dim = head_dim;
1339+
if (dim > TQ_BK) dim = TQ_BK;
1340+
1341+
float q_rot[TQ_BK];
1342+
memcpy(q_rot, query, (size_t)dim * sizeof(float));
1343+
for (int i = dim; i < TQ_BK; i++) q_rot[i] = 0.0f;
1344+
tq_rht_transform(q_rot, dim, TKV_DEFAULT_SEED);
1345+
1346+
for (int seq = 0; seq < seq_len; seq++) {
1347+
const block_tq_turbo_kv_3bo* block = &blocks_3bo[seq];
1348+
float norm = tkv_fp16_to_fp32(block->norm);
1349+
1350+
float rotated[TQ_BK];
1351+
dequant_mse_rotated_3bo(block, rotated, dim);
1352+
1353+
float mse_dot = 0.0f;
1354+
for (int d = 0; d < dim; d++) mse_dot += q_rot[d] * rotated[d];
1355+
scores[seq] = norm * mse_dot;
1356+
}
1357+
}

tools/quant.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ static tq_type parse_kv_type(const char* s) {
8383
if (strcmp(s, "turbo_kv_4b") == 0) return TQ_TYPE_TURBO_KV_4B;
8484
if (strcmp(s, "turbo_kv_5b") == 0) return TQ_TYPE_TURBO_KV_5B;
8585
if (strcmp(s, "turbo_kv_4bo") == 0) return TQ_TYPE_TURBO_KV_4BO;
86+
if (strcmp(s, "turbo_kv_3bo") == 0) return TQ_TYPE_TURBO_KV_3BO;
8687
if (strcmp(s, "turbo_kv_1b") == 0) return TQ_TYPE_TURBO_KV_1B;
8788
if (strcmp(s, "qjl_1b") == 0) return TQ_TYPE_QJL_1B;
8889
if (strcmp(s, "mixed_4b8") == 0) return TQ_TYPE_MIXED_4B8;

0 commit comments

Comments
 (0)