@@ -12942,6 +12942,43 @@ void tq_free_model(tq_model_t* model) {
1294212942 }
1294312943 }
1294412944 free(model->moe_config);
12945+
12946+ /* Free dequantized norm/embedding buffers (GGUF path only).
12947+ * In the GGUF path, dequant_tensor_fp32() individually malloc's each
12948+ * norm weight. In the SafeTensor path, these point into _converted_data
12949+ * (freed above), so we must NOT free them again. */
12950+ if (model->gguf_ctx && model->layers) {
12951+ for (int l = 0; l < model->config.n_layers; l++) {
12952+ tq_layer_weights_t* layer = &model->layers[l];
12953+ free(layer->attn_norm);
12954+ free(layer->ffn_norm);
12955+ free(layer->q_norm);
12956+ free(layer->k_norm);
12957+ free(layer->post_attn_norm);
12958+ free(layer->post_ffn_norm);
12959+ free(layer->pre_ffn_norm);
12960+ free(layer->post_ffn_norm_1);
12961+ free(layer->pre_ffn_norm_2);
12962+ free(layer->post_ffn_norm_2);
12963+ free(layer->ple_norm);
12964+ free(layer->delta_a_log);
12965+ free(layer->delta_conv1d);
12966+ free(layer->delta_dt_bias);
12967+ free(layer->delta_in_proj_qkv);
12968+ free(layer->delta_in_proj_z);
12969+ free(layer->delta_norm);
12970+ free(layer->delta_in_proj_a);
12971+ free(layer->delta_in_proj_b);
12972+ free(layer->delta_out_proj);
12973+ }
12974+ free(model->token_embedding);
12975+ free(model->output_weight);
12976+ free(model->output_norm);
12977+ free(model->rope_freqs);
12978+ free(model->ple_proj);
12979+ free(model->ple_proj_norm);
12980+ }
12981+
1294512982 free(model->layers);
1294612983
1294712984 /* Free GGUF context (handles munmap internally) */
@@ -13317,12 +13354,16 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
1331713354 s->delta_dvec = (float*)calloc((size_t)dv, sizeof(float));
1331813355 }
1331913356
13320- /* Quantization workspace */
13357+ /* Quantization workspace — use MAX head_dim for hybrid attention (Gemma 4).
13358+ * Sliding layers have head_dim=256, full layers have head_dim=512.
13359+ * Quantized cache must accommodate the larger dimension. (issue #61) */
1332113360 size_t block_size = tq_type_block_size(kv_type);
1332213361 size_t type_size = tq_type_type_size(kv_type);
1332313362 if (block_size == 0) block_size = TQ_BK;
1332413363 if (type_size == 0) type_size = sizeof(block_tq_uniform_4b);
13325- size_t n_blocks_per_head = ((size_t)config->head_dim + block_size - 1) / block_size;
13364+ int max_head_dim = config->head_dim;
13365+ if (config->full_head_dim > max_head_dim) max_head_dim = config->full_head_dim;
13366+ size_t n_blocks_per_head = ((size_t)max_head_dim + block_size - 1) / block_size;
1332613367 /* quant_key_buf is used as a gather buffer for integer attention:
1332713368 * we collect quantized key blocks for one KV head across all seq positions.
1332813369 * Size needed: max_seq_len * blocks_per_head * type_size */
@@ -13337,7 +13378,10 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
1333713378 * Layout: [n_layers][max_seq_len][n_kv_heads][blocks_per_head * type_size]
1333813379 * Each key vector is quantized when stored, then reused for fast Q4xQ8 attention. */
1333913380 s->quant_head_stride = n_blocks_per_head * type_size;
13340- size_t quant_pos_stride = s->quant_head_stride * (size_t)config->n_kv_heads;
13381+ /* Use max kv_heads for position stride (hybrid: sliding=8, full=2 but larger heads) */
13382+ int max_kv_heads = config->n_kv_heads;
13383+ if (config->full_n_kv_heads > max_kv_heads) max_kv_heads = config->full_n_kv_heads;
13384+ size_t quant_pos_stride = s->quant_head_stride * (size_t)max_kv_heads;
1334113385 s->quant_kv_stride = quant_pos_stride * (size_t)max_seq;
1334213386 if (kv_type < TQ_TYPE_COUNT) {
1334313387 s->quant_key_cache = calloc((size_t)n_layers * s->quant_kv_stride, 1);
@@ -14388,15 +14432,17 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
1438814432 /* Quantized KV cache: stride was allocated with sliding dims (c->n_kv_heads, c->head_dim).
1438914433 * For hybrid attention full layers with different head_dim, skip quant cache
1439014434 * (quant_head_stride doesn't match). Fall back to FP32 cache for those layers. */
14435+ /* Hybrid attention KV cache: allocated with max(sliding, full) dimensions.
14436+ * quant_head_stride uses max_head_dim, quant_pos_stride uses max_kv_heads.
14437+ * Both sliding and full layers can use the quantized cache. (issue #61) */
1439114438 int cache_n_kv_heads = c->n_kv_heads;
14392- if (head_dim ! = c->head_dim) {
14393- /* Full layer: head_dim mismatch with quant cache allocation.
14394- * Disable both quantized and integer attention → use FP32 path. */
14439+ if (c->full_n_kv_heads > cache_n_kv_heads) cache_n_kv_heads = c->full_n_kv_heads;
14440+ if (head_dim != c-> head_dim && c->full_head_dim == 0) {
14441+ /* Non-hybrid head_dim mismatch — disable quantized path */
1439514442 use_quant_kv = 0;
1439614443 use_int_attn = 0;
14397- /* Ensure K is stored in FP32 cache (may have been skipped above) */
1439814444 memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
14399- } else if (use_int_attn && head_dim != c->head_dim) {
14445+ } else if (use_int_attn && head_dim != c->head_dim && c->full_head_dim == 0 ) {
1440014446 use_int_attn = 0;
1440114447 memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
1440214448 }
@@ -16297,6 +16343,7 @@ static int chat_find_marker(const char* h, int hlen, const char* m) {
1629716343static const char* const CHAT_END_MARKERS[] = {
1629816344 "<|im_end|>", "<|eot_id|>", "<end_of_turn>", "<|endoftext|>",
1629916345 "<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
16346+ "</s>", "<|end|>",
1630016347 NULL,
1630116348};
1630216349
0 commit comments