quantumaikr
diff --git a/‎bindings/python/quantcpp/cli.py‎
Lines changed: 33 additions & 3 deletions b/‎bindings/python/quantcpp/cli.py‎
Lines changed: 33 additions & 3 deletions
diff --git a/‎include/turboquant/tq_engine.h‎
Lines changed: 14 additions & 0 deletions b/‎include/turboquant/tq_engine.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎quant.h‎
Lines changed: 55 additions & 8 deletions b/‎quant.h‎
Lines changed: 55 additions & 8 deletions
diff --git a/‎scripts/check_sync.sh‎
Lines changed: 152 additions & 0 deletions b/‎scripts/check_sync.sh‎
Lines changed: 152 additions & 0 deletions
@@ -153,8 +153,9 @@ def cmd_run(args):
     m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
               n_threads=args.threads)
 
-    if args.prompt:
-        question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
+    prompt_parts = args.prompt if args.prompt else None
+    if prompt_parts:
+        question = " ".join(prompt_parts) if isinstance(prompt_parts, list) else prompt_parts
         for tok in m.generate(question):
             print(tok, end="", flush=True)
         print()
@@ -357,6 +358,8 @@ def cmd_chat_default(args):
 def main():
     import argparse
 
+    from quantcpp import __version__
+
     parser = argparse.ArgumentParser(
         prog="quantcpp",
         description="Chat with a local LLM. No API key, no GPU, no server.",
@@ -387,6 +390,8 @@ def main():
 """,
     )
 
+    parser.add_argument("--version", action="version", version=f"quantcpp {__version__}")
+
     sub = parser.add_subparsers(dest="command")
 
     # pull
@@ -433,7 +438,32 @@ def main():
     parser.add_argument("--temperature", "-t", type=float, default=0.7)
     parser.add_argument("--threads", "-j", type=int, default=4)
 
-    args = parser.parse_args()
+    # Backwards-compat (issue #54): if the first positional arg is not a
+    # known subcommand, treat all positionals as a prompt. We must detect
+    # this BEFORE argparse sees the argv, because the subparser will reject
+    # unknown choices with an error.
+    known_commands = {"pull", "list", "run", "serve", "client"}
+    argv = sys.argv[1:]
+
+    first_pos = None
+    for a in argv:
+        if a.startswith("-"):
+            continue
+        first_pos = a
+        break
+
+    if first_pos and first_pos not in known_commands:
+        # Parse with a minimal parser that has no subcommands
+        compat = argparse.ArgumentParser(prog="quantcpp", add_help=False)
+        compat.add_argument("prompt", nargs="*", default=None)
+        compat.add_argument("--model", "-m", default=None)
+        compat.add_argument("--max-tokens", "-n", type=int, default=256)
+        compat.add_argument("--temperature", "-t", type=float, default=0.7)
+        compat.add_argument("--threads", "-j", type=int, default=4)
+        args = compat.parse_args(argv)
+        return cmd_chat_default(args)
+
+    args = parser.parse_args(argv)
 
     if args.command == "pull":
         return cmd_pull(args)
 
@@ -63,6 +63,16 @@ typedef struct {
     float final_logit_softcap; /* logit soft-capping: logits = cap * tanh(logits/cap), 0=disabled */
     float attn_logit_softcap;  /* attention score soft-capping (Gemma): 0=disabled, typically 50.0 */
     int* per_layer_inter_dim;  /* [n_layers] per-layer intermediate_dim (NULL = use intermediate_dim) */
+
+    /* Phi-3 LongRoPE parameters */
+    int rope_orig_ctx_len;             /* original context length (e.g., 4096) */
+    float rope_attn_factor;            /* attention magnitude scaling */
+    const float* rope_factors_short;   /* [head_dim/2] for short context */
+    const float* rope_factors_long;    /* [head_dim/2] for long context */
+
+    /* Phi-3 fused-tensor flags — drive state buffer sizing */
+    int has_fused_qkv;                 /* any layer has gguf_w_qkv */
+    int has_fused_up_gate;             /* any layer has gguf_w_up_gate */
 } tq_model_config_t;
 
 /* ============================================================
@@ -173,6 +183,10 @@ typedef struct {
     const void* gguf_delta_a;    int gguf_delta_a_type;
     const void* gguf_delta_b;    int gguf_delta_b_type;
     const void* gguf_delta_out;  int gguf_delta_out_type;
+    /* Phi-3 fused projections — one matmul + memcpy split */
+    const void* gguf_w_qkv;     int gguf_w_qkv_type;     /* [hidden, q+k+v] fused QKV */
+    const void* gguf_w_up_gate; int gguf_w_up_gate_type;  /* [hidden, 2*inter] fused gate||up */
+
     /* GGUF FFN (dense layers in MoE models) */
     const void* gguf_w_gate; int gguf_w_gate_type;
     const void* gguf_w_up;   int gguf_w_up_type;
 
@@ -12942,6 +12942,43 @@ void tq_free_model(tq_model_t* model) {
         }
     }
     free(model->moe_config);
+
+    /* Free dequantized norm/embedding buffers (GGUF path only).
+     * In the GGUF path, dequant_tensor_fp32() individually malloc's each
+     * norm weight. In the SafeTensor path, these point into _converted_data
+     * (freed above), so we must NOT free them again. */
+    if (model->gguf_ctx && model->layers) {
+        for (int l = 0; l < model->config.n_layers; l++) {
+            tq_layer_weights_t* layer = &model->layers[l];
+            free(layer->attn_norm);
+            free(layer->ffn_norm);
+            free(layer->q_norm);
+            free(layer->k_norm);
+            free(layer->post_attn_norm);
+            free(layer->post_ffn_norm);
+            free(layer->pre_ffn_norm);
+            free(layer->post_ffn_norm_1);
+            free(layer->pre_ffn_norm_2);
+            free(layer->post_ffn_norm_2);
+            free(layer->ple_norm);
+            free(layer->delta_a_log);
+            free(layer->delta_conv1d);
+            free(layer->delta_dt_bias);
+            free(layer->delta_in_proj_qkv);
+            free(layer->delta_in_proj_z);
+            free(layer->delta_norm);
+            free(layer->delta_in_proj_a);
+            free(layer->delta_in_proj_b);
+            free(layer->delta_out_proj);
+        }
+        free(model->token_embedding);
+        free(model->output_weight);
+        free(model->output_norm);
+        free(model->rope_freqs);
+        free(model->ple_proj);
+        free(model->ple_proj_norm);
+    }
+
     free(model->layers);
 
     /* Free GGUF context (handles munmap internally) */
@@ -13317,12 +13354,16 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
         s->delta_dvec = (float*)calloc((size_t)dv, sizeof(float));
     }
 
-    /* Quantization workspace */
+    /* Quantization workspace — use MAX head_dim for hybrid attention (Gemma 4).
+     * Sliding layers have head_dim=256, full layers have head_dim=512.
+     * Quantized cache must accommodate the larger dimension. (issue #61) */
     size_t block_size = tq_type_block_size(kv_type);
     size_t type_size  = tq_type_type_size(kv_type);
     if (block_size == 0) block_size = TQ_BK;
     if (type_size == 0) type_size = sizeof(block_tq_uniform_4b);
-    size_t n_blocks_per_head = ((size_t)config->head_dim + block_size - 1) / block_size;
+    int max_head_dim = config->head_dim;
+    if (config->full_head_dim > max_head_dim) max_head_dim = config->full_head_dim;
+    size_t n_blocks_per_head = ((size_t)max_head_dim + block_size - 1) / block_size;
     /* quant_key_buf is used as a gather buffer for integer attention:
      * we collect quantized key blocks for one KV head across all seq positions.
      * Size needed: max_seq_len * blocks_per_head * type_size */
@@ -13337,7 +13378,10 @@ tq_state_t* tq_create_state_ex(const tq_model_config_t* config, tq_type kv_type,
      * Layout: [n_layers][max_seq_len][n_kv_heads][blocks_per_head * type_size]
      * Each key vector is quantized when stored, then reused for fast Q4xQ8 attention. */
     s->quant_head_stride = n_blocks_per_head * type_size;
-    size_t quant_pos_stride = s->quant_head_stride * (size_t)config->n_kv_heads;
+    /* Use max kv_heads for position stride (hybrid: sliding=8, full=2 but larger heads) */
+    int max_kv_heads = config->n_kv_heads;
+    if (config->full_n_kv_heads > max_kv_heads) max_kv_heads = config->full_n_kv_heads;
+    size_t quant_pos_stride = s->quant_head_stride * (size_t)max_kv_heads;
     s->quant_kv_stride = quant_pos_stride * (size_t)max_seq;
     if (kv_type < TQ_TYPE_COUNT) {
         s->quant_key_cache = calloc((size_t)n_layers * s->quant_kv_stride, 1);
@@ -14388,15 +14432,17 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
     /* Quantized KV cache: stride was allocated with sliding dims (c->n_kv_heads, c->head_dim).
      * For hybrid attention full layers with different head_dim, skip quant cache
      * (quant_head_stride doesn't match). Fall back to FP32 cache for those layers. */
+    /* Hybrid attention KV cache: allocated with max(sliding, full) dimensions.
+     * quant_head_stride uses max_head_dim, quant_pos_stride uses max_kv_heads.
+     * Both sliding and full layers can use the quantized cache. (issue #61) */
     int cache_n_kv_heads = c->n_kv_heads;
-    if (head_dim != c->head_dim) {
-        /* Full layer: head_dim mismatch with quant cache allocation.
-         * Disable both quantized and integer attention → use FP32 path. */
+    if (c->full_n_kv_heads > cache_n_kv_heads) cache_n_kv_heads = c->full_n_kv_heads;
+    if (head_dim != c->head_dim && c->full_head_dim == 0) {
+        /* Non-hybrid head_dim mismatch — disable quantized path */
         use_quant_kv = 0;
         use_int_attn = 0;
-        /* Ensure K is stored in FP32 cache (may have been skipped above) */
         memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
-    } else if (use_int_attn && head_dim != c->head_dim) {
+    } else if (use_int_attn && head_dim != c->head_dim && c->full_head_dim == 0) {
         use_int_attn = 0;
         memcpy(key_cache_layer + (size_t)pos * cache_kv_dim, s->k, kv_dim * sizeof(float));
     }
@@ -16297,6 +16343,7 @@ static int chat_find_marker(const char* h, int hlen, const char* m) {
 static const char* const CHAT_END_MARKERS[] = {
     "<|im_end|>", "<|eot_id|>", "<end_of_turn>", "<|endoftext|>",
     "<|im_start|>", "<|start_header_id|>", "<|eom_id|>",
+    "</s>", "<|end|>",
     NULL,
 };
 
 
@@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+# check_sync.sh — verify critical code sections are in sync between
+# quant.h (single header) and src/ (split sources).
+#
+# This catches the #67-class bug: a feature implemented in quant.h
+# but not ported to the split sources (or vice versa).
+#
+# Usage: bash scripts/check_sync.sh
+# Returns 0 if all checks pass, 1 if any drift is detected.
+
+set -euo pipefail
+
+HEADER="quant.h"
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+ERRORS=0
+
+check_marker_list() {
+    local label="$1"
+    local file1="$2"
+    local file2="$3"
+    local pattern="$4"
+
+    local list1 list2
+    list1=$(grep -o "$pattern" "$file1" 2>/dev/null | sort -u)
+    list2=$(grep -o "$pattern" "$file2" 2>/dev/null | sort -u)
+
+    if [ "$list1" = "$list2" ]; then
+        echo -e "  ${GREEN}✓${NC} $label"
+    else
+        echo -e "  ${RED}✗${NC} $label — MISMATCH"
+        diff <(echo "$list1") <(echo "$list2") || true
+        ERRORS=$((ERRORS + 1))
+    fi
+}
+
+check_field_exists() {
+    local label="$1"
+    local field="$2"
+    local file="$3"
+
+    if grep -q "$field" "$file" 2>/dev/null; then
+        echo -e "  ${GREEN}✓${NC} $label: '$field' found in $(basename $file)"
+    else
+        echo -e "  ${RED}✗${NC} $label: '$field' MISSING in $(basename $file)"
+        ERRORS=$((ERRORS + 1))
+    fi
+}
+
+check_both_have() {
+    local label="$1"
+    local pattern="$2"
+    local file1="$3"
+    local file2="$4"
+
+    local has1 has2
+    has1=$(grep -c "$pattern" "$file1" 2>/dev/null || echo 0)
+    has2=$(grep -c "$pattern" "$file2" 2>/dev/null || echo 0)
+
+    if [ "$has1" -gt 0 ] && [ "$has2" -gt 0 ]; then
+        echo -e "  ${GREEN}✓${NC} $label: present in both files"
+    elif [ "$has1" -eq 0 ] && [ "$has2" -eq 0 ]; then
+        echo -e "  ${YELLOW}—${NC} $label: absent in both (OK if not yet needed)"
+    else
+        local missing
+        [ "$has1" -eq 0 ] && missing="$(basename $file1)" || missing="$(basename $file2)"
+        echo -e "  ${RED}✗${NC} $label: MISSING in $missing"
+        ERRORS=$((ERRORS + 1))
+    fi
+}
+
+echo "=== quant.h ↔ split-source sync check ==="
+echo ""
+
+# --- 1. CHAT_END_MARKERS list ---
+echo "[1] CHAT_END_MARKERS (template token filter)"
+# Extract only the markers from the CHAT_END_MARKERS array definition
+extract_markers() {
+    sed -n '/CHAT_END_MARKERS\[\]/,/NULL/p' "$1" | grep -o '"[^"]*"' | sort -u
+}
+local_m1=$(extract_markers "$HEADER")
+local_m2=$(extract_markers "src/engine/tq_generate.c")
+if [ "$local_m1" = "$local_m2" ]; then
+    echo -e "  ${GREEN}✓${NC} End markers"
+else
+    echo -e "  ${RED}✗${NC} End markers — MISMATCH"
+    diff <(echo "$local_m1") <(echo "$local_m2") || true
+    ERRORS=$((ERRORS + 1))
+fi
+
+# --- 2. Phi-3 fused tensor support ---
+echo ""
+echo "[2] Phi-3 fused tensor fields"
+check_field_exists "Config: has_fused_qkv" "has_fused_qkv" "include/turboquant/tq_engine.h"
+check_field_exists "Config: has_fused_up_gate" "has_fused_up_gate" "include/turboquant/tq_engine.h"
+check_field_exists "Layer: gguf_w_qkv" "gguf_w_qkv" "include/turboquant/tq_engine.h"
+check_field_exists "Layer: gguf_w_up_gate" "gguf_w_up_gate" "include/turboquant/tq_engine.h"
+check_field_exists "Config: rope_factors_short" "rope_factors_short" "include/turboquant/tq_engine.h"
+
+# --- 3. Fused QKV forward path ---
+echo ""
+echo "[3] Fused QKV forward path"
+check_both_have "Fused QKV matmul" "gguf_w_qkv" \
+    "$HEADER" "src/engine/tq_transformer.c"
+check_both_have "Fused FFN gate||up" "gguf_w_up_gate" \
+    "$HEADER" "src/engine/tq_transformer.c"
+
+# --- 4. LongRoPE ---
+echo ""
+echo "[4] LongRoPE rotation"
+check_both_have "rope_factors_short" "rope_factors_short" \
+    "$HEADER" "src/engine/tq_transformer.c"
+check_both_have "rope_factors_long" "rope_factors_long" \
+    "$HEADER" "src/engine/tq_transformer.c"
+
+# --- 5. BOS token handling ---
+echo ""
+echo "[5] BOS token handling"
+check_both_have "BOS <s> lookup in tokenizer" '"<s>"' \
+    "$HEADER" "src/engine/tq_tokenizer.c"
+check_both_have "BOS <s> auto-detect in generate" '"<s>"' \
+    "$HEADER" "src/engine/tq_generate.c"
+check_both_have "BOS <|begin_of_text|> lookup" '"<|begin_of_text|>"' \
+    "$HEADER" "src/engine/tq_tokenizer.c"
+
+# --- 6. Hybrid attention stride (GQA fix) ---
+echo ""
+echo "[6] Hybrid attention cache stride"
+check_both_have "max_head_dim in quant cache" "max_head_dim" \
+    "$HEADER" "src/engine/tq_transformer.c"
+check_both_have "max_kv_heads in quant cache" "max_kv_heads" \
+    "$HEADER" "src/engine/tq_transformer.c"
+
+# --- 7. Memory free completeness ---
+echo ""
+echo "[7] GGUF dequant memory free"
+check_both_have "free(layer->attn_norm)" "free(layer->attn_norm)" \
+    "$HEADER" "src/engine/tq_model.c"
+
+# --- Summary ---
+echo ""
+echo "========================================="
+if [ "$ERRORS" -eq 0 ]; then
+    echo -e "  ${GREEN}ALL CHECKS PASSED${NC}"
+else
+    echo -e "  ${RED}$ERRORS SYNC ISSUES DETECTED${NC}"
+fi
+echo "========================================="
+exit "$ERRORS"