quantumaikr
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/turboquant/tq_gguf.h‎
Lines changed: 56 additions & 0 deletions b/‎include/turboquant/tq_gguf.h‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎src/backend/metal/tq_elementwise.metal‎
Lines changed: 291 additions & 0 deletions b/‎src/backend/metal/tq_elementwise.metal‎
Lines changed: 291 additions & 0 deletions
@@ -58,3 +58,4 @@ libturboquant.a
 *.o
 tq_run
 tq_run.dSYM/
+.claude/worktrees/
@@ -337,6 +337,62 @@ int tq_metal_moe_forward(
     const int*      up_types,       /* per-expert up quant types, NULL = use weight_type */
     const int*      down_types);    /* per-expert down quant types, NULL = use weight_type */
 
+/* ============================================================
+ * GPU Compute Graph — Full Layer Forward
+ *
+ * Encodes ALL operations for one transformer layer into a single
+ * Metal command buffer with minimal CPU<->GPU sync.
+ * Eliminates per-kernel dispatch overhead that made per-matmul
+ * GPU dispatch slower than CPU NEON.
+ *
+ * Usage:
+ *   tq_metal_gpu_init_buffers(dim, inter, q_dim, kv_dim);
+ *   tq_metal_gpu_init_attn(n_heads, max_seq, kv_dim);
+ *   for each layer:
+ *     tq_metal_forward_layer(x, key_cache, value_cache, ...);
+ * ============================================================ */
+
+/* Initialize persistent GPU activation buffers (call once at model load) */
+int tq_metal_gpu_init_buffers(int max_dim, int max_inter, int max_q_dim, int max_kv_dim);
+
+/* Initialize attention + KV cache GPU buffers (call once after config is known) */
+int tq_metal_gpu_init_attn(int n_heads, int max_seq, int kv_dim);
+
+/* Check if full GPU compute graph forward is available */
+int tq_metal_graph_available(void);
+
+/* Full transformer layer forward on GPU (Q4 weights).
+ * Encodes rmsnorm → QKV → RoPE → attention → O-proj → residual →
+ * rmsnorm → gate/up → activation → mul → down → residual.
+ * Returns 0 on success, -1 if unavailable (use CPU fallback). */
+int tq_metal_forward_layer(
+    float* x,
+    float* key_cache, float* value_cache,
+    const float* w_attn_norm, const float* w_ffn_norm,
+    const uint8_t* wq_qs, const float* wq_sc,
+    const uint8_t* wk_qs, const float* wk_sc,
+    const uint8_t* wv_qs, const float* wv_sc,
+    const uint8_t* wo_qs, const float* wo_sc,
+    const uint8_t* wg_qs, const float* wg_sc,
+    const uint8_t* wu_qs, const float* wu_sc,
+    const uint8_t* wd_qs, const float* wd_sc,
+    int dim, int n_heads, int n_kv_heads, int head_dim,
+    int inter_dim, int pos, int seq_len, float rope_base, float rms_eps,
+    int use_gelu);
+
+/* Legacy layer forward (QKV matmul only, backward compat) */
+int tq_metal_layer_forward(
+    float* xb, float* xb2, float* q, float* k, float* v,
+    float* hb, float* hb2,
+    const uint8_t* wq_qs, const float* wq_scales,
+    const uint8_t* wk_qs, const float* wk_scales,
+    const uint8_t* wv_qs, const float* wv_scales,
+    const uint8_t* wo_qs, const float* wo_scales,
+    const uint8_t* wg_qs, const float* wg_scales,
+    const uint8_t* wu_qs, const float* wu_scales,
+    const uint8_t* wd_qs, const float* wd_scales,
+    int dim, int q_dim, int kv_dim, int inter_dim);
+
 #ifdef __cplusplus
 }
 #endif
 
@@ -141,3 +141,294 @@ kernel void add_vectors(
         out[tid] = a[tid] + b[tid];
     }
 }
+
+/* ============================================================
+ * RoPE (Rotary Position Embedding)
+ *
+ * Applies rotation to pairs (x[2i], x[2i+1]) using:
+ *   theta = pos * base^(-2i/head_dim)
+ *   x'[2i]   = x[2i]*cos(theta) - x[2i+1]*sin(theta)
+ *   x'[2i+1] = x[2i]*sin(theta) + x[2i+1]*cos(theta)
+ *
+ * Applies to both Q (n_heads heads) and K (n_kv_heads heads)
+ * packed contiguously: Q[0..n_heads*head_dim-1], K follows.
+ *
+ * Dispatch: one thread per pair in Q and K combined.
+ *   Total threads = (n_heads + n_kv_heads) * head_dim / 2
+ * ============================================================ */
+kernel void rope(
+    device float*    q          [[buffer(0)]],
+    device float*    k          [[buffer(1)]],
+    constant uint&   pos        [[buffer(2)]],
+    constant uint&   head_dim   [[buffer(3)]],
+    constant uint&   n_heads    [[buffer(4)]],
+    constant uint&   n_kv_heads [[buffer(5)]],
+    constant float&  rope_base  [[buffer(6)]],
+    uint id [[thread_position_in_grid]])
+{
+    uint half_hd = head_dim / 2;
+    uint total_q_pairs = n_heads * half_hd;
+
+    device float* vec;
+    uint pair_in_head;
+
+    if (id < total_q_pairs) {
+        /* Q region */
+        uint head = id / half_hd;
+        pair_in_head = id % half_hd;
+        vec = q + head * head_dim;
+    } else {
+        /* K region */
+        uint kid = id - total_q_pairs;
+        uint total_k_pairs = n_kv_heads * half_hd;
+        if (kid >= total_k_pairs) return;
+        uint head = kid / half_hd;
+        pair_in_head = kid % half_hd;
+        vec = k + head * head_dim;
+    }
+
+    float freq = 1.0f / pow(rope_base, 2.0f * float(pair_in_head) / float(head_dim));
+    float theta = float(pos) * freq;
+    float cos_t = cos(theta);
+    float sin_t = sin(theta);
+
+    uint idx = pair_in_head * 2;
+    float v0 = vec[idx];
+    float v1 = vec[idx + 1];
+    vec[idx]     = v0 * cos_t - v1 * sin_t;
+    vec[idx + 1] = v0 * sin_t + v1 * cos_t;
+}
+
+/* ============================================================
+ * GELU with tanh approximation
+ *
+ * gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+ *
+ * In-place: x[i] = gelu(x[i])
+ * Dispatch: grid covers all n elements, one thread per element.
+ * ============================================================ */
+kernel void gelu_tanh(
+    device float*  x   [[buffer(0)]],
+    constant uint& n   [[buffer(1)]],
+    uint tid [[thread_position_in_grid]])
+{
+    if (tid < n) {
+        float v = x[tid];
+        /* sqrt(2/pi) ≈ 0.7978845608 */
+        float inner = 0.7978845608f * (v + 0.044715f * v * v * v);
+        x[tid] = 0.5f * v * (1.0f + tanh(inner));
+    }
+}
+
+/* ============================================================
+ * Softmax (in-place, per-head)
+ *
+ * Each threadgroup processes one head's scores[0..len-1].
+ * Two-pass: find max, then compute exp and sum, then normalize.
+ *
+ * Dispatch: threadgroups = n_heads, threads_per_threadgroup = 256
+ * ============================================================ */
+kernel void softmax_inplace(
+    device float*       x    [[buffer(0)]],
+    constant uint&      len  [[buffer(1)]],
+    uint gid  [[threadgroup_position_in_grid]],
+    uint tid  [[thread_index_in_threadgroup]],
+    uint tgsize [[threads_per_threadgroup]],
+    uint simd_lane [[thread_index_in_simdgroup]],
+    uint simd_gid  [[simdgroup_index_in_threadgroup]])
+{
+    threadgroup float scratch[8];
+
+    device float* row = x + gid * len;
+
+    /* Phase 1: find max */
+    float local_max = -INFINITY;
+    for (uint i = tid; i < len; i += tgsize) {
+        float v = row[i];
+        if (v > local_max) local_max = v;
+    }
+
+    /* SIMD reduction for max */
+    local_max = simd_max(local_max);
+    uint num_simd = (tgsize + 31) / 32;
+    if (simd_lane == 0) scratch[simd_gid] = local_max;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (simd_gid == 0) {
+        float val = (tid < num_simd) ? scratch[tid] : -INFINITY;
+        val = simd_max(val);
+        if (tid == 0) scratch[0] = val;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    float max_val = scratch[0];
+
+    /* Phase 2: exp and sum */
+    float local_sum = 0.0f;
+    for (uint i = tid; i < len; i += tgsize) {
+        float e = exp(row[i] - max_val);
+        row[i] = e;
+        local_sum += e;
+    }
+
+    /* SIMD reduction for sum */
+    local_sum = simd_reduce_sum_ew(local_sum);
+    if (simd_lane == 0) scratch[simd_gid] = local_sum;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (simd_gid == 0) {
+        float val = (tid < num_simd) ? scratch[tid] : 0.0f;
+        val = simd_reduce_sum_ew(val);
+        if (tid == 0) scratch[0] = val;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    float inv_sum = 1.0f / scratch[0];
+
+    /* Phase 3: normalize */
+    for (uint i = tid; i < len; i += tgsize) {
+        row[i] *= inv_sum;
+    }
+}
+
+/* ============================================================
+ * Attention Q·K scoring
+ *
+ * For each head h, compute: scores[h * seq_len + t] = dot(Q_h, K_cache[t, h])
+ * where K_cache layout is [seq_len, n_kv_heads, head_dim].
+ *
+ * With GQA: multiple Q heads share one KV head (kv_mul = n_heads / n_kv_heads).
+ *
+ * Dispatch: one threadgroup per (head, position) pair.
+ *   Grid = (n_heads * seq_len, 1, 1), threadgroup = (256, 1, 1)
+ * ============================================================ */
+kernel void attention_qk(
+    device const float* q         [[buffer(0)]],
+    device const float* k_cache   [[buffer(1)]],
+    device float*       scores    [[buffer(2)]],
+    constant uint&      head_dim  [[buffer(3)]],
+    constant uint&      seq_len   [[buffer(4)]],
+    constant uint&      n_heads   [[buffer(5)]],
+    constant uint&      n_kv_heads[[buffer(6)]],
+    constant uint&      kv_dim    [[buffer(7)]],
+    uint gid  [[threadgroup_position_in_grid]],
+    uint tid  [[thread_index_in_threadgroup]],
+    uint tgsize [[threads_per_threadgroup]],
+    uint simd_lane [[thread_index_in_simdgroup]],
+    uint simd_gid  [[simdgroup_index_in_threadgroup]])
+{
+    threadgroup float scratch[8];
+
+    uint h = gid / seq_len;       /* query head index */
+    uint t = gid % seq_len;       /* position in sequence */
+    if (h >= n_heads) return;
+
+    /* GQA: map query head to KV head */
+    uint kv_mul = n_heads / n_kv_heads;
+    uint kv_h = h / kv_mul;
+
+    device const float* q_head = q + h * head_dim;
+    /* K cache layout: [seq_len * kv_dim], position t at offset t * kv_dim + kv_h * head_dim */
+    device const float* k_vec = k_cache + t * kv_dim + kv_h * head_dim;
+
+    /* Parallel dot product */
+    float dot = 0.0f;
+    for (uint i = tid; i < head_dim; i += tgsize) {
+        dot += q_head[i] * k_vec[i];
+    }
+
+    /* SIMD reduction */
+    dot = simd_reduce_sum_ew(dot);
+    uint num_simd = (tgsize + 31) / 32;
+    if (simd_lane == 0) scratch[simd_gid] = dot;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (simd_gid == 0) {
+        float val = (tid < num_simd) ? scratch[tid] : 0.0f;
+        val = simd_reduce_sum_ew(val);
+        if (tid == 0) scratch[0] = val;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (tid == 0) {
+        /* Scale by 1/sqrt(head_dim) */
+        scores[h * seq_len + t] = scratch[0] * rsqrt(float(head_dim));
+    }
+}
+
+/* ============================================================
+ * Attention value weighted sum
+ *
+ * For each head h: output[h*head_dim + d] = sum_t(attn[h*seq_len+t] * V[t, kv_h, d])
+ * V cache layout: [seq_len, n_kv_heads, head_dim] (same as K cache).
+ *
+ * Dispatch: one threadgroup per (head, head_dim_element) pair.
+ *   Grid = (n_heads * head_dim, 1, 1), threadgroup = (256, 1, 1)
+ *   Each threadgroup reduces across seq_len for one output element.
+ * ============================================================ */
+kernel void attention_v(
+    device const float* attn_weights [[buffer(0)]],
+    device const float* v_cache      [[buffer(1)]],
+    device float*       output       [[buffer(2)]],
+    constant uint&      head_dim     [[buffer(3)]],
+    constant uint&      seq_len      [[buffer(4)]],
+    constant uint&      n_heads      [[buffer(5)]],
+    constant uint&      n_kv_heads   [[buffer(6)]],
+    constant uint&      kv_dim       [[buffer(7)]],
+    uint gid  [[threadgroup_position_in_grid]],
+    uint tid  [[thread_index_in_threadgroup]],
+    uint tgsize [[threads_per_threadgroup]],
+    uint simd_lane [[thread_index_in_simdgroup]],
+    uint simd_gid  [[simdgroup_index_in_threadgroup]])
+{
+    threadgroup float scratch[8];
+
+    uint h = gid / head_dim;       /* query head index */
+    uint d = gid % head_dim;       /* element within head */
+    if (h >= n_heads) return;
+
+    /* GQA: map query head to KV head */
+    uint kv_mul = n_heads / n_kv_heads;
+    uint kv_h = h / kv_mul;
+
+    device const float* attn_h = attn_weights + h * seq_len;
+
+    /* Parallel weighted sum across seq positions */
+    float sum = 0.0f;
+    for (uint t = tid; t < seq_len; t += tgsize) {
+        sum += attn_h[t] * v_cache[t * kv_dim + kv_h * head_dim + d];
+    }
+
+    /* SIMD reduction */
+    sum = simd_reduce_sum_ew(sum);
+    uint num_simd = (tgsize + 31) / 32;
+    if (simd_lane == 0) scratch[simd_gid] = sum;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (simd_gid == 0) {
+        float val = (tid < num_simd) ? scratch[tid] : 0.0f;
+        val = simd_reduce_sum_ew(val);
+        if (tid == 0) scratch[0] = val;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (tid == 0) {
+        output[h * head_dim + d] = scratch[0];
+    }
+}
+
+/* ============================================================
+ * In-place vector add (aliased output)
+ *
+ * a[i] += b[i]
+ *
+ * Unlike add_vectors which writes to separate output, this
+ * adds b into a in-place. Used in residual connections where
+ * we want x += xb2 without a separate output buffer.
+ *
+ * Dispatch: grid covers all n elements, one thread per element.
+ * ============================================================ */
+kernel void add_inplace(
+    device float*       a   [[buffer(0)]],
+    device const float* b   [[buffer(1)]],
+    constant uint&      n   [[buffer(2)]],
+    uint tid [[thread_position_in_grid]])
+{
+    if (tid < n) {
+        a[tid] += b[tid];
+    }
+}