quantumaikr
diff --git a/‎src/backend/metal/tq_elementwise.metal‎
Lines changed: 143 additions & 0 deletions b/‎src/backend/metal/tq_elementwise.metal‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎src/backend/metal/tq_matmul.metal‎
Lines changed: 11 additions & 8 deletions b/‎src/backend/metal/tq_matmul.metal‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎src/backend/metal/tq_metal_common.h‎
Lines changed: 32 additions & 0 deletions b/‎src/backend/metal/tq_metal_common.h‎
Lines changed: 32 additions & 0 deletions
@@ -0,0 +1,143 @@
+/**
+ * TurboQuant -- Element-wise Metal compute shaders
+ *
+ * Provides GPU kernels for operations between matmuls that would
+ * otherwise force GPU->CPU->GPU round-trips:
+ *   - RMSNorm (with threadgroup reduction)
+ *   - SiLU activation
+ *   - Element-wise multiply
+ *   - Vector add
+ */
+#include <metal_stdlib>
+using namespace metal;
+
+/* ============================================================
+ * SIMD-group sum reduction (matches tq_polar.metal helpers)
+ * ============================================================ */
+
+inline float simd_reduce_sum_ew(float val) {
+    val += simd_shuffle_down(val, 16);
+    val += simd_shuffle_down(val, 8);
+    val += simd_shuffle_down(val, 4);
+    val += simd_shuffle_down(val, 2);
+    val += simd_shuffle_down(val, 1);
+    return val;
+}
+
+/* ============================================================
+ * RMSNorm kernel
+ *
+ * out[i] = (x[i] / rms(x)) * weight[i]
+ * rms(x) = sqrt(mean(x^2) + eps)
+ *
+ * Two-phase design:
+ *   Phase 1: Parallel reduction to compute sum of squares.
+ *   Phase 2: Each thread normalizes and scales its element(s).
+ *
+ * Dispatch: one threadgroup per row (n elements).
+ * Threadgroup size: 256 threads (8 SIMD groups of 32).
+ * Each thread handles ceil(n / tgsize) elements.
+ * ============================================================ */
+kernel void rmsnorm(
+    device const float* x      [[buffer(0)]],
+    device const float* weight [[buffer(1)]],
+    device float*       out    [[buffer(2)]],
+    constant uint&      n      [[buffer(3)]],
+    constant float&     eps    [[buffer(4)]],
+    uint tid        [[thread_index_in_threadgroup]],
+    uint tgsize     [[threads_per_threadgroup]],
+    uint simd_lane  [[thread_index_in_simdgroup]],
+    uint simd_gid   [[simdgroup_index_in_threadgroup]])
+{
+    /* Scratch for cross-SIMD-group reduction (max 8 SIMD groups for TG=256) */
+    threadgroup float scratch[8];
+
+    /* Phase 1: accumulate sum of squares */
+    float ss = 0.0f;
+    for (uint i = tid; i < n; i += tgsize) {
+        float v = x[i];
+        ss += v * v;
+    }
+
+    /* SIMD-group reduction */
+    ss = simd_reduce_sum_ew(ss);
+    uint num_simd_groups = (tgsize + 31) / 32;
+
+    if (simd_lane == 0) {
+        scratch[simd_gid] = ss;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    /* Final reduction in first SIMD group */
+    if (simd_gid == 0) {
+        float val = (tid < num_simd_groups) ? scratch[tid] : 0.0f;
+        val = simd_reduce_sum_ew(val);
+        if (tid == 0) {
+            scratch[0] = rsqrt(val / float(n) + eps);
+        }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    /* Phase 2: normalize and scale */
+    float inv_rms = scratch[0];
+    for (uint i = tid; i < n; i += tgsize) {
+        out[i] = x[i] * inv_rms * weight[i];
+    }
+}
+
+/* ============================================================
+ * SiLU (Sigmoid Linear Unit) activation
+ *
+ * out[i] = x[i] * sigmoid(x[i]) = x[i] / (1 + exp(-x[i]))
+ *
+ * Dispatch: grid covers all n elements, one thread per element.
+ * ============================================================ */
+kernel void silu(
+    device const float* x   [[buffer(0)]],
+    device float*       out [[buffer(1)]],
+    constant uint&      n   [[buffer(2)]],
+    uint tid [[thread_position_in_grid]])
+{
+    if (tid < n) {
+        float v = x[tid];
+        out[tid] = v / (1.0f + exp(-v));
+    }
+}
+
+/* ============================================================
+ * Element-wise multiply
+ *
+ * out[i] = a[i] * b[i]
+ *
+ * Dispatch: grid covers all n elements, one thread per element.
+ * ============================================================ */
+kernel void mul_elementwise(
+    device const float* a   [[buffer(0)]],
+    device const float* b   [[buffer(1)]],
+    device float*       out [[buffer(2)]],
+    constant uint&      n   [[buffer(3)]],
+    uint tid [[thread_position_in_grid]])
+{
+    if (tid < n) {
+        out[tid] = a[tid] * b[tid];
+    }
+}
+
+/* ============================================================
+ * Vector add
+ *
+ * out[i] = a[i] + b[i]
+ *
+ * Dispatch: grid covers all n elements, one thread per element.
+ * ============================================================ */
+kernel void add_vectors(
+    device const float* a   [[buffer(0)]],
+    device const float* b   [[buffer(1)]],
+    device float*       out [[buffer(2)]],
+    constant uint&      n   [[buffer(3)]],
+    uint tid [[thread_position_in_grid]])
+{
+    if (tid < n) {
+        out[tid] = a[tid] + b[tid];
+    }
+}
@@ -550,16 +550,19 @@ kernel void matmul_tq_q4(
         const float sc = weight_sc[sc_row + b];
         device const uint8_t* qs = weight_qs + qs_row + b * 16;
         const uint base = b * 32;
+        /* Packing: byte j = (q[2j+1] << 4) | q[2j]
+         * Low nibble  (& 0xF) = element at index 2*j
+         * High nibble (>> 4)  = element at index 2*j+1 */
         for (uint k = 0; k < 16; k += 4) {
             uint8_t p0 = qs[k], p1 = qs[k+1], p2 = qs[k+2], p3 = qs[k+3];
-            sum += (float(int(p0 & 0xF) - 8) * input[base + k]
-                 +  float(int(p0 >> 4)  - 8) * input[base + k + 16]
-                 +  float(int(p1 & 0xF) - 8) * input[base + k + 1]
-                 +  float(int(p1 >> 4)  - 8) * input[base + k + 17]
-                 +  float(int(p2 & 0xF) - 8) * input[base + k + 2]
-                 +  float(int(p2 >> 4)  - 8) * input[base + k + 18]
-                 +  float(int(p3 & 0xF) - 8) * input[base + k + 3]
-                 +  float(int(p3 >> 4)  - 8) * input[base + k + 19]) * sc;
+            sum += (float(int(p0 & 0xF) - 8) * input[base + 2*k]
+                 +  float(int(p0 >> 4)  - 8) * input[base + 2*k + 1]
+                 +  float(int(p1 & 0xF) - 8) * input[base + 2*(k+1)]
+                 +  float(int(p1 >> 4)  - 8) * input[base + 2*(k+1) + 1]
+                 +  float(int(p2 & 0xF) - 8) * input[base + 2*(k+2)]
+                 +  float(int(p2 >> 4)  - 8) * input[base + 2*(k+2) + 1]
+                 +  float(int(p3 & 0xF) - 8) * input[base + 2*(k+3)]
+                 +  float(int(p3 >> 4)  - 8) * input[base + 2*(k+3) + 1]) * sc;
         }
     }
 
 
@@ -32,6 +32,10 @@ typedef enum {
     TQ_METAL_PIPE_VALUE_QUANTIZE_4B,
     TQ_METAL_PIPE_VALUE_QUANTIZE_2B,
     TQ_METAL_PIPE_VALUE_DEQUANT_MATMUL,
+    TQ_METAL_PIPE_RMSNORM,
+    TQ_METAL_PIPE_SILU,
+    TQ_METAL_PIPE_MUL_ELEMENTWISE,
+    TQ_METAL_PIPE_ADD_VECTORS,
     TQ_METAL_PIPE_COUNT
 } tq_metal_pipeline_id;
 
@@ -124,6 +128,34 @@ void tq_turbo_quantize_metal(const float* src, void* dst, int n);
 void tq_turbo_attention_metal(const float* query, const void* kv_cache,
                                float* scores, int seq_len, int head_dim);
 
+/* ============================================================
+ * Element-wise operations (between matmuls)
+ * ============================================================ */
+
+/**
+ * RMSNorm on Metal GPU.
+ * out[i] = (x[i] / rms(x)) * weight[i], rms = sqrt(mean(x^2) + eps)
+ */
+int tq_metal_rmsnorm(float* out, const float* x, const float* w, int n, float eps);
+
+/**
+ * SiLU activation on Metal GPU.
+ * out[i] = x[i] / (1 + exp(-x[i]))
+ */
+int tq_metal_silu(float* out, const float* x, int n);
+
+/**
+ * Element-wise multiply on Metal GPU.
+ * out[i] = a[i] * b[i]
+ */
+int tq_metal_mul(float* out, const float* a, const float* b, int n);
+
+/**
+ * Vector add on Metal GPU.
+ * out[i] = a[i] + b[i]
+ */
+int tq_metal_add(float* out, const float* a, const float* b, int n);
+
 #ifdef __cplusplus
 }
 #endif