@@ -102,6 +102,136 @@ func (q *Q4KStorage) DequantizeSubBlock(blkIdx, subIdx int, dst []float32) {
102102 }
103103}
104104
105+ // QuantizeQ4K quantizes float32 values into Q4_K format.
106+ // Q4_K uses asymmetric quantization with per-sub-block 6-bit scales and mins,
107+ // shared fp16 super-block scale and dmin. 256 values per super-block, 144 bytes.
108+ func QuantizeQ4K (src []float32 ) * Q4KStorage {
109+ n := len (src )
110+ nBlocks := (n + q4KSuperBlockSize - 1 ) / q4KSuperBlockSize
111+ raw := make ([]byte , nBlocks * q4KBlockBytes )
112+
113+ for bi := range nBlocks {
114+ off := bi * q4KSuperBlockSize
115+ var values [q4KSuperBlockSize ]float32
116+ end := off + q4KSuperBlockSize
117+ if end > n {
118+ end = n
119+ }
120+ copy (values [:], src [off :end ])
121+
122+ // Compute per-sub-block scale and min.
123+ var subScales , subMins [q4KNumSubBlocks ]float32
124+ for sb := range q4KNumSubBlocks {
125+ sOff := sb * q4KSubBlockSize
126+ minVal , maxVal := values [sOff ], values [sOff ]
127+ for j := 1 ; j < q4KSubBlockSize ; j ++ {
128+ v := values [sOff + j ]
129+ if v < minVal {
130+ minVal = v
131+ }
132+ if v > maxVal {
133+ maxVal = v
134+ }
135+ }
136+ if minVal > 0 {
137+ minVal = 0
138+ }
139+ subScales [sb ] = (maxVal - minVal ) / 15.0
140+ subMins [sb ] = - minVal
141+ }
142+
143+ // Super-block scale and dmin from max sub-block values.
144+ var maxScale , maxMin float32
145+ for sb := range q4KNumSubBlocks {
146+ if subScales [sb ] > maxScale {
147+ maxScale = subScales [sb ]
148+ }
149+ if subMins [sb ] > maxMin {
150+ maxMin = subMins [sb ]
151+ }
152+ }
153+
154+ d := maxScale / 63.0
155+ dmin := maxMin / 63.0
156+
157+ // Quantize sub-block scales and mins to 6-bit.
158+ var scalesQ , minsQ [q4KNumSubBlocks ]uint8
159+ for sb := range q4KNumSubBlocks {
160+ if d > 0 {
161+ v := int (float64 (subScales [sb ]/ d ) + 0.5 )
162+ if v > 63 {
163+ v = 63
164+ }
165+ scalesQ [sb ] = uint8 (v )
166+ }
167+ if dmin > 0 {
168+ v := int (float64 (subMins [sb ]/ dmin ) + 0.5 )
169+ if v > 63 {
170+ v = 63
171+ }
172+ minsQ [sb ] = uint8 (v )
173+ }
174+ }
175+
176+ blk := raw [bi * q4KBlockBytes : (bi + 1 )* q4KBlockBytes ]
177+
178+ // fp16 d and dmin.
179+ dFP16 := float16 .FromFloat32 (d )
180+ dminFP16 := float16 .FromFloat32 (dmin )
181+ binary .LittleEndian .PutUint16 (blk [0 :2 ], dFP16 .Bits ())
182+ binary .LittleEndian .PutUint16 (blk [2 :4 ], dminFP16 .Bits ())
183+
184+ // Pack 6-bit scales and mins into 12 bytes at blk[4:16].
185+ for i := range 4 {
186+ blk [4 + i ] = (scalesQ [i ] & 63 ) | ((scalesQ [4 + i ] >> 4 ) << 6 )
187+ blk [8 + i ] = (minsQ [i ] & 63 ) | ((minsQ [4 + i ] >> 4 ) << 6 )
188+ }
189+ for i := range 4 {
190+ blk [12 + i ] = (scalesQ [4 + i ] & 0xF ) | ((minsQ [4 + i ] & 0xF ) << 4 )
191+ }
192+
193+ // Quantize values to 4-bit per sub-block pair.
194+ dRT := dFP16 .ToFloat32 ()
195+ dminRT := dminFP16 .ToFloat32 ()
196+ for group := range 4 {
197+ sb0 , sb1 := group * 2 , group * 2 + 1
198+ sc0 := dRT * float32 (scalesQ [sb0 ])
199+ mn0 := dminRT * float32 (minsQ [sb0 ])
200+ sc1 := dRT * float32 (scalesQ [sb1 ])
201+ mn1 := dminRT * float32 (minsQ [sb1 ])
202+
203+ var invScale0 , invScale1 float32
204+ if sc0 > 0 {
205+ invScale0 = 1.0 / sc0
206+ }
207+ if sc1 > 0 {
208+ invScale1 = 1.0 / sc1
209+ }
210+
211+ baseOut := group * 64
212+ baseQ := group * 32
213+ for l := range 32 {
214+ v0 := values [baseOut + l ]
215+ v1 := values [baseOut + l + 32 ]
216+ q0 := int (float64 ((v0 + mn0 )* invScale0 ) + 0.5 )
217+ q1 := int (float64 ((v1 + mn1 )* invScale1 ) + 0.5 )
218+ if q0 < 0 {
219+ q0 = 0
220+ } else if q0 > 15 {
221+ q0 = 15
222+ }
223+ if q1 < 0 {
224+ q1 = 0
225+ } else if q1 > 15 {
226+ q1 = 15
227+ }
228+ blk [16 + baseQ + l ] = byte (q0 ) | (byte (q1 ) << 4 )
229+ }
230+ }
231+ }
232+ return & Q4KStorage {raw : raw , len : n }
233+ }
234+
105235// Q4KStorage holds Q4_K quantized tensor data on CPU.
106236type Q4KStorage struct {
107237 raw []byte // raw super-block data
0 commit comments