@@ -3436,18 +3436,25 @@ func (e *GPUEngine[T]) gatherQ8(
34363436 }
34373437
34383438 // Upload indices as int32 to GPU.
3439+ trace := debugGPU || os .Getenv ("UPLOAD_TRACE" ) == "1"
34393440 idx32 := make ([]int32 , N )
34403441 for i , id := range idxData {
34413442 idx32 [i ] = int32 (id )
34423443 }
34433444 idxBytes := N * 4
34443445 devIdx , err := e .pool .Alloc (e .deviceID , idxBytes )
34453446 if err != nil {
3447+ if trace {
3448+ fmt .Fprintf (os .Stderr , "[GATHER_Q8] pool.Alloc(idx %d) FAILED: %v — CPU fallback\n " , idxBytes , err )
3449+ }
34463450 return e .cpu .Gather (context .Background (), params , indices , output )
34473451 }
34483452 defer e .pool .Free (e .deviceID , devIdx , idxBytes )
34493453
34503454 if err := e .runtime .Memcpy (devIdx , unsafe .Pointer (& idx32 [0 ]), idxBytes , gpuapi .MemcpyHostToDevice ); err != nil {
3455+ if trace {
3456+ fmt .Fprintf (os .Stderr , "[GATHER_Q8] Memcpy(idx H2D %d bytes) FAILED: %v — CPU fallback\n " , idxBytes , err )
3457+ }
34513458 return e .cpu .Gather (context .Background (), params , indices , output )
34523459 }
34533460
@@ -3456,6 +3463,9 @@ func (e *GPUEngine[T]) gatherQ8(
34563463 outBytes := outElems * f32Size
34573464 devOut , err := e .pool .Alloc (e .deviceID , outBytes )
34583465 if err != nil {
3466+ if trace {
3467+ fmt .Fprintf (os .Stderr , "[GATHER_Q8] pool.Alloc(out %d) FAILED: %v — CPU fallback\n " , outBytes , err )
3468+ }
34593469 return e .cpu .Gather (context .Background (), params , indices , output )
34603470 }
34613471
0 commit comments