@@ -569,16 +569,6 @@ func (e *GPUEngine[T]) UploadWeights(tensors []*tensor.TensorNumeric[float32]) e
569569 "device" , fmt .Sprintf ("%d" , e .deviceID ),
570570 "method" , method )
571571 }
572- // Check for sticky CUDA errors after UploadWeights.
573- if debugGPU || os .Getenv ("UPLOAD_TRACE" ) == "1" {
574- if e .stream != nil {
575- if syncErr := e .stream .Synchronize (); syncErr != nil {
576- fmt .Fprintf (os .Stderr , "[UPLOAD] CUDA sync error after UploadWeights: %v\n " , syncErr )
577- } else {
578- fmt .Fprintf (os .Stderr , "[UPLOAD] CUDA context clean after UploadWeights\n " )
579- }
580- }
581- }
582572 return nil
583573}
584574
@@ -3421,9 +3411,6 @@ func (e *GPUEngine[T]) gatherQ8(
34213411 devQ8 unsafe.Pointer ,
34223412) error {
34233413 e .setDevice ()
3424- if debugGPU || os .Getenv ("UPLOAD_TRACE" ) == "1" {
3425- fmt .Fprintf (os .Stderr , "[GATHER_Q8] called: V=%d D=%d devQ8=%p\n " , params .Shape ()[0 ], params .Shape ()[1 ], devQ8 )
3426- }
34273414
34283415 pShape := params .Shape ()
34293416 V := pShape [0 ]
@@ -3436,25 +3423,18 @@ func (e *GPUEngine[T]) gatherQ8(
34363423 }
34373424
34383425 // Upload indices as int32 to GPU.
3439- trace := debugGPU || os .Getenv ("UPLOAD_TRACE" ) == "1"
34403426 idx32 := make ([]int32 , N )
34413427 for i , id := range idxData {
34423428 idx32 [i ] = int32 (id )
34433429 }
34443430 idxBytes := N * 4
34453431 devIdx , err := e .pool .Alloc (e .deviceID , idxBytes )
34463432 if err != nil {
3447- if trace {
3448- fmt .Fprintf (os .Stderr , "[GATHER_Q8] pool.Alloc(idx %d) FAILED: %v — CPU fallback\n " , idxBytes , err )
3449- }
34503433 return e .cpu .Gather (context .Background (), params , indices , output )
34513434 }
34523435 defer e .pool .Free (e .deviceID , devIdx , idxBytes )
34533436
34543437 if err := e .runtime .Memcpy (devIdx , unsafe .Pointer (& idx32 [0 ]), idxBytes , gpuapi .MemcpyHostToDevice ); err != nil {
3455- if trace {
3456- fmt .Fprintf (os .Stderr , "[GATHER_Q8] Memcpy(idx H2D %d bytes) FAILED: %v — CPU fallback\n " , idxBytes , err )
3457- }
34583438 return e .cpu .Gather (context .Background (), params , indices , output )
34593439 }
34603440
@@ -3463,23 +3443,14 @@ func (e *GPUEngine[T]) gatherQ8(
34633443 outBytes := outElems * f32Size
34643444 devOut , err := e .pool .Alloc (e .deviceID , outBytes )
34653445 if err != nil {
3466- if trace {
3467- fmt .Fprintf (os .Stderr , "[GATHER_Q8] pool.Alloc(out %d) FAILED: %v — CPU fallback\n " , outBytes , err )
3468- }
34693446 return e .cpu .Gather (context .Background (), params , indices , output )
34703447 }
34713448
34723449 // Launch Q8 gather kernel.
34733450 if err := e .kernels .GatherQ8F32 (devQ8 , devIdx , devOut , N , D , V , e .stream ); err != nil {
3474- if debugGPU || os .Getenv ("UPLOAD_TRACE" ) == "1" {
3475- fmt .Fprintf (os .Stderr , "[GATHER_Q8] kernel FAILED: %v — fallback to CPU\n " , err )
3476- }
34773451 e .pool .Free (e .deviceID , devOut , outBytes )
34783452 return e .cpu .Gather (context .Background (), params , indices , output )
34793453 }
3480- if debugGPU || os .Getenv ("UPLOAD_TRACE" ) == "1" {
3481- fmt .Fprintf (os .Stderr , "[GATHER_Q8] kernel OK, devOut=%p elems=%d\n " , devOut , outElems )
3482- }
34833454
34843455 // Write result into output tensor as GPUStorage (pool-backed).
34853456 gs , err := tensor .NewGPUStorageFromPool [float32 ](devOut , outElems , e .pool , e .deviceID )
0 commit comments