3434namespace cubool {
3535 namespace kernels {
3636
37- template <typename IndexType, size_t blockSize>
37+ template <typename IndexType, size_t threads, size_t blockSize>
3838 __global__ void spgemv (thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
3939 thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
4040 thrust::device_ptr<const IndexType> v, // Input dense v vector
4141 thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
42- thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin
43- IndexType assignedOrder = blockIdx .x ;
44- IndexType id = threadIdx .x ;
42+ thrust::device_ptr<const IndexType> rowConfig, // Rows to process for each bin
43+ IndexType rowsCount) { // Num of rows to process
44+
45+ static const size_t WARP_SIZE = 32 ;
46+
47+ IndexType id = threadIdx .x % threads;
48+ IndexType interBlockId = threadIdx .x / threads;
49+ IndexType assignedOrder = blockIdx .x * (blockSize / threads) + interBlockId;
50+
51+ if (assignedOrder >= rowsCount)
52+ assignedOrder = rowsCount - 1 ;
4553
4654 IndexType i = rowConfig[assignedOrder]; // Row to process
4755
@@ -51,28 +59,38 @@ namespace cubool {
5159 __shared__ IndexType tmp_accum[blockSize];
5260
5361 // Initial zero
54- tmp_accum[id ] = 0 ;
62+ tmp_accum[threadIdx . x ] = 0 ;
5563 __syncthreads ();
5664
5765 // Each thread accum nnz values
58- for (size_t k = id; k < rowSize; k += blockSize ) {
59- tmp_accum[id ] |= v[colIndices[rowBegin + k]];
66+ for (size_t k = id; k < rowSize; k += threads ) {
67+ tmp_accum[threadIdx . x ] |= v[colIndices[rowBegin + k]];
6068 }
6169 __syncthreads ();
6270
6371 // Reduce accum to single value
64- for (size_t s = 1 ; s < blockSize; s *= 2 ) {
72+ for (size_t s = 1 ; s < threads && warpSize ; s *= 2 ) {
73+ if (id % (2 * s) == 0 ) {
74+ tmp_accum[threadIdx .x ] |= tmp_accum[threadIdx .x + s];
75+ }
76+
77+ __syncwarp ();
78+ }
79+
80+ __syncthreads ();
81+
82+ for (size_t s = WARP_SIZE; s < threads; s *= 2 ) {
6583 if (id % (2 * s) == 0 ) {
66- tmp_accum[id ] |= tmp_accum[id + s];
84+ tmp_accum[threadIdx . x ] |= tmp_accum[threadIdx . x + s];
6785 }
6886
6987 __syncthreads ();
7088 }
7189
7290 // 0-thread saves result
7391 if (id == 0 ) {
74- if (tmp_accum[0 ] > 0 ) {
75- x[i] = tmp_accum[0 ];
92+ if (tmp_accum[threadIdx . x ] > 0 ) {
93+ x[i] = tmp_accum[threadIdx . x ];
7694 }
7795 }
7896 }
@@ -95,11 +113,14 @@ namespace cubool {
95113 thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin)
96114
97115 EXPAND_SIDE_EFFECTS (
98- (binSizes[Bins::id] > 0 ?
99- spgemv<IndexType, Bins::blockSize>
100- <<<binSizes[Bins::id], Bins::blockSize, 0 , streamsWrapper.streams[Bins::id]>>>
101- (rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id])
102- : void ())
116+ (binSizes[Bins::id] > 0 ?
117+ spgemv<IndexType, Bins::threads, Bins::blockSize>
118+ <<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio? 1 : 0 ),
119+ Bins::blockSize,
120+ 0 ,
121+ streamsWrapper.streams[Bins::id]>>>
122+ (rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
123+ : void ())
103124 );
104125 }
105126
@@ -143,10 +164,13 @@ namespace cubool {
143164 // Empty out buffer
144165 thrust::fill_n (mOutput .begin (), M, (IndexType) 0 );
145166
146- using ConfigType = Config<Bin<32 , 1 , 32 , 0 >,
147- Bin<64 , 32 , 64 , 1 >,
148- Bin<128 ,64 , 128 ,2 >,
149- Bin<256 ,128 ,max,3 >>;
167+ using ConfigType = Config<Bin<4 , 32 , 1 , 8 , 0 >,
168+ Bin<8 , 32 , 8 , 16 , 1 >,
169+ Bin<16 , 32 , 16 , 32 , 2 >,
170+ Bin<32 , 32 , 32 , 64 , 3 >,
171+ Bin<64 , 64 , 64 , 128 ,4 >,
172+ Bin<128 ,128 ,128 ,256 ,5 >,
173+ Bin<256 ,256 ,256 ,max,6 >>;
150174 ConfigType config;
151175
152176 mRowsConfig .resize (M);
0 commit comments