#pragma unroll for(int i = 0; i < numPerThread; i++){ sum += (idx < N) ? d_in[idx] : 0.0f; idx += blockDim.x; } //do reduction in warp #pragma unroll for(int offset = warpSize / 2; offset > 0; offset /= 2){ sum += __shfl_down_sync(0xffffffff, sum, offset); }
// shared mem for the sum of per warp constint laneId = tid % warpSize; constint warpId = tid / warpSize; int warpNum = blockDim.x / warpSize; __shared__ float warpLevelSums[warpNum];
// move data to warp0 sum = (tid < warpNum)? warpLevelSums[tid]:0; // Final reduce using first warp if(warpId == 0) { #pragma unroll for(int offset = warpSize / 2; offset > 0; offset /= 2){ sum += __shfl_down_sync(0xffffffff, sum, offset); } } // write result for this block to global mem if(tid == 0) d_out[blockIdx.x] = sum; }