__global__ voidreduce2(float* d_in, float* d_out, int N){ __shared__ float sdata[BLOCK_SIZE];
// each thread loads one element from global memory to shared mem unsignedint idx = blockIdx.x * blockDim.x + threadIdx.x; unsignedint tid = threadIdx.x; sdata[tid] = (idx < N) ? d_in[idx] : 0.0f; __syncthreads(); // do reduction in shared mem for (unsignedint s = blockDim.x / 2; s >= 1; s >>= 1) { if (tid < s) sdata[tid] += sdata[tid + s]; // s = 128
__syncthreads(); }
// if matrix is large, reduce one block to global mem if (tid == 0) d_out[blockIdx.x] = sdata[tid]; /* //if matrix is small, only reduce once if(tid == 0){ atomicAdd(d_out, sdata[0]); } */ }
// move data to warp0 sum = (tid < warpNum)? warpLevelSums[tid]:0; // Final reduce using first warp if (warpId == 0){ #pragma unroll for(int offset = warpSize / 2; offset > 0; offset /= 2){ sum += __shfl_down_sync(0xffffffff, sum, offset); } } // write result for this block to global mem if(tid == 0) d_out[blockIdx.x] = sum; }
#pragma unroll for(int i = 0; i < numPerThread; i++){ sum += (idx < N) ? d_in[idx] : 0.0f; idx += blockDim.x; } //do reduction in warp #pragma unroll for(int offset = warpSize / 2; offset > 0; offset /= 2){ sum += __shfl_down_sync(0xffffffff, sum, offset); }
// shared mem for the sum of per warp constint laneId = tid % warpSize; constint warpId = tid / warpSize; int warpNum = blockDim.x / warpSize; __shared__ float warpLevelSums[warpNum];
// move data to warp0 sum = (tid < warpNum)? warpLevelSums[tid]:0; // Final reduce using first warp if(warpId == 0) { #pragma unroll for(int offset = warpSize / 2; offset > 0; offset /= 2){ sum += __shfl_down_sync(0xffffffff, sum, offset); } } // write result for this block to global mem if(tid == 0) d_out[blockIdx.x] = sum; }