1 /*! @file ParallelReduction_GPU.cu
2 @brief Parallel reduction kernels
4 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * * Neither the name of NVIDIA CORPORATION nor the names of its
15 * contributors may be used to endorse or promote products derived
16 * from this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
22 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
26 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #ifndef _PARALLEL_REDUCTION_H_
32 #define _PARALLEL_REDUCTION_H_
34 #define _CG_ABI_EXPERIMENTAL
35 #include <cooperative_groups.h>
36 #include <cooperative_groups/reduce.h>
39 namespace cg = cooperative_groups;
41 // Utility class used to avoid linker errors with extern
42 // unsized shared memory arrays with templated type
45 __device__ inline operator T *() {
46 extern __shared__ int __smem[];
50 __device__ inline operator const T *() const {
51 extern __shared__ int __smem[];
56 // specialize for double to avoid unaligned memory
57 // access compile errors
59 struct SharedMemory<double> {
60 __device__ inline operator double *() {
61 extern __shared__ double __smem_d[];
62 return (double *)__smem_d;
65 __device__ inline operator const double *() const {
66 extern __shared__ double __smem_d[];
67 return (double *)__smem_d;
72 __device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {
73 for (int offset = warpSize / 2; offset > 0; offset /= 2) {
74 mySum += __shfl_down_sync(mask, mySum, offset);
79 #if __CUDA_ARCH__ >= 800
80 // Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic
81 // when on SM 8.0 or higher
83 __device__ __forceinline__ int warpReduceSum<int>(unsigned int mask,
85 mySum = __reduce_add_sync(mask, mySum);
91 Parallel sum reduction using shared memory
92 - takes log(n) steps for n input elements
94 - only works for power-of-2 arrays
97 /*! This reduction interleaves which threads are active by using the modulo
98 operator. This operator is very expensive on GPUs, and the interleaved
99 inactivity means that no whole warps are active, which is also very
102 __global__ void reduce0(T *g_idata, T *g_odata, unsigned int n) {
103 // Handle to thread block group
104 cg::thread_block cta = cg::this_thread_block();
105 T *sdata = SharedMemory<T>();
108 unsigned int tid = threadIdx.x;
109 unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
111 sdata[tid] = (i < n) ? g_idata[i] : 0;
115 // do reduction in shared mem
116 for (unsigned int s = 1; s < blockDim.x; s *= 2) {
117 // modulo arithmetic is slow!
118 if ((tid % (2 * s)) == 0) {
119 sdata[tid] += sdata[tid + s];
125 // write result for this block to global mem
126 if (tid == 0) g_odata[blockIdx.x] = sdata[0];
129 /*! This version uses contiguous threads, but its interleaved
130 addressing results in many shared memory bank conflicts.
133 __global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) {
134 // Handle to thread block group
135 cg::thread_block cta = cg::this_thread_block();
136 T *sdata = SharedMemory<T>();
139 unsigned int tid = threadIdx.x;
140 unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
142 sdata[tid] = (i < n) ? g_idata[i] : 0;
146 // do reduction in shared mem
147 for (unsigned int s = 1; s < blockDim.x; s *= 2) {
148 int index = 2 * s * tid;
150 if (index < blockDim.x) {
151 sdata[index] += sdata[index + s];
157 // write result for this block to global mem
158 if (tid == 0) g_odata[blockIdx.x] = sdata[0];
162 This version uses sequential addressing -- no divergence or bank conflicts.
165 __global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) {
166 // Handle to thread block group
167 cg::thread_block cta = cg::this_thread_block();
168 T *sdata = SharedMemory<T>();
171 unsigned int tid = threadIdx.x;
172 unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
174 sdata[tid] = (i < n) ? g_idata[i] : 0;
178 // do reduction in shared mem
179 for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
181 sdata[tid] += sdata[tid + s];
187 // write result for this block to global mem
188 if (tid == 0) g_odata[blockIdx.x] = sdata[0];
192 This version uses n/2 threads --
193 it performs the first level of reduction when reading from global memory.
196 __global__ void reduce3(T *g_idata, T *g_odata, unsigned int n) {
197 // Handle to thread block group
198 cg::thread_block cta = cg::this_thread_block();
199 T *sdata = SharedMemory<T>();
201 // perform first level of reduction,
202 // reading from global memory, writing to shared memory
203 unsigned int tid = threadIdx.x;
204 unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
206 T mySum = (i < n) ? g_idata[i] : 0;
208 if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
213 // do reduction in shared mem
214 for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
216 sdata[tid] = mySum = mySum + sdata[tid + s];
222 // write result for this block to global mem
223 if (tid == 0) g_odata[blockIdx.x] = mySum;
227 This version uses the warp shuffle operation if available to reduce
228 warp synchronization. When shuffle is not available the final warp's
229 worth of work is unrolled to reduce looping overhead.
231 http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
232 for additional information about using shuffle to perform a reduction
234 Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
235 In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
236 If blockSize > 32, allocate blockSize*sizeof(T) bytes.
238 template <class T, unsigned int blockSize>
239 __global__ void reduce4(T *g_idata, T *g_odata, unsigned int n) {
240 // Handle to thread block group
241 cg::thread_block cta = cg::this_thread_block();
242 T *sdata = SharedMemory<T>();
244 // perform first level of reduction,
245 // reading from global memory, writing to shared memory
246 unsigned int tid = threadIdx.x;
247 unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
249 T mySum = (i < n) ? g_idata[i] : 0;
251 if (i + blockSize < n) mySum += g_idata[i + blockSize];
256 // do reduction in shared mem
257 for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
259 sdata[tid] = mySum = mySum + sdata[tid + s];
265 cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
267 if (cta.thread_rank() < 32) {
268 // Fetch final intermediate sum from 2nd warp
269 if (blockSize >= 64) mySum += sdata[tid + 32];
270 // Reduce final warp using shuffle
271 for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
272 mySum += tile32.shfl_down(mySum, offset);
276 // write result for this block to global mem
277 if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
281 This version is completely unrolled, unless warp shuffle is available, then
282 shuffle is used within a loop. It uses a template parameter to achieve
283 optimal code for any (power of 2) number of threads. This requires a switch
284 statement in the host code to handle all the different thread block sizes at
285 compile time. When shuffle is available, it is used to reduce warp
287 Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
288 In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
289 If blockSize > 32, allocate blockSize*sizeof(T) bytes.
291 template <class T, unsigned int blockSize>
292 __global__ void reduce5(T *g_idata, T *g_odata, unsigned int n) {
293 // Handle to thread block group
294 cg::thread_block cta = cg::this_thread_block();
295 T *sdata = SharedMemory<T>();
297 // perform first level of reduction,
298 // reading from global memory, writing to shared memory
299 unsigned int tid = threadIdx.x;
300 unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
302 T mySum = (i < n) ? g_idata[i] : 0;
304 if (i + blockSize < n) mySum += g_idata[i + blockSize];
309 // do reduction in shared mem
310 if ((blockSize >= 512) && (tid < 256)) {
311 sdata[tid] = mySum = mySum + sdata[tid + 256];
316 if ((blockSize >= 256) && (tid < 128)) {
317 sdata[tid] = mySum = mySum + sdata[tid + 128];
322 if ((blockSize >= 128) && (tid < 64)) {
323 sdata[tid] = mySum = mySum + sdata[tid + 64];
328 cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
330 if (cta.thread_rank() < 32) {
331 // Fetch final intermediate sum from 2nd warp
332 if (blockSize >= 64) mySum += sdata[tid + 32];
333 // Reduce final warp using shuffle
334 for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
335 mySum += tile32.shfl_down(mySum, offset);
339 // write result for this block to global mem
340 if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
344 This version adds multiple elements per thread sequentially. This reduces
345 the overall cost of the algorithm while keeping the work complexity O(n) and
346 the step complexity O(log n). (Brent's Theorem optimization)
347 Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
348 In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
349 If blockSize > 32, allocate blockSize*sizeof(T) bytes.
351 template <class T, unsigned int blockSize, bool nIsPow2>
352 __global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) {
353 // Handle to thread block group
354 cg::thread_block cta = cg::this_thread_block();
355 T *sdata = SharedMemory<T>();
357 // perform first level of reduction,
358 // reading from global memory, writing to shared memory
359 unsigned int tid = threadIdx.x;
360 unsigned int gridSize = blockSize * gridDim.x;
364 // we reduce multiple elements per thread. The number is determined by the
365 // number of active thread blocks (via gridDim). More blocks will result
366 // in a larger gridSize and therefore fewer elements per thread
368 unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
369 gridSize = gridSize << 1;
373 // ensure we don't read out of bounds -- this is optimized away for
374 // powerOf2 sized arrays
375 if ((i + blockSize) < n) {
376 mySum += g_idata[i + blockSize];
381 unsigned int i = blockIdx.x * blockSize + threadIdx.x;
388 // each thread puts its local sum into shared memory
392 // do reduction in shared mem
393 if ((blockSize >= 512) && (tid < 256)) {
394 sdata[tid] = mySum = mySum + sdata[tid + 256];
399 if ((blockSize >= 256) && (tid < 128)) {
400 sdata[tid] = mySum = mySum + sdata[tid + 128];
405 if ((blockSize >= 128) && (tid < 64)) {
406 sdata[tid] = mySum = mySum + sdata[tid + 64];
411 cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
413 if (cta.thread_rank() < 32) {
414 // Fetch final intermediate sum from 2nd warp
415 if (blockSize >= 64) mySum += sdata[tid + 32];
416 // Reduce final warp using shuffle
417 for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
418 mySum += tile32.shfl_down(mySum, offset);
422 // write result for this block to global mem
423 if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
426 template <typename T, unsigned int blockSize, bool nIsPow2>
427 __global__ void reduce7(const T *__restrict__ g_idata, T *__restrict__ g_odata,
429 T *sdata = SharedMemory<T>();
431 // perform first level of reduction,
432 // reading from global memory, writing to shared memory
433 unsigned int tid = threadIdx.x;
434 unsigned int gridSize = blockSize * gridDim.x;
435 unsigned int maskLength = (blockSize & 31); // 31 = warpSize-1
436 maskLength = (maskLength > 0) ? (32 - maskLength) : maskLength;
437 const unsigned int mask = (0xffffffff) >> maskLength;
441 // we reduce multiple elements per thread. The number is determined by the
442 // number of active thread blocks (via gridDim). More blocks will result
443 // in a larger gridSize and therefore fewer elements per thread
445 unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
446 gridSize = gridSize << 1;
450 // ensure we don't read out of bounds -- this is optimized away for
451 // powerOf2 sized arrays
452 if ((i + blockSize) < n) {
453 mySum += g_idata[i + blockSize];
458 unsigned int i = blockIdx.x * blockSize + threadIdx.x;
465 // Reduce within warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
467 mySum = warpReduceSum<T>(mask, mySum);
469 // each thread puts its local sum into shared memory
470 if ((tid % warpSize) == 0) {
471 sdata[tid / warpSize] = mySum;
476 const unsigned int shmem_extent =
477 (blockSize / warpSize) > 0 ? (blockSize / warpSize) : 1;
478 const unsigned int ballot_result = __ballot_sync(mask, tid < shmem_extent);
479 if (tid < shmem_extent) {
481 // Reduce final warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
483 mySum = warpReduceSum<T>(ballot_result, mySum);
486 // write result for this block to global mem
488 g_odata[blockIdx.x] = mySum;
492 // Performs a reduction step and updates numTotal with how many are remaining
493 template <typename T, typename Group>
494 __device__ T cg_reduce_n(T in, Group &threads) {
495 return cg::reduce(threads, in, cg::plus<T>());
499 __global__ void cg_reduce(T *g_idata, T *g_odata, unsigned int n) {
500 // Shared memory for intermediate steps
501 T *sdata = SharedMemory<T>();
502 // Handle to thread block group
503 cg::thread_block cta = cg::this_thread_block();
504 // Handle to tile in thread block
505 cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta);
507 unsigned int ctaSize = cta.size();
508 unsigned int numCtas = gridDim.x;
509 unsigned int threadRank = cta.thread_rank();
510 unsigned int threadIndex = (blockIdx.x * ctaSize) + threadRank;
514 unsigned int i = threadIndex;
515 unsigned int indexStride = (numCtas * ctaSize);
517 threadVal += g_idata[i];
520 sdata[threadRank] = threadVal;
523 // Wait for all tiles to finish and reduce within CTA
525 unsigned int ctaSteps = tile.meta_group_size();
526 unsigned int ctaIndex = ctaSize >> 1;
527 while (ctaIndex >= 32) {
529 if (threadRank < ctaIndex) {
530 threadVal += sdata[threadRank + ctaIndex];
531 sdata[threadRank] = threadVal;
538 // Shuffle redux instead of smem redux
541 if (tile.meta_group_rank() == 0) {
542 threadVal = cg_reduce_n(threadVal, tile);
546 if (threadRank == 0) g_odata[blockIdx.x] = threadVal;
549 template <class T, size_t BlockSize, size_t MultiWarpGroupSize>
550 __global__ void multi_warp_cg_reduce(T *g_idata, T *g_odata, unsigned int n) {
551 // Shared memory for intermediate steps
552 T *sdata = SharedMemory<T>();
553 __shared__ cg::experimental::block_tile_memory<sizeof(T), BlockSize> scratch;
555 // Handle to thread block group
556 auto cta = cg::experimental::this_thread_block(scratch);
557 // Handle to multiWarpTile in thread block
558 auto multiWarpTile = cg::experimental::tiled_partition<MultiWarpGroupSize>(cta);
560 unsigned int gridSize = BlockSize * gridDim.x;
563 // we reduce multiple elements per thread. The number is determined by the
564 // number of active thread blocks (via gridDim). More blocks will result
565 // in a larger gridSize and therefore fewer elements per thread
566 int nIsPow2 = !(n & n-1);
568 unsigned int i = blockIdx.x * BlockSize * 2 + threadIdx.x;
569 gridSize = gridSize << 1;
572 threadVal += g_idata[i];
573 // ensure we don't read out of bounds -- this is optimized away for
574 // powerOf2 sized arrays
575 if ((i + BlockSize) < n) {
576 threadVal += g_idata[i + blockDim.x];
581 unsigned int i = blockIdx.x * BlockSize + threadIdx.x;
583 threadVal += g_idata[i];
588 threadVal = cg_reduce_n(threadVal, multiWarpTile);
590 if (multiWarpTile.thread_rank() == 0) {
591 sdata[multiWarpTile.meta_group_rank()] = threadVal;
595 if (threadIdx.x == 0) {
597 for (int i=0; i < multiWarpTile.meta_group_size(); i++) {
598 threadVal += sdata[i];
600 g_odata[blockIdx.x] = threadVal;
604 extern "C" bool isPow2(unsigned int x);
606 ////////////////////////////////////////////////////////////////////////////////
607 // Wrapper function for kernel launch
608 ////////////////////////////////////////////////////////////////////////////////
610 void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata,
612 dim3 dimBlock(threads, 1, 1);
613 dim3 dimGrid(blocks, 1, 1);
615 // when there is only one warp per block, we need to allocate two warps
616 // worth of shared memory so that we don't index shared memory out of bounds
618 (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
620 // as kernel 9 - multi_warp_cg_reduce cannot work for more than 64 threads
621 // we choose to set kernel 7 for this purpose.
622 if (threads < 64 && whichKernel == 9)
627 // choose which of the optimized versions of reduction to launch
628 switch (whichKernel) {
630 reduce0<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
634 reduce1<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
638 reduce2<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
642 reduce3<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
649 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
654 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
659 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
664 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
669 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
674 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
679 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
684 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
689 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
694 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
704 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
709 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
714 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
719 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
724 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
729 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
734 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
739 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
744 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
749 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
759 reduce6<T, 512, true>
760 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
764 reduce6<T, 256, true>
765 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
769 reduce6<T, 128, true>
770 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
775 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
780 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
785 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
790 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
795 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
800 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
805 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
811 reduce6<T, 512, false>
812 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
816 reduce6<T, 256, false>
817 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
821 reduce6<T, 128, false>
822 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
826 reduce6<T, 64, false>
827 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
831 reduce6<T, 32, false>
832 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
836 reduce6<T, 16, false>
837 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
842 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
847 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
852 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
857 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
865 // For reduce7 kernel we require only blockSize/warpSize
866 // number of elements in shared memory
867 smemSize = ((threads / 32) + 1) * sizeof(T);
871 reduce7<T, 1024, true>
872 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
875 reduce7<T, 512, true>
876 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
880 reduce7<T, 256, true>
881 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
885 reduce7<T, 128, true>
886 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
891 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
896 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
901 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
906 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
911 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
916 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
921 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
927 reduce7<T, 1024, true>
928 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
931 reduce7<T, 512, false>
932 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
936 reduce7<T, 256, false>
937 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
941 reduce7<T, 128, false>
942 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
946 reduce7<T, 64, false>
947 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
951 reduce7<T, 32, false>
952 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
956 reduce7<T, 16, false>
957 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
962 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
967 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
972 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
977 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
984 cg_reduce<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
987 constexpr int numOfMultiWarpGroups = 2;
988 smemSize = numOfMultiWarpGroups * sizeof(T);
991 multi_warp_cg_reduce<T, 1024, 1024/numOfMultiWarpGroups>
992 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
996 multi_warp_cg_reduce<T, 512, 512/numOfMultiWarpGroups>
997 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
1001 multi_warp_cg_reduce<T, 256, 256/numOfMultiWarpGroups>
1002 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
1006 multi_warp_cg_reduce<T, 128, 128/numOfMultiWarpGroups>
1007 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
1011 multi_warp_cg_reduce<T, 64, 64/numOfMultiWarpGroups>
1012 <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
1016 printf("thread block size of < 64 is not supported for this kernel\n");
1023 // Instantiate the reduction function for 3 types
1024 template void reduce<int>(int size, int threads, int blocks, int whichKernel,
1025 int *d_idata, int *d_odata);
1027 template void reduce<float>(int size, int threads, int blocks, int whichKernel,
1028 float *d_idata, float *d_odata);
1030 template void reduce<double>(int size, int threads, int blocks, int whichKernel,
1031 double *d_idata, double *d_odata);
1033 #endif // #ifndef _REDUCE_KERNEL_H_