HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
ParallelReduction_GPU.cu
Go to the documentation of this file.
1 /*! @file ParallelReduction_GPU.cu
2  @brief Parallel reduction kernels
3 
4  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * * Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the distribution.
14  * * Neither the name of NVIDIA CORPORATION nor the names of its
15  * contributors may be used to endorse or promote products derived
16  * from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
22  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
26  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #ifndef _PARALLEL_REDUCTION_H_
32 #define _PARALLEL_REDUCTION_H_
33 
34 #define _CG_ABI_EXPERIMENTAL
35 #include <cooperative_groups.h>
36 #include <cooperative_groups/reduce.h>
37 #include <stdio.h>
38 
39 namespace cg = cooperative_groups;
40 
41 // Utility class used to avoid linker errors with extern
42 // unsized shared memory arrays with templated type
43 template <class T>
44 struct SharedMemory {
45  __device__ inline operator T *() {
46  extern __shared__ int __smem[];
47  return (T *)__smem;
48  }
49 
50  __device__ inline operator const T *() const {
51  extern __shared__ int __smem[];
52  return (T *)__smem;
53  }
54 };
55 
56 // specialize for double to avoid unaligned memory
57 // access compile errors
58 template <>
59 struct SharedMemory<double> {
60  __device__ inline operator double *() {
61  extern __shared__ double __smem_d[];
62  return (double *)__smem_d;
63  }
64 
65  __device__ inline operator const double *() const {
66  extern __shared__ double __smem_d[];
67  return (double *)__smem_d;
68  }
69 };
70 
71 template <class T>
72 __device__ __forceinline__ T warpReduceSum(unsigned int mask, T mySum) {
73  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
74  mySum += __shfl_down_sync(mask, mySum, offset);
75  }
76  return mySum;
77 }
78 
79 #if __CUDA_ARCH__ >= 800
80 // Specialize warpReduceFunc for int inputs to use __reduce_add_sync intrinsic
81 // when on SM 8.0 or higher
82 template <>
83 __device__ __forceinline__ int warpReduceSum<int>(unsigned int mask,
84  int mySum) {
85  mySum = __reduce_add_sync(mask, mySum);
86  return mySum;
87 }
88 #endif
89 
90 /*
91  Parallel sum reduction using shared memory
92  - takes log(n) steps for n input elements
93  - uses n threads
94  - only works for power-of-2 arrays
95 */
96 
97 /*! This reduction interleaves which threads are active by using the modulo
98  operator. This operator is very expensive on GPUs, and the interleaved
99  inactivity means that no whole warps are active, which is also very
100  inefficient */
101 template <class T>
102 __global__ void reduce0(T *g_idata, T *g_odata, unsigned int n) {
103  // Handle to thread block group
104  cg::thread_block cta = cg::this_thread_block();
105  T *sdata = SharedMemory<T>();
106 
107  // load shared mem
108  unsigned int tid = threadIdx.x;
109  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
110 
111  sdata[tid] = (i < n) ? g_idata[i] : 0;
112 
113  cg::sync(cta);
114 
115  // do reduction in shared mem
116  for (unsigned int s = 1; s < blockDim.x; s *= 2) {
117  // modulo arithmetic is slow!
118  if ((tid % (2 * s)) == 0) {
119  sdata[tid] += sdata[tid + s];
120  }
121 
122  cg::sync(cta);
123  }
124 
125  // write result for this block to global mem
126  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
127 }
128 
129 /*! This version uses contiguous threads, but its interleaved
130  addressing results in many shared memory bank conflicts.
131 */
132 template <class T>
133 __global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) {
134  // Handle to thread block group
135  cg::thread_block cta = cg::this_thread_block();
136  T *sdata = SharedMemory<T>();
137 
138  // load shared mem
139  unsigned int tid = threadIdx.x;
140  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
141 
142  sdata[tid] = (i < n) ? g_idata[i] : 0;
143 
144  cg::sync(cta);
145 
146  // do reduction in shared mem
147  for (unsigned int s = 1; s < blockDim.x; s *= 2) {
148  int index = 2 * s * tid;
149 
150  if (index < blockDim.x) {
151  sdata[index] += sdata[index + s];
152  }
153 
154  cg::sync(cta);
155  }
156 
157  // write result for this block to global mem
158  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
159 }
160 
161 /*
162  This version uses sequential addressing -- no divergence or bank conflicts.
163 */
164 template <class T>
165 __global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) {
166  // Handle to thread block group
167  cg::thread_block cta = cg::this_thread_block();
168  T *sdata = SharedMemory<T>();
169 
170  // load shared mem
171  unsigned int tid = threadIdx.x;
172  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
173 
174  sdata[tid] = (i < n) ? g_idata[i] : 0;
175 
176  cg::sync(cta);
177 
178  // do reduction in shared mem
179  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
180  if (tid < s) {
181  sdata[tid] += sdata[tid + s];
182  }
183 
184  cg::sync(cta);
185  }
186 
187  // write result for this block to global mem
188  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
189 }
190 
191 /*
192  This version uses n/2 threads --
193  it performs the first level of reduction when reading from global memory.
194 */
195 template <class T>
196 __global__ void reduce3(T *g_idata, T *g_odata, unsigned int n) {
197  // Handle to thread block group
198  cg::thread_block cta = cg::this_thread_block();
199  T *sdata = SharedMemory<T>();
200 
201  // perform first level of reduction,
202  // reading from global memory, writing to shared memory
203  unsigned int tid = threadIdx.x;
204  unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
205 
206  T mySum = (i < n) ? g_idata[i] : 0;
207 
208  if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
209 
210  sdata[tid] = mySum;
211  cg::sync(cta);
212 
213  // do reduction in shared mem
214  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
215  if (tid < s) {
216  sdata[tid] = mySum = mySum + sdata[tid + s];
217  }
218 
219  cg::sync(cta);
220  }
221 
222  // write result for this block to global mem
223  if (tid == 0) g_odata[blockIdx.x] = mySum;
224 }
225 
226 /*
227  This version uses the warp shuffle operation if available to reduce
228  warp synchronization. When shuffle is not available the final warp's
229  worth of work is unrolled to reduce looping overhead.
230  See
231  http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
232  for additional information about using shuffle to perform a reduction
233  within a warp.
234  Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
235  In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
236  If blockSize > 32, allocate blockSize*sizeof(T) bytes.
237 */
238 template <class T, unsigned int blockSize>
239 __global__ void reduce4(T *g_idata, T *g_odata, unsigned int n) {
240  // Handle to thread block group
241  cg::thread_block cta = cg::this_thread_block();
242  T *sdata = SharedMemory<T>();
243 
244  // perform first level of reduction,
245  // reading from global memory, writing to shared memory
246  unsigned int tid = threadIdx.x;
247  unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
248 
249  T mySum = (i < n) ? g_idata[i] : 0;
250 
251  if (i + blockSize < n) mySum += g_idata[i + blockSize];
252 
253  sdata[tid] = mySum;
254  cg::sync(cta);
255 
256  // do reduction in shared mem
257  for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
258  if (tid < s) {
259  sdata[tid] = mySum = mySum + sdata[tid + s];
260  }
261 
262  cg::sync(cta);
263  }
264 
265  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
266 
267  if (cta.thread_rank() < 32) {
268  // Fetch final intermediate sum from 2nd warp
269  if (blockSize >= 64) mySum += sdata[tid + 32];
270  // Reduce final warp using shuffle
271  for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
272  mySum += tile32.shfl_down(mySum, offset);
273  }
274  }
275 
276  // write result for this block to global mem
277  if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
278 }
279 
280 /*
281  This version is completely unrolled, unless warp shuffle is available, then
282  shuffle is used within a loop. It uses a template parameter to achieve
283  optimal code for any (power of 2) number of threads. This requires a switch
284  statement in the host code to handle all the different thread block sizes at
285  compile time. When shuffle is available, it is used to reduce warp
286  synchronization.
287  Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
288  In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
289  If blockSize > 32, allocate blockSize*sizeof(T) bytes.
290 */
291 template <class T, unsigned int blockSize>
292 __global__ void reduce5(T *g_idata, T *g_odata, unsigned int n) {
293  // Handle to thread block group
294  cg::thread_block cta = cg::this_thread_block();
295  T *sdata = SharedMemory<T>();
296 
297  // perform first level of reduction,
298  // reading from global memory, writing to shared memory
299  unsigned int tid = threadIdx.x;
300  unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
301 
302  T mySum = (i < n) ? g_idata[i] : 0;
303 
304  if (i + blockSize < n) mySum += g_idata[i + blockSize];
305 
306  sdata[tid] = mySum;
307  cg::sync(cta);
308 
309  // do reduction in shared mem
310  if ((blockSize >= 512) && (tid < 256)) {
311  sdata[tid] = mySum = mySum + sdata[tid + 256];
312  }
313 
314  cg::sync(cta);
315 
316  if ((blockSize >= 256) && (tid < 128)) {
317  sdata[tid] = mySum = mySum + sdata[tid + 128];
318  }
319 
320  cg::sync(cta);
321 
322  if ((blockSize >= 128) && (tid < 64)) {
323  sdata[tid] = mySum = mySum + sdata[tid + 64];
324  }
325 
326  cg::sync(cta);
327 
328  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
329 
330  if (cta.thread_rank() < 32) {
331  // Fetch final intermediate sum from 2nd warp
332  if (blockSize >= 64) mySum += sdata[tid + 32];
333  // Reduce final warp using shuffle
334  for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
335  mySum += tile32.shfl_down(mySum, offset);
336  }
337  }
338 
339  // write result for this block to global mem
340  if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
341 }
342 
343 /*
344  This version adds multiple elements per thread sequentially. This reduces
345  the overall cost of the algorithm while keeping the work complexity O(n) and
346  the step complexity O(log n). (Brent's Theorem optimization)
347  Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
348  In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
349  If blockSize > 32, allocate blockSize*sizeof(T) bytes.
350 */
351 template <class T, unsigned int blockSize, bool nIsPow2>
352 __global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) {
353  // Handle to thread block group
354  cg::thread_block cta = cg::this_thread_block();
355  T *sdata = SharedMemory<T>();
356 
357  // perform first level of reduction,
358  // reading from global memory, writing to shared memory
359  unsigned int tid = threadIdx.x;
360  unsigned int gridSize = blockSize * gridDim.x;
361 
362  T mySum = 0;
363 
364  // we reduce multiple elements per thread. The number is determined by the
365  // number of active thread blocks (via gridDim). More blocks will result
366  // in a larger gridSize and therefore fewer elements per thread
367  if (nIsPow2) {
368  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
369  gridSize = gridSize << 1;
370 
371  while (i < n) {
372  mySum += g_idata[i];
373  // ensure we don't read out of bounds -- this is optimized away for
374  // powerOf2 sized arrays
375  if ((i + blockSize) < n) {
376  mySum += g_idata[i + blockSize];
377  }
378  i += gridSize;
379  }
380  } else {
381  unsigned int i = blockIdx.x * blockSize + threadIdx.x;
382  while (i < n) {
383  mySum += g_idata[i];
384  i += gridSize;
385  }
386  }
387 
388  // each thread puts its local sum into shared memory
389  sdata[tid] = mySum;
390  cg::sync(cta);
391 
392  // do reduction in shared mem
393  if ((blockSize >= 512) && (tid < 256)) {
394  sdata[tid] = mySum = mySum + sdata[tid + 256];
395  }
396 
397  cg::sync(cta);
398 
399  if ((blockSize >= 256) && (tid < 128)) {
400  sdata[tid] = mySum = mySum + sdata[tid + 128];
401  }
402 
403  cg::sync(cta);
404 
405  if ((blockSize >= 128) && (tid < 64)) {
406  sdata[tid] = mySum = mySum + sdata[tid + 64];
407  }
408 
409  cg::sync(cta);
410 
411  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
412 
413  if (cta.thread_rank() < 32) {
414  // Fetch final intermediate sum from 2nd warp
415  if (blockSize >= 64) mySum += sdata[tid + 32];
416  // Reduce final warp using shuffle
417  for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
418  mySum += tile32.shfl_down(mySum, offset);
419  }
420  }
421 
422  // write result for this block to global mem
423  if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
424 }
425 
426 template <typename T, unsigned int blockSize, bool nIsPow2>
427 __global__ void reduce7(const T *__restrict__ g_idata, T *__restrict__ g_odata,
428  unsigned int n) {
429  T *sdata = SharedMemory<T>();
430 
431  // perform first level of reduction,
432  // reading from global memory, writing to shared memory
433  unsigned int tid = threadIdx.x;
434  unsigned int gridSize = blockSize * gridDim.x;
435  unsigned int maskLength = (blockSize & 31); // 31 = warpSize-1
436  maskLength = (maskLength > 0) ? (32 - maskLength) : maskLength;
437  const unsigned int mask = (0xffffffff) >> maskLength;
438 
439  T mySum = 0;
440 
441  // we reduce multiple elements per thread. The number is determined by the
442  // number of active thread blocks (via gridDim). More blocks will result
443  // in a larger gridSize and therefore fewer elements per thread
444  if (nIsPow2) {
445  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
446  gridSize = gridSize << 1;
447 
448  while (i < n) {
449  mySum += g_idata[i];
450  // ensure we don't read out of bounds -- this is optimized away for
451  // powerOf2 sized arrays
452  if ((i + blockSize) < n) {
453  mySum += g_idata[i + blockSize];
454  }
455  i += gridSize;
456  }
457  } else {
458  unsigned int i = blockIdx.x * blockSize + threadIdx.x;
459  while (i < n) {
460  mySum += g_idata[i];
461  i += gridSize;
462  }
463  }
464 
465  // Reduce within warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
466  // SM 8.0
467  mySum = warpReduceSum<T>(mask, mySum);
468 
469  // each thread puts its local sum into shared memory
470  if ((tid % warpSize) == 0) {
471  sdata[tid / warpSize] = mySum;
472  }
473 
474  __syncthreads();
475 
476  const unsigned int shmem_extent =
477  (blockSize / warpSize) > 0 ? (blockSize / warpSize) : 1;
478  const unsigned int ballot_result = __ballot_sync(mask, tid < shmem_extent);
479  if (tid < shmem_extent) {
480  mySum = sdata[tid];
481  // Reduce final warp using shuffle or reduce_add if T==int & CUDA_ARCH ==
482  // SM 8.0
483  mySum = warpReduceSum<T>(ballot_result, mySum);
484  }
485 
486  // write result for this block to global mem
487  if (tid == 0) {
488  g_odata[blockIdx.x] = mySum;
489  }
490 }
491 
492 // Performs a reduction step and updates numTotal with how many are remaining
493 template <typename T, typename Group>
494 __device__ T cg_reduce_n(T in, Group &threads) {
495  return cg::reduce(threads, in, cg::plus<T>());
496 }
497 
498 template <class T>
499 __global__ void cg_reduce(T *g_idata, T *g_odata, unsigned int n) {
500  // Shared memory for intermediate steps
501  T *sdata = SharedMemory<T>();
502  // Handle to thread block group
503  cg::thread_block cta = cg::this_thread_block();
504  // Handle to tile in thread block
505  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta);
506 
507  unsigned int ctaSize = cta.size();
508  unsigned int numCtas = gridDim.x;
509  unsigned int threadRank = cta.thread_rank();
510  unsigned int threadIndex = (blockIdx.x * ctaSize) + threadRank;
511 
512  T threadVal = 0;
513  {
514  unsigned int i = threadIndex;
515  unsigned int indexStride = (numCtas * ctaSize);
516  while (i < n) {
517  threadVal += g_idata[i];
518  i += indexStride;
519  }
520  sdata[threadRank] = threadVal;
521  }
522 
523  // Wait for all tiles to finish and reduce within CTA
524  {
525  unsigned int ctaSteps = tile.meta_group_size();
526  unsigned int ctaIndex = ctaSize >> 1;
527  while (ctaIndex >= 32) {
528  cta.sync();
529  if (threadRank < ctaIndex) {
530  threadVal += sdata[threadRank + ctaIndex];
531  sdata[threadRank] = threadVal;
532  }
533  ctaSteps >>= 1;
534  ctaIndex >>= 1;
535  }
536  }
537 
538  // Shuffle redux instead of smem redux
539  {
540  cta.sync();
541  if (tile.meta_group_rank() == 0) {
542  threadVal = cg_reduce_n(threadVal, tile);
543  }
544  }
545 
546  if (threadRank == 0) g_odata[blockIdx.x] = threadVal;
547 }
548 
549 template <class T, size_t BlockSize, size_t MultiWarpGroupSize>
550 __global__ void multi_warp_cg_reduce(T *g_idata, T *g_odata, unsigned int n) {
551  // Shared memory for intermediate steps
552  T *sdata = SharedMemory<T>();
553  __shared__ cg::experimental::block_tile_memory<sizeof(T), BlockSize> scratch;
554 
555  // Handle to thread block group
556  auto cta = cg::experimental::this_thread_block(scratch);
557  // Handle to multiWarpTile in thread block
558  auto multiWarpTile = cg::experimental::tiled_partition<MultiWarpGroupSize>(cta);
559 
560  unsigned int gridSize = BlockSize * gridDim.x;
561  T threadVal = 0;
562 
563  // we reduce multiple elements per thread. The number is determined by the
564  // number of active thread blocks (via gridDim). More blocks will result
565  // in a larger gridSize and therefore fewer elements per thread
566  int nIsPow2 = !(n & n-1);
567  if (nIsPow2) {
568  unsigned int i = blockIdx.x * BlockSize * 2 + threadIdx.x;
569  gridSize = gridSize << 1;
570 
571  while (i < n) {
572  threadVal += g_idata[i];
573  // ensure we don't read out of bounds -- this is optimized away for
574  // powerOf2 sized arrays
575  if ((i + BlockSize) < n) {
576  threadVal += g_idata[i + blockDim.x];
577  }
578  i += gridSize;
579  }
580  } else {
581  unsigned int i = blockIdx.x * BlockSize + threadIdx.x;
582  while (i < n) {
583  threadVal += g_idata[i];
584  i += gridSize;
585  }
586  }
587 
588  threadVal = cg_reduce_n(threadVal, multiWarpTile);
589 
590  if (multiWarpTile.thread_rank() == 0) {
591  sdata[multiWarpTile.meta_group_rank()] = threadVal;
592  }
593  cg::sync(cta);
594 
595  if (threadIdx.x == 0) {
596  threadVal = 0;
597  for (int i=0; i < multiWarpTile.meta_group_size(); i++) {
598  threadVal += sdata[i];
599  }
600  g_odata[blockIdx.x] = threadVal;
601  }
602 }
603 
604 extern "C" bool isPow2(unsigned int x);
605 
606 ////////////////////////////////////////////////////////////////////////////////
607 // Wrapper function for kernel launch
608 ////////////////////////////////////////////////////////////////////////////////
609 template <class T>
610 void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata,
611  T *d_odata) {
612  dim3 dimBlock(threads, 1, 1);
613  dim3 dimGrid(blocks, 1, 1);
614 
615  // when there is only one warp per block, we need to allocate two warps
616  // worth of shared memory so that we don't index shared memory out of bounds
617  int smemSize =
618  (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
619 
620  // as kernel 9 - multi_warp_cg_reduce cannot work for more than 64 threads
621  // we choose to set kernel 7 for this purpose.
622  if (threads < 64 && whichKernel == 9)
623  {
624  whichKernel = 7;
625  }
626 
627  // choose which of the optimized versions of reduction to launch
628  switch (whichKernel) {
629  case 0:
630  reduce0<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
631  break;
632 
633  case 1:
634  reduce1<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
635  break;
636 
637  case 2:
638  reduce2<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
639  break;
640 
641  case 3:
642  reduce3<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
643  break;
644 
645  case 4:
646  switch (threads) {
647  case 512:
648  reduce4<T, 512>
649  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
650  break;
651 
652  case 256:
653  reduce4<T, 256>
654  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
655  break;
656 
657  case 128:
658  reduce4<T, 128>
659  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
660  break;
661 
662  case 64:
663  reduce4<T, 64>
664  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
665  break;
666 
667  case 32:
668  reduce4<T, 32>
669  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
670  break;
671 
672  case 16:
673  reduce4<T, 16>
674  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
675  break;
676 
677  case 8:
678  reduce4<T, 8>
679  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
680  break;
681 
682  case 4:
683  reduce4<T, 4>
684  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
685  break;
686 
687  case 2:
688  reduce4<T, 2>
689  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
690  break;
691 
692  case 1:
693  reduce4<T, 1>
694  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
695  break;
696  }
697 
698  break;
699 
700  case 5:
701  switch (threads) {
702  case 512:
703  reduce5<T, 512>
704  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
705  break;
706 
707  case 256:
708  reduce5<T, 256>
709  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
710  break;
711 
712  case 128:
713  reduce5<T, 128>
714  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
715  break;
716 
717  case 64:
718  reduce5<T, 64>
719  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
720  break;
721 
722  case 32:
723  reduce5<T, 32>
724  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
725  break;
726 
727  case 16:
728  reduce5<T, 16>
729  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
730  break;
731 
732  case 8:
733  reduce5<T, 8>
734  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
735  break;
736 
737  case 4:
738  reduce5<T, 4>
739  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
740  break;
741 
742  case 2:
743  reduce5<T, 2>
744  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
745  break;
746 
747  case 1:
748  reduce5<T, 1>
749  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
750  break;
751  }
752 
753  break;
754 
755  case 6:
756  if (isPow2(size)) {
757  switch (threads) {
758  case 512:
759  reduce6<T, 512, true>
760  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
761  break;
762 
763  case 256:
764  reduce6<T, 256, true>
765  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
766  break;
767 
768  case 128:
769  reduce6<T, 128, true>
770  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
771  break;
772 
773  case 64:
774  reduce6<T, 64, true>
775  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
776  break;
777 
778  case 32:
779  reduce6<T, 32, true>
780  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
781  break;
782 
783  case 16:
784  reduce6<T, 16, true>
785  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
786  break;
787 
788  case 8:
789  reduce6<T, 8, true>
790  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
791  break;
792 
793  case 4:
794  reduce6<T, 4, true>
795  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
796  break;
797 
798  case 2:
799  reduce6<T, 2, true>
800  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
801  break;
802 
803  case 1:
804  reduce6<T, 1, true>
805  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
806  break;
807  }
808  } else {
809  switch (threads) {
810  case 512:
811  reduce6<T, 512, false>
812  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
813  break;
814 
815  case 256:
816  reduce6<T, 256, false>
817  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
818  break;
819 
820  case 128:
821  reduce6<T, 128, false>
822  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
823  break;
824 
825  case 64:
826  reduce6<T, 64, false>
827  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
828  break;
829 
830  case 32:
831  reduce6<T, 32, false>
832  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
833  break;
834 
835  case 16:
836  reduce6<T, 16, false>
837  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
838  break;
839 
840  case 8:
841  reduce6<T, 8, false>
842  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
843  break;
844 
845  case 4:
846  reduce6<T, 4, false>
847  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
848  break;
849 
850  case 2:
851  reduce6<T, 2, false>
852  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
853  break;
854 
855  case 1:
856  reduce6<T, 1, false>
857  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
858  break;
859  }
860  }
861 
862  break;
863 
864  case 7:
865  // For reduce7 kernel we require only blockSize/warpSize
866  // number of elements in shared memory
867  smemSize = ((threads / 32) + 1) * sizeof(T);
868  if (isPow2(size)) {
869  switch (threads) {
870  case 1024:
871  reduce7<T, 1024, true>
872  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
873  break;
874  case 512:
875  reduce7<T, 512, true>
876  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
877  break;
878 
879  case 256:
880  reduce7<T, 256, true>
881  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
882  break;
883 
884  case 128:
885  reduce7<T, 128, true>
886  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
887  break;
888 
889  case 64:
890  reduce7<T, 64, true>
891  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
892  break;
893 
894  case 32:
895  reduce7<T, 32, true>
896  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
897  break;
898 
899  case 16:
900  reduce7<T, 16, true>
901  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
902  break;
903 
904  case 8:
905  reduce7<T, 8, true>
906  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
907  break;
908 
909  case 4:
910  reduce7<T, 4, true>
911  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
912  break;
913 
914  case 2:
915  reduce7<T, 2, true>
916  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
917  break;
918 
919  case 1:
920  reduce7<T, 1, true>
921  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
922  break;
923  }
924  } else {
925  switch (threads) {
926  case 1024:
927  reduce7<T, 1024, true>
928  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
929  break;
930  case 512:
931  reduce7<T, 512, false>
932  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
933  break;
934 
935  case 256:
936  reduce7<T, 256, false>
937  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
938  break;
939 
940  case 128:
941  reduce7<T, 128, false>
942  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
943  break;
944 
945  case 64:
946  reduce7<T, 64, false>
947  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
948  break;
949 
950  case 32:
951  reduce7<T, 32, false>
952  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
953  break;
954 
955  case 16:
956  reduce7<T, 16, false>
957  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
958  break;
959 
960  case 8:
961  reduce7<T, 8, false>
962  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
963  break;
964 
965  case 4:
966  reduce7<T, 4, false>
967  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
968  break;
969 
970  case 2:
971  reduce7<T, 2, false>
972  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
973  break;
974 
975  case 1:
976  reduce7<T, 1, false>
977  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
978  break;
979  }
980  }
981 
982  break;
983  case 8:
984  cg_reduce<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
985  break;
986  case 9:
987  constexpr int numOfMultiWarpGroups = 2;
988  smemSize = numOfMultiWarpGroups * sizeof(T);
989  switch (threads) {
990  case 1024:
991  multi_warp_cg_reduce<T, 1024, 1024/numOfMultiWarpGroups>
992  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
993  break;
994 
995  case 512:
996  multi_warp_cg_reduce<T, 512, 512/numOfMultiWarpGroups>
997  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
998  break;
999 
1000  case 256:
1001  multi_warp_cg_reduce<T, 256, 256/numOfMultiWarpGroups>
1002  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
1003  break;
1004 
1005  case 128:
1006  multi_warp_cg_reduce<T, 128, 128/numOfMultiWarpGroups>
1007  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
1008  break;
1009 
1010  case 64:
1011  multi_warp_cg_reduce<T, 64, 64/numOfMultiWarpGroups>
1012  <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
1013  break;
1014 
1015  default:
1016  printf("thread block size of < 64 is not supported for this kernel\n");
1017  break;
1018  }
1019  break;
1020  }
1021 }
1022 
1023 // Instantiate the reduction function for 3 types
1024 template void reduce<int>(int size, int threads, int blocks, int whichKernel,
1025  int *d_idata, int *d_odata);
1026 
1027 template void reduce<float>(int size, int threads, int blocks, int whichKernel,
1028  float *d_idata, float *d_odata);
1029 
1030 template void reduce<double>(int size, int threads, int blocks, int whichKernel,
1031  double *d_idata, double *d_odata);
1032 
1033 #endif // #ifndef _REDUCE_KERNEL_H_
1034