10 #include <cuda_profiler_api.h>
46 printf(
"ERROR! WENO functions other than yc or mapped have not been implemented on GPUs.\n");
51 #ifdef CUDA_VAR_ORDERDING_AOS
76 int p = threadIdx.x + (blockDim.x * blockIdx.x);
77 if (p < ngrid_points) {
78 const int max_ndims = 3;
79 const double thirteen_by_twelve = 13.0 / 12.0;
80 const double one_fourth = 1.0 / 4.0;
82 int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
83 int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
84 double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
85 double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
90 ww1RF = w1 + 2*weno_size + offset;
91 ww2RF = w2 + 2*weno_size + offset;
92 ww3RF = w3 + 2*weno_size + offset;
93 ww1LU = w1 + weno_size + offset;
94 ww2LU = w2 + weno_size + offset;
95 ww3LU = w3 + weno_size + offset;
96 ww1RU = w1 + 2*weno_size + weno_size + offset;
97 ww2RU = w2 + 2*weno_size + weno_size + offset;
98 ww3RU = w3 + 2*weno_size + weno_size + offset;
100 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
104 indexC[dir] = indexI[dir]-1;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
105 qm3L = qm1L - 2*stride;
106 qm2L = qm1L - stride;
107 qp1L = qm1L + stride;
108 qp2L = qm1L + 2*stride;
110 indexC[dir] = indexI[dir] ;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
111 qm3R = qm1R + 2*stride;
112 qm2R = qm1R + stride;
113 qp1R = qm1R - stride;
114 qp2R = qm1R - 2*stride;
117 const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
118 m3LF = (fC+qm3L*nvars);
119 m2LF = (fC+qm2L*nvars);
120 m1LF = (fC+qm1L*nvars);
121 p1LF = (fC+qp1L*nvars);
122 p2LF = (fC+qp2L*nvars);
123 const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
124 m3RF = (fC+qm3R*nvars);
125 m2RF = (fC+qm2R*nvars);
126 m1RF = (fC+qm1R*nvars);
127 p1RF = (fC+qp1R*nvars);
128 p2RF = (fC+qp2R*nvars);
129 const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
130 m3LU = (uC+qm3L*nvars);
131 m2LU = (uC+qm2L*nvars);
132 m1LU = (uC+qm1L*nvars);
133 p1LU = (uC+qp1L*nvars);
134 p2LU = (uC+qp2L*nvars);
135 const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
136 m3RU = (uC+qm3R*nvars);
137 m2RU = (uC+qm2R*nvars);
138 m1RU = (uC+qm1R*nvars);
139 p1RU = (uC+qp1R*nvars);
140 p2RU = (uC+qp2R*nvars);
144 if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
145 || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
164 _WENOWeights_v_M_((ww1LF+p*nvars),(ww2LF+p*nvars),(ww3LF+p*nvars),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,nvars);
165 _WENOWeights_v_M_((ww1RF+p*nvars),(ww2RF+p*nvars),(ww3RF+p*nvars),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,nvars);
166 _WENOWeights_v_M_((ww1LU+p*nvars),(ww2LU+p*nvars),(ww3LU+p*nvars),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,nvars);
167 _WENOWeights_v_M_((ww1RU+p*nvars),(ww2RU+p*nvars),(ww3RU+p*nvars),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,nvars);
201 double * __restrict__ fC,
202 double * __restrict__ uC,
203 double * __restrict__ x,
213 int ghosts = solver->
ghosts;
214 int ndims = solver->
ndims;
215 int nvars = solver->
nvars;
220 int offset = weno->
offset[dir];
221 int bounds_inter[ndims];
222 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
227 int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
228 int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
230 #if defined(GPU_STAT)
231 cudaEvent_t startEvent, stopEvent;
232 float milliseconds = 0;
234 checkCuda( cudaEventCreate(&startEvent) );
235 checkCuda( cudaEventCreate(&stopEvent) );
237 int weno_memory_accessed = 12*npoints_grid*nvars*
sizeof(double);
238 int fCuC_memory_accessed = 1;
239 for (
int d=0; d<ndims; d++) {
240 if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
241 else fCuC_memory_accessed *= dim[d];
243 fCuC_memory_accessed *= nvars*
sizeof(double);
245 checkCuda( cudaEventRecord(startEvent, 0) );
248 WENOFifthOrderCalculateWeightsM_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
249 npoints_grid, ndims, dir, ghosts, nvars, weno->
size, offset, stride[dir],
250 is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->
eps,
253 cudaDeviceSynchronize();
255 #if defined(GPU_STAT)
256 checkCuda( cudaEventRecord(stopEvent, 0) );
257 checkCuda( cudaEventSynchronize(stopEvent) );
258 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
260 printf(
"%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
261 "WENOFifthOrderCalculateWeightsM", milliseconds*1e-3, dir,
262 (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds));
264 checkCuda( cudaEventDestroy(startEvent) );
265 checkCuda( cudaEventDestroy(stopEvent) );
294 int p = threadIdx.x + (blockDim.x * blockIdx.x);
295 if (p < ngrid_points) {
296 const int max_ndims = 3;
297 const double thirteen_by_twelve = 13.0 / 12.0;
298 const double one_fourth = 1.0 / 4.0;
300 int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
301 int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
302 double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
303 double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
308 ww1RF = w1 + 2*weno_size + offset;
309 ww2RF = w2 + 2*weno_size + offset;
310 ww3RF = w3 + 2*weno_size + offset;
311 ww1LU = w1 + weno_size + offset;
312 ww2LU = w2 + weno_size + offset;
313 ww3LU = w3 + weno_size + offset;
314 ww1RU = w1 + 2*weno_size + weno_size + offset;
315 ww2RU = w2 + 2*weno_size + weno_size + offset;
316 ww3RU = w3 + 2*weno_size + weno_size + offset;
318 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
322 indexC[dir] = indexI[dir]-1;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
323 qm3L = qm1L - 2*stride;
324 qm2L = qm1L - stride;
325 qp1L = qm1L + stride;
326 qp2L = qm1L + 2*stride;
328 indexC[dir] = indexI[dir] ;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
329 qm3R = qm1R + 2*stride;
330 qm2R = qm1R + stride;
331 qp1R = qm1R - stride;
332 qp2R = qm1R - 2*stride;
335 const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
336 m3LF = (fC+qm3L*nvars);
337 m2LF = (fC+qm2L*nvars);
338 m1LF = (fC+qm1L*nvars);
339 p1LF = (fC+qp1L*nvars);
340 p2LF = (fC+qp2L*nvars);
341 const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
342 m3RF = (fC+qm3R*nvars);
343 m2RF = (fC+qm2R*nvars);
344 m1RF = (fC+qm1R*nvars);
345 p1RF = (fC+qp1R*nvars);
346 p2RF = (fC+qp2R*nvars);
347 const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
348 m3LU = (uC+qm3L*nvars);
349 m2LU = (uC+qm2L*nvars);
350 m1LU = (uC+qm1L*nvars);
351 p1LU = (uC+qp1L*nvars);
352 p2LU = (uC+qp2L*nvars);
353 const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
354 m3RU = (uC+qm3R*nvars);
355 m2RU = (uC+qm2R*nvars);
356 m1RU = (uC+qm1R*nvars);
357 p1RU = (uC+qp1R*nvars);
358 p2RU = (uC+qp2R*nvars);
362 if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
363 || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
382 _WENOWeights_v_YC_((ww1LF+p*nvars),(ww2LF+p*nvars),(ww3LF+p*nvars),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,nvars);
383 _WENOWeights_v_YC_((ww1RF+p*nvars),(ww2RF+p*nvars),(ww3RF+p*nvars),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,nvars);
384 _WENOWeights_v_YC_((ww1LU+p*nvars),(ww2LU+p*nvars),(ww3LU+p*nvars),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,nvars);
385 _WENOWeights_v_YC_((ww1RU+p*nvars),(ww2RU+p*nvars),(ww3RU+p*nvars),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,nvars);
421 double * __restrict__ fC,
422 double * __restrict__ uC,
423 double * __restrict__ x,
433 int ghosts = solver->
ghosts;
434 int ndims = solver->
ndims;
435 int nvars = solver->
nvars;
440 int offset = weno->
offset[dir];
441 int bounds_inter[ndims];
442 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
445 int nblocks = (ngrid_points - 1) / 256 + 1;
448 int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
449 int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
451 cudaEvent_t startEvent, stopEvent;
452 float milliseconds = 0;
454 checkCuda( cudaEventCreate(&startEvent) );
455 checkCuda( cudaEventCreate(&stopEvent) );
458 checkCuda( cudaEventRecord(startEvent, 0) );
459 WENOFifthOrderCalculateWeightsYC_kernel<<<nblocks, 256>>>(
460 ngrid_points, ndims, dir, ghosts, nvars, weno->
size, offset, stride[dir],
461 is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->
eps,
464 checkCuda( cudaEventRecord(stopEvent, 0) );
465 checkCuda( cudaEventSynchronize(stopEvent) );
466 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
469 printf(
"WENOFifthOrderCalculateWeightsYC_GPU:\n");
470 printf(
" GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f weno->size = %d weno->offset[1] = %d\n",
471 milliseconds*1e-3, dir,
472 (1e-6*((12*weno->
offset[1]+2*(nvars*(dim[0]+2*ghosts)*dim[1]))*
sizeof(
double)))/milliseconds,
475 checkCuda( cudaEventDestroy(startEvent) );
476 checkCuda( cudaEventDestroy(stopEvent) );
487 int npoints_local_wghosts,
507 int p = threadIdx.x + (blockDim.x * blockIdx.x);
508 if (p < npoints_grid) {
509 const int max_ndims = 3;
510 const double thirteen_by_twelve = 13.0 / 12.0;
511 const double one_fourth = 1.0 / 4.0;
513 int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
514 int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
515 double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
516 double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
521 ww1RF = w1 + 2*weno_size + offset;
522 ww2RF = w2 + 2*weno_size + offset;
523 ww3RF = w3 + 2*weno_size + offset;
524 ww1LU = w1 + weno_size + offset;
525 ww2LU = w2 + weno_size + offset;
526 ww3LU = w3 + weno_size + offset;
527 ww1RU = w1 + 2*weno_size + weno_size + offset;
528 ww2RU = w2 + 2*weno_size + weno_size + offset;
529 ww3RU = w3 + 2*weno_size + weno_size + offset;
531 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
535 indexC[dir] = indexI[dir]-1;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
536 qm3L = qm1L - 2*stride;
537 qm2L = qm1L - stride;
538 qp1L = qm1L + stride;
539 qp2L = qm1L + 2*stride;
541 indexC[dir] = indexI[dir] ;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
542 qm3R = qm1R + 2*stride;
543 qm2R = qm1R + stride;
544 qp1R = qm1R - stride;
545 qp2R = qm1R - 2*stride;
547 for (
int j=0; j<nvars; j++) {
549 const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
555 const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
561 const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
567 const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
574 qm3L += npoints_local_wghosts;
575 qm2L += npoints_local_wghosts;
576 qm1L += npoints_local_wghosts;
577 qp1L += npoints_local_wghosts;
578 qp2L += npoints_local_wghosts;
580 qm3R += npoints_local_wghosts;
581 qm2R += npoints_local_wghosts;
582 qm1R += npoints_local_wghosts;
583 qp1R += npoints_local_wghosts;
584 qp2R += npoints_local_wghosts;
588 if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
589 || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
608 _WENOWeights_v_M_Scalar_((ww1LF+p),(ww2LF+p),(ww3LF+p),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,0);
609 _WENOWeights_v_M_Scalar_((ww1RF+p),(ww2RF+p),(ww3RF+p),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,0);
610 _WENOWeights_v_M_Scalar_((ww1LU+p),(ww2LU+p),(ww3LU+p),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,0);
611 _WENOWeights_v_M_Scalar_((ww1RU+p),(ww2RU+p),(ww3RU+p),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,0);
648 double * __restrict__ fC,
649 double * __restrict__ uC,
650 double * __restrict__ x,
660 int ghosts = solver->
ghosts;
661 int ndims = solver->
ndims;
662 int nvars = solver->
nvars;
667 int offset = weno->
offset[dir];
668 int bounds_inter[ndims];
669 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
674 int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
675 int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
677 #if defined(GPU_STAT)
678 cudaEvent_t startEvent, stopEvent;
679 float milliseconds = 0;
681 checkCuda( cudaEventCreate(&startEvent) );
682 checkCuda( cudaEventCreate(&stopEvent) );
684 int weno_memory_accessed = 12*npoints_grid*nvars*
sizeof(double);
685 int fCuC_memory_accessed = 1;
686 for (
int d=0; d<ndims; d++) {
687 if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
688 else fCuC_memory_accessed *= dim[d];
690 fCuC_memory_accessed *= nvars*
sizeof(double);
692 checkCuda( cudaEventRecord(startEvent, 0) );
695 WENOFifthOrderCalculateWeightsM_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
697 is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->
eps,
700 cudaDeviceSynchronize();
702 #if defined(GPU_STAT)
703 checkCuda( cudaEventRecord(stopEvent, 0) );
704 checkCuda( cudaEventSynchronize(stopEvent) );
705 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
707 printf(
"%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f stride = %d\n",
708 "WENOFifthOrderCalculateWeightsM2", milliseconds*1e-3, dir,
709 (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds),
712 checkCuda( cudaEventDestroy(startEvent) );
713 checkCuda( cudaEventDestroy(stopEvent) );
723 int npoints_local_wghosts,
735 const int * __restrict__ dim,
736 const double * __restrict__ fC,
737 const double * __restrict__ uC,
738 double * __restrict__ w1,
739 double * __restrict__ w2,
740 double * __restrict__ w3
743 int p = threadIdx.x + (blockDim.x * blockIdx.x);
744 if (p < npoints_grid) {
745 const int max_ndims = 3;
746 const double thirteen_by_twelve = 13.0 / 12.0;
747 const double one_fourth = 1.0 / 4.0;
749 int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
750 int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
751 double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
752 double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
757 ww1RF = w1 + 2*weno_size + offset;
758 ww2RF = w2 + 2*weno_size + offset;
759 ww3RF = w3 + 2*weno_size + offset;
760 ww1LU = w1 + weno_size + offset;
761 ww2LU = w2 + weno_size + offset;
762 ww3LU = w3 + weno_size + offset;
763 ww1RU = w1 + 2*weno_size + weno_size + offset;
764 ww2RU = w2 + 2*weno_size + weno_size + offset;
765 ww3RU = w3 + 2*weno_size + weno_size + offset;
767 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
771 indexC[dir] = indexI[dir]-1;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
772 qm3L = qm1L - 2*stride;
773 qm2L = qm1L - stride;
774 qp1L = qm1L + stride;
775 qp2L = qm1L + 2*stride;
777 indexC[dir] = indexI[dir] ;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
778 qm3R = qm1R + 2*stride;
779 qm2R = qm1R + stride;
780 qp1R = qm1R - stride;
781 qp2R = qm1R - 2*stride;
783 for (
int j=0; j <nvars; j++) {
785 const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
791 const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
797 const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
803 const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
810 qm3L += npoints_local_wghosts;
811 qm2L += npoints_local_wghosts;
812 qm1L += npoints_local_wghosts;
813 qp1L += npoints_local_wghosts;
814 qp2L += npoints_local_wghosts;
816 qm3R += npoints_local_wghosts;
817 qm2R += npoints_local_wghosts;
818 qm1R += npoints_local_wghosts;
819 qp1R += npoints_local_wghosts;
820 qp2R += npoints_local_wghosts;
824 if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
825 || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
844 _WENOWeights_v_YC_Scalar_((ww1LF+p),(ww2LF+p),(ww3LF+p),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,0);
845 _WENOWeights_v_YC_Scalar_((ww1RF+p),(ww2RF+p),(ww3RF+p),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,0);
846 _WENOWeights_v_YC_Scalar_((ww1LU+p),(ww2LU+p),(ww3LU+p),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,0);
847 _WENOWeights_v_YC_Scalar_((ww1RU+p),(ww2RU+p),(ww3RU+p),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,0);
885 double * __restrict__ fC,
886 double * __restrict__ uC,
887 double * __restrict__ x,
897 int ghosts = solver->
ghosts;
898 int ndims = solver->
ndims;
899 int nvars = solver->
nvars;
904 int offset = weno->
offset[dir];
905 int bounds_inter[ndims];
906 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
911 int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
912 int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
914 #if defined(GPU_STAT)
915 cudaEvent_t startEvent, stopEvent;
916 float milliseconds = 0;
918 checkCuda( cudaEventCreate(&startEvent) );
919 checkCuda( cudaEventCreate(&stopEvent) );
921 int weno_memory_accessed = 12*npoints_grid*nvars*
sizeof(double);
922 int fCuC_memory_accessed = 1;
923 for (
int d=0; d<ndims; d++) {
924 if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
925 else fCuC_memory_accessed *= dim[d];
927 fCuC_memory_accessed *= nvars*
sizeof(double);
929 checkCuda( cudaEventRecord(startEvent, 0) );
932 WENOFifthOrderCalculateWeightsYC_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
934 is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->
eps,
937 cudaDeviceSynchronize();
939 #if defined(GPU_STAT)
940 checkCuda( cudaEventRecord(stopEvent, 0) );
941 checkCuda( cudaEventSynchronize(stopEvent) );
942 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
944 printf(
"%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f stride = %d\n",
945 "WENOFifthOrderCalculateWeightsYC2", milliseconds*1e-3, dir,
946 (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds),
949 checkCuda( cudaEventDestroy(startEvent) );
950 checkCuda( cudaEventDestroy(stopEvent) );
char spatial_scheme_hyp[_MAX_STRING_SIZE_]
int gpuWENOFifthOrderCalculateWeights(double *fC, double *uC, double *x, int dir, void *s, void *m)
#define _WENOWeights_v_M_(w1, w2, w3, c1, c2, c3, m3, m2, m1, p1, p2, eps, N)
int npoints_local_wghosts
Definitions for the functions computing the interpolated value of the primitive at the cell interface...
#define _WENOWeights_v_YC_Scalar_(w1, w2, w3, c1, c2, c3, m3, m2, m1, p1, p2, eps, idx)
#define _ArrayIndexnD_(N, index, imax, i, ghost)
MPI related function definitions.
#define _WENO_OPTIMAL_WEIGHT_2_
#define GPU_THREADS_PER_BLOCK
Structure of variables/parameters needed by the WENO-type scheme.
#define _ArrayIndex1D_(N, imax, i, ghost, index)
static int gpuWENOFifthOrderCalculateWeightsYC(double *, double *, double *, int, void *, void *)
#define _CRWENO_OPTIMAL_WEIGHT_1_
#define _WENO_OPTIMAL_WEIGHT_3_
Contains function definitions for common mathematical functions.
#define _WENO_OPTIMAL_WEIGHT_1_
Contains function definitions for common array operations on GPU.
#define _WENOWeights_v_M_Scalar_(w1, w2, w3, c1, c2, c3, m3, m2, m1, p1, p2, eps, idx)
#define _ArrayCopy1D_(x, y, size)
Contains structure definition for hypar.
#define _CRWENO_OPTIMAL_WEIGHT_2_
__global__ void WENOFifthOrderCalculateWeightsM_kernel(int npoints_grid, int npoints_local_wghosts, int ndims, int dir, int ghosts, int nvars, int weno_size, int offset, int stride, int is_crweno, int is_mpi_ip_zero, int is_mpi_ip_proc, double weno_eps, const int *dim, const double *fC, const double *uC, double *w1, double *w2, double *w3)
#define _CRWENO_OPTIMAL_WEIGHT_3_
Some basic definitions and macros.
#define _WENOWeights_v_YC_(w1, w2, w3, c1, c2, c3, m3, m2, m1, p1, p2, eps, N)
Contains macros and function definitions for common array operations.
#define _FIFTH_ORDER_CRWENO_
#define _ArrayProduct1D_(x, size, p)
Structure of MPI-related variables.
Structure containing all solver-specific variables and functions.
static int gpuWENOFifthOrderCalculateWeightsM(double *, double *, double *, int, void *, void *)
__global__ void WENOFifthOrderCalculateWeightsYC_kernel(int npoints_grid, int npoints_local_wghosts, int ndims, int dir, int ghosts, int nvars, int weno_size, int offset, int stride, int is_crweno, int is_mpi_ip_zero, int is_mpi_ip_proc, double weno_eps, const int *__restrict__ dim, const double *__restrict__ fC, const double *__restrict__ uC, double *__restrict__ w1, double *__restrict__ w2, double *__restrict__ w3)
Contains macros and function definitions for common matrix multiplication.