HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
WENOFifthOrderCalculateWeights_GPU.cu
Go to the documentation of this file.
1 
6 #include <stdlib.h>
7 #include <string.h>
8 #include <time.h>
9 
10 #include <cuda_profiler_api.h>
11 
12 #include <basic.h>
13 #include <basic_gpu.h>
14 #include <arrayfunctions.h>
15 #include <matmult_native.h>
16 #include <mathfunctions.h>
17 #include <interpolation.h>
18 #include <mpivars.h>
19 #include <hypar.h>
20 
21 #include <arrayfunctions_gpu.h>
22 
23 
24 static int gpuWENOFifthOrderCalculateWeightsYC(double*,double*,double*,int,void*,void*);
25 static int gpuWENOFifthOrderCalculateWeightsM(double*,double*,double*,int,void*,void*);
26 
31  double *fC,
32  double *uC,
33  double *x,
34  int dir,
35  void *s,
36  void *m
37 )
38 {
39  HyPar *solver = (HyPar*) s;
40  WENOParameters *weno = (WENOParameters*) solver->interp;
41  MPIVariables *mpi = (MPIVariables*) m;
42 
43  if (weno->yc) return(gpuWENOFifthOrderCalculateWeightsYC (fC,uC,x,dir,solver,mpi));
44  else if (weno->mapped) return(gpuWENOFifthOrderCalculateWeightsM (fC,uC,x,dir,solver,mpi));
45  else {
46  printf("ERROR! WENO functions other than yc or mapped have not been implemented on GPUs.\n");
47  return 1;
48  }
49 }
50 
51 #ifdef CUDA_VAR_ORDERDING_AOS
52 
54 __global__
56  int ngrid_points,
57  int ndims,
58  int dir,
59  int ghosts,
60  int nvars,
61  int weno_size,
62  int offset,
63  int stride,
64  int is_crweno,
65  int is_mpi_ip_zero,
66  int is_mpi_ip_proc,
67  double weno_eps,
68  const int *dim,
69  const double *fC,
70  const double *uC,
71  double *w1,
72  double *w2,
73  double *w3
74 )
75 {
76  int p = threadIdx.x + (blockDim.x * blockIdx.x);
77  if (p < ngrid_points) {
78  const int max_ndims = 3;
79  const double thirteen_by_twelve = 13.0 / 12.0;
80  const double one_fourth = 1.0 / 4.0;
81 
82  int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
83  int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
84  double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
85  double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
86 
87  ww1LF = w1 + offset;
88  ww2LF = w2 + offset;
89  ww3LF = w3 + offset;
90  ww1RF = w1 + 2*weno_size + offset;
91  ww2RF = w2 + 2*weno_size + offset;
92  ww3RF = w3 + 2*weno_size + offset;
93  ww1LU = w1 + weno_size + offset;
94  ww2LU = w2 + weno_size + offset;
95  ww3LU = w3 + weno_size + offset;
96  ww1RU = w1 + 2*weno_size + weno_size + offset;
97  ww2RU = w2 + 2*weno_size + weno_size + offset;
98  ww3RU = w3 + 2*weno_size + weno_size + offset;
99 
100  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
101  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
102  _ArrayCopy1D_(indexC,indexI,ndims);
103 
104  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
105  qm3L = qm1L - 2*stride;
106  qm2L = qm1L - stride;
107  qp1L = qm1L + stride;
108  qp2L = qm1L + 2*stride;
109 
110  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
111  qm3R = qm1R + 2*stride;
112  qm2R = qm1R + stride;
113  qp1R = qm1R - stride;
114  qp2R = qm1R - 2*stride;
115 
116  /* Defining stencil points */
117  const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
118  m3LF = (fC+qm3L*nvars);
119  m2LF = (fC+qm2L*nvars);
120  m1LF = (fC+qm1L*nvars);
121  p1LF = (fC+qp1L*nvars);
122  p2LF = (fC+qp2L*nvars);
123  const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
124  m3RF = (fC+qm3R*nvars);
125  m2RF = (fC+qm2R*nvars);
126  m1RF = (fC+qm1R*nvars);
127  p1RF = (fC+qp1R*nvars);
128  p2RF = (fC+qp2R*nvars);
129  const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
130  m3LU = (uC+qm3L*nvars);
131  m2LU = (uC+qm2L*nvars);
132  m1LU = (uC+qm1L*nvars);
133  p1LU = (uC+qp1L*nvars);
134  p2LU = (uC+qp2L*nvars);
135  const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
136  m3RU = (uC+qm3R*nvars);
137  m2RU = (uC+qm2R*nvars);
138  m1RU = (uC+qm1R*nvars);
139  p1RU = (uC+qp1R*nvars);
140  p2RU = (uC+qp2R*nvars);
141 
142  double c1, c2, c3;
143  if (is_crweno) {
144  if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
145  || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
146  /* Use WENO5 at the physical boundaries */
150  } else {
151  /* CRWENO5 at the interior points */
155  }
156  } else {
157  /* WENO5 and HCWENO5 */
161  }
162 
163  /* calculate WENO weights */
164  _WENOWeights_v_M_((ww1LF+p*nvars),(ww2LF+p*nvars),(ww3LF+p*nvars),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,nvars);
165  _WENOWeights_v_M_((ww1RF+p*nvars),(ww2RF+p*nvars),(ww3RF+p*nvars),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,nvars);
166  _WENOWeights_v_M_((ww1LU+p*nvars),(ww2LU+p*nvars),(ww3LU+p*nvars),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,nvars);
167  _WENOWeights_v_M_((ww1RU+p*nvars),(ww2RU+p*nvars),(ww3RU+p*nvars),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,nvars);
168  }
169 
170  return;
171 }
172 
201  double * __restrict__ fC,
202  double * __restrict__ uC,
203  double * __restrict__ x,
204  int dir,
205  void *s,
206  void *m
207 )
208 {
209  HyPar *solver = (HyPar*) s;
210  WENOParameters *weno = (WENOParameters*) solver->interp;
211  MPIVariables *mpi = (MPIVariables*) m;
212 
213  int ghosts = solver->ghosts;
214  int ndims = solver->ndims;
215  int nvars = solver->nvars;
216  int *dim = solver->dim_local;
217  int *stride= solver->stride_with_ghosts;
218 
219  /* calculate dimension offset */
220  int offset = weno->offset[dir];
221  int bounds_inter[ndims];
222  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
223  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
224  int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
225 
226  int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
227  int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
228  int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
229 
230 #if defined(GPU_STAT)
231  cudaEvent_t startEvent, stopEvent;
232  float milliseconds = 0;
233 
234  checkCuda( cudaEventCreate(&startEvent) );
235  checkCuda( cudaEventCreate(&stopEvent) );
236 
237  int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
238  int fCuC_memory_accessed = 1;
239  for (int d=0; d<ndims; d++) {
240  if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
241  else fCuC_memory_accessed *= dim[d];
242  }
243  fCuC_memory_accessed *= nvars*sizeof(double);
244 
245  checkCuda( cudaEventRecord(startEvent, 0) );
246 #endif
247 
248  WENOFifthOrderCalculateWeightsM_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
249  npoints_grid, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
250  is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
251  solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
252  );
253  cudaDeviceSynchronize();
254 
255 #if defined(GPU_STAT)
256  checkCuda( cudaEventRecord(stopEvent, 0) );
257  checkCuda( cudaEventSynchronize(stopEvent) );
258  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
259 
260  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
261  "WENOFifthOrderCalculateWeightsM", milliseconds*1e-3, dir,
262  (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds));
263 
264  checkCuda( cudaEventDestroy(startEvent) );
265  checkCuda( cudaEventDestroy(stopEvent) );
266 #endif
267 
268  return 0;
269 }
270 
272 __global__
274  int ngrid_points,
275  int ndims,
276  int dir,
277  int ghosts,
278  int nvars,
279  int weno_size,
280  int offset,
281  int stride,
282  int is_crweno,
283  int is_mpi_ip_zero,
284  int is_mpi_ip_proc,
285  double weno_eps,
286  const int *dim,
287  const double *fC,
288  const double *uC,
289  double *w1,
290  double *w2,
291  double *w3
292 )
293 {
294  int p = threadIdx.x + (blockDim.x * blockIdx.x);
295  if (p < ngrid_points) {
296  const int max_ndims = 3;
297  const double thirteen_by_twelve = 13.0 / 12.0;
298  const double one_fourth = 1.0 / 4.0;
299 
300  int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
301  int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
302  double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
303  double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
304 
305  ww1LF = w1 + offset;
306  ww2LF = w2 + offset;
307  ww3LF = w3 + offset;
308  ww1RF = w1 + 2*weno_size + offset;
309  ww2RF = w2 + 2*weno_size + offset;
310  ww3RF = w3 + 2*weno_size + offset;
311  ww1LU = w1 + weno_size + offset;
312  ww2LU = w2 + weno_size + offset;
313  ww3LU = w3 + weno_size + offset;
314  ww1RU = w1 + 2*weno_size + weno_size + offset;
315  ww2RU = w2 + 2*weno_size + weno_size + offset;
316  ww3RU = w3 + 2*weno_size + weno_size + offset;
317 
318  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
319  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
320  _ArrayCopy1D_(indexC,indexI,ndims);
321 
322  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
323  qm3L = qm1L - 2*stride;
324  qm2L = qm1L - stride;
325  qp1L = qm1L + stride;
326  qp2L = qm1L + 2*stride;
327 
328  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
329  qm3R = qm1R + 2*stride;
330  qm2R = qm1R + stride;
331  qp1R = qm1R - stride;
332  qp2R = qm1R - 2*stride;
333 
334  /* Defining stencil points */
335  const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
336  m3LF = (fC+qm3L*nvars);
337  m2LF = (fC+qm2L*nvars);
338  m1LF = (fC+qm1L*nvars);
339  p1LF = (fC+qp1L*nvars);
340  p2LF = (fC+qp2L*nvars);
341  const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
342  m3RF = (fC+qm3R*nvars);
343  m2RF = (fC+qm2R*nvars);
344  m1RF = (fC+qm1R*nvars);
345  p1RF = (fC+qp1R*nvars);
346  p2RF = (fC+qp2R*nvars);
347  const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
348  m3LU = (uC+qm3L*nvars);
349  m2LU = (uC+qm2L*nvars);
350  m1LU = (uC+qm1L*nvars);
351  p1LU = (uC+qp1L*nvars);
352  p2LU = (uC+qp2L*nvars);
353  const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
354  m3RU = (uC+qm3R*nvars);
355  m2RU = (uC+qm2R*nvars);
356  m1RU = (uC+qm1R*nvars);
357  p1RU = (uC+qp1R*nvars);
358  p2RU = (uC+qp2R*nvars);
359 
360  double c1, c2, c3;
361  if (is_crweno) {
362  if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
363  || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
364  /* Use WENO5 at the physical boundaries */
368  } else {
369  /* CRWENO5 at the interior points */
373  }
374  } else {
375  /* WENO5 and HCWENO5 */
379  }
380 
381  /* calculate WENO weights */
382  _WENOWeights_v_YC_((ww1LF+p*nvars),(ww2LF+p*nvars),(ww3LF+p*nvars),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,nvars);
383  _WENOWeights_v_YC_((ww1RF+p*nvars),(ww2RF+p*nvars),(ww3RF+p*nvars),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,nvars);
384  _WENOWeights_v_YC_((ww1LU+p*nvars),(ww2LU+p*nvars),(ww3LU+p*nvars),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,nvars);
385  _WENOWeights_v_YC_((ww1RU+p*nvars),(ww2RU+p*nvars),(ww3RU+p*nvars),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,nvars);
386  }
387 
388  return;
389 }
390 
421  double * __restrict__ fC,
422  double * __restrict__ uC,
423  double * __restrict__ x,
424  int dir,
425  void *s,
426  void *m
427 )
428 {
429  HyPar *solver = (HyPar*) s;
430  WENOParameters *weno = (WENOParameters*) solver->interp;
431  MPIVariables *mpi = (MPIVariables*) m;
432 
433  int ghosts = solver->ghosts;
434  int ndims = solver->ndims;
435  int nvars = solver->nvars;
436  int *dim = solver->dim_local;
437  int *stride= solver->stride_with_ghosts;
438 
439  /* calculate dimension offset */
440  int offset = weno->offset[dir];
441  int bounds_inter[ndims];
442  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
443  int ngrid_points; _ArrayProduct1D_(bounds_inter,ndims,ngrid_points);
444  //int nblocks = (ngrid_points - 1) / GPU_THREADS_PER_BLOCK + 1;
445  int nblocks = (ngrid_points - 1) / 256 + 1;
446 
447  int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
448  int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
449  int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
450 
451  cudaEvent_t startEvent, stopEvent;
452  float milliseconds = 0;
453 
454  checkCuda( cudaEventCreate(&startEvent) );
455  checkCuda( cudaEventCreate(&stopEvent) );
456 
457  //cudaProfilerStart();
458  checkCuda( cudaEventRecord(startEvent, 0) );
459  WENOFifthOrderCalculateWeightsYC_kernel<<<nblocks, 256>>>(
460  ngrid_points, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
461  is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
462  solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
463  );
464  checkCuda( cudaEventRecord(stopEvent, 0) );
465  checkCuda( cudaEventSynchronize(stopEvent) );
466  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
467  //cudaProfilerStop();
468 
469  printf("WENOFifthOrderCalculateWeightsYC_GPU:\n");
470  printf(" GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f weno->size = %d weno->offset[1] = %d\n",
471  milliseconds*1e-3, dir,
472  (1e-6*((12*weno->offset[1]+2*(nvars*(dim[0]+2*ghosts)*dim[1]))*sizeof(double)))/milliseconds,
473  weno->size, weno->offset[1]);
474 
475  checkCuda( cudaEventDestroy(startEvent) );
476  checkCuda( cudaEventDestroy(stopEvent) );
477 
478  return (0);
479 }
480 
481 #else
482 
484 __global__
486  int npoints_grid,
487  int npoints_local_wghosts,
488  int ndims,
489  int dir,
490  int ghosts,
491  int nvars,
492  int weno_size,
493  int offset,
494  int stride,
495  int is_crweno,
496  int is_mpi_ip_zero,
497  int is_mpi_ip_proc,
498  double weno_eps,
499  const int *dim,
500  const double *fC,
501  const double *uC,
502  double *w1,
503  double *w2,
504  double *w3
505 )
506 {
507  int p = threadIdx.x + (blockDim.x * blockIdx.x);
508  if (p < npoints_grid) {
509  const int max_ndims = 3;
510  const double thirteen_by_twelve = 13.0 / 12.0;
511  const double one_fourth = 1.0 / 4.0;
512 
513  int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
514  int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
515  double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
516  double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
517 
518  ww1LF = w1 + offset;
519  ww2LF = w2 + offset;
520  ww3LF = w3 + offset;
521  ww1RF = w1 + 2*weno_size + offset;
522  ww2RF = w2 + 2*weno_size + offset;
523  ww3RF = w3 + 2*weno_size + offset;
524  ww1LU = w1 + weno_size + offset;
525  ww2LU = w2 + weno_size + offset;
526  ww3LU = w3 + weno_size + offset;
527  ww1RU = w1 + 2*weno_size + weno_size + offset;
528  ww2RU = w2 + 2*weno_size + weno_size + offset;
529  ww3RU = w3 + 2*weno_size + weno_size + offset;
530 
531  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
532  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
533  _ArrayCopy1D_(indexC,indexI,ndims);
534 
535  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
536  qm3L = qm1L - 2*stride;
537  qm2L = qm1L - stride;
538  qp1L = qm1L + stride;
539  qp2L = qm1L + 2*stride;
540 
541  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
542  qm3R = qm1R + 2*stride;
543  qm2R = qm1R + stride;
544  qp1R = qm1R - stride;
545  qp2R = qm1R - 2*stride;
546 
547  for (int j=0; j<nvars; j++) {
548  /* Defining stencil points */
549  const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
550  m3LF = (fC+qm3L);
551  m2LF = (fC+qm2L);
552  m1LF = (fC+qm1L);
553  p1LF = (fC+qp1L);
554  p2LF = (fC+qp2L);
555  const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
556  m3RF = (fC+qm3R);
557  m2RF = (fC+qm2R);
558  m1RF = (fC+qm1R);
559  p1RF = (fC+qp1R);
560  p2RF = (fC+qp2R);
561  const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
562  m3LU = (uC+qm3L);
563  m2LU = (uC+qm2L);
564  m1LU = (uC+qm1L);
565  p1LU = (uC+qp1L);
566  p2LU = (uC+qp2L);
567  const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
568  m3RU = (uC+qm3R);
569  m2RU = (uC+qm2R);
570  m1RU = (uC+qm1R);
571  p1RU = (uC+qp1R);
572  p2RU = (uC+qp2R);
573 
574  qm3L += npoints_local_wghosts;
575  qm2L += npoints_local_wghosts;
576  qm1L += npoints_local_wghosts;
577  qp1L += npoints_local_wghosts;
578  qp2L += npoints_local_wghosts;
579 
580  qm3R += npoints_local_wghosts;
581  qm2R += npoints_local_wghosts;
582  qm1R += npoints_local_wghosts;
583  qp1R += npoints_local_wghosts;
584  qp2R += npoints_local_wghosts;
585 
586  double c1, c2, c3;
587  if (is_crweno) {
588  if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
589  || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
590  /* Use WENO5 at the physical boundaries */
594  } else {
595  /* CRWENO5 at the interior points */
599  }
600  } else {
601  /* WENO5 and HCWENO5 */
605  }
606 
607  /* calculate WENO weights */
608  _WENOWeights_v_M_Scalar_((ww1LF+p),(ww2LF+p),(ww3LF+p),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,0);
609  _WENOWeights_v_M_Scalar_((ww1RF+p),(ww2RF+p),(ww3RF+p),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,0);
610  _WENOWeights_v_M_Scalar_((ww1LU+p),(ww2LU+p),(ww3LU+p),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,0);
611  _WENOWeights_v_M_Scalar_((ww1RU+p),(ww2RU+p),(ww3RU+p),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,0);
612  p += npoints_grid;
613  }
614  }
615 
616  return;
617 }
618 
619 
648  double * __restrict__ fC,
649  double * __restrict__ uC,
650  double * __restrict__ x,
651  int dir,
652  void *s,
653  void *m
654 )
655 {
656  HyPar *solver = (HyPar*) s;
657  WENOParameters *weno = (WENOParameters*) solver->interp;
658  MPIVariables *mpi = (MPIVariables*) m;
659 
660  int ghosts = solver->ghosts;
661  int ndims = solver->ndims;
662  int nvars = solver->nvars;
663  int *dim = solver->dim_local;
664  int *stride= solver->stride_with_ghosts;
665 
666  /* calculate dimension offset */
667  int offset = weno->offset[dir];
668  int bounds_inter[ndims];
669  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
670  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
671  int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
672 
673  int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
674  int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
675  int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
676 
677 #if defined(GPU_STAT)
678  cudaEvent_t startEvent, stopEvent;
679  float milliseconds = 0;
680 
681  checkCuda( cudaEventCreate(&startEvent) );
682  checkCuda( cudaEventCreate(&stopEvent) );
683 
684  int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
685  int fCuC_memory_accessed = 1;
686  for (int d=0; d<ndims; d++) {
687  if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
688  else fCuC_memory_accessed *= dim[d];
689  }
690  fCuC_memory_accessed *= nvars*sizeof(double);
691 
692  checkCuda( cudaEventRecord(startEvent, 0) );
693 #endif
694 
695  WENOFifthOrderCalculateWeightsM_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
696  npoints_grid, solver->npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
697  is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
698  solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
699  );
700  cudaDeviceSynchronize();
701 
702 #if defined(GPU_STAT)
703  checkCuda( cudaEventRecord(stopEvent, 0) );
704  checkCuda( cudaEventSynchronize(stopEvent) );
705  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
706 
707  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f stride = %d\n",
708  "WENOFifthOrderCalculateWeightsM2", milliseconds*1e-3, dir,
709  (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds),
710  stride[dir]);
711 
712  checkCuda( cudaEventDestroy(startEvent) );
713  checkCuda( cudaEventDestroy(stopEvent) );
714 #endif
715 
716  return 0;
717 }
718 
720 __global__
722  int npoints_grid,
723  int npoints_local_wghosts,
724  int ndims,
725  int dir,
726  int ghosts,
727  int nvars,
728  int weno_size,
729  int offset,
730  int stride,
731  int is_crweno,
732  int is_mpi_ip_zero,
733  int is_mpi_ip_proc,
734  double weno_eps,
735  const int * __restrict__ dim,
736  const double * __restrict__ fC,
737  const double * __restrict__ uC,
738  double * __restrict__ w1,
739  double * __restrict__ w2,
740  double * __restrict__ w3
741 )
742 {
743  int p = threadIdx.x + (blockDim.x * blockIdx.x);
744  if (p < npoints_grid) {
745  const int max_ndims = 3;
746  const double thirteen_by_twelve = 13.0 / 12.0;
747  const double one_fourth = 1.0 / 4.0;
748 
749  int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
750  int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
751  double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
752  double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
753 
754  ww1LF = w1 + offset;
755  ww2LF = w2 + offset;
756  ww3LF = w3 + offset;
757  ww1RF = w1 + 2*weno_size + offset;
758  ww2RF = w2 + 2*weno_size + offset;
759  ww3RF = w3 + 2*weno_size + offset;
760  ww1LU = w1 + weno_size + offset;
761  ww2LU = w2 + weno_size + offset;
762  ww3LU = w3 + weno_size + offset;
763  ww1RU = w1 + 2*weno_size + weno_size + offset;
764  ww2RU = w2 + 2*weno_size + weno_size + offset;
765  ww3RU = w3 + 2*weno_size + weno_size + offset;
766 
767  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
768  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
769  _ArrayCopy1D_(indexC,indexI,ndims);
770 
771  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
772  qm3L = qm1L - 2*stride;
773  qm2L = qm1L - stride;
774  qp1L = qm1L + stride;
775  qp2L = qm1L + 2*stride;
776 
777  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
778  qm3R = qm1R + 2*stride;
779  qm2R = qm1R + stride;
780  qp1R = qm1R - stride;
781  qp2R = qm1R - 2*stride;
782 
783  for (int j=0; j <nvars; j++) {
784  /* Defining stencil points */
785  const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
786  m3LF = (fC+qm3L);
787  m2LF = (fC+qm2L);
788  m1LF = (fC+qm1L);
789  p1LF = (fC+qp1L);
790  p2LF = (fC+qp2L);
791  const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
792  m3RF = (fC+qm3R);
793  m2RF = (fC+qm2R);
794  m1RF = (fC+qm1R);
795  p1RF = (fC+qp1R);
796  p2RF = (fC+qp2R);
797  const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
798  m3LU = (uC+qm3L);
799  m2LU = (uC+qm2L);
800  m1LU = (uC+qm1L);
801  p1LU = (uC+qp1L);
802  p2LU = (uC+qp2L);
803  const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
804  m3RU = (uC+qm3R);
805  m2RU = (uC+qm2R);
806  m1RU = (uC+qm1R);
807  p1RU = (uC+qp1R);
808  p2RU = (uC+qp2R);
809 
810  qm3L += npoints_local_wghosts;
811  qm2L += npoints_local_wghosts;
812  qm1L += npoints_local_wghosts;
813  qp1L += npoints_local_wghosts;
814  qp2L += npoints_local_wghosts;
815 
816  qm3R += npoints_local_wghosts;
817  qm2R += npoints_local_wghosts;
818  qm1R += npoints_local_wghosts;
819  qp1R += npoints_local_wghosts;
820  qp2R += npoints_local_wghosts;
821 
822  double c1, c2, c3;
823  if (is_crweno) {
824  if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
825  || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
826  /* Use WENO5 at the physical boundaries */
830  } else {
831  /* CRWENO5 at the interior points */
835  }
836  } else {
837  /* WENO5 and HCWENO5 */
841  }
842 
843  /* calculate WENO weights */
844  _WENOWeights_v_YC_Scalar_((ww1LF+p),(ww2LF+p),(ww3LF+p),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,0);
845  _WENOWeights_v_YC_Scalar_((ww1RF+p),(ww2RF+p),(ww3RF+p),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,0);
846  _WENOWeights_v_YC_Scalar_((ww1LU+p),(ww2LU+p),(ww3LU+p),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,0);
847  _WENOWeights_v_YC_Scalar_((ww1RU+p),(ww2RU+p),(ww3RU+p),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,0);
848  p += npoints_grid;
849  }
850  }
851 
852  return;
853 }
854 
885  double * __restrict__ fC,
886  double * __restrict__ uC,
887  double * __restrict__ x,
888  int dir,
889  void *s,
890  void *m
891 )
892 {
893  HyPar *solver = (HyPar*) s;
894  WENOParameters *weno = (WENOParameters*) solver->interp;
895  MPIVariables *mpi = (MPIVariables*) m;
896 
897  int ghosts = solver->ghosts;
898  int ndims = solver->ndims;
899  int nvars = solver->nvars;
900  int *dim = solver->dim_local;
901  int *stride= solver->stride_with_ghosts;
902 
903  /* calculate dimension offset */
904  int offset = weno->offset[dir];
905  int bounds_inter[ndims];
906  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
907  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
908  int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
909 
910  int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
911  int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
912  int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
913 
914 #if defined(GPU_STAT)
915  cudaEvent_t startEvent, stopEvent;
916  float milliseconds = 0;
917 
918  checkCuda( cudaEventCreate(&startEvent) );
919  checkCuda( cudaEventCreate(&stopEvent) );
920 
921  int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
922  int fCuC_memory_accessed = 1;
923  for (int d=0; d<ndims; d++) {
924  if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
925  else fCuC_memory_accessed *= dim[d];
926  }
927  fCuC_memory_accessed *= nvars*sizeof(double);
928 
929  checkCuda( cudaEventRecord(startEvent, 0) );
930 #endif
931 
932  WENOFifthOrderCalculateWeightsYC_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
933  npoints_grid, solver->npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
934  is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
935  solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
936  );
937  cudaDeviceSynchronize();
938 
939 #if defined(GPU_STAT)
940  checkCuda( cudaEventRecord(stopEvent, 0) );
941  checkCuda( cudaEventSynchronize(stopEvent) );
942  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
943 
944  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f stride = %d\n",
945  "WENOFifthOrderCalculateWeightsYC2", milliseconds*1e-3, dir,
946  (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds),
947  stride[dir]);
948 
949  checkCuda( cudaEventDestroy(startEvent) );
950  checkCuda( cudaEventDestroy(stopEvent) );
951 #endif
952 
953  return (0);
954 }
955 
956 #endif
957 
char spatial_scheme_hyp[_MAX_STRING_SIZE_]
Definition: hypar.h:84
void * interp
Definition: hypar.h:362
int gpuWENOFifthOrderCalculateWeights(double *fC, double *uC, double *x, int dir, void *s, void *m)
#define _WENOWeights_v_M_(w1, w2, w3, c1, c2, c3, m3, m2, m1, p1, p2, eps, N)
int npoints_local_wghosts
Definition: hypar.h:42
Definitions for the functions computing the interpolated value of the primitive at the cell interface...
#define _WENOWeights_v_YC_Scalar_(w1, w2, w3, c1, c2, c3, m3, m2, m1, p1, p2, eps, idx)
#define _ArrayIndexnD_(N, index, imax, i, ghost)
int * dim_local
Definition: hypar.h:37
MPI related function definitions.
#define _WENO_OPTIMAL_WEIGHT_2_
#define GPU_THREADS_PER_BLOCK
Definition: basic_gpu.h:7
Structure of variables/parameters needed by the WENO-type scheme.
int ghosts
Definition: hypar.h:52
#define _ArrayIndex1D_(N, imax, i, ghost, index)
static int gpuWENOFifthOrderCalculateWeightsYC(double *, double *, double *, int, void *, void *)
#define _CRWENO_OPTIMAL_WEIGHT_1_
#define _WENO_OPTIMAL_WEIGHT_3_
int * gpu_dim_local
Definition: hypar.h:455
Contains function definitions for common mathematical functions.
#define _WENO_OPTIMAL_WEIGHT_1_
Contains function definitions for common array operations on GPU.
int * stride_with_ghosts
Definition: hypar.h:414
#define _WENOWeights_v_M_Scalar_(w1, w2, w3, c1, c2, c3, m3, m2, m1, p1, p2, eps, idx)
#define _ArrayCopy1D_(x, y, size)
int nvars
Definition: hypar.h:29
Contains structure definition for hypar.
#define _CRWENO_OPTIMAL_WEIGHT_2_
__global__ void WENOFifthOrderCalculateWeightsM_kernel(int npoints_grid, int npoints_local_wghosts, int ndims, int dir, int ghosts, int nvars, int weno_size, int offset, int stride, int is_crweno, int is_mpi_ip_zero, int is_mpi_ip_proc, double weno_eps, const int *dim, const double *fC, const double *uC, double *w1, double *w2, double *w3)
#define _CRWENO_OPTIMAL_WEIGHT_3_
Some basic definitions and macros.
int ndims
Definition: hypar.h:26
#define _WENOWeights_v_YC_(w1, w2, w3, c1, c2, c3, m3, m2, m1, p1, p2, eps, N)
Contains macros and function definitions for common array operations.
#define _FIFTH_ORDER_CRWENO_
Definition: interpolation.h:28
#define _ArrayProduct1D_(x, size, p)
Structure of MPI-related variables.
Structure containing all solver-specific variables and functions.
Definition: hypar.h:23
static int gpuWENOFifthOrderCalculateWeightsM(double *, double *, double *, int, void *, void *)
__global__ void WENOFifthOrderCalculateWeightsYC_kernel(int npoints_grid, int npoints_local_wghosts, int ndims, int dir, int ghosts, int nvars, int weno_size, int offset, int stride, int is_crweno, int is_mpi_ip_zero, int is_mpi_ip_proc, double weno_eps, const int *__restrict__ dim, const double *__restrict__ fC, const double *__restrict__ uC, double *__restrict__ w1, double *__restrict__ w2, double *__restrict__ w3)
Contains macros and function definitions for common matrix multiplication.