HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Interp1PrimFifthOrderWENO_GPU.cu
Go to the documentation of this file.
1 
6 #include <basic.h>
7 #include <basic_gpu.h>
8 #include <arrayfunctions_gpu.h>
9 #include <mathfunctions.h>
10 #include <interpolation.h>
11 #include <mpivars.h>
12 #include <hypar.h>
13 
14 #undef _MINIMUM_GHOSTS_
15 
19 #define _MINIMUM_GHOSTS_ 3
20 
21 #ifdef CUDA_VAR_ORDERDING_AOS
22 
24 __global__
26  int ngrid_points,
27  int ndims,
28  int dir,
29  int ghosts,
30  int nvars,
31  int weno_size,
32  int offset,
33  int stride,
34  int upw,
35  int uflag,
36  const int *dim,
37  const double *fC,
38  const double *w1,
39  const double *w2,
40  const double *w3,
41  double *fI
42 )
43 {
44  int p = threadIdx.x + (blockDim.x * blockIdx.x);
45  if (p < ngrid_points) {
46  int bounds_inter[GPU_MAX_NDIMS], indexC[GPU_MAX_NDIMS], indexI[GPU_MAX_NDIMS];
47  int qm1,qm2,qm3,qp1,qp2;
48  const double *ww1, *ww2, *ww3;
49 
50  ww1 = w1 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
51  ww2 = w2 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
52  ww3 = w3 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
53 
54  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
55  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
56  _ArrayCopy1D_(indexC,indexI,ndims);
57 
58  if (upw > 0) {
59  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
60  qm3 = qm1 - 2*stride;
61  qm2 = qm1 - stride;
62  qp1 = qm1 + stride;
63  qp2 = qm1 + 2*stride;
64  } else {
65  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
66  qm3 = qm1 + 2*stride;
67  qm2 = qm1 + stride;
68  qp1 = qm1 - stride;
69  qp2 = qm1 - 2*stride;
70  }
71 
72  /* Defining stencil points */
73  const double *fm3, *fm2, *fm1, *fp1, *fp2;
74  fm3 = (fC+qm3*nvars);
75  fm2 = (fC+qm2*nvars);
76  fm1 = (fC+qm1*nvars);
77  fp1 = (fC+qp1*nvars);
78  fp2 = (fC+qp2*nvars);
79 
80  /* Candidate stencils and their optimal weights*/
81  double f1[GPU_MAX_NVARS], f2[GPU_MAX_NVARS], f3[GPU_MAX_NVARS];
82 
83  _ArrayAXBYCZ_(f1,(2*GPU_ONE_SIXTH),fm3,(-7*GPU_ONE_SIXTH) ,fm2,(11*GPU_ONE_SIXTH) ,fm1,nvars);
84  _ArrayAXBYCZ_(f2,(-GPU_ONE_SIXTH) ,fm2,(5*GPU_ONE_SIXTH) ,fm1,(2*GPU_ONE_SIXTH) ,fp1,nvars);
85  _ArrayAXBYCZ_(f3,(2*GPU_ONE_SIXTH),fm1,(5*GPU_ONE_SIXTH) ,fp1,(-GPU_ONE_SIXTH) ,fp2,nvars);
86 
87  /* calculate WENO weights */
88  const double *cur_w1, *cur_w2, *cur_w3;
89  cur_w1 = (ww1+p*nvars);
90  cur_w2 = (ww2+p*nvars);
91  cur_w3 = (ww3+p*nvars);
92 
93  _ArrayMultiply3Add1D_((fI+p*nvars),cur_w1,f1,cur_w2,f2,cur_w3,f3,nvars);
94  }
95 
96  return;
97 }
98 
146 extern "C" int gpuInterp1PrimFifthOrderWENO(
147  double *fI,
148  double *fC,
149  double *u,
150  double *x,
151  int upw,
152  int dir,
153  void *s,
154  void *m,
155  int uflag
156  )
157 {
158  HyPar *solver = (HyPar*) s;
159  WENOParameters *weno = (WENOParameters*) solver->interp;
160 
161  int ghosts = solver->ghosts;
162  int ndims = solver->ndims;
163  int nvars = solver->nvars;
164  int *dim = solver->dim_local;
165  int *stride= solver->stride_with_ghosts;
166 
167  /* calculate dimension offset */
168  int offset = weno->offset[dir];
169  int bounds_inter[ndims];
170  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
171  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
172  int nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
173 
174 #if defined(GPU_STAT)
175  cudaEvent_t start, stop;
176  float milliseconds = 0;
177  checkCuda( cudaEventCreate(&start) );
178  checkCuda( cudaEventCreate(&stop) );
179 
180  int weno_memory_accessed = 3*npoints_grid*nvars*sizeof(double);
181  int fI_memory_accessed = npoints_grid*nvars*sizeof(double);
182  int fC_memory_accessed = 1;
183  for (int d=0; d<ndims; d++) {
184  if (d == dir) fC_memory_accessed *= (dim[d]+2*ghosts);
185  else fC_memory_accessed *= dim[d];
186  }
187  fC_memory_accessed *= nvars*sizeof(double);
188 
189  checkCuda( cudaEventRecord(start) );
190 #endif
191 
192  Interp1PrimFifthOrderWENO_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
193  npoints_grid, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir], upw, uflag,
194  solver->gpu_dim_local, fC, weno->w1, weno->w2, weno->w3, fI
195  );
196  cudaDeviceSynchronize();
197 
198 #if defined(GPU_STAT)
199  checkCuda( cudaEventRecord(stop) );
200  checkCuda( cudaEventSynchronize(stop) );
201  checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
202 
203  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
204  "Interp1PrimFifthOrderWENO", milliseconds*1e-3, dir,
205  (1e-6*(weno_memory_accessed+fI_memory_accessed+fC_memory_accessed))/milliseconds);
206 
207  checkCuda( cudaEventDestroy(start) );
208  checkCuda( cudaEventDestroy(stop) );
209 #endif
210 
211  return 0;
212 }
213 
214 #else
215 
217 __global__
219  int npoints_grid,
220  int npoints_local_wghosts,
221  int ndims,
222  int dir,
223  int ghosts,
224  int nvars,
225  int weno_size,
226  int offset,
227  int stride,
228  int upw,
229  int uflag,
230  const int *dim,
231  const double *fC,
232  const double *w1,
233  const double *w2,
234  const double *w3,
235  double *fI
236 )
237 {
238  int p = threadIdx.x + (blockDim.x * blockIdx.x);
239  if (p < npoints_grid) {
240  int bounds_inter[GPU_MAX_NDIMS], indexC[GPU_MAX_NDIMS], indexI[GPU_MAX_NDIMS];
241  int qm1,qm2,qm3,qp1,qp2;
242  const double *ww1, *ww2, *ww3;
243 
244  ww1 = w1 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
245  ww2 = w2 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
246  ww3 = w3 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
247 
248  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
249  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
250  _ArrayCopy1D_(indexC,indexI,ndims);
251 
252  if (upw > 0) {
253  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
254  qm3 = qm1 - 2*stride;
255  qm2 = qm1 - stride;
256  qp1 = qm1 + stride;
257  qp2 = qm1 + 2*stride;
258  } else {
259  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
260  qm3 = qm1 + 2*stride;
261  qm2 = qm1 + stride;
262  qp1 = qm1 - stride;
263  qp2 = qm1 - 2*stride;
264  }
265 
266  /* Defining stencil points */
267  const double *fm3, *fm2, *fm1, *fp1, *fp2;
268  /* Candidate stencils and their optimal weights*/
269  double f1, f2, f3;
270 
271  int l = p;
272  for (int j = 0; j < nvars; j++) {
273  fm3 = (fC+qm3);
274  fm2 = (fC+qm2);
275  fm1 = (fC+qm1);
276  fp1 = (fC+qp1);
277  fp2 = (fC+qp2);
278 
279  f1 = (2*GPU_ONE_SIXTH)*fm3[0] + (-7*GPU_ONE_SIXTH)*fm2[0] + (11*GPU_ONE_SIXTH)*fm1[0];
280  f2 = ( -GPU_ONE_SIXTH)*fm2[0] + ( 5*GPU_ONE_SIXTH)*fm1[0] + ( 2*GPU_ONE_SIXTH)*fp1[0];
281  f3 = (2*GPU_ONE_SIXTH)*fm1[0] + ( 5*GPU_ONE_SIXTH)*fp1[0] + ( -GPU_ONE_SIXTH)*fp2[0];
282 
283  qm3 += npoints_local_wghosts;
284  qm2 += npoints_local_wghosts;
285  qm1 += npoints_local_wghosts;
286  qp1 += npoints_local_wghosts;
287  qp2 += npoints_local_wghosts;
288 
289  /*
290  _ArrayAXBYCZ_(f1,(2*GPU_ONE_SIXTH),fm3,(-7*GPU_ONE_SIXTH) ,fm2,(11*GPU_ONE_SIXTH) ,fm1,nvars);
291  _ArrayAXBYCZ_(f2,(-GPU_ONE_SIXTH) ,fm2,(5*GPU_ONE_SIXTH) ,fm1,(2*GPU_ONE_SIXTH) ,fp1,nvars);
292  _ArrayAXBYCZ_(f3,(2*GPU_ONE_SIXTH),fm1,(5*GPU_ONE_SIXTH) ,fp1,(-GPU_ONE_SIXTH) ,fp2,nvars);
293  */
294 
295  /* calculate WENO weights */
296  fI[l] = (ww1+l)[0]*f1 + (ww2+l)[0]*f2 + (ww3+l)[0]*f3;
297  l += npoints_grid;
298  //_ArrayMultiply3Add1D_((fI+p),cur_w1,f1,cur_w2,f2,cur_w3,f3,1);
299  }
300  }
301 
302  return;
303 }
304 
353  double *fI,
354  double *fC,
355  double *u,
356  double *x,
357  int upw,
358  int dir,
359  void *s,
360  void *m,
361  int uflag
362  )
363 {
364  HyPar *solver = (HyPar*) s;
365  WENOParameters *weno = (WENOParameters*) solver->interp;
366 
367  int ghosts = solver->ghosts;
368  int ndims = solver->ndims;
369  int nvars = solver->nvars;
370  int *dim = solver->dim_local;
371  int *stride= solver->stride_with_ghosts;
372 
373  /* calculate dimension offset */
374  int offset = weno->offset[dir];
375  int bounds_inter[ndims];
376  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
377  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
378  int nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
379 
380 #if defined(GPU_STAT)
381  cudaEvent_t start, stop;
382  float milliseconds = 0;
383  checkCuda( cudaEventCreate(&start) );
384  checkCuda( cudaEventCreate(&stop) );
385 
386 
387  int weno_memory_accessed = 3*npoints_grid*nvars*sizeof(double);
388  int fI_memory_accessed = npoints_grid*nvars*sizeof(double);
389  int fC_memory_accessed = 1;
390  for (int d=0; d<ndims; d++) {
391  if (d == dir) fC_memory_accessed *= (dim[d]+2*ghosts);
392  else fC_memory_accessed *= dim[d];
393  }
394  fC_memory_accessed *= nvars*sizeof(double);
395 
396  checkCuda( cudaEventRecord(start) );
397 #endif
398 
399  Interp1PrimFifthOrderWENO_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
400  npoints_grid, solver->npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir], upw, uflag,
401  solver->gpu_dim_local, fC, weno->w1, weno->w2, weno->w3, fI
402  );
403 #if defined(GPU_STAT)
404  checkCuda( cudaEventRecord(stop) );
405  checkCuda( cudaEventSynchronize(stop) );
406 #endif
407 
408  cudaDeviceSynchronize();
409 
410 #if defined(GPU_STAT)
411  checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
412 
413  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
414  "Interp1PrimFifthOrderWENO2", milliseconds*1e-3, dir,
415  (1e-6*(weno_memory_accessed+fI_memory_accessed+fC_memory_accessed))/milliseconds);
416 
417  checkCuda( cudaEventDestroy(start) );
418  checkCuda( cudaEventDestroy(stop) );
419 #endif
420 
421  return 0;
422 }
423 
424 #endif
void * interp
Definition: hypar.h:362
int npoints_local_wghosts
Definition: hypar.h:42
Definitions for the functions computing the interpolated value of the primitive at the cell interface...
#define _ArrayIndexnD_(N, index, imax, i, ghost)
int * dim_local
Definition: hypar.h:37
MPI related function definitions.
#define GPU_THREADS_PER_BLOCK
Definition: basic_gpu.h:7
#define GPU_MAX_NVARS
Definition: basic_gpu.h:9
Structure of variables/parameters needed by the WENO-type scheme.
#define GPU_MAX_NDIMS
Definition: basic_gpu.h:8
int ghosts
Definition: hypar.h:52
#define _ArrayIndex1D_(N, imax, i, ghost, index)
#define _ArrayMultiply3Add1D_(x, a, b, c, d, e, f, size)
int * gpu_dim_local
Definition: hypar.h:455
Contains function definitions for common mathematical functions.
Contains function definitions for common array operations on GPU.
int * stride_with_ghosts
Definition: hypar.h:414
#define _ArrayCopy1D_(x, y, size)
int nvars
Definition: hypar.h:29
int gpuInterp1PrimFifthOrderWENO(double *, double *, double *, double *, int, int, void *, void *, int)
5th order WENO reconstruction (component-wise) on a uniform grid
#define _ArrayAXBYCZ_(w, a, x, b, y, c, z, size)
Contains structure definition for hypar.
Some basic definitions and macros.
int ndims
Definition: hypar.h:26
#define _ArrayProduct1D_(x, size, p)
#define GPU_ONE_SIXTH
Definition: basic_gpu.h:6
Structure containing all solver-specific variables and functions.
Definition: hypar.h:23
__global__ void Interp1PrimFifthOrderWENO_kernel(int npoints_grid, int npoints_local_wghosts, int ndims, int dir, int ghosts, int nvars, int weno_size, int offset, int stride, int upw, int uflag, const int *dim, const double *fC, const double *w1, const double *w2, const double *w3, double *fI)