HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
Interp1PrimFifthOrderWENO_GPU.cu
Go to the documentation of this file.
1 /*! @file Interp1PrimFifthOrderWENO_GPU.cu
2  * @brief WENO5 Scheme (Component-wise application to vectors).
3  * @author Youngdae Kim
4 */
5 
6 #include <basic.h>
7 #include <basic_gpu.h>
8 #include <arrayfunctions_gpu.h>
9 #include <mathfunctions.h>
10 #include <interpolation.h>
11 #include <mpivars.h>
12 #include <hypar.h>
13 
14 #undef _MINIMUM_GHOSTS_
15 /*! \def _MINIMUM_GHOSTS_
16  * Minimum number of ghost points required for this interpolation
17  * method.
18 */
19 #define _MINIMUM_GHOSTS_ 3
20 
21 #ifdef CUDA_VAR_ORDERDING_AOS
22 
23 /*! Kernel for gpuInterp1PrimFifthOrderWENO() */
24 __global__
25 void Interp1PrimFifthOrderWENO_kernel(
26  int ngrid_points,
27  int ndims,
28  int dir,
29  int ghosts,
30  int nvars,
31  int weno_size,
32  int offset,
33  int stride,
34  int upw,
35  int uflag,
36  const int *dim,
37  const double *fC,
38  const double *w1,
39  const double *w2,
40  const double *w3,
41  double *fI
42 )
43 {
44  int p = threadIdx.x + (blockDim.x * blockIdx.x);
45  if (p < ngrid_points) {
46  int bounds_inter[GPU_MAX_NDIMS], indexC[GPU_MAX_NDIMS], indexI[GPU_MAX_NDIMS];
47  int qm1,qm2,qm3,qp1,qp2;
48  const double *ww1, *ww2, *ww3;
49 
50  ww1 = w1 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
51  ww2 = w2 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
52  ww3 = w3 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
53 
54  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
55  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
56  _ArrayCopy1D_(indexC,indexI,ndims);
57 
58  if (upw > 0) {
59  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
60  qm3 = qm1 - 2*stride;
61  qm2 = qm1 - stride;
62  qp1 = qm1 + stride;
63  qp2 = qm1 + 2*stride;
64  } else {
65  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
66  qm3 = qm1 + 2*stride;
67  qm2 = qm1 + stride;
68  qp1 = qm1 - stride;
69  qp2 = qm1 - 2*stride;
70  }
71 
72  /* Defining stencil points */
73  const double *fm3, *fm2, *fm1, *fp1, *fp2;
74  fm3 = (fC+qm3*nvars);
75  fm2 = (fC+qm2*nvars);
76  fm1 = (fC+qm1*nvars);
77  fp1 = (fC+qp1*nvars);
78  fp2 = (fC+qp2*nvars);
79 
80  /* Candidate stencils and their optimal weights*/
81  double f1[GPU_MAX_NVARS], f2[GPU_MAX_NVARS], f3[GPU_MAX_NVARS];
82 
83  _ArrayAXBYCZ_(f1,(2*GPU_ONE_SIXTH),fm3,(-7*GPU_ONE_SIXTH) ,fm2,(11*GPU_ONE_SIXTH) ,fm1,nvars);
84  _ArrayAXBYCZ_(f2,(-GPU_ONE_SIXTH) ,fm2,(5*GPU_ONE_SIXTH) ,fm1,(2*GPU_ONE_SIXTH) ,fp1,nvars);
85  _ArrayAXBYCZ_(f3,(2*GPU_ONE_SIXTH),fm1,(5*GPU_ONE_SIXTH) ,fp1,(-GPU_ONE_SIXTH) ,fp2,nvars);
86 
87  /* calculate WENO weights */
88  const double *cur_w1, *cur_w2, *cur_w3;
89  cur_w1 = (ww1+p*nvars);
90  cur_w2 = (ww2+p*nvars);
91  cur_w3 = (ww3+p*nvars);
92 
93  _ArrayMultiply3Add1D_((fI+p*nvars),cur_w1,f1,cur_w2,f2,cur_w3,f3,nvars);
94  }
95 
96  return;
97 }
98 
99 /*! @brief 5th order WENO reconstruction (component-wise) on a uniform grid
100 
101  Computes the interpolated values of the first primitive of a function \f${\bf f}\left({\bf u}\right)\f$
102  at the interfaces from the cell-centered values of the function using the fifth order WENO scheme on a
103  uniform grid. The first primitive is defined as a function \f${\bf h}\left({\bf u}\right)\f$ that satisfies:
104  \f{equation}{
105  {\bf f}\left({\bf u}\left(x\right)\right) = \frac{1}{\Delta x} \int_{x-\Delta x/2}^{x+\Delta x/2} {\bf h}\left({\bf u}\left(\zeta\right)\right)d\zeta,
106  \f}
107  where \f$x\f$ is the spatial coordinate along the dimension of the interpolation. This function computes the 5th order WENO numerical approximation
108  \f$\hat{\bf f}_{j+1/2} \approx {\bf h}_{j+1/2}\f$ as the convex combination of three 3rd order methods:
109  \f{align}{
110  &\ \omega_1\ \times\ \left[ \hat{\bf f}_{j+1/2}^1 = \frac{1}{3} {\bf f}_{j-2} - \frac{7}{6} {\bf f}_{j-1} + \frac{11}{6} {\bf f}_j \right]\\
111  + &\ \omega_2\ \times\ \left[ \hat{\bf f}_{j+1/2}^2 = -\frac{1}{6} {\bf f}_{j-1} + \frac{5}{6} {\bf f}_j + \frac{1}{3} {\bf f}_{j+1} \right]\\
112  + &\ \omega_3\ \times\ \left[ \hat{\bf f}_{j+1/2}^3 = \frac{1}{3} {\bf f}_j + \frac{5}{6} {\bf f}_{j+1} - \frac{1}{6} {\bf f}_{j+2} \right]\\
113  \Rightarrow &\ \hat{\bf f}_{j+1/2} = \frac{\omega_1}{3} {\bf f}_{j-2} - \frac{1}{6}(7\omega_1+\omega_2){\bf f}_{j-1} + \frac{1}{6}(11\omega_1+5\omega_2+2\omega_3){\bf f}_j + \frac{1}{6}(2\omega_2+5\omega_3){\bf f}_{j+1} - \frac{\omega_3}{6}{\bf f}_{j+2},
114  \f}
115  where \f$\omega_k; k=1,2,3\f$ are the nonlinear WENO weights computed in WENOFifthOrderCalculateWeights() (note that the \f$\omega\f$ are different for each component of the vector \f$\hat{\bf f}\f$).
116 
117  \b Implementation \b Notes:
118  + This method assumes a uniform grid in the spatial dimension corresponding to the interpolation.
119  + The method described above corresponds to a left-biased interpolation. The corresponding right-biased
120  interpolation can be obtained by reflecting the equations about interface j+1/2.
121  + The scalar interpolation method is applied to the vector function in a component-wise manner.
122  + The function computes the interpolant for the entire grid in one call. It loops over all the grid lines along the interpolation direction
123  and carries out the 1D interpolation along these grid lines.
124  + Location of cell-centers and cell interfaces along the spatial dimension of the interpolation is shown in the following figure:
125  @image html chap1_1Ddomain.png
126  @image latex chap1_1Ddomain.eps width=0.9\textwidth
127 
128  \b Function \b arguments:
129 
130  Argument | Type | Explanation
131  --------- | --------- | ---------------------------------------------
132  fI | double* | Array to hold the computed interpolant at the grid interfaces. This array must have the same layout as the solution, but with \b no \b ghost \b points. Its size should be the same as u in all dimensions, except dir (the dimension along which to interpolate) along which it should be larger by 1 (number of interfaces is 1 more than the number of interior cell centers).
133  fC | double* | Array with the cell-centered values of the flux function \f${\bf f}\left({\bf u}\right)\f$. This array must have the same layout and size as the solution, \b with \b ghost \b points.
134  u | double* | The solution array \f${\bf u}\f$ (with ghost points). If the interpolation is characteristic based, this is needed to compute the eigendecomposition. For a multidimensional problem, the layout is as follows: u is a contiguous 1D array of size (nvars*dim[0]*dim[1]*...*dim[D-1]) corresponding to the multi-dimensional solution, with the following ordering - nvars, dim[0], dim[1], ..., dim[D-1], where nvars is the number of solution components (#HyPar::nvars), dim is the local size (#HyPar::dim_local), D is the number of spatial dimensions.
135  x | double* | The grid array (with ghost points). This is used only by non-uniform-grid interpolation methods. For multidimensional problems, the layout is as follows: x is a contiguous 1D array of size (dim[0]+dim[1]+...+dim[D-1]), with the spatial coordinates along dim[0] stored from 0,...,dim[0]-1, the spatial coordinates along dim[1] stored along dim[0],...,dim[0]+dim[1]-1, and so forth.
136  upw | int | Upwinding direction: if positive, a left-biased interpolant will be computed; if negative, a right-biased interpolant will be computed. If the interpolation method is central, then this has no effect.
137  dir | int | Spatial dimension along which to interpolate (eg: 0 for 1D; 0 or 1 for 2D; 0,1 or 2 for 3D)
138  s | void* | Solver object of type #HyPar: the following variables are needed - #HyPar::ghosts, #HyPar::ndims, #HyPar::nvars, #HyPar::dim_local.
139  m | void* | MPI object of type #MPIVariables: this is needed only by compact interpolation method that need to solve a global implicit system across MPI ranks.
140  uflag | int | A flag indicating if the function being interpolated \f${\bf f}\f$ is the solution itself \f${\bf u}\f$ (if 1, \f${\bf f}\left({\bf u}\right) \equiv {\bf u}\f$).
141 
142 
143  \b Reference:
144  + Jiang, G.-S., Shu, C.-W., Efficient Implementation of Weighted ENO Schemes, J. Comput. Phys., 126 (1), 1996, pp. 202-228, http://dx.doi.org/10.1006/jcph.1996.0130
145  */
146 extern "C" int gpuInterp1PrimFifthOrderWENO(
147  double *fI, /*!< Array of interpolated function values at the interfaces */
148  double *fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
149  double *u, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
150  double *x, /*!< Grid coordinates */
151  int upw, /*!< Upwind direction (left or right biased) */
152  int dir, /*!< Spatial dimension along which to interpolation */
153  void *s, /*!< Object of type #HyPar containing solver-related variables */
154  void *m, /*!< Object of type #MPIVariables containing MPI-related variables */
155  int uflag /*!< Flag to indicate if \f$f(u) \equiv u\f$, i.e, if the solution is being reconstructed */
156  )
157 {
158  HyPar *solver = (HyPar*) s;
159  WENOParameters *weno = (WENOParameters*) solver->interp;
160 
161  int ghosts = solver->ghosts;
162  int ndims = solver->ndims;
163  int nvars = solver->nvars;
164  int *dim = solver->dim_local;
165  int *stride= solver->stride_with_ghosts;
166 
167  /* calculate dimension offset */
168  int offset = weno->offset[dir];
169  int bounds_inter[ndims];
170  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
171  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
172  int nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
173 
174 #if defined(GPU_STAT)
175  cudaEvent_t start, stop;
176  float milliseconds = 0;
177  checkCuda( cudaEventCreate(&start) );
178  checkCuda( cudaEventCreate(&stop) );
179 
180  int weno_memory_accessed = 3*npoints_grid*nvars*sizeof(double);
181  int fI_memory_accessed = npoints_grid*nvars*sizeof(double);
182  int fC_memory_accessed = 1;
183  for (int d=0; d<ndims; d++) {
184  if (d == dir) fC_memory_accessed *= (dim[d]+2*ghosts);
185  else fC_memory_accessed *= dim[d];
186  }
187  fC_memory_accessed *= nvars*sizeof(double);
188 
189  checkCuda( cudaEventRecord(start) );
190 #endif
191 
192  Interp1PrimFifthOrderWENO_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
193  npoints_grid, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir], upw, uflag,
194  solver->gpu_dim_local, fC, weno->w1, weno->w2, weno->w3, fI
195  );
196  cudaDeviceSynchronize();
197 
198 #if defined(GPU_STAT)
199  checkCuda( cudaEventRecord(stop) );
200  checkCuda( cudaEventSynchronize(stop) );
201  checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
202 
203  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
204  "Interp1PrimFifthOrderWENO", milliseconds*1e-3, dir,
205  (1e-6*(weno_memory_accessed+fI_memory_accessed+fC_memory_accessed))/milliseconds);
206 
207  checkCuda( cudaEventDestroy(start) );
208  checkCuda( cudaEventDestroy(stop) );
209 #endif
210 
211  return 0;
212 }
213 
214 #else
215 
216 /*! Kernel for gpuInterp1PrimFifthOrderWENO() */
217 __global__
218 void Interp1PrimFifthOrderWENO_kernel(
219  int npoints_grid,
220  int npoints_local_wghosts,
221  int ndims,
222  int dir,
223  int ghosts,
224  int nvars,
225  int weno_size,
226  int offset,
227  int stride,
228  int upw,
229  int uflag,
230  const int *dim,
231  const double *fC,
232  const double *w1,
233  const double *w2,
234  const double *w3,
235  double *fI
236 )
237 {
238  int p = threadIdx.x + (blockDim.x * blockIdx.x);
239  if (p < npoints_grid) {
240  int bounds_inter[GPU_MAX_NDIMS], indexC[GPU_MAX_NDIMS], indexI[GPU_MAX_NDIMS];
241  int qm1,qm2,qm3,qp1,qp2;
242  const double *ww1, *ww2, *ww3;
243 
244  ww1 = w1 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
245  ww2 = w2 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
246  ww3 = w3 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
247 
248  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
249  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
250  _ArrayCopy1D_(indexC,indexI,ndims);
251 
252  if (upw > 0) {
253  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
254  qm3 = qm1 - 2*stride;
255  qm2 = qm1 - stride;
256  qp1 = qm1 + stride;
257  qp2 = qm1 + 2*stride;
258  } else {
259  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
260  qm3 = qm1 + 2*stride;
261  qm2 = qm1 + stride;
262  qp1 = qm1 - stride;
263  qp2 = qm1 - 2*stride;
264  }
265 
266  /* Defining stencil points */
267  const double *fm3, *fm2, *fm1, *fp1, *fp2;
268  /* Candidate stencils and their optimal weights*/
269  double f1, f2, f3;
270 
271  int l = p;
272  for (int j = 0; j < nvars; j++) {
273  fm3 = (fC+qm3);
274  fm2 = (fC+qm2);
275  fm1 = (fC+qm1);
276  fp1 = (fC+qp1);
277  fp2 = (fC+qp2);
278 
279  f1 = (2*GPU_ONE_SIXTH)*fm3[0] + (-7*GPU_ONE_SIXTH)*fm2[0] + (11*GPU_ONE_SIXTH)*fm1[0];
280  f2 = ( -GPU_ONE_SIXTH)*fm2[0] + ( 5*GPU_ONE_SIXTH)*fm1[0] + ( 2*GPU_ONE_SIXTH)*fp1[0];
281  f3 = (2*GPU_ONE_SIXTH)*fm1[0] + ( 5*GPU_ONE_SIXTH)*fp1[0] + ( -GPU_ONE_SIXTH)*fp2[0];
282 
283  qm3 += npoints_local_wghosts;
284  qm2 += npoints_local_wghosts;
285  qm1 += npoints_local_wghosts;
286  qp1 += npoints_local_wghosts;
287  qp2 += npoints_local_wghosts;
288 
289  /*
290  _ArrayAXBYCZ_(f1,(2*GPU_ONE_SIXTH),fm3,(-7*GPU_ONE_SIXTH) ,fm2,(11*GPU_ONE_SIXTH) ,fm1,nvars);
291  _ArrayAXBYCZ_(f2,(-GPU_ONE_SIXTH) ,fm2,(5*GPU_ONE_SIXTH) ,fm1,(2*GPU_ONE_SIXTH) ,fp1,nvars);
292  _ArrayAXBYCZ_(f3,(2*GPU_ONE_SIXTH),fm1,(5*GPU_ONE_SIXTH) ,fp1,(-GPU_ONE_SIXTH) ,fp2,nvars);
293  */
294 
295  /* calculate WENO weights */
296  fI[l] = (ww1+l)[0]*f1 + (ww2+l)[0]*f2 + (ww3+l)[0]*f3;
297  l += npoints_grid;
298  //_ArrayMultiply3Add1D_((fI+p),cur_w1,f1,cur_w2,f2,cur_w3,f3,1);
299  }
300  }
301 
302  return;
303 }
304 
305 /*! @brief 5th order WENO reconstruction (component-wise) on a uniform grid
306 
307  Computes the interpolated values of the first primitive of a function \f${\bf f}\left({\bf u}\right)\f$
308  at the interfaces from the cell-centered values of the function using the fifth order WENO scheme on a
309  uniform grid. The first primitive is defined as a function \f${\bf h}\left({\bf u}\right)\f$ that satisfies:
310  \f{equation}{
311  {\bf f}\left({\bf u}\left(x\right)\right) = \frac{1}{\Delta x} \int_{x-\Delta x/2}^{x+\Delta x/2} {\bf h}\left({\bf u}\left(\zeta\right)\right)d\zeta,
312  \f}
313  where \f$x\f$ is the spatial coordinate along the dimension of the interpolation. This function computes the 5th order WENO numerical approximation
314  \f$\hat{\bf f}_{j+1/2} \approx {\bf h}_{j+1/2}\f$ as the convex combination of three 3rd order methods:
315  \f{align}{
316  &\ \omega_1\ \times\ \left[ \hat{\bf f}_{j+1/2}^1 = \frac{1}{3} {\bf f}_{j-2} - \frac{7}{6} {\bf f}_{j-1} + \frac{11}{6} {\bf f}_j \right]\\
317  + &\ \omega_2\ \times\ \left[ \hat{\bf f}_{j+1/2}^2 = -\frac{1}{6} {\bf f}_{j-1} + \frac{5}{6} {\bf f}_j + \frac{1}{3} {\bf f}_{j+1} \right]\\
318  + &\ \omega_3\ \times\ \left[ \hat{\bf f}_{j+1/2}^3 = \frac{1}{3} {\bf f}_j + \frac{5}{6} {\bf f}_{j+1} - \frac{1}{6} {\bf f}_{j+2} \right]\\
319  \Rightarrow &\ \hat{\bf f}_{j+1/2} = \frac{\omega_1}{3} {\bf f}_{j-2} - \frac{1}{6}(7\omega_1+\omega_2){\bf f}_{j-1} + \frac{1}{6}(11\omega_1+5\omega_2+2\omega_3){\bf f}_j + \frac{1}{6}(2\omega_2+5\omega_3){\bf f}_{j+1} - \frac{\omega_3}{6}{\bf f}_{j+2},
320  \f}
321  where \f$\omega_k; k=1,2,3\f$ are the nonlinear WENO weights computed in WENOFifthOrderCalculateWeights() (note that the \f$\omega\f$ are different for each component of the vector \f$\hat{\bf f}\f$).
322 
323  \b Implementation \b Notes:
324  + This method assumes a uniform grid in the spatial dimension corresponding to the interpolation.
325  + The method described above corresponds to a left-biased interpolation. The corresponding right-biased
326  interpolation can be obtained by reflecting the equations about interface j+1/2.
327  + The scalar interpolation method is applied to the vector function in a component-wise manner.
328  + The function computes the interpolant for the entire grid in one call. It loops over all the grid lines along the interpolation direction
329  and carries out the 1D interpolation along these grid lines.
330  + Location of cell-centers and cell interfaces along the spatial dimension of the interpolation is shown in the following figure:
331  @image html chap1_1Ddomain.png
332  @image latex chap1_1Ddomain.eps width=0.9\textwidth
333 
334  \b Function \b arguments:
335 
336  Argument | Type | Explanation
337  --------- | --------- | ---------------------------------------------
338  fI | double* | Array to hold the computed interpolant at the grid interfaces. This array must have the same layout as the solution, but with \b no \b ghost \b points. Its size should be the same as u in all dimensions, except dir (the dimension along which to interpolate) along which it should be larger by 1 (number of interfaces is 1 more than the number of interior cell centers).
339  fC | double* | Array with the cell-centered values of the flux function \f${\bf f}\left({\bf u}\right)\f$. This array must have the same layout and size as the solution, \b with \b ghost \b points.
340  u | double* | The solution array \f${\bf u}\f$ (with ghost points). If the interpolation is characteristic based, this is needed to compute the eigendecomposition. For a multidimensional problem, the layout is as follows: u is a contiguous 1D array of size (nvars*dim[0]*dim[1]*...*dim[D-1]) corresponding to the multi-dimensional solution, with the following ordering - nvars, dim[0], dim[1], ..., dim[D-1], where nvars is the number of solution components (#HyPar::nvars), dim is the local size (#HyPar::dim_local), D is the number of spatial dimensions.
341  x | double* | The grid array (with ghost points). This is used only by non-uniform-grid interpolation methods. For multidimensional problems, the layout is as follows: x is a contiguous 1D array of size (dim[0]+dim[1]+...+dim[D-1]), with the spatial coordinates along dim[0] stored from 0,...,dim[0]-1, the spatial coordinates along dim[1] stored along dim[0],...,dim[0]+dim[1]-1, and so forth.
342  upw | int | Upwinding direction: if positive, a left-biased interpolant will be computed; if negative, a right-biased interpolant will be computed. If the interpolation method is central, then this has no effect.
343  dir | int | Spatial dimension along which to interpolate (eg: 0 for 1D; 0 or 1 for 2D; 0,1 or 2 for 3D)
344  s | void* | Solver object of type #HyPar: the following variables are needed - #HyPar::ghosts, #HyPar::ndims, #HyPar::nvars, #HyPar::dim_local.
345  m | void* | MPI object of type #MPIVariables: this is needed only by compact interpolation method that need to solve a global implicit system across MPI ranks.
346  uflag | int | A flag indicating if the function being interpolated \f${\bf f}\f$ is the solution itself \f${\bf u}\f$ (if 1, \f${\bf f}\left({\bf u}\right) \equiv {\bf u}\f$).
347 
348 
349  \b Reference:
350  + Jiang, G.-S., Shu, C.-W., Efficient Implementation of Weighted ENO Schemes, J. Comput. Phys., 126 (1), 1996, pp. 202-228, http://dx.doi.org/10.1006/jcph.1996.0130
351  */
352 extern "C" int gpuInterp1PrimFifthOrderWENO(
353  double *fI, /*!< Array of interpolated function values at the interfaces */
354  double *fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
355  double *u, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
356  double *x, /*!< Grid coordinates */
357  int upw, /*!< Upwind direction (left or right biased) */
358  int dir, /*!< Spatial dimension along which to interpolation */
359  void *s, /*!< Object of type #HyPar containing solver-related variables */
360  void *m, /*!< Object of type #MPIVariables containing MPI-related variables */
361  int uflag /*!< Flag to indicate if \f$f(u) \equiv u\f$, i.e, if the solution is being reconstructed */
362  )
363 {
364  HyPar *solver = (HyPar*) s;
365  WENOParameters *weno = (WENOParameters*) solver->interp;
366 
367  int ghosts = solver->ghosts;
368  int ndims = solver->ndims;
369  int nvars = solver->nvars;
370  int *dim = solver->dim_local;
371  int *stride= solver->stride_with_ghosts;
372 
373  /* calculate dimension offset */
374  int offset = weno->offset[dir];
375  int bounds_inter[ndims];
376  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
377  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
378  int nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
379 
380 #if defined(GPU_STAT)
381  cudaEvent_t start, stop;
382  float milliseconds = 0;
383  checkCuda( cudaEventCreate(&start) );
384  checkCuda( cudaEventCreate(&stop) );
385 
386 
387  int weno_memory_accessed = 3*npoints_grid*nvars*sizeof(double);
388  int fI_memory_accessed = npoints_grid*nvars*sizeof(double);
389  int fC_memory_accessed = 1;
390  for (int d=0; d<ndims; d++) {
391  if (d == dir) fC_memory_accessed *= (dim[d]+2*ghosts);
392  else fC_memory_accessed *= dim[d];
393  }
394  fC_memory_accessed *= nvars*sizeof(double);
395 
396  checkCuda( cudaEventRecord(start) );
397 #endif
398 
399  Interp1PrimFifthOrderWENO_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
400  npoints_grid, solver->npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir], upw, uflag,
401  solver->gpu_dim_local, fC, weno->w1, weno->w2, weno->w3, fI
402  );
403 #if defined(GPU_STAT)
404  checkCuda( cudaEventRecord(stop) );
405  checkCuda( cudaEventSynchronize(stop) );
406 #endif
407 
408  cudaDeviceSynchronize();
409 
410 #if defined(GPU_STAT)
411  checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
412 
413  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
414  "Interp1PrimFifthOrderWENO2", milliseconds*1e-3, dir,
415  (1e-6*(weno_memory_accessed+fI_memory_accessed+fC_memory_accessed))/milliseconds);
416 
417  checkCuda( cudaEventDestroy(start) );
418  checkCuda( cudaEventDestroy(stop) );
419 #endif
420 
421  return 0;
422 }
423 
424 #endif