HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
WENOFifthOrderCalculateWeights_GPU.cu
Go to the documentation of this file.
1 /*! @file WENOFifthOrderCalculateWeights_GPU.cu
2  @brief Functions to compute the nonlinear weights for WENO-type schemes
3  @author Youngdae Kim
4 */
5 
6 #include <stdlib.h>
7 #include <string.h>
8 #include <time.h>
9 
10 #include <cuda_profiler_api.h>
11 
12 #include <basic.h>
13 #include <basic_gpu.h>
14 #include <arrayfunctions.h>
15 #include <matmult_native.h>
16 #include <mathfunctions.h>
17 #include <interpolation.h>
18 #include <mpivars.h>
19 #include <hypar.h>
20 
21 #include <arrayfunctions_gpu.h>
22 
23 
24 static int gpuWENOFifthOrderCalculateWeightsYC(double*,double*,double*,int,void*,void*);
25 static int gpuWENOFifthOrderCalculateWeightsM(double*,double*,double*,int,void*,void*);
26 
27 /*! Compute the nonlinear weights for 5th order WENO-type schemes. This function is a wrapper that
28  calls the appropriate function, depending on the type of WENO weights.
29 */
30 extern "C" int gpuWENOFifthOrderCalculateWeights(
31  double *fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
32  double *uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
33  double *x, /*!< Grid coordinates */
34  int dir, /*!< Spatial dimension along which to interpolation */
35  void *s, /*!< Object of type #HyPar containing solver-related variables */
36  void *m /*!< Object of type #MPIVariables containing MPI-related variables */
37 )
38 {
39  HyPar *solver = (HyPar*) s;
40  WENOParameters *weno = (WENOParameters*) solver->interp;
41  MPIVariables *mpi = (MPIVariables*) m;
42 
43  if (weno->yc) return(gpuWENOFifthOrderCalculateWeightsYC (fC,uC,x,dir,solver,mpi));
44  else if (weno->mapped) return(gpuWENOFifthOrderCalculateWeightsM (fC,uC,x,dir,solver,mpi));
45  else {
46  printf("ERROR! WENO functions other than yc or mapped have not been implemented on GPUs.\n");
47  return 1;
48  }
49 }
50 
51 #ifdef CUDA_VAR_ORDERDING_AOS
52 
53 /*! Kernel for gpuWENOFifthOrderCalculateWeightsM() */
54 __global__
55 void WENOFifthOrderCalculateWeightsM_kernel(
56  int ngrid_points,
57  int ndims,
58  int dir,
59  int ghosts,
60  int nvars,
61  int weno_size,
62  int offset,
63  int stride,
64  int is_crweno,
65  int is_mpi_ip_zero,
66  int is_mpi_ip_proc,
67  double weno_eps,
68  const int *dim,
69  const double *fC,
70  const double *uC,
71  double *w1,
72  double *w2,
73  double *w3
74 )
75 {
76  int p = threadIdx.x + (blockDim.x * blockIdx.x);
77  if (p < ngrid_points) {
78  const int max_ndims = 3;
79  const double thirteen_by_twelve = 13.0 / 12.0;
80  const double one_fourth = 1.0 / 4.0;
81 
82  int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
83  int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
84  double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
85  double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
86 
87  ww1LF = w1 + offset;
88  ww2LF = w2 + offset;
89  ww3LF = w3 + offset;
90  ww1RF = w1 + 2*weno_size + offset;
91  ww2RF = w2 + 2*weno_size + offset;
92  ww3RF = w3 + 2*weno_size + offset;
93  ww1LU = w1 + weno_size + offset;
94  ww2LU = w2 + weno_size + offset;
95  ww3LU = w3 + weno_size + offset;
96  ww1RU = w1 + 2*weno_size + weno_size + offset;
97  ww2RU = w2 + 2*weno_size + weno_size + offset;
98  ww3RU = w3 + 2*weno_size + weno_size + offset;
99 
100  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
101  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
102  _ArrayCopy1D_(indexC,indexI,ndims);
103 
104  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
105  qm3L = qm1L - 2*stride;
106  qm2L = qm1L - stride;
107  qp1L = qm1L + stride;
108  qp2L = qm1L + 2*stride;
109 
110  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
111  qm3R = qm1R + 2*stride;
112  qm2R = qm1R + stride;
113  qp1R = qm1R - stride;
114  qp2R = qm1R - 2*stride;
115 
116  /* Defining stencil points */
117  const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
118  m3LF = (fC+qm3L*nvars);
119  m2LF = (fC+qm2L*nvars);
120  m1LF = (fC+qm1L*nvars);
121  p1LF = (fC+qp1L*nvars);
122  p2LF = (fC+qp2L*nvars);
123  const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
124  m3RF = (fC+qm3R*nvars);
125  m2RF = (fC+qm2R*nvars);
126  m1RF = (fC+qm1R*nvars);
127  p1RF = (fC+qp1R*nvars);
128  p2RF = (fC+qp2R*nvars);
129  const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
130  m3LU = (uC+qm3L*nvars);
131  m2LU = (uC+qm2L*nvars);
132  m1LU = (uC+qm1L*nvars);
133  p1LU = (uC+qp1L*nvars);
134  p2LU = (uC+qp2L*nvars);
135  const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
136  m3RU = (uC+qm3R*nvars);
137  m2RU = (uC+qm2R*nvars);
138  m1RU = (uC+qm1R*nvars);
139  p1RU = (uC+qp1R*nvars);
140  p2RU = (uC+qp2R*nvars);
141 
142  double c1, c2, c3;
143  if (is_crweno) {
144  if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
145  || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
146  /* Use WENO5 at the physical boundaries */
147  c1 = _WENO_OPTIMAL_WEIGHT_1_;
148  c2 = _WENO_OPTIMAL_WEIGHT_2_;
149  c3 = _WENO_OPTIMAL_WEIGHT_3_;
150  } else {
151  /* CRWENO5 at the interior points */
152  c1 = _CRWENO_OPTIMAL_WEIGHT_1_;
153  c2 = _CRWENO_OPTIMAL_WEIGHT_2_;
154  c3 = _CRWENO_OPTIMAL_WEIGHT_3_;
155  }
156  } else {
157  /* WENO5 and HCWENO5 */
158  c1 = _WENO_OPTIMAL_WEIGHT_1_;
159  c2 = _WENO_OPTIMAL_WEIGHT_2_;
160  c3 = _WENO_OPTIMAL_WEIGHT_3_;
161  }
162 
163  /* calculate WENO weights */
164  _WENOWeights_v_M_((ww1LF+p*nvars),(ww2LF+p*nvars),(ww3LF+p*nvars),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,nvars);
165  _WENOWeights_v_M_((ww1RF+p*nvars),(ww2RF+p*nvars),(ww3RF+p*nvars),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,nvars);
166  _WENOWeights_v_M_((ww1LU+p*nvars),(ww2LU+p*nvars),(ww3LU+p*nvars),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,nvars);
167  _WENOWeights_v_M_((ww1RU+p*nvars),(ww2RU+p*nvars),(ww3RU+p*nvars),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,nvars);
168  }
169 
170  return;
171 }
172 
173 /*!
174  Computes the nonlinear weights for the 5th order component-wise WENO-type schemes using the mapped formulation of Henrick, Aslam & Powers:
175  \f{eqnarray}{
176  \omega_k &=& \frac {a_k} {\sum_{j=1}^3 a_j },\ a_k = \frac {\tilde{\omega}_k \left( c_k + c_k^2 - 3c_k\tilde{\omega}_k + \tilde{\omega}_k^2\right)} {c_k^2 + \tilde{\omega}_k\left(1-2c_k\right)}, \\
177  \tilde{\omega}_k &=& \frac {\tilde{a}_k} {\sum_{j=1}^3 \tilde{a}_j },\ \tilde{a}_k = \frac {c_k} {\left(\beta_k+\epsilon\right)^p},\ k = 1,2,3,
178  \f}
179  where \f$c_k\f$ are the optimal weights, \f$p\f$ is hardcoded to \f$2\f$, and \f$\epsilon\f$ is an input parameter
180  (#WENOParameters::eps) (typically \f$10^{-6}\f$). The smoothness indicators \f$\beta_k\f$ are given by:
181  \f{eqnarray}{
182  \beta_1 &=& \frac{13}{12} \left(f_{j-2}-2f_{j-1}+f_j\right)^2 + \frac{1}{4}\left(f_{j-2}-4f_{j-1}+3f_j\right)^2 \\
183  \beta_2 &=& \frac{13}{12} \left(f_{j-1}-2f_j+f_{j+1}\right)^2 + \frac{1}{4}\left(f_{j-1}-f_{j+1}\right)^2 \\
184  \beta_3 &=& \frac{13}{12} \left(f_j-2f_{j+1}+f_{j+2}\right)^2 + \frac{1}{4}\left(3f_j-4f_{j+1}+f_{j+2}\right)^2
185  \f}
186 
187  \b Notes:
188  + This function computes the weights for the entire grid (for interpolation along a given spatial dimension \a dir ).
189  Thus, it loops over all grid lines along \a dir.
190  + The weights are computed for all components, for both the left- and right-biased interpolations, and for both the
191  flux function \f${\bf f}\left({\bf u}\right)\f$ and the solution \f$\bf u\f$. Thus, this approach of computing and
192  storing the weights is quite memory-intensive.
193  + The variable \a offset denotes the offset in the complete array from where the weights for interpolation along the
194  current interpolation dimension (\a dir ) are stored.
195 
196  \b Reference:
197  + Henrick, Aslam, Powers, Mapped weighted essentially non-oscillatory schemes: Achieving optimal order near critical
198  points, J. Comput. Phys., 2005. http://dx.doi.org/10.1016/j.jcp.2005.01.023
199 */
200 int gpuWENOFifthOrderCalculateWeightsM(
201  double * __restrict__ fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
202  double * __restrict__ uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
203  double * __restrict__ x, /*!< Grid coordinates */
204  int dir, /*!< Spatial dimension along which to interpolation */
205  void *s, /*!< Object of type #HyPar containing solver-related variables */
206  void *m /*!< Object of type #MPIVariables containing MPI-related variables */
207 )
208 {
209  HyPar *solver = (HyPar*) s;
210  WENOParameters *weno = (WENOParameters*) solver->interp;
211  MPIVariables *mpi = (MPIVariables*) m;
212 
213  int ghosts = solver->ghosts;
214  int ndims = solver->ndims;
215  int nvars = solver->nvars;
216  int *dim = solver->dim_local;
217  int *stride= solver->stride_with_ghosts;
218 
219  /* calculate dimension offset */
220  int offset = weno->offset[dir];
221  int bounds_inter[ndims];
222  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
223  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
224  int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
225 
226  int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
227  int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
228  int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
229 
230 #if defined(GPU_STAT)
231  cudaEvent_t startEvent, stopEvent;
232  float milliseconds = 0;
233 
234  checkCuda( cudaEventCreate(&startEvent) );
235  checkCuda( cudaEventCreate(&stopEvent) );
236 
237  int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
238  int fCuC_memory_accessed = 1;
239  for (int d=0; d<ndims; d++) {
240  if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
241  else fCuC_memory_accessed *= dim[d];
242  }
243  fCuC_memory_accessed *= nvars*sizeof(double);
244 
245  checkCuda( cudaEventRecord(startEvent, 0) );
246 #endif
247 
248  WENOFifthOrderCalculateWeightsM_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
249  npoints_grid, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
250  is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
251  solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
252  );
253  cudaDeviceSynchronize();
254 
255 #if defined(GPU_STAT)
256  checkCuda( cudaEventRecord(stopEvent, 0) );
257  checkCuda( cudaEventSynchronize(stopEvent) );
258  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
259 
260  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
261  "WENOFifthOrderCalculateWeightsM", milliseconds*1e-3, dir,
262  (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds));
263 
264  checkCuda( cudaEventDestroy(startEvent) );
265  checkCuda( cudaEventDestroy(stopEvent) );
266 #endif
267 
268  return 0;
269 }
270 
271 /*! Kernel for gpuWENOFifthOrderCalculateWeightsYC() */
272 __global__
273 void WENOFifthOrderCalculateWeightsYC_kernel(
274  int ngrid_points,
275  int ndims,
276  int dir,
277  int ghosts,
278  int nvars,
279  int weno_size,
280  int offset,
281  int stride,
282  int is_crweno,
283  int is_mpi_ip_zero,
284  int is_mpi_ip_proc,
285  double weno_eps,
286  const int *dim,
287  const double *fC,
288  const double *uC,
289  double *w1,
290  double *w2,
291  double *w3
292 )
293 {
294  int p = threadIdx.x + (blockDim.x * blockIdx.x);
295  if (p < ngrid_points) {
296  const int max_ndims = 3;
297  const double thirteen_by_twelve = 13.0 / 12.0;
298  const double one_fourth = 1.0 / 4.0;
299 
300  int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
301  int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
302  double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
303  double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
304 
305  ww1LF = w1 + offset;
306  ww2LF = w2 + offset;
307  ww3LF = w3 + offset;
308  ww1RF = w1 + 2*weno_size + offset;
309  ww2RF = w2 + 2*weno_size + offset;
310  ww3RF = w3 + 2*weno_size + offset;
311  ww1LU = w1 + weno_size + offset;
312  ww2LU = w2 + weno_size + offset;
313  ww3LU = w3 + weno_size + offset;
314  ww1RU = w1 + 2*weno_size + weno_size + offset;
315  ww2RU = w2 + 2*weno_size + weno_size + offset;
316  ww3RU = w3 + 2*weno_size + weno_size + offset;
317 
318  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
319  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
320  _ArrayCopy1D_(indexC,indexI,ndims);
321 
322  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
323  qm3L = qm1L - 2*stride;
324  qm2L = qm1L - stride;
325  qp1L = qm1L + stride;
326  qp2L = qm1L + 2*stride;
327 
328  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
329  qm3R = qm1R + 2*stride;
330  qm2R = qm1R + stride;
331  qp1R = qm1R - stride;
332  qp2R = qm1R - 2*stride;
333 
334  /* Defining stencil points */
335  const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
336  m3LF = (fC+qm3L*nvars);
337  m2LF = (fC+qm2L*nvars);
338  m1LF = (fC+qm1L*nvars);
339  p1LF = (fC+qp1L*nvars);
340  p2LF = (fC+qp2L*nvars);
341  const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
342  m3RF = (fC+qm3R*nvars);
343  m2RF = (fC+qm2R*nvars);
344  m1RF = (fC+qm1R*nvars);
345  p1RF = (fC+qp1R*nvars);
346  p2RF = (fC+qp2R*nvars);
347  const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
348  m3LU = (uC+qm3L*nvars);
349  m2LU = (uC+qm2L*nvars);
350  m1LU = (uC+qm1L*nvars);
351  p1LU = (uC+qp1L*nvars);
352  p2LU = (uC+qp2L*nvars);
353  const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
354  m3RU = (uC+qm3R*nvars);
355  m2RU = (uC+qm2R*nvars);
356  m1RU = (uC+qm1R*nvars);
357  p1RU = (uC+qp1R*nvars);
358  p2RU = (uC+qp2R*nvars);
359 
360  double c1, c2, c3;
361  if (is_crweno) {
362  if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
363  || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
364  /* Use WENO5 at the physical boundaries */
365  c1 = _WENO_OPTIMAL_WEIGHT_1_;
366  c2 = _WENO_OPTIMAL_WEIGHT_2_;
367  c3 = _WENO_OPTIMAL_WEIGHT_3_;
368  } else {
369  /* CRWENO5 at the interior points */
370  c1 = _CRWENO_OPTIMAL_WEIGHT_1_;
371  c2 = _CRWENO_OPTIMAL_WEIGHT_2_;
372  c3 = _CRWENO_OPTIMAL_WEIGHT_3_;
373  }
374  } else {
375  /* WENO5 and HCWENO5 */
376  c1 = _WENO_OPTIMAL_WEIGHT_1_;
377  c2 = _WENO_OPTIMAL_WEIGHT_2_;
378  c3 = _WENO_OPTIMAL_WEIGHT_3_;
379  }
380 
381  /* calculate WENO weights */
382  _WENOWeights_v_YC_((ww1LF+p*nvars),(ww2LF+p*nvars),(ww3LF+p*nvars),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,nvars);
383  _WENOWeights_v_YC_((ww1RF+p*nvars),(ww2RF+p*nvars),(ww3RF+p*nvars),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,nvars);
384  _WENOWeights_v_YC_((ww1LU+p*nvars),(ww2LU+p*nvars),(ww3LU+p*nvars),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,nvars);
385  _WENOWeights_v_YC_((ww1RU+p*nvars),(ww2RU+p*nvars),(ww3RU+p*nvars),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,nvars);
386  }
387 
388  return;
389 }
390 
391 /*!
392  Computes the nonlinear weights for the 5th order component-wise WENO-type schemes using the ESWENO formulation of
393  Yamaleev & Carpenter. Note that only the formulation for the nonlinear weights is adopted and implemented here, not
394  the ESWENO scheme as a whole.
395  \f{equation}{
396  \omega_k = \frac {a_k} {\sum_{j=1}^3 a_j },\ a_k = c_k \left( 1 + \frac{\tau_5}{\beta_k+\epsilon} \right)^p,\ k = 1,2,3,
397  \f}
398  where \f$c_k\f$ are the optimal weights, \f$p\f$ is hardcoded to \f$2\f$, and \f$\epsilon\f$ is an input parameter
399  (#WENOParameters::eps) (typically \f$10^{-6}\f$). The smoothness indicators \f$\beta_k\f$ are given by:
400  \f{eqnarray}{
401  \beta_1 &=& \frac{13}{12} \left(f_{j-2}-2f_{j-1}+f_j\right)^2 + \frac{1}{4}\left(f_{j-2}-4f_{j-1}+3f_j\right)^2 \\
402  \beta_2 &=& \frac{13}{12} \left(f_{j-1}-2f_j+f_{j+1}\right)^2 + \frac{1}{4}\left(f_{j-1}-f_{j+1}\right)^2 \\
403  \beta_3 &=& \frac{13}{12} \left(f_j-2f_{j+1}+f_{j+2}\right)^2 + \frac{1}{4}\left(3f_j-4f_{j+1}+f_{j+2}\right)^2,
404  \f}
405  and \f$\tau_5 = \left( f_{j-2}-4f_{j-1}+6f_j-4f_{j+1}+f_{j+2} \right)^2\f$.
406 
407  \b Notes:
408  + This function computes the weights for the entire grid (for interpolation along a given spatial dimension \a dir ).
409  Thus, it loops over all grid lines along \a dir.
410  + The weights are computed for all components, for both the left- and right-biased interpolations, and for both the
411  flux function \f${\bf f}\left({\bf u}\right)\f$ and the solution \f$\bf u\f$. Thus, this approach of computing and
412  storing the weights is quite memory-intensive.
413  + The variable \a offset denotes the offset in the complete array from where the weights for interpolation along the
414  current interpolation dimension (\a dir ) are stored.
415 
416  \b Reference:
417  + Yamaleev, Carpenter, A systematic methodology for constructing high-order energy stable WENO schemes,
418  J. Comput. Phys., 2009. http://dx.doi.org/10.1016/j.jcp.2009.03.002
419 */
420 int gpuWENOFifthOrderCalculateWeightsYC(
421  double * __restrict__ fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
422  double * __restrict__ uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
423  double * __restrict__ x, /*!< Grid coordinates */
424  int dir, /*!< Spatial dimension along which to interpolation */
425  void *s, /*!< Object of type #HyPar containing solver-related variables */
426  void *m /*!< Object of type #MPIVariables containing MPI-related variables */
427 )
428 {
429  HyPar *solver = (HyPar*) s;
430  WENOParameters *weno = (WENOParameters*) solver->interp;
431  MPIVariables *mpi = (MPIVariables*) m;
432 
433  int ghosts = solver->ghosts;
434  int ndims = solver->ndims;
435  int nvars = solver->nvars;
436  int *dim = solver->dim_local;
437  int *stride= solver->stride_with_ghosts;
438 
439  /* calculate dimension offset */
440  int offset = weno->offset[dir];
441  int bounds_inter[ndims];
442  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
443  int ngrid_points; _ArrayProduct1D_(bounds_inter,ndims,ngrid_points);
444  //int nblocks = (ngrid_points - 1) / GPU_THREADS_PER_BLOCK + 1;
445  int nblocks = (ngrid_points - 1) / 256 + 1;
446 
447  int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
448  int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
449  int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
450 
451  cudaEvent_t startEvent, stopEvent;
452  float milliseconds = 0;
453 
454  checkCuda( cudaEventCreate(&startEvent) );
455  checkCuda( cudaEventCreate(&stopEvent) );
456 
457  //cudaProfilerStart();
458  checkCuda( cudaEventRecord(startEvent, 0) );
459  WENOFifthOrderCalculateWeightsYC_kernel<<<nblocks, 256>>>(
460  ngrid_points, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
461  is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
462  solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
463  );
464  checkCuda( cudaEventRecord(stopEvent, 0) );
465  checkCuda( cudaEventSynchronize(stopEvent) );
466  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
467  //cudaProfilerStop();
468 
469  printf("WENOFifthOrderCalculateWeightsYC_GPU:\n");
470  printf(" GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f weno->size = %d weno->offset[1] = %d\n",
471  milliseconds*1e-3, dir,
472  (1e-6*((12*weno->offset[1]+2*(nvars*(dim[0]+2*ghosts)*dim[1]))*sizeof(double)))/milliseconds,
473  weno->size, weno->offset[1]);
474 
475  checkCuda( cudaEventDestroy(startEvent) );
476  checkCuda( cudaEventDestroy(stopEvent) );
477 
478  return (0);
479 }
480 
481 #else
482 
483 /*! Kernel for gpuWENOFifthOrderCalculateWeightsM() */
484 __global__
485 void WENOFifthOrderCalculateWeightsM_kernel(
486  int npoints_grid,
487  int npoints_local_wghosts,
488  int ndims,
489  int dir,
490  int ghosts,
491  int nvars,
492  int weno_size,
493  int offset,
494  int stride,
495  int is_crweno,
496  int is_mpi_ip_zero,
497  int is_mpi_ip_proc,
498  double weno_eps,
499  const int *dim,
500  const double *fC,
501  const double *uC,
502  double *w1,
503  double *w2,
504  double *w3
505 )
506 {
507  int p = threadIdx.x + (blockDim.x * blockIdx.x);
508  if (p < npoints_grid) {
509  const int max_ndims = 3;
510  const double thirteen_by_twelve = 13.0 / 12.0;
511  const double one_fourth = 1.0 / 4.0;
512 
513  int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
514  int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
515  double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
516  double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
517 
518  ww1LF = w1 + offset;
519  ww2LF = w2 + offset;
520  ww3LF = w3 + offset;
521  ww1RF = w1 + 2*weno_size + offset;
522  ww2RF = w2 + 2*weno_size + offset;
523  ww3RF = w3 + 2*weno_size + offset;
524  ww1LU = w1 + weno_size + offset;
525  ww2LU = w2 + weno_size + offset;
526  ww3LU = w3 + weno_size + offset;
527  ww1RU = w1 + 2*weno_size + weno_size + offset;
528  ww2RU = w2 + 2*weno_size + weno_size + offset;
529  ww3RU = w3 + 2*weno_size + weno_size + offset;
530 
531  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
532  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
533  _ArrayCopy1D_(indexC,indexI,ndims);
534 
535  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
536  qm3L = qm1L - 2*stride;
537  qm2L = qm1L - stride;
538  qp1L = qm1L + stride;
539  qp2L = qm1L + 2*stride;
540 
541  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
542  qm3R = qm1R + 2*stride;
543  qm2R = qm1R + stride;
544  qp1R = qm1R - stride;
545  qp2R = qm1R - 2*stride;
546 
547  for (int j=0; j<nvars; j++) {
548  /* Defining stencil points */
549  const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
550  m3LF = (fC+qm3L);
551  m2LF = (fC+qm2L);
552  m1LF = (fC+qm1L);
553  p1LF = (fC+qp1L);
554  p2LF = (fC+qp2L);
555  const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
556  m3RF = (fC+qm3R);
557  m2RF = (fC+qm2R);
558  m1RF = (fC+qm1R);
559  p1RF = (fC+qp1R);
560  p2RF = (fC+qp2R);
561  const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
562  m3LU = (uC+qm3L);
563  m2LU = (uC+qm2L);
564  m1LU = (uC+qm1L);
565  p1LU = (uC+qp1L);
566  p2LU = (uC+qp2L);
567  const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
568  m3RU = (uC+qm3R);
569  m2RU = (uC+qm2R);
570  m1RU = (uC+qm1R);
571  p1RU = (uC+qp1R);
572  p2RU = (uC+qp2R);
573 
574  qm3L += npoints_local_wghosts;
575  qm2L += npoints_local_wghosts;
576  qm1L += npoints_local_wghosts;
577  qp1L += npoints_local_wghosts;
578  qp2L += npoints_local_wghosts;
579 
580  qm3R += npoints_local_wghosts;
581  qm2R += npoints_local_wghosts;
582  qm1R += npoints_local_wghosts;
583  qp1R += npoints_local_wghosts;
584  qp2R += npoints_local_wghosts;
585 
586  double c1, c2, c3;
587  if (is_crweno) {
588  if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
589  || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
590  /* Use WENO5 at the physical boundaries */
591  c1 = _WENO_OPTIMAL_WEIGHT_1_;
592  c2 = _WENO_OPTIMAL_WEIGHT_2_;
593  c3 = _WENO_OPTIMAL_WEIGHT_3_;
594  } else {
595  /* CRWENO5 at the interior points */
596  c1 = _CRWENO_OPTIMAL_WEIGHT_1_;
597  c2 = _CRWENO_OPTIMAL_WEIGHT_2_;
598  c3 = _CRWENO_OPTIMAL_WEIGHT_3_;
599  }
600  } else {
601  /* WENO5 and HCWENO5 */
602  c1 = _WENO_OPTIMAL_WEIGHT_1_;
603  c2 = _WENO_OPTIMAL_WEIGHT_2_;
604  c3 = _WENO_OPTIMAL_WEIGHT_3_;
605  }
606 
607  /* calculate WENO weights */
608  _WENOWeights_v_M_Scalar_((ww1LF+p),(ww2LF+p),(ww3LF+p),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,0);
609  _WENOWeights_v_M_Scalar_((ww1RF+p),(ww2RF+p),(ww3RF+p),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,0);
610  _WENOWeights_v_M_Scalar_((ww1LU+p),(ww2LU+p),(ww3LU+p),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,0);
611  _WENOWeights_v_M_Scalar_((ww1RU+p),(ww2RU+p),(ww3RU+p),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,0);
612  p += npoints_grid;
613  }
614  }
615 
616  return;
617 }
618 
619 
620 /*!
621  Computes the nonlinear weights for the 5th order component-wise WENO-type schemes using the mapped formulation of Henrick, Aslam & Powers:
622  \f{eqnarray}{
623  \omega_k &=& \frac {a_k} {\sum_{j=1}^3 a_j },\ a_k = \frac {\tilde{\omega}_k \left( c_k + c_k^2 - 3c_k\tilde{\omega}_k + \tilde{\omega}_k^2\right)} {c_k^2 + \tilde{\omega}_k\left(1-2c_k\right)}, \\
624  \tilde{\omega}_k &=& \frac {\tilde{a}_k} {\sum_{j=1}^3 \tilde{a}_j },\ \tilde{a}_k = \frac {c_k} {\left(\beta_k+\epsilon\right)^p},\ k = 1,2,3,
625  \f}
626  where \f$c_k\f$ are the optimal weights, \f$p\f$ is hardcoded to \f$2\f$, and \f$\epsilon\f$ is an input parameter
627  (#WENOParameters::eps) (typically \f$10^{-6}\f$). The smoothness indicators \f$\beta_k\f$ are given by:
628  \f{eqnarray}{
629  \beta_1 &=& \frac{13}{12} \left(f_{j-2}-2f_{j-1}+f_j\right)^2 + \frac{1}{4}\left(f_{j-2}-4f_{j-1}+3f_j\right)^2 \\
630  \beta_2 &=& \frac{13}{12} \left(f_{j-1}-2f_j+f_{j+1}\right)^2 + \frac{1}{4}\left(f_{j-1}-f_{j+1}\right)^2 \\
631  \beta_3 &=& \frac{13}{12} \left(f_j-2f_{j+1}+f_{j+2}\right)^2 + \frac{1}{4}\left(3f_j-4f_{j+1}+f_{j+2}\right)^2
632  \f}
633 
634  \b Notes:
635  + This function computes the weights for the entire grid (for interpolation along a given spatial dimension \a dir ).
636  Thus, it loops over all grid lines along \a dir.
637  + The weights are computed for all components, for both the left- and right-biased interpolations, and for both the
638  flux function \f${\bf f}\left({\bf u}\right)\f$ and the solution \f$\bf u\f$. Thus, this approach of computing and
639  storing the weights is quite memory-intensive.
640  + The variable \a offset denotes the offset in the complete array from where the weights for interpolation along the
641  current interpolation dimension (\a dir ) are stored.
642 
643  \b Reference:
644  + Henrick, Aslam, Powers, Mapped weighted essentially non-oscillatory schemes: Achieving optimal order near critical
645  points, J. Comput. Phys., 2005. http://dx.doi.org/10.1016/j.jcp.2005.01.023
646 */
647 int gpuWENOFifthOrderCalculateWeightsM(
648  double * __restrict__ fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
649  double * __restrict__ uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
650  double * __restrict__ x, /*!< Grid coordinates */
651  int dir, /*!< Spatial dimension along which to interpolation */
652  void *s, /*!< Object of type #HyPar containing solver-related variables */
653  void *m /*!< Object of type #MPIVariables containing MPI-related variables */
654 )
655 {
656  HyPar *solver = (HyPar*) s;
657  WENOParameters *weno = (WENOParameters*) solver->interp;
658  MPIVariables *mpi = (MPIVariables*) m;
659 
660  int ghosts = solver->ghosts;
661  int ndims = solver->ndims;
662  int nvars = solver->nvars;
663  int *dim = solver->dim_local;
664  int *stride= solver->stride_with_ghosts;
665 
666  /* calculate dimension offset */
667  int offset = weno->offset[dir];
668  int bounds_inter[ndims];
669  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
670  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
671  int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
672 
673  int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
674  int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
675  int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
676 
677 #if defined(GPU_STAT)
678  cudaEvent_t startEvent, stopEvent;
679  float milliseconds = 0;
680 
681  checkCuda( cudaEventCreate(&startEvent) );
682  checkCuda( cudaEventCreate(&stopEvent) );
683 
684  int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
685  int fCuC_memory_accessed = 1;
686  for (int d=0; d<ndims; d++) {
687  if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
688  else fCuC_memory_accessed *= dim[d];
689  }
690  fCuC_memory_accessed *= nvars*sizeof(double);
691 
692  checkCuda( cudaEventRecord(startEvent, 0) );
693 #endif
694 
695  WENOFifthOrderCalculateWeightsM_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
696  npoints_grid, solver->npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
697  is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
698  solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
699  );
700  cudaDeviceSynchronize();
701 
702 #if defined(GPU_STAT)
703  checkCuda( cudaEventRecord(stopEvent, 0) );
704  checkCuda( cudaEventSynchronize(stopEvent) );
705  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
706 
707  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f stride = %d\n",
708  "WENOFifthOrderCalculateWeightsM2", milliseconds*1e-3, dir,
709  (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds),
710  stride[dir]);
711 
712  checkCuda( cudaEventDestroy(startEvent) );
713  checkCuda( cudaEventDestroy(stopEvent) );
714 #endif
715 
716  return 0;
717 }
718 
719 /*! Kernel for gpuWENOFifthOrderCalculateWeightsYC() */
720 __global__
721 void WENOFifthOrderCalculateWeightsYC_kernel(
722  int npoints_grid,
723  int npoints_local_wghosts,
724  int ndims,
725  int dir,
726  int ghosts,
727  int nvars,
728  int weno_size,
729  int offset,
730  int stride,
731  int is_crweno,
732  int is_mpi_ip_zero,
733  int is_mpi_ip_proc,
734  double weno_eps,
735  const int * __restrict__ dim,
736  const double * __restrict__ fC,
737  const double * __restrict__ uC,
738  double * __restrict__ w1,
739  double * __restrict__ w2,
740  double * __restrict__ w3
741 )
742 {
743  int p = threadIdx.x + (blockDim.x * blockIdx.x);
744  if (p < npoints_grid) {
745  const int max_ndims = 3;
746  const double thirteen_by_twelve = 13.0 / 12.0;
747  const double one_fourth = 1.0 / 4.0;
748 
749  int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
750  int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
751  double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
752  double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
753 
754  ww1LF = w1 + offset;
755  ww2LF = w2 + offset;
756  ww3LF = w3 + offset;
757  ww1RF = w1 + 2*weno_size + offset;
758  ww2RF = w2 + 2*weno_size + offset;
759  ww3RF = w3 + 2*weno_size + offset;
760  ww1LU = w1 + weno_size + offset;
761  ww2LU = w2 + weno_size + offset;
762  ww3LU = w3 + weno_size + offset;
763  ww1RU = w1 + 2*weno_size + weno_size + offset;
764  ww2RU = w2 + 2*weno_size + weno_size + offset;
765  ww3RU = w3 + 2*weno_size + weno_size + offset;
766 
767  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
768  _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
769  _ArrayCopy1D_(indexC,indexI,ndims);
770 
771  indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
772  qm3L = qm1L - 2*stride;
773  qm2L = qm1L - stride;
774  qp1L = qm1L + stride;
775  qp2L = qm1L + 2*stride;
776 
777  indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
778  qm3R = qm1R + 2*stride;
779  qm2R = qm1R + stride;
780  qp1R = qm1R - stride;
781  qp2R = qm1R - 2*stride;
782 
783  for (int j=0; j <nvars; j++) {
784  /* Defining stencil points */
785  const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
786  m3LF = (fC+qm3L);
787  m2LF = (fC+qm2L);
788  m1LF = (fC+qm1L);
789  p1LF = (fC+qp1L);
790  p2LF = (fC+qp2L);
791  const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
792  m3RF = (fC+qm3R);
793  m2RF = (fC+qm2R);
794  m1RF = (fC+qm1R);
795  p1RF = (fC+qp1R);
796  p2RF = (fC+qp2R);
797  const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
798  m3LU = (uC+qm3L);
799  m2LU = (uC+qm2L);
800  m1LU = (uC+qm1L);
801  p1LU = (uC+qp1L);
802  p2LU = (uC+qp2L);
803  const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
804  m3RU = (uC+qm3R);
805  m2RU = (uC+qm2R);
806  m1RU = (uC+qm1R);
807  p1RU = (uC+qp1R);
808  p2RU = (uC+qp2R);
809 
810  qm3L += npoints_local_wghosts;
811  qm2L += npoints_local_wghosts;
812  qm1L += npoints_local_wghosts;
813  qp1L += npoints_local_wghosts;
814  qp2L += npoints_local_wghosts;
815 
816  qm3R += npoints_local_wghosts;
817  qm2R += npoints_local_wghosts;
818  qm1R += npoints_local_wghosts;
819  qp1R += npoints_local_wghosts;
820  qp2R += npoints_local_wghosts;
821 
822  double c1, c2, c3;
823  if (is_crweno) {
824  if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
825  || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
826  /* Use WENO5 at the physical boundaries */
827  c1 = _WENO_OPTIMAL_WEIGHT_1_;
828  c2 = _WENO_OPTIMAL_WEIGHT_2_;
829  c3 = _WENO_OPTIMAL_WEIGHT_3_;
830  } else {
831  /* CRWENO5 at the interior points */
832  c1 = _CRWENO_OPTIMAL_WEIGHT_1_;
833  c2 = _CRWENO_OPTIMAL_WEIGHT_2_;
834  c3 = _CRWENO_OPTIMAL_WEIGHT_3_;
835  }
836  } else {
837  /* WENO5 and HCWENO5 */
838  c1 = _WENO_OPTIMAL_WEIGHT_1_;
839  c2 = _WENO_OPTIMAL_WEIGHT_2_;
840  c3 = _WENO_OPTIMAL_WEIGHT_3_;
841  }
842 
843  /* calculate WENO weights */
844  _WENOWeights_v_YC_Scalar_((ww1LF+p),(ww2LF+p),(ww3LF+p),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,0);
845  _WENOWeights_v_YC_Scalar_((ww1RF+p),(ww2RF+p),(ww3RF+p),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,0);
846  _WENOWeights_v_YC_Scalar_((ww1LU+p),(ww2LU+p),(ww3LU+p),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,0);
847  _WENOWeights_v_YC_Scalar_((ww1RU+p),(ww2RU+p),(ww3RU+p),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,0);
848  p += npoints_grid;
849  }
850  }
851 
852  return;
853 }
854 
855 /*!
856  Computes the nonlinear weights for the 5th order component-wise WENO-type schemes using the ESWENO formulation of
857  Yamaleev & Carpenter. Note that only the formulation for the nonlinear weights is adopted and implemented here, not
858  the ESWENO scheme as a whole.
859  \f{equation}{
860  \omega_k = \frac {a_k} {\sum_{j=1}^3 a_j },\ a_k = c_k \left( 1 + \frac{\tau_5}{\beta_k+\epsilon} \right)^p,\ k = 1,2,3,
861  \f}
862  where \f$c_k\f$ are the optimal weights, \f$p\f$ is hardcoded to \f$2\f$, and \f$\epsilon\f$ is an input parameter
863  (#WENOParameters::eps) (typically \f$10^{-6}\f$). The smoothness indicators \f$\beta_k\f$ are given by:
864  \f{eqnarray}{
865  \beta_1 &=& \frac{13}{12} \left(f_{j-2}-2f_{j-1}+f_j\right)^2 + \frac{1}{4}\left(f_{j-2}-4f_{j-1}+3f_j\right)^2 \\
866  \beta_2 &=& \frac{13}{12} \left(f_{j-1}-2f_j+f_{j+1}\right)^2 + \frac{1}{4}\left(f_{j-1}-f_{j+1}\right)^2 \\
867  \beta_3 &=& \frac{13}{12} \left(f_j-2f_{j+1}+f_{j+2}\right)^2 + \frac{1}{4}\left(3f_j-4f_{j+1}+f_{j+2}\right)^2,
868  \f}
869  and \f$\tau_5 = \left( f_{j-2}-4f_{j-1}+6f_j-4f_{j+1}+f_{j+2} \right)^2\f$.
870 
871  \b Notes:
872  + This function computes the weights for the entire grid (for interpolation along a given spatial dimension \a dir ).
873  Thus, it loops over all grid lines along \a dir.
874  + The weights are computed for all components, for both the left- and right-biased interpolations, and for both the
875  flux function \f${\bf f}\left({\bf u}\right)\f$ and the solution \f$\bf u\f$. Thus, this approach of computing and
876  storing the weights is quite memory-intensive.
877  + The variable \a offset denotes the offset in the complete array from where the weights for interpolation along the
878  current interpolation dimension (\a dir ) are stored.
879 
880  \b Reference:
881  + Yamaleev, Carpenter, A systematic methodology for constructing high-order energy stable WENO schemes,
882  J. Comput. Phys., 2009. http://dx.doi.org/10.1016/j.jcp.2009.03.002
883 */
884 int gpuWENOFifthOrderCalculateWeightsYC(
885  double * __restrict__ fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
886  double * __restrict__ uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
887  double * __restrict__ x, /*!< Grid coordinates */
888  int dir, /*!< Spatial dimension along which to interpolation */
889  void *s, /*!< Object of type #HyPar containing solver-related variables */
890  void *m /*!< Object of type #MPIVariables containing MPI-related variables */
891 )
892 {
893  HyPar *solver = (HyPar*) s;
894  WENOParameters *weno = (WENOParameters*) solver->interp;
895  MPIVariables *mpi = (MPIVariables*) m;
896 
897  int ghosts = solver->ghosts;
898  int ndims = solver->ndims;
899  int nvars = solver->nvars;
900  int *dim = solver->dim_local;
901  int *stride= solver->stride_with_ghosts;
902 
903  /* calculate dimension offset */
904  int offset = weno->offset[dir];
905  int bounds_inter[ndims];
906  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
907  int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
908  int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
909 
910  int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
911  int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
912  int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
913 
914 #if defined(GPU_STAT)
915  cudaEvent_t startEvent, stopEvent;
916  float milliseconds = 0;
917 
918  checkCuda( cudaEventCreate(&startEvent) );
919  checkCuda( cudaEventCreate(&stopEvent) );
920 
921  int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
922  int fCuC_memory_accessed = 1;
923  for (int d=0; d<ndims; d++) {
924  if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
925  else fCuC_memory_accessed *= dim[d];
926  }
927  fCuC_memory_accessed *= nvars*sizeof(double);
928 
929  checkCuda( cudaEventRecord(startEvent, 0) );
930 #endif
931 
932  WENOFifthOrderCalculateWeightsYC_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
933  npoints_grid, solver->npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
934  is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
935  solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
936  );
937  cudaDeviceSynchronize();
938 
939 #if defined(GPU_STAT)
940  checkCuda( cudaEventRecord(stopEvent, 0) );
941  checkCuda( cudaEventSynchronize(stopEvent) );
942  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
943 
944  printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f stride = %d\n",
945  "WENOFifthOrderCalculateWeightsYC2", milliseconds*1e-3, dir,
946  (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds),
947  stride[dir]);
948 
949  checkCuda( cudaEventDestroy(startEvent) );
950  checkCuda( cudaEventDestroy(stopEvent) );
951 #endif
952 
953  return (0);
954 }
955 
956 #endif
957