HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
HyperbolicFunction_GPU.cu
Go to the documentation of this file.
1 /*! @file HyperbolicFunction_GPU.cu
2  @author Youngdae Kim
3  @brief Compute the hyperbolic term of the governing equations
4 */
5 
6 #include <arrayfunctions_gpu.h>
7 #include <basic_gpu.h>
8 #include <mpivars.h>
9 #include <hypar.h>
10 
11 static int ReconstructHyperbolic (double*,double*,double*,double*,int,void*,void*,double,int,
12  int(*)(double*,double*,double*,double*,
13  double*,double*,int,void*,double));
14 static int DefaultUpwinding (double*,double*,double*,double*,
15  double*,double*,int,void*,double);
16 
17 /*! Kernel for stage boundary integral calculation */
18 template <int block_size>
19 __global__
20 void StageBoundaryIntegral_kernel(
21  int n,
22  int nvars,
23  int sign,
24  const double * __restrict__ FluxI,
25  double * __restrict__ StageBoundaryIntegral
26 )
27 {
28  extern __shared__ double smem[];
29 
30  int tid = threadIdx.x;
31  int grid_size = block_size * gridDim.x;
32  int i = blockIdx.x * block_size + threadIdx.x;
33  int j;
34  double tid_sum[GPU_MAX_NVARS] = { 0 };
35 
36  while (i < n) {
37  for (j=0; j<nvars; j++) tid_sum[j] += FluxI[i*nvars+j];
38  i += grid_size;
39  }
40  for (j=0; j<nvars; j++) smem[tid*nvars+j] = tid_sum[j];
41  __syncthreads();
42 
43 
44  if (block_size >= 512 && tid < 256) {
45  for (j=0; j<nvars; j++) smem[tid*nvars+j] = tid_sum[j] = tid_sum[j] + smem[(tid+256)*nvars+j];
46  }
47  __syncthreads();
48  if (block_size >= 256 && tid < 128) {
49  for (j=0; j<nvars; j++) smem[tid*nvars+j] = tid_sum[j] = tid_sum[j] + smem[(tid+128)*nvars+j];
50  }
51  __syncthreads();
52  if (block_size >= 128 && tid < 64) {
53  for (j=0; j<nvars; j++) smem[tid*nvars+j] = tid_sum[j] = tid_sum[j] + smem[(tid+64)*nvars+j];
54  }
55  __syncthreads();
56 
57  if (tid < 32) {
58  if (block_size >= 64) {
59  for (j=0; j<nvars; j++) tid_sum[j] += smem[(tid+32)*nvars+j];
60  }
61  for (int offset=16; offset>0; offset/=2) {
62  for (j=0; j<nvars; j++) {
63  tid_sum[j] += __shfl_down_sync(0xffffffff, tid_sum[j], offset);
64  }
65  }
66  }
67  __syncthreads();
68 
69  if (tid == 0) {
70  for (j=0; j<nvars; j++) StageBoundaryIntegral[j] = (sign == 1) ? tid_sum[j] : -tid_sum[j];
71  }
72  return;
73 }
74 
75 #ifdef CUDA_VAR_ORDERDING_AOS
76 
77 /*! 3D Kernel for gpuHyperbolicFunction() */
78 __global__
79 void HyperbolicFunction_dim3_kernel(
80  int npoints_grid,
81  int d,
82  int nvars,
83  int ghosts,
84  int offset,
85  const int * __restrict__ dim,
86  const double * __restrict__ FluxI,
87  const double * __restrict__ dxinv,
88  double * __restrict__ hyp,
89  double * __restrict__ FluxI_p1,
90  double * __restrict__ FluxI_p2
91 )
92 {
93  int tx = threadIdx.x + (blockDim.x * blockIdx.x);
94  if (tx < npoints_grid) {
95  int v, p, p1, p2, b;
96  int dim_interface[3];
97  int index[3], index1[3], index2[3];
98 
99  _ArrayIndexnD_(3,tx,dim,index,0);
100  _ArrayCopy1D_(dim,dim_interface,3);
101  dim_interface[d] += 1;
102 
103  _ArrayCopy1D_(index,index1,3);
104  _ArrayCopy1D_(index,index2,3); index2[d]++;
105  _ArrayIndex1D_(3,dim ,index ,ghosts,p);
106  _ArrayIndex1D_(3,dim_interface,index1,0 ,p1);
107  _ArrayIndex1D_(3,dim_interface,index2,0 ,p2);
108  for (v=0; v<nvars; v++) hyp[nvars*p+v] += dxinv[offset+ghosts+index[d]]
109  * (FluxI[nvars*p2+v]-FluxI[nvars*p1+v]);
110  if (index[d] == 0 || index[d] == dim[d]-1) {
111  if (d == 0) {
112  b = index[1] + index[2]*dim[1];
113  } else if (d == 1) {
114  b = index[0] + index[2]*dim[0];
115  } else {
116  b = index[0] + index[1]*dim[0];
117  }
118  if (index[d] == 0)
119  for (v=0; v<nvars; v++) FluxI_p1[b*nvars+v] = FluxI[nvars*p1+v];
120  else
121  for (v=0; v<nvars; v++) FluxI_p2[b*nvars+v] = FluxI[nvars*p2+v];
122  }
123  }
124  return;
125 }
126 
127 /*! 2D Kernel for gpuHyperbolicFunction() */
128 __global__
129 void HyperbolicFunction_dim2_kernel(
130  int npoints_grid,
131  int d,
132  int nvars,
133  int ghosts,
134  int offset,
135  const int * __restrict__ dim,
136  const double * __restrict__ FluxI,
137  const double * __restrict__ dxinv,
138  double * __restrict__ hyp,
139  double * __restrict__ FluxI_p1,
140  double * __restrict__ FluxI_p2
141 )
142 {
143  int tx = threadIdx.x + (blockDim.x * blockIdx.x);
144  if (tx < npoints_grid) {
145  int v, p, p1, p2, b;
146  int dim_interface[2];
147  int index[2], index1[2], index2[2];
148 
149  _ArrayIndexnD_(2,tx,dim,index,0);
150  _ArrayCopy1D_(dim,dim_interface,2);
151  dim_interface[d] += 1;
152 
153  _ArrayCopy1D_(index,index1,2);
154  _ArrayCopy1D_(index,index2,2); index2[d]++;
155  _ArrayIndex1D_(2,dim ,index ,ghosts,p);
156  _ArrayIndex1D_(2,dim_interface,index1,0 ,p1);
157  _ArrayIndex1D_(2,dim_interface,index2,0 ,p2);
158  for (v=0; v<nvars; v++) hyp[nvars*p+v] += dxinv[offset+ghosts+index[d]]
159  * (FluxI[nvars*p2+v]-FluxI[nvars*p1+v]);
160  if (index[d] == 0 || index[d] == dim[d]-1) {
161  b = (d == 0) ? index[1] : index[0];
162  if (index[d] == 0)
163  for (v=0; v<nvars; v++) FluxI_p1[b*nvars+v] = FluxI[nvars*p1+v];
164  else
165  for (v=0; v<nvars; v++) FluxI_p2[b*nvars+v] = FluxI[nvars*p2+v];
166  }
167  }
168  return;
169 }
170 
171 /*! This function computes the hyperbolic term of the governing equations, expressed as follows:-
172  \f{equation}{
173  {\bf F} \left({\bf u}\right) = \sum_{d=0}^{D-1} \frac {\partial {\bf f}_d\left({\bf u}\right)} {\partial x_d}
174  \f}
175  using a conservative finite-difference discretization is space:
176  \f{equation}{
177  \hat{\bf F} \left({\bf u}\right) = \sum_{d=0}^{D-1} \frac {1} {\Delta x_d} \left[ \hat{\bf f}_{d,j+1/2} - \hat{\bf f}_{d,j-1/2} \right]
178  \f}
179  where \f$d\f$ denotes the spatial dimension, \f$D\f$ denotes the total number of spatial dimensions, the hat denotes
180  the discretized quantity, and \f$j\f$ is the grid coordinate along dimension \f$d\f$.
181  The approximation to the flux function \f${\bf f}_d\f$ at the interfaces \f$j\pm1/2\f$, denoted by \f$\hat{\bf f}_{d,j\pm 1/2}\f$,
182  are computed using the function ReconstructHyperbolic().
183 */
184 extern "C" int gpuHyperbolicFunction(
185  double *hyp, /*!< Array to hold the computed hyperbolic term (shares the same layout as u */
186  double *gpu_u, /*!< Solution array */
187  void *s, /*!< Solver object of type #HyPar */
188  void *m, /*!< MPI object of type #MPIVariables */
189  double t, /*!< Current simulation time */
190  int LimFlag, /*!< Flag to indicate if the nonlinear coefficients for solution-dependent
191  interpolation method should be recomputed (see ReconstructHyperbolic() for
192  an explanation on why this is needed) */
193  /*! Function pointer to the flux function for the hyperbolic term */
194  int(*FluxFunction)(double*,double*,int,void*,double),
195  /*! Function pointer to the upwinding function for the hyperbolic term */
196  int(*UpwindFunction)(double*,double*,double*,double*,double*,double*,int,void*,double)
197 )
198 {
199  HyPar *solver = (HyPar*) s;
200  MPIVariables *mpi = (MPIVariables*) m;
201  int d;
202  double *gpu_FluxI = solver->fluxI; /* interface flux */
203  double *gpu_FluxC = solver->fluxC; /* cell centered flux */
204 
205  int ndims = solver->ndims;
206  int nvars = solver->nvars;
207  int ghosts = solver->ghosts;
208  int *dim = solver->dim_local;
209  int size = solver->npoints_local_wghosts;
210  double *gpu_x = solver->gpu_x;
211  double *gpu_dxinv = solver->gpu_dxinv;
212  double *gpu_FluxI_p1, *gpu_FluxI_p2;
213 
214  LimFlag = (LimFlag && solver->flag_nonlinearinterp && solver->SetInterpLimiterVar);
215 
216  gpuMemset(hyp, 0, size*nvars*sizeof(double));
217  gpuMemset(solver->StageBoundaryBuffer, 0, solver->StageBoundaryBuffer_size*sizeof(double));
218  gpuMemset(solver->StageBoundaryIntegral, 0, 2*ndims*nvars*sizeof(double));
219 
220  if (!FluxFunction) return(0); /* zero hyperbolic term */
221  /*solver->count_hyp++;*/
222 
223  int npoints_grid = 1; for (d = 0; d < ndims; d++) npoints_grid *= dim[d];
224  int nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
225  int offset = 0;
226 
227 #if defined(GPU_STAT)
228  cudaEvent_t start, stop;
229  float milliseconds = 0;
230 
231  checkCuda( cudaEventCreate(&start) );
232  checkCuda( cudaEventCreate(&stop) );
233 #endif
234 
235  for (d = 0; d < ndims; d++) {
236  /* evaluate cell-centered flux */
237  FluxFunction(gpu_FluxC,gpu_u,d,solver,t);
238  /* compute interface fluxes */
239  ReconstructHyperbolic(gpu_FluxI,gpu_FluxC,gpu_u,gpu_x+offset,d,solver,mpi,t,LimFlag,UpwindFunction);
240 
241  gpu_FluxI_p1 = solver->StageBoundaryBuffer+(solver->gpu_npoints_boundary_offset[d]*nvars);
242  gpu_FluxI_p2 = solver->StageBoundaryBuffer+(solver->gpu_npoints_boundary_offset[d]+solver->gpu_npoints_boundary[d])*nvars;
243 
244 #if defined(GPU_STAT)
245  checkCuda( cudaEventRecord(start, 0) );
246 #endif
247 
248  if (ndims == 3) {
249  HyperbolicFunction_dim3_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
250  npoints_grid, d, nvars, ghosts, offset,
251  solver->gpu_dim_local, gpu_FluxI, gpu_dxinv,
252  hyp, gpu_FluxI_p1, gpu_FluxI_p2
253  );
254  } else if (ndims == 2) {
255  HyperbolicFunction_dim2_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
256  npoints_grid, d, nvars, ghosts, offset,
257  solver->gpu_dim_local, gpu_FluxI, gpu_dxinv,
258  hyp, gpu_FluxI_p1, gpu_FluxI_p2
259  );
260  } else {
261  fprintf(stderr,"gpuHyperbolicFunction(): Not implemented for ndims = %d!\n", ndims);
262  exit(1);
263  }
264  cudaDeviceSynchronize();
265 
266 #if defined(GPU_STAT)
267  checkCuda( cudaEventRecord(stop, 0) );
268  checkCuda( cudaEventSynchronize(stop) );
269  checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
270  printf("%-50s GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f\n",
271  "HyperbolicFunction", milliseconds*1e-3, d, (1e-6*2*npoints_grid*nvars*sizeof(double))/milliseconds);
272 
273  checkCuda( cudaEventRecord(start, 0) );
274 #endif
275 
276  StageBoundaryIntegral_kernel<GPU_THREADS_PER_BLOCK><<<1, GPU_THREADS_PER_BLOCK, GPU_THREADS_PER_BLOCK*nvars*sizeof(double)>>>(
277  solver->gpu_npoints_boundary[d], nvars, -1, gpu_FluxI_p1,
278  solver->StageBoundaryIntegral + 2*d*nvars
279  );
280  StageBoundaryIntegral_kernel<GPU_THREADS_PER_BLOCK><<<1, GPU_THREADS_PER_BLOCK, GPU_THREADS_PER_BLOCK*nvars*sizeof(double)>>>(
281  solver->gpu_npoints_boundary[d], nvars, 1, gpu_FluxI_p2,
282  solver->StageBoundaryIntegral + (2*d+1)*nvars
283  );
284  cudaDeviceSynchronize();
285 
286 #if defined(GPU_STAT)
287  checkCuda( cudaEventRecord(stop, 0) );
288  checkCuda( cudaEventSynchronize(stop) );
289  checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
290  printf("%-50s GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f\n",
291  "StageBoundaryIntegral", milliseconds*1e-3, d, (1e-6*2*solver->gpu_npoints_boundary[d]*nvars*sizeof(double))/milliseconds);
292 #endif
293 
294  offset += dim[d] + 2*ghosts;
295  }
296 
297  if (solver->flag_ib) gpuArrayBlockMultiply(hyp, solver->gpu_iblank, size, nvars);
298 
299 #if defined(GPU_STAT)
300  checkCuda(cudaEventDestroy(start));
301  checkCuda(cudaEventDestroy(stop));
302 #endif
303 
304  return(0);
305 }
306 
307 /*! This function computes the numerical flux \f$\hat{\bf f}_{j+1/2}\f$ at the interface from the cell-centered
308  flux function \f${\bf f}_j\f$. This happens in two steps:-
309 
310  \b Interpolation: High-order accurate approximations to the flux at the interface \f$j+1/2\f$ are computed from
311  the cell-centered flux with left- and right-biased interpolation methods. This is done by the
312  #HyPar::InterpolateInterfacesHyp function. This can be expressed as follows:
313  \f{align}{
314  \hat{\bf f}^L_{j+1/2} &= \mathcal{I}\left({\bf f}_j,+1\right), \\
315  \hat{\bf f}^R_{j+1/2} &= \mathcal{I}\left({\bf f}_j,-1\right),
316  \f}
317  where the \f$\pm 1\f$ indicates the interpolation bias, and \f$\mathcal{I}\f$ is the interpolation operator
318  pointed to by #HyPar::InterpolateInterfacesHyp (see \b src/InterpolationFunctions for all the available operators).
319  The interface values of the solution are similarly computed:
320  \f{align}{
321  \hat{\bf u}^L_{j+1/2} &= \mathcal{I}\left({\bf u}_j,+1\right), \\
322  \hat{\bf u}^R_{j+1/2} &= \mathcal{I}\left({\bf u}_j,-1\right),
323  \f}
324  The specific choice of \f$\mathcal{I}\f$ is set based on #HyPar::spatial_scheme_hyp.
325 
326  \b Upwinding: The final flux at the interface is computed as
327  \f{equation}{
328  \hat{\bf f}_{j+1/2} = \mathcal{U}\left( \hat{\bf f}^L_{j+1/2}, \hat{\bf f}^R_{j+1/2}, \hat{\bf u}^L_{j+1/2}, \hat{\bf u}^R_{j+1/2} \right),
329  \f}
330  where \f$\mathcal{U}\f$ denotes the upwinding function UpwindFunction() passed as an argument (if NULL, DefaultUpwinding() is used). The
331  upwinding function is specified by the physical model.
332 
333  \b Note:
334  Solution-dependent, nonlinear interpolation methods (such as WENO, CRWENO) are implemented in a way that separates the calculation of the
335  nonlinear interpolation weights (based on, say, the smoothness of the flux function), and the actual evaluation of the interpolant, into
336  different functions. This allows the flexibility to choose if and when the nonlinear coefficients are evaluated (or previously computed
337  values are reused). Some possible scenarios are:
338  + For explicit time integration, they are computed every time the hyperbolic flux term is being computed.
339  + For implicit time integration, consistency or linearization may require that they be computed and "frozen"
340  at the beginning of a time step or stage.
341 
342  The argument \b LimFlag controls this behavior:
343  + LimFlag = 1 means recompute the nonlinear coefficients.
344  + LimFlag = 0 means reuse the the previously computed coefficients.
345 */
346 int ReconstructHyperbolic(
347  double *gpu_fluxI, /*!< Array to hold the computed interface fluxes. This array does not
348  have ghost points. The dimensions are the same as those of u without
349  ghost points in all dimensions, except along dir, where it is one more */
350  double *gpu_fluxC, /*!< Array of the flux function computed at the cell centers
351  (same layout as u) */
352  double *gpu_u, /*!< Solution array */
353  double *gpu_x, /*!< Array of spatial coordinates */
354  int dir, /*!< Spatial dimension along which to reconstruct the interface fluxes */
355  void *s, /*!< Solver object of type #HyPar */
356  void *m, /*!< MPI object of type #MPIVariables */
357  double t, /*!< Current solution time */
358  int LimFlag, /*!< Flag to indicate if the nonlinear coefficients for solution-dependent
359  interpolation method should be recomputed */
360  /*! Function pointer to the upwinding function for the interface flux computation. If NULL,
361  DefaultUpwinding() will be used. */
362  int(*UpwindFunction)(double*,double*,double*,double*,
363  double*,double*,int,void*,double)
364 )
365 {
366  HyPar *solver = (HyPar*) s;
367  MPIVariables *mpi = (MPIVariables*) m;
368 
369  double *gpu_uC = NULL;
370  double *gpu_uL = solver->uL;
371  double *gpu_uR = solver->uR;
372  double *gpu_fluxL = solver->fL;
373  double *gpu_fluxR = solver->fR;
374 
375  /*
376  precalculate the non-linear interpolation coefficients if required
377  else reuse the weights previously calculated
378  */
379  if (LimFlag) solver->SetInterpLimiterVar(gpu_fluxC,gpu_u,gpu_x,dir,solver,mpi);
380 
381  /* if defined, calculate the modified u-function to be used for upwinding
382  e.g.: used in well-balanced schemes for Euler/Navier-Stokes with gravity
383  otherwise, just copy u to uC */
384  if (solver->UFunction) {
385  gpu_uC = solver->uC;
386  solver->UFunction(gpu_uC,gpu_u,dir,solver,mpi,t);
387  } else gpu_uC = gpu_u;
388 
389  /* Interpolation -> to calculate left and right-biased interface flux and state variable*/
390  solver->InterpolateInterfacesHyp(gpu_uL ,gpu_uC ,gpu_u,gpu_x, 1,dir,solver,mpi,1);
391  solver->InterpolateInterfacesHyp(gpu_uR ,gpu_uC ,gpu_u,gpu_x,-1,dir,solver,mpi,1);
392  solver->InterpolateInterfacesHyp(gpu_fluxL,gpu_fluxC,gpu_u,gpu_x, 1,dir,solver,mpi,0);
393  solver->InterpolateInterfacesHyp(gpu_fluxR,gpu_fluxC,gpu_u,gpu_x,-1,dir,solver,mpi,0);
394 
395  /* Upwind -> to calculate the final interface flux */
396  if (UpwindFunction) { UpwindFunction (gpu_fluxI,gpu_fluxL,gpu_fluxR,gpu_uL,gpu_uR,gpu_u,dir,solver,t); }
397  else { DefaultUpwinding (gpu_fluxI,gpu_fluxL,gpu_fluxR,NULL,NULL,NULL,dir,solver,t); }
398 
399  return(0);
400 }
401 
402 #else
403 
404 /*! 3D Kernel for gpuHyperbolicFunction() */
405 __global__
406 void HyperbolicFunction_dim3_kernel(
407  int npoints_grid,
408  int npoints_local_wghosts,
409  int npoints_dim_interface,
410  int d,
411  int nvars,
412  int ghosts,
413  int offset,
414  const int * __restrict__ dim,
415  const double * __restrict__ FluxI,
416  const double * __restrict__ dxinv,
417  double * __restrict__ hyp,
418  double * __restrict__ FluxI_p1,
419  double * __restrict__ FluxI_p2
420 )
421 {
422  int tx = threadIdx.x + (blockDim.x * blockIdx.x);
423  if (tx < npoints_grid) {
424  int v, p, p1, p2, b;
425  int dim_interface[3];
426  int index[3], index1[3], index2[3];
427 
428  _ArrayIndexnD_(3,tx,dim,index,0);
429  _ArrayCopy1D_(dim,dim_interface,3);
430  dim_interface[d] += 1;
431 
432  _ArrayCopy1D_(index,index1,3);
433  _ArrayCopy1D_(index,index2,3); index2[d]++;
434  _ArrayIndex1D_(3,dim ,index ,ghosts,p);
435  _ArrayIndex1D_(3,dim_interface,index1,0 ,p1);
436  _ArrayIndex1D_(3,dim_interface,index2,0 ,p2);
437  for (v=0; v<nvars; v++) {
438  hyp[p+v*npoints_local_wghosts] += dxinv[offset+ghosts+index[d]]
439  * (FluxI[p2+v*npoints_dim_interface]-FluxI[p1+v*npoints_dim_interface]);
440  }
441  if (index[d] == 0 || index[d] == dim[d]-1) {
442  if (d == 0) {
443  b = index[1] + index[2]*dim[1];
444  } else if (d == 1) {
445  b = index[0] + index[2]*dim[0];
446  } else {
447  b = index[0] + index[1]*dim[0];
448  }
449  if (index[d] == 0) {
450  for (v=0; v<nvars; v++) FluxI_p1[b*nvars+v] = FluxI[p1+v*npoints_dim_interface];
451  } else {
452  for (v=0; v<nvars; v++) FluxI_p2[b*nvars+v] = FluxI[p2+v*npoints_dim_interface];
453  }
454  }
455  }
456  return;
457 }
458 
459 
460 /*! This function computes the hyperbolic term of the governing equations, expressed as follows:-
461  \f{equation}{
462  {\bf F} \left({\bf u}\right) = \sum_{d=0}^{D-1} \frac {\partial {\bf f}_d\left({\bf u}\right)} {\partial x_d}
463  \f}
464  using a conservative finite-difference discretization is space:
465  \f{equation}{
466  \hat{\bf F} \left({\bf u}\right) = \sum_{d=0}^{D-1} \frac {1} {\Delta x_d} \left[ \hat{\bf f}_{d,j+1/2} - \hat{\bf f}_{d,j-1/2} \right]
467  \f}
468  where \f$d\f$ denotes the spatial dimension, \f$D\f$ denotes the total number of spatial dimensions, the hat denotes
469  the discretized quantity, and \f$j\f$ is the grid coordinate along dimension \f$d\f$.
470  The approximation to the flux function \f${\bf f}_d\f$ at the interfaces \f$j\pm1/2\f$, denoted by \f$\hat{\bf f}_{d,j\pm 1/2}\f$,
471  are computed using the function ReconstructHyperbolic().
472 */
473 extern "C" int gpuHyperbolicFunction(
474  double *hyp, /*!< Array to hold the computed hyperbolic term (shares the same layout as u */
475  double *u, /*!< Solution array */
476  void *s, /*!< Solver object of type #HyPar */
477  void *m, /*!< MPI object of type #MPIVariables */
478  double t, /*!< Current simulation time */
479  int LimFlag, /*!< Flag to indicate if the nonlinear coefficients for solution-dependent
480  interpolation method should be recomputed (see ReconstructHyperbolic() for
481  an explanation on why this is needed) */
482  /*! Function pointer to the flux function for the hyperbolic term */
483  int(*FluxFunction)(double*,double*,int,void*,double),
484  /*! Function pointer to the upwinding function for the hyperbolic term */
485  int(*UpwindFunction)( double*,double*,double*,double*,
486  double*,double*,int,void*,double)
487 )
488 {
489  HyPar *solver = (HyPar*) s;
490  MPIVariables *mpi = (MPIVariables*) m;
491  int d;
492  double *FluxI = solver->fluxI; /* interface flux */
493  double *FluxC = solver->fluxC; /* cell centered flux */
494 
495  int ndims = solver->ndims;
496  int nvars = solver->nvars;
497  int ghosts = solver->ghosts;
498  int *dim = solver->dim_local;
499  int size = solver->npoints_local_wghosts;
500  double *x = solver->gpu_x;
501  double *dxinv = solver->gpu_dxinv;
502  double *FluxI_p1, *FluxI_p2;
503 
504  LimFlag = (LimFlag && solver->flag_nonlinearinterp && solver->SetInterpLimiterVar);
505 
506  gpuMemset(hyp, 0, size*nvars*sizeof(double));
507  gpuMemset(solver->StageBoundaryBuffer, 0, solver->StageBoundaryBuffer_size*sizeof(double));
508  gpuMemset(solver->StageBoundaryIntegral, 0, 2*ndims*nvars*sizeof(double));
509 
510  if (!FluxFunction) return(0); /* zero hyperbolic term */
511  /*solver->count_hyp++;*/
512 
513  int npoints_grid = 1; for (d = 0; d < ndims; d++) npoints_grid *= dim[d];
514  int nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
515  int offset = 0;
516 
517 #if defined(GPU_STAT)
518  cudaEvent_t start, stop;
519  float milliseconds = 0;
520 
521  checkCuda( cudaEventCreate(&start) );
522  checkCuda( cudaEventCreate(&stop) );
523 #endif
524 
525  for (d = 0; d < ndims; d++) {
526  int npoints_dim_interface = 1; for (int i=0; i<ndims; i++) npoints_dim_interface *= (i==d) ? (dim[i]+1) : dim[i];
527  /* evaluate cell-centered flux */
528  FluxFunction(FluxC,u,d,solver,t);
529  /* compute interface fluxes */
530  ReconstructHyperbolic(FluxI,FluxC,u,x+offset,d,solver,mpi,t,LimFlag,UpwindFunction);
531 
532  FluxI_p1 = solver->StageBoundaryBuffer+(solver->gpu_npoints_boundary_offset[d]*nvars);
533  FluxI_p2 = solver->StageBoundaryBuffer+(solver->gpu_npoints_boundary_offset[d]+solver->gpu_npoints_boundary[d])*nvars;
534 
535 #if defined(GPU_STAT)
536  checkCuda( cudaEventRecord(start, 0) );
537 #endif
538  if (ndims == 3) {
539  HyperbolicFunction_dim3_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
540  npoints_grid, size, npoints_dim_interface, d, nvars, ghosts, offset,
541  solver->gpu_dim_local, FluxI, dxinv,
542  hyp, FluxI_p1, FluxI_p2
543  );
544  } else {
545  fprintf(stderr,"gpuHyperbolicFunction(): Not implemented for ndims = %d!\n", ndims);
546  exit(1);
547  }
548  cudaDeviceSynchronize();
549 
550 #if defined(GPU_STAT)
551  checkCuda( cudaEventRecord(stop, 0) );
552  checkCuda( cudaEventSynchronize(stop) );
553  checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
554  printf("%-50s GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f\n",
555  "HyperbolicFunction", milliseconds*1e-3, d, (1e-6*2*npoints_grid*nvars*sizeof(double))/milliseconds);
556 
557  checkCuda( cudaEventRecord(start, 0) );
558 #endif
559 
560  StageBoundaryIntegral_kernel<GPU_THREADS_PER_BLOCK><<<1, GPU_THREADS_PER_BLOCK, GPU_THREADS_PER_BLOCK*nvars*sizeof(double)>>>(
561  solver->gpu_npoints_boundary[d], nvars, -1, FluxI_p1,
562  solver->StageBoundaryIntegral + 2*d*nvars
563  );
564  StageBoundaryIntegral_kernel<GPU_THREADS_PER_BLOCK><<<1, GPU_THREADS_PER_BLOCK, GPU_THREADS_PER_BLOCK*nvars*sizeof(double)>>>(
565  solver->gpu_npoints_boundary[d], nvars, 1, FluxI_p2,
566  solver->StageBoundaryIntegral + (2*d+1)*nvars
567  );
568  cudaDeviceSynchronize();
569 
570 #if defined(GPU_STAT)
571  checkCuda( cudaEventRecord(stop, 0) );
572  checkCuda( cudaEventSynchronize(stop) );
573  checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
574  printf("%-50s GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f\n",
575  "StageBoundaryIntegral", milliseconds*1e-3, d, (1e-6*2*solver->gpu_npoints_boundary[d]*nvars*sizeof(double))/milliseconds);
576 #endif
577  offset += dim[d] + 2*ghosts;
578  }
579 
580  if (solver->flag_ib) gpuArrayBlockMultiply(hyp, solver->gpu_iblank, size, nvars);
581 
582 #if defined(GPU_STAT)
583  checkCuda(cudaEventDestroy(start));
584  checkCuda(cudaEventDestroy(stop));
585 #endif
586 
587  return(0);
588 }
589 
590 /*! This function computes the numerical flux \f$\hat{\bf f}_{j+1/2}\f$ at the interface from the cell-centered
591  flux function \f${\bf f}_j\f$. This happens in two steps:-
592 
593  \b Interpolation: High-order accurate approximations to the flux at the interface \f$j+1/2\f$ are computed from
594  the cell-centered flux with left- and right-biased interpolation methods. This is done by the
595  #HyPar::InterpolateInterfacesHyp function. This can be expressed as follows:
596  \f{align}{
597  \hat{\bf f}^L_{j+1/2} &= \mathcal{I}\left({\bf f}_j,+1\right), \\
598  \hat{\bf f}^R_{j+1/2} &= \mathcal{I}\left({\bf f}_j,-1\right),
599  \f}
600  where the \f$\pm 1\f$ indicates the interpolation bias, and \f$\mathcal{I}\f$ is the interpolation operator
601  pointed to by #HyPar::InterpolateInterfacesHyp (see \b src/InterpolationFunctions for all the available operators).
602  The interface values of the solution are similarly computed:
603  \f{align}{
604  \hat{\bf u}^L_{j+1/2} &= \mathcal{I}\left({\bf u}_j,+1\right), \\
605  \hat{\bf u}^R_{j+1/2} &= \mathcal{I}\left({\bf u}_j,-1\right),
606  \f}
607  The specific choice of \f$\mathcal{I}\f$ is set based on #HyPar::spatial_scheme_hyp.
608 
609  \b Upwinding: The final flux at the interface is computed as
610  \f{equation}{
611  \hat{\bf f}_{j+1/2} = \mathcal{U}\left( \hat{\bf f}^L_{j+1/2}, \hat{\bf f}^R_{j+1/2}, \hat{\bf u}^L_{j+1/2}, \hat{\bf u}^R_{j+1/2} \right),
612  \f}
613  where \f$\mathcal{U}\f$ denotes the upwinding function UpwindFunction() passed as an argument (if NULL, DefaultUpwinding() is used). The
614  upwinding function is specified by the physical model.
615 
616  \b Note:
617  Solution-dependent, nonlinear interpolation methods (such as WENO, CRWENO) are implemented in a way that separates the calculation of the
618  nonlinear interpolation weights (based on, say, the smoothness of the flux function), and the actual evaluation of the interpolant, into
619  different functions. This allows the flexibility to choose if and when the nonlinear coefficients are evaluated (or previously computed
620  values are reused). Some possible scenarios are:
621  + For explicit time integration, they are computed every time the hyperbolic flux term is being computed.
622  + For implicit time integration, consistency or linearization may require that they be computed and "frozen"
623  at the beginning of a time step or stage.
624 
625  The argument \b LimFlag controls this behavior:
626  + LimFlag = 1 means recompute the nonlinear coefficients.
627  + LimFlag = 0 means reuse the the previously computed coefficients.
628 */
629 int ReconstructHyperbolic(
630  double *fluxI, /*!< Array to hold the computed interface fluxes. This array does not
631  have ghost points. The dimensions are the same as those of u without
632  ghost points in all dimensions, except along dir, where it is one more */
633  double *fluxC, /*!< Array of the flux function computed at the cell centers
634  (same layout as u) */
635  double *u, /*!< Solution array */
636  double *x, /*!< Array of spatial coordinates */
637  int dir, /*!< Spatial dimension along which to reconstruct the interface fluxes */
638  void *s, /*!< Solver object of type #HyPar */
639  void *m, /*!< MPI object of type #MPIVariables */
640  double t, /*!< Current solution time */
641  int LimFlag, /*!< Flag to indicate if the nonlinear coefficients for solution-dependent
642  interpolation method should be recomputed */
643  /*! Function pointer to the upwinding function for the interface flux computation. If NULL,
644  DefaultUpwinding() will be used. */
645  int(*UpwindFunction)(double*,double*,double*,double*,
646  double*,double*,int,void*,double)
647 )
648 {
649  HyPar *solver = (HyPar*) s;
650  MPIVariables *mpi = (MPIVariables*) m;
651 
652  double *uC = NULL;
653  double *uL = solver->uL;
654  double *uR = solver->uR;
655  double *fluxL = solver->fL;
656  double *fluxR = solver->fR;
657 
658  /*
659  precalculate the non-linear interpolation coefficients if required
660  else reuse the weights previously calculated
661  */
662 
663  if (LimFlag) solver->SetInterpLimiterVar(fluxC,u,x,dir,solver,mpi);
664 
665  /* if defined, calculate the modified u-function to be used for upwinding
666  e.g.: used in well-balanced schemes for Euler/Navier-Stokes with gravity
667  otherwise, just copy u to uC */
668  if (solver->UFunction) {
669  uC = solver->uC;
670  solver->UFunction(uC,u,dir,solver,mpi,t);
671  } else uC = u;
672 
673 
674  /* Interpolation -> to calculate left and right-biased interface flux and state variable*/
675  solver->InterpolateInterfacesHyp(uL ,uC ,u,x, 1,dir,solver,mpi,1);
676  solver->InterpolateInterfacesHyp(uR ,uC ,u,x,-1,dir,solver,mpi,1);
677  solver->InterpolateInterfacesHyp(fluxL,fluxC,u,x, 1,dir,solver,mpi,0);
678  solver->InterpolateInterfacesHyp(fluxR,fluxC,u,x,-1,dir,solver,mpi,0);
679 
680  /* Upwind -> to calculate the final interface flux */
681  if (UpwindFunction) { UpwindFunction (fluxI,fluxL,fluxR,uL,uR,u,dir,solver,t); }
682  else { DefaultUpwinding (fluxI,fluxL,fluxR,NULL,NULL,NULL,dir,solver,t); }
683 
684  return(0);
685 }
686 
687 #endif
688 
689 /*! If no upwinding scheme is specified, this function defines the "upwind" flux as the
690  arithmetic mean of the left- and right-biased fluxes. */
691 int DefaultUpwinding(
692  double *fI, /*!< Computed upwind interface flux */
693  double *fL, /*!< Left-biased reconstructed interface flux */
694  double *fR, /*!< Right-biased reconstructed interface flux */
695  double *uL, /*!< Left-biased reconstructed interface solution */
696  double *uR, /*!< Right-biased reconstructed interface solution */
697  double *u, /*!< Cell-centered solution */
698  int dir, /*!< Spatial dimension */
699  void *s, /*!< Solver object of type #HyPar */
700  double t /*!< Current solution time */
701 )
702 {
703  HyPar *solver = (HyPar*) s;
704  int done;
705 
706  int *dim = solver->dim_local;
707  int ndims = solver->ndims;
708  int nvars = solver->nvars;
709 
710  int bounds_outer[ndims], bounds_inter[ndims];
711  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
712  _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
713 
714  done = 0; int index_outer[ndims], index_inter[ndims];
715  _ArraySetValue_(index_outer,ndims,0);
716  while (!done) {
717  _ArrayCopy1D_(index_outer,index_inter,ndims);
718  for (index_inter[dir] = 0; index_inter[dir] < bounds_inter[dir]; index_inter[dir]++) {
719  int p; _ArrayIndex1D_(ndims,bounds_inter,index_inter,0,p);
720  int v; for (v=0; v<nvars; v++) fI[nvars*p+v] = 0.5 * (fL[nvars*p+v]+fR[nvars*p+v]);
721  }
722  _ArrayIncrementIndex_(ndims,bounds_outer,index_outer,done);
723  }
724 
725  return(0);
726 }