1 /*! @file WENOFifthOrderCalculateWeights_GPU.cu
2 @brief Functions to compute the nonlinear weights for WENO-type schemes
10 #include <cuda_profiler_api.h>
13 #include <basic_gpu.h>
14 #include <arrayfunctions.h>
15 #include <matmult_native.h>
16 #include <mathfunctions.h>
17 #include <interpolation.h>
21 #include <arrayfunctions_gpu.h>
24 static int gpuWENOFifthOrderCalculateWeightsYC(double*,double*,double*,int,void*,void*);
25 static int gpuWENOFifthOrderCalculateWeightsM(double*,double*,double*,int,void*,void*);
27 /*! Compute the nonlinear weights for 5th order WENO-type schemes. This function is a wrapper that
28 calls the appropriate function, depending on the type of WENO weights.
30 extern "C" int gpuWENOFifthOrderCalculateWeights(
31 double *fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
32 double *uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
33 double *x, /*!< Grid coordinates */
34 int dir, /*!< Spatial dimension along which to interpolation */
35 void *s, /*!< Object of type #HyPar containing solver-related variables */
36 void *m /*!< Object of type #MPIVariables containing MPI-related variables */
39 HyPar *solver = (HyPar*) s;
40 WENOParameters *weno = (WENOParameters*) solver->interp;
41 MPIVariables *mpi = (MPIVariables*) m;
43 if (weno->yc) return(gpuWENOFifthOrderCalculateWeightsYC (fC,uC,x,dir,solver,mpi));
44 else if (weno->mapped) return(gpuWENOFifthOrderCalculateWeightsM (fC,uC,x,dir,solver,mpi));
46 printf("ERROR! WENO functions other than yc or mapped have not been implemented on GPUs.\n");
51 #ifdef CUDA_VAR_ORDERDING_AOS
53 /*! Kernel for gpuWENOFifthOrderCalculateWeightsM() */
55 void WENOFifthOrderCalculateWeightsM_kernel(
76 int p = threadIdx.x + (blockDim.x * blockIdx.x);
77 if (p < ngrid_points) {
78 const int max_ndims = 3;
79 const double thirteen_by_twelve = 13.0 / 12.0;
80 const double one_fourth = 1.0 / 4.0;
82 int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
83 int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
84 double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
85 double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
90 ww1RF = w1 + 2*weno_size + offset;
91 ww2RF = w2 + 2*weno_size + offset;
92 ww3RF = w3 + 2*weno_size + offset;
93 ww1LU = w1 + weno_size + offset;
94 ww2LU = w2 + weno_size + offset;
95 ww3LU = w3 + weno_size + offset;
96 ww1RU = w1 + 2*weno_size + weno_size + offset;
97 ww2RU = w2 + 2*weno_size + weno_size + offset;
98 ww3RU = w3 + 2*weno_size + weno_size + offset;
100 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
101 _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
102 _ArrayCopy1D_(indexC,indexI,ndims);
104 indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
105 qm3L = qm1L - 2*stride;
106 qm2L = qm1L - stride;
107 qp1L = qm1L + stride;
108 qp2L = qm1L + 2*stride;
110 indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
111 qm3R = qm1R + 2*stride;
112 qm2R = qm1R + stride;
113 qp1R = qm1R - stride;
114 qp2R = qm1R - 2*stride;
116 /* Defining stencil points */
117 const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
118 m3LF = (fC+qm3L*nvars);
119 m2LF = (fC+qm2L*nvars);
120 m1LF = (fC+qm1L*nvars);
121 p1LF = (fC+qp1L*nvars);
122 p2LF = (fC+qp2L*nvars);
123 const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
124 m3RF = (fC+qm3R*nvars);
125 m2RF = (fC+qm2R*nvars);
126 m1RF = (fC+qm1R*nvars);
127 p1RF = (fC+qp1R*nvars);
128 p2RF = (fC+qp2R*nvars);
129 const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
130 m3LU = (uC+qm3L*nvars);
131 m2LU = (uC+qm2L*nvars);
132 m1LU = (uC+qm1L*nvars);
133 p1LU = (uC+qp1L*nvars);
134 p2LU = (uC+qp2L*nvars);
135 const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
136 m3RU = (uC+qm3R*nvars);
137 m2RU = (uC+qm2R*nvars);
138 m1RU = (uC+qm1R*nvars);
139 p1RU = (uC+qp1R*nvars);
140 p2RU = (uC+qp2R*nvars);
144 if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
145 || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
146 /* Use WENO5 at the physical boundaries */
147 c1 = _WENO_OPTIMAL_WEIGHT_1_;
148 c2 = _WENO_OPTIMAL_WEIGHT_2_;
149 c3 = _WENO_OPTIMAL_WEIGHT_3_;
151 /* CRWENO5 at the interior points */
152 c1 = _CRWENO_OPTIMAL_WEIGHT_1_;
153 c2 = _CRWENO_OPTIMAL_WEIGHT_2_;
154 c3 = _CRWENO_OPTIMAL_WEIGHT_3_;
157 /* WENO5 and HCWENO5 */
158 c1 = _WENO_OPTIMAL_WEIGHT_1_;
159 c2 = _WENO_OPTIMAL_WEIGHT_2_;
160 c3 = _WENO_OPTIMAL_WEIGHT_3_;
163 /* calculate WENO weights */
164 _WENOWeights_v_M_((ww1LF+p*nvars),(ww2LF+p*nvars),(ww3LF+p*nvars),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,nvars);
165 _WENOWeights_v_M_((ww1RF+p*nvars),(ww2RF+p*nvars),(ww3RF+p*nvars),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,nvars);
166 _WENOWeights_v_M_((ww1LU+p*nvars),(ww2LU+p*nvars),(ww3LU+p*nvars),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,nvars);
167 _WENOWeights_v_M_((ww1RU+p*nvars),(ww2RU+p*nvars),(ww3RU+p*nvars),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,nvars);
174 Computes the nonlinear weights for the 5th order component-wise WENO-type schemes using the mapped formulation of Henrick, Aslam & Powers:
176 \omega_k &=& \frac {a_k} {\sum_{j=1}^3 a_j },\ a_k = \frac {\tilde{\omega}_k \left( c_k + c_k^2 - 3c_k\tilde{\omega}_k + \tilde{\omega}_k^2\right)} {c_k^2 + \tilde{\omega}_k\left(1-2c_k\right)}, \\
177 \tilde{\omega}_k &=& \frac {\tilde{a}_k} {\sum_{j=1}^3 \tilde{a}_j },\ \tilde{a}_k = \frac {c_k} {\left(\beta_k+\epsilon\right)^p},\ k = 1,2,3,
179 where \f$c_k\f$ are the optimal weights, \f$p\f$ is hardcoded to \f$2\f$, and \f$\epsilon\f$ is an input parameter
180 (#WENOParameters::eps) (typically \f$10^{-6}\f$). The smoothness indicators \f$\beta_k\f$ are given by:
182 \beta_1 &=& \frac{13}{12} \left(f_{j-2}-2f_{j-1}+f_j\right)^2 + \frac{1}{4}\left(f_{j-2}-4f_{j-1}+3f_j\right)^2 \\
183 \beta_2 &=& \frac{13}{12} \left(f_{j-1}-2f_j+f_{j+1}\right)^2 + \frac{1}{4}\left(f_{j-1}-f_{j+1}\right)^2 \\
184 \beta_3 &=& \frac{13}{12} \left(f_j-2f_{j+1}+f_{j+2}\right)^2 + \frac{1}{4}\left(3f_j-4f_{j+1}+f_{j+2}\right)^2
188 + This function computes the weights for the entire grid (for interpolation along a given spatial dimension \a dir ).
189 Thus, it loops over all grid lines along \a dir.
190 + The weights are computed for all components, for both the left- and right-biased interpolations, and for both the
191 flux function \f${\bf f}\left({\bf u}\right)\f$ and the solution \f$\bf u\f$. Thus, this approach of computing and
192 storing the weights is quite memory-intensive.
193 + The variable \a offset denotes the offset in the complete array from where the weights for interpolation along the
194 current interpolation dimension (\a dir ) are stored.
197 + Henrick, Aslam, Powers, Mapped weighted essentially non-oscillatory schemes: Achieving optimal order near critical
198 points, J. Comput. Phys., 2005. http://dx.doi.org/10.1016/j.jcp.2005.01.023
200 int gpuWENOFifthOrderCalculateWeightsM(
201 double * __restrict__ fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
202 double * __restrict__ uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
203 double * __restrict__ x, /*!< Grid coordinates */
204 int dir, /*!< Spatial dimension along which to interpolation */
205 void *s, /*!< Object of type #HyPar containing solver-related variables */
206 void *m /*!< Object of type #MPIVariables containing MPI-related variables */
209 HyPar *solver = (HyPar*) s;
210 WENOParameters *weno = (WENOParameters*) solver->interp;
211 MPIVariables *mpi = (MPIVariables*) m;
213 int ghosts = solver->ghosts;
214 int ndims = solver->ndims;
215 int nvars = solver->nvars;
216 int *dim = solver->dim_local;
217 int *stride= solver->stride_with_ghosts;
219 /* calculate dimension offset */
220 int offset = weno->offset[dir];
221 int bounds_inter[ndims];
222 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
223 int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
224 int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
226 int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
227 int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
228 int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
230 #if defined(GPU_STAT)
231 cudaEvent_t startEvent, stopEvent;
232 float milliseconds = 0;
234 checkCuda( cudaEventCreate(&startEvent) );
235 checkCuda( cudaEventCreate(&stopEvent) );
237 int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
238 int fCuC_memory_accessed = 1;
239 for (int d=0; d<ndims; d++) {
240 if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
241 else fCuC_memory_accessed *= dim[d];
243 fCuC_memory_accessed *= nvars*sizeof(double);
245 checkCuda( cudaEventRecord(startEvent, 0) );
248 WENOFifthOrderCalculateWeightsM_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
249 npoints_grid, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
250 is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
251 solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
253 cudaDeviceSynchronize();
255 #if defined(GPU_STAT)
256 checkCuda( cudaEventRecord(stopEvent, 0) );
257 checkCuda( cudaEventSynchronize(stopEvent) );
258 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
260 printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
261 "WENOFifthOrderCalculateWeightsM", milliseconds*1e-3, dir,
262 (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds));
264 checkCuda( cudaEventDestroy(startEvent) );
265 checkCuda( cudaEventDestroy(stopEvent) );
271 /*! Kernel for gpuWENOFifthOrderCalculateWeightsYC() */
273 void WENOFifthOrderCalculateWeightsYC_kernel(
294 int p = threadIdx.x + (blockDim.x * blockIdx.x);
295 if (p < ngrid_points) {
296 const int max_ndims = 3;
297 const double thirteen_by_twelve = 13.0 / 12.0;
298 const double one_fourth = 1.0 / 4.0;
300 int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
301 int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
302 double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
303 double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
308 ww1RF = w1 + 2*weno_size + offset;
309 ww2RF = w2 + 2*weno_size + offset;
310 ww3RF = w3 + 2*weno_size + offset;
311 ww1LU = w1 + weno_size + offset;
312 ww2LU = w2 + weno_size + offset;
313 ww3LU = w3 + weno_size + offset;
314 ww1RU = w1 + 2*weno_size + weno_size + offset;
315 ww2RU = w2 + 2*weno_size + weno_size + offset;
316 ww3RU = w3 + 2*weno_size + weno_size + offset;
318 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
319 _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
320 _ArrayCopy1D_(indexC,indexI,ndims);
322 indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
323 qm3L = qm1L - 2*stride;
324 qm2L = qm1L - stride;
325 qp1L = qm1L + stride;
326 qp2L = qm1L + 2*stride;
328 indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
329 qm3R = qm1R + 2*stride;
330 qm2R = qm1R + stride;
331 qp1R = qm1R - stride;
332 qp2R = qm1R - 2*stride;
334 /* Defining stencil points */
335 const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
336 m3LF = (fC+qm3L*nvars);
337 m2LF = (fC+qm2L*nvars);
338 m1LF = (fC+qm1L*nvars);
339 p1LF = (fC+qp1L*nvars);
340 p2LF = (fC+qp2L*nvars);
341 const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
342 m3RF = (fC+qm3R*nvars);
343 m2RF = (fC+qm2R*nvars);
344 m1RF = (fC+qm1R*nvars);
345 p1RF = (fC+qp1R*nvars);
346 p2RF = (fC+qp2R*nvars);
347 const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
348 m3LU = (uC+qm3L*nvars);
349 m2LU = (uC+qm2L*nvars);
350 m1LU = (uC+qm1L*nvars);
351 p1LU = (uC+qp1L*nvars);
352 p2LU = (uC+qp2L*nvars);
353 const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
354 m3RU = (uC+qm3R*nvars);
355 m2RU = (uC+qm2R*nvars);
356 m1RU = (uC+qm1R*nvars);
357 p1RU = (uC+qp1R*nvars);
358 p2RU = (uC+qp2R*nvars);
362 if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
363 || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
364 /* Use WENO5 at the physical boundaries */
365 c1 = _WENO_OPTIMAL_WEIGHT_1_;
366 c2 = _WENO_OPTIMAL_WEIGHT_2_;
367 c3 = _WENO_OPTIMAL_WEIGHT_3_;
369 /* CRWENO5 at the interior points */
370 c1 = _CRWENO_OPTIMAL_WEIGHT_1_;
371 c2 = _CRWENO_OPTIMAL_WEIGHT_2_;
372 c3 = _CRWENO_OPTIMAL_WEIGHT_3_;
375 /* WENO5 and HCWENO5 */
376 c1 = _WENO_OPTIMAL_WEIGHT_1_;
377 c2 = _WENO_OPTIMAL_WEIGHT_2_;
378 c3 = _WENO_OPTIMAL_WEIGHT_3_;
381 /* calculate WENO weights */
382 _WENOWeights_v_YC_((ww1LF+p*nvars),(ww2LF+p*nvars),(ww3LF+p*nvars),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,nvars);
383 _WENOWeights_v_YC_((ww1RF+p*nvars),(ww2RF+p*nvars),(ww3RF+p*nvars),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,nvars);
384 _WENOWeights_v_YC_((ww1LU+p*nvars),(ww2LU+p*nvars),(ww3LU+p*nvars),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,nvars);
385 _WENOWeights_v_YC_((ww1RU+p*nvars),(ww2RU+p*nvars),(ww3RU+p*nvars),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,nvars);
392 Computes the nonlinear weights for the 5th order component-wise WENO-type schemes using the ESWENO formulation of
393 Yamaleev & Carpenter. Note that only the formulation for the nonlinear weights is adopted and implemented here, not
394 the ESWENO scheme as a whole.
396 \omega_k = \frac {a_k} {\sum_{j=1}^3 a_j },\ a_k = c_k \left( 1 + \frac{\tau_5}{\beta_k+\epsilon} \right)^p,\ k = 1,2,3,
398 where \f$c_k\f$ are the optimal weights, \f$p\f$ is hardcoded to \f$2\f$, and \f$\epsilon\f$ is an input parameter
399 (#WENOParameters::eps) (typically \f$10^{-6}\f$). The smoothness indicators \f$\beta_k\f$ are given by:
401 \beta_1 &=& \frac{13}{12} \left(f_{j-2}-2f_{j-1}+f_j\right)^2 + \frac{1}{4}\left(f_{j-2}-4f_{j-1}+3f_j\right)^2 \\
402 \beta_2 &=& \frac{13}{12} \left(f_{j-1}-2f_j+f_{j+1}\right)^2 + \frac{1}{4}\left(f_{j-1}-f_{j+1}\right)^2 \\
403 \beta_3 &=& \frac{13}{12} \left(f_j-2f_{j+1}+f_{j+2}\right)^2 + \frac{1}{4}\left(3f_j-4f_{j+1}+f_{j+2}\right)^2,
405 and \f$\tau_5 = \left( f_{j-2}-4f_{j-1}+6f_j-4f_{j+1}+f_{j+2} \right)^2\f$.
408 + This function computes the weights for the entire grid (for interpolation along a given spatial dimension \a dir ).
409 Thus, it loops over all grid lines along \a dir.
410 + The weights are computed for all components, for both the left- and right-biased interpolations, and for both the
411 flux function \f${\bf f}\left({\bf u}\right)\f$ and the solution \f$\bf u\f$. Thus, this approach of computing and
412 storing the weights is quite memory-intensive.
413 + The variable \a offset denotes the offset in the complete array from where the weights for interpolation along the
414 current interpolation dimension (\a dir ) are stored.
417 + Yamaleev, Carpenter, A systematic methodology for constructing high-order energy stable WENO schemes,
418 J. Comput. Phys., 2009. http://dx.doi.org/10.1016/j.jcp.2009.03.002
420 int gpuWENOFifthOrderCalculateWeightsYC(
421 double * __restrict__ fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
422 double * __restrict__ uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
423 double * __restrict__ x, /*!< Grid coordinates */
424 int dir, /*!< Spatial dimension along which to interpolation */
425 void *s, /*!< Object of type #HyPar containing solver-related variables */
426 void *m /*!< Object of type #MPIVariables containing MPI-related variables */
429 HyPar *solver = (HyPar*) s;
430 WENOParameters *weno = (WENOParameters*) solver->interp;
431 MPIVariables *mpi = (MPIVariables*) m;
433 int ghosts = solver->ghosts;
434 int ndims = solver->ndims;
435 int nvars = solver->nvars;
436 int *dim = solver->dim_local;
437 int *stride= solver->stride_with_ghosts;
439 /* calculate dimension offset */
440 int offset = weno->offset[dir];
441 int bounds_inter[ndims];
442 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
443 int ngrid_points; _ArrayProduct1D_(bounds_inter,ndims,ngrid_points);
444 //int nblocks = (ngrid_points - 1) / GPU_THREADS_PER_BLOCK + 1;
445 int nblocks = (ngrid_points - 1) / 256 + 1;
447 int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
448 int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
449 int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
451 cudaEvent_t startEvent, stopEvent;
452 float milliseconds = 0;
454 checkCuda( cudaEventCreate(&startEvent) );
455 checkCuda( cudaEventCreate(&stopEvent) );
457 //cudaProfilerStart();
458 checkCuda( cudaEventRecord(startEvent, 0) );
459 WENOFifthOrderCalculateWeightsYC_kernel<<<nblocks, 256>>>(
460 ngrid_points, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
461 is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
462 solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
464 checkCuda( cudaEventRecord(stopEvent, 0) );
465 checkCuda( cudaEventSynchronize(stopEvent) );
466 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
467 //cudaProfilerStop();
469 printf("WENOFifthOrderCalculateWeightsYC_GPU:\n");
470 printf(" GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f weno->size = %d weno->offset[1] = %d\n",
471 milliseconds*1e-3, dir,
472 (1e-6*((12*weno->offset[1]+2*(nvars*(dim[0]+2*ghosts)*dim[1]))*sizeof(double)))/milliseconds,
473 weno->size, weno->offset[1]);
475 checkCuda( cudaEventDestroy(startEvent) );
476 checkCuda( cudaEventDestroy(stopEvent) );
483 /*! Kernel for gpuWENOFifthOrderCalculateWeightsM() */
485 void WENOFifthOrderCalculateWeightsM_kernel(
487 int npoints_local_wghosts,
507 int p = threadIdx.x + (blockDim.x * blockIdx.x);
508 if (p < npoints_grid) {
509 const int max_ndims = 3;
510 const double thirteen_by_twelve = 13.0 / 12.0;
511 const double one_fourth = 1.0 / 4.0;
513 int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
514 int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
515 double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
516 double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
521 ww1RF = w1 + 2*weno_size + offset;
522 ww2RF = w2 + 2*weno_size + offset;
523 ww3RF = w3 + 2*weno_size + offset;
524 ww1LU = w1 + weno_size + offset;
525 ww2LU = w2 + weno_size + offset;
526 ww3LU = w3 + weno_size + offset;
527 ww1RU = w1 + 2*weno_size + weno_size + offset;
528 ww2RU = w2 + 2*weno_size + weno_size + offset;
529 ww3RU = w3 + 2*weno_size + weno_size + offset;
531 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
532 _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
533 _ArrayCopy1D_(indexC,indexI,ndims);
535 indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
536 qm3L = qm1L - 2*stride;
537 qm2L = qm1L - stride;
538 qp1L = qm1L + stride;
539 qp2L = qm1L + 2*stride;
541 indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
542 qm3R = qm1R + 2*stride;
543 qm2R = qm1R + stride;
544 qp1R = qm1R - stride;
545 qp2R = qm1R - 2*stride;
547 for (int j=0; j<nvars; j++) {
548 /* Defining stencil points */
549 const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
555 const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
561 const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
567 const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
574 qm3L += npoints_local_wghosts;
575 qm2L += npoints_local_wghosts;
576 qm1L += npoints_local_wghosts;
577 qp1L += npoints_local_wghosts;
578 qp2L += npoints_local_wghosts;
580 qm3R += npoints_local_wghosts;
581 qm2R += npoints_local_wghosts;
582 qm1R += npoints_local_wghosts;
583 qp1R += npoints_local_wghosts;
584 qp2R += npoints_local_wghosts;
588 if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
589 || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
590 /* Use WENO5 at the physical boundaries */
591 c1 = _WENO_OPTIMAL_WEIGHT_1_;
592 c2 = _WENO_OPTIMAL_WEIGHT_2_;
593 c3 = _WENO_OPTIMAL_WEIGHT_3_;
595 /* CRWENO5 at the interior points */
596 c1 = _CRWENO_OPTIMAL_WEIGHT_1_;
597 c2 = _CRWENO_OPTIMAL_WEIGHT_2_;
598 c3 = _CRWENO_OPTIMAL_WEIGHT_3_;
601 /* WENO5 and HCWENO5 */
602 c1 = _WENO_OPTIMAL_WEIGHT_1_;
603 c2 = _WENO_OPTIMAL_WEIGHT_2_;
604 c3 = _WENO_OPTIMAL_WEIGHT_3_;
607 /* calculate WENO weights */
608 _WENOWeights_v_M_Scalar_((ww1LF+p),(ww2LF+p),(ww3LF+p),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,0);
609 _WENOWeights_v_M_Scalar_((ww1RF+p),(ww2RF+p),(ww3RF+p),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,0);
610 _WENOWeights_v_M_Scalar_((ww1LU+p),(ww2LU+p),(ww3LU+p),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,0);
611 _WENOWeights_v_M_Scalar_((ww1RU+p),(ww2RU+p),(ww3RU+p),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,0);
621 Computes the nonlinear weights for the 5th order component-wise WENO-type schemes using the mapped formulation of Henrick, Aslam & Powers:
623 \omega_k &=& \frac {a_k} {\sum_{j=1}^3 a_j },\ a_k = \frac {\tilde{\omega}_k \left( c_k + c_k^2 - 3c_k\tilde{\omega}_k + \tilde{\omega}_k^2\right)} {c_k^2 + \tilde{\omega}_k\left(1-2c_k\right)}, \\
624 \tilde{\omega}_k &=& \frac {\tilde{a}_k} {\sum_{j=1}^3 \tilde{a}_j },\ \tilde{a}_k = \frac {c_k} {\left(\beta_k+\epsilon\right)^p},\ k = 1,2,3,
626 where \f$c_k\f$ are the optimal weights, \f$p\f$ is hardcoded to \f$2\f$, and \f$\epsilon\f$ is an input parameter
627 (#WENOParameters::eps) (typically \f$10^{-6}\f$). The smoothness indicators \f$\beta_k\f$ are given by:
629 \beta_1 &=& \frac{13}{12} \left(f_{j-2}-2f_{j-1}+f_j\right)^2 + \frac{1}{4}\left(f_{j-2}-4f_{j-1}+3f_j\right)^2 \\
630 \beta_2 &=& \frac{13}{12} \left(f_{j-1}-2f_j+f_{j+1}\right)^2 + \frac{1}{4}\left(f_{j-1}-f_{j+1}\right)^2 \\
631 \beta_3 &=& \frac{13}{12} \left(f_j-2f_{j+1}+f_{j+2}\right)^2 + \frac{1}{4}\left(3f_j-4f_{j+1}+f_{j+2}\right)^2
635 + This function computes the weights for the entire grid (for interpolation along a given spatial dimension \a dir ).
636 Thus, it loops over all grid lines along \a dir.
637 + The weights are computed for all components, for both the left- and right-biased interpolations, and for both the
638 flux function \f${\bf f}\left({\bf u}\right)\f$ and the solution \f$\bf u\f$. Thus, this approach of computing and
639 storing the weights is quite memory-intensive.
640 + The variable \a offset denotes the offset in the complete array from where the weights for interpolation along the
641 current interpolation dimension (\a dir ) are stored.
644 + Henrick, Aslam, Powers, Mapped weighted essentially non-oscillatory schemes: Achieving optimal order near critical
645 points, J. Comput. Phys., 2005. http://dx.doi.org/10.1016/j.jcp.2005.01.023
647 int gpuWENOFifthOrderCalculateWeightsM(
648 double * __restrict__ fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
649 double * __restrict__ uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
650 double * __restrict__ x, /*!< Grid coordinates */
651 int dir, /*!< Spatial dimension along which to interpolation */
652 void *s, /*!< Object of type #HyPar containing solver-related variables */
653 void *m /*!< Object of type #MPIVariables containing MPI-related variables */
656 HyPar *solver = (HyPar*) s;
657 WENOParameters *weno = (WENOParameters*) solver->interp;
658 MPIVariables *mpi = (MPIVariables*) m;
660 int ghosts = solver->ghosts;
661 int ndims = solver->ndims;
662 int nvars = solver->nvars;
663 int *dim = solver->dim_local;
664 int *stride= solver->stride_with_ghosts;
666 /* calculate dimension offset */
667 int offset = weno->offset[dir];
668 int bounds_inter[ndims];
669 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
670 int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
671 int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
673 int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
674 int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
675 int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
677 #if defined(GPU_STAT)
678 cudaEvent_t startEvent, stopEvent;
679 float milliseconds = 0;
681 checkCuda( cudaEventCreate(&startEvent) );
682 checkCuda( cudaEventCreate(&stopEvent) );
684 int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
685 int fCuC_memory_accessed = 1;
686 for (int d=0; d<ndims; d++) {
687 if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
688 else fCuC_memory_accessed *= dim[d];
690 fCuC_memory_accessed *= nvars*sizeof(double);
692 checkCuda( cudaEventRecord(startEvent, 0) );
695 WENOFifthOrderCalculateWeightsM_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
696 npoints_grid, solver->npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
697 is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
698 solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
700 cudaDeviceSynchronize();
702 #if defined(GPU_STAT)
703 checkCuda( cudaEventRecord(stopEvent, 0) );
704 checkCuda( cudaEventSynchronize(stopEvent) );
705 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
707 printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f stride = %d\n",
708 "WENOFifthOrderCalculateWeightsM2", milliseconds*1e-3, dir,
709 (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds),
712 checkCuda( cudaEventDestroy(startEvent) );
713 checkCuda( cudaEventDestroy(stopEvent) );
719 /*! Kernel for gpuWENOFifthOrderCalculateWeightsYC() */
721 void WENOFifthOrderCalculateWeightsYC_kernel(
723 int npoints_local_wghosts,
735 const int * __restrict__ dim,
736 const double * __restrict__ fC,
737 const double * __restrict__ uC,
738 double * __restrict__ w1,
739 double * __restrict__ w2,
740 double * __restrict__ w3
743 int p = threadIdx.x + (blockDim.x * blockIdx.x);
744 if (p < npoints_grid) {
745 const int max_ndims = 3;
746 const double thirteen_by_twelve = 13.0 / 12.0;
747 const double one_fourth = 1.0 / 4.0;
749 int bounds_inter[max_ndims], indexC[max_ndims], indexI[max_ndims];
750 int qm1L, qm2L, qm3L, qp1L, qp2L, qm1R, qm2R, qm3R, qp1R, qp2R;
751 double *ww1LF, *ww2LF, *ww3LF, *ww1RF, *ww2RF, *ww3RF;
752 double *ww1LU, *ww2LU, *ww3LU, *ww1RU, *ww2RU, *ww3RU;
757 ww1RF = w1 + 2*weno_size + offset;
758 ww2RF = w2 + 2*weno_size + offset;
759 ww3RF = w3 + 2*weno_size + offset;
760 ww1LU = w1 + weno_size + offset;
761 ww2LU = w2 + weno_size + offset;
762 ww3LU = w3 + weno_size + offset;
763 ww1RU = w1 + 2*weno_size + weno_size + offset;
764 ww2RU = w2 + 2*weno_size + weno_size + offset;
765 ww3RU = w3 + 2*weno_size + weno_size + offset;
767 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
768 _ArrayIndexnD_(ndims,p,bounds_inter,indexC,0);
769 _ArrayCopy1D_(indexC,indexI,ndims);
771 indexC[dir] = indexI[dir]-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1L);
772 qm3L = qm1L - 2*stride;
773 qm2L = qm1L - stride;
774 qp1L = qm1L + stride;
775 qp2L = qm1L + 2*stride;
777 indexC[dir] = indexI[dir] ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1R);
778 qm3R = qm1R + 2*stride;
779 qm2R = qm1R + stride;
780 qp1R = qm1R - stride;
781 qp2R = qm1R - 2*stride;
783 for (int j=0; j <nvars; j++) {
784 /* Defining stencil points */
785 const double *m3LF, *m2LF, *m1LF, *p1LF, *p2LF;
791 const double *m3RF, *m2RF, *m1RF, *p1RF, *p2RF;
797 const double *m3LU, *m2LU, *m1LU, *p1LU, *p2LU;
803 const double *m3RU, *m2RU, *m1RU, *p1RU, *p2RU;
810 qm3L += npoints_local_wghosts;
811 qm2L += npoints_local_wghosts;
812 qm1L += npoints_local_wghosts;
813 qp1L += npoints_local_wghosts;
814 qp2L += npoints_local_wghosts;
816 qm3R += npoints_local_wghosts;
817 qm2R += npoints_local_wghosts;
818 qm1R += npoints_local_wghosts;
819 qp1R += npoints_local_wghosts;
820 qp2R += npoints_local_wghosts;
824 if ( (is_mpi_ip_zero && (indexI[dir] == 0 ))
825 || (is_mpi_ip_proc && (indexI[dir] == dim[dir])) ) {
826 /* Use WENO5 at the physical boundaries */
827 c1 = _WENO_OPTIMAL_WEIGHT_1_;
828 c2 = _WENO_OPTIMAL_WEIGHT_2_;
829 c3 = _WENO_OPTIMAL_WEIGHT_3_;
831 /* CRWENO5 at the interior points */
832 c1 = _CRWENO_OPTIMAL_WEIGHT_1_;
833 c2 = _CRWENO_OPTIMAL_WEIGHT_2_;
834 c3 = _CRWENO_OPTIMAL_WEIGHT_3_;
837 /* WENO5 and HCWENO5 */
838 c1 = _WENO_OPTIMAL_WEIGHT_1_;
839 c2 = _WENO_OPTIMAL_WEIGHT_2_;
840 c3 = _WENO_OPTIMAL_WEIGHT_3_;
843 /* calculate WENO weights */
844 _WENOWeights_v_YC_Scalar_((ww1LF+p),(ww2LF+p),(ww3LF+p),c1,c2,c3,m3LF,m2LF,m1LF,p1LF,p2LF,weno_eps,0);
845 _WENOWeights_v_YC_Scalar_((ww1RF+p),(ww2RF+p),(ww3RF+p),c1,c2,c3,m3RF,m2RF,m1RF,p1RF,p2RF,weno_eps,0);
846 _WENOWeights_v_YC_Scalar_((ww1LU+p),(ww2LU+p),(ww3LU+p),c1,c2,c3,m3LU,m2LU,m1LU,p1LU,p2LU,weno_eps,0);
847 _WENOWeights_v_YC_Scalar_((ww1RU+p),(ww2RU+p),(ww3RU+p),c1,c2,c3,m3RU,m2RU,m1RU,p1RU,p2RU,weno_eps,0);
856 Computes the nonlinear weights for the 5th order component-wise WENO-type schemes using the ESWENO formulation of
857 Yamaleev & Carpenter. Note that only the formulation for the nonlinear weights is adopted and implemented here, not
858 the ESWENO scheme as a whole.
860 \omega_k = \frac {a_k} {\sum_{j=1}^3 a_j },\ a_k = c_k \left( 1 + \frac{\tau_5}{\beta_k+\epsilon} \right)^p,\ k = 1,2,3,
862 where \f$c_k\f$ are the optimal weights, \f$p\f$ is hardcoded to \f$2\f$, and \f$\epsilon\f$ is an input parameter
863 (#WENOParameters::eps) (typically \f$10^{-6}\f$). The smoothness indicators \f$\beta_k\f$ are given by:
865 \beta_1 &=& \frac{13}{12} \left(f_{j-2}-2f_{j-1}+f_j\right)^2 + \frac{1}{4}\left(f_{j-2}-4f_{j-1}+3f_j\right)^2 \\
866 \beta_2 &=& \frac{13}{12} \left(f_{j-1}-2f_j+f_{j+1}\right)^2 + \frac{1}{4}\left(f_{j-1}-f_{j+1}\right)^2 \\
867 \beta_3 &=& \frac{13}{12} \left(f_j-2f_{j+1}+f_{j+2}\right)^2 + \frac{1}{4}\left(3f_j-4f_{j+1}+f_{j+2}\right)^2,
869 and \f$\tau_5 = \left( f_{j-2}-4f_{j-1}+6f_j-4f_{j+1}+f_{j+2} \right)^2\f$.
872 + This function computes the weights for the entire grid (for interpolation along a given spatial dimension \a dir ).
873 Thus, it loops over all grid lines along \a dir.
874 + The weights are computed for all components, for both the left- and right-biased interpolations, and for both the
875 flux function \f${\bf f}\left({\bf u}\right)\f$ and the solution \f$\bf u\f$. Thus, this approach of computing and
876 storing the weights is quite memory-intensive.
877 + The variable \a offset denotes the offset in the complete array from where the weights for interpolation along the
878 current interpolation dimension (\a dir ) are stored.
881 + Yamaleev, Carpenter, A systematic methodology for constructing high-order energy stable WENO schemes,
882 J. Comput. Phys., 2009. http://dx.doi.org/10.1016/j.jcp.2009.03.002
884 int gpuWENOFifthOrderCalculateWeightsYC(
885 double * __restrict__ fC, /*!< Array of cell-centered values of the function \f${\bf f}\left({\bf u}\right)\f$ */
886 double * __restrict__ uC, /*!< Array of cell-centered values of the solution \f${\bf u}\f$ */
887 double * __restrict__ x, /*!< Grid coordinates */
888 int dir, /*!< Spatial dimension along which to interpolation */
889 void *s, /*!< Object of type #HyPar containing solver-related variables */
890 void *m /*!< Object of type #MPIVariables containing MPI-related variables */
893 HyPar *solver = (HyPar*) s;
894 WENOParameters *weno = (WENOParameters*) solver->interp;
895 MPIVariables *mpi = (MPIVariables*) m;
897 int ghosts = solver->ghosts;
898 int ndims = solver->ndims;
899 int nvars = solver->nvars;
900 int *dim = solver->dim_local;
901 int *stride= solver->stride_with_ghosts;
903 /* calculate dimension offset */
904 int offset = weno->offset[dir];
905 int bounds_inter[ndims];
906 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
907 int npoints_grid; _ArrayProduct1D_(bounds_inter,ndims,npoints_grid);
908 int nblocks = (npoints_grid-1) / GPU_THREADS_PER_BLOCK + 1;
910 int is_crweno = (strcmp(solver->spatial_scheme_hyp,_FIFTH_ORDER_CRWENO_) == 0) ? 1 : 0;
911 int is_mpi_ip_zero = (mpi->ip[dir] == 0) ? 1 : 0;
912 int is_mpi_ip_proc = (mpi->ip[dir] == mpi->iproc[dir]-1) ? 1 : 0;
914 #if defined(GPU_STAT)
915 cudaEvent_t startEvent, stopEvent;
916 float milliseconds = 0;
918 checkCuda( cudaEventCreate(&startEvent) );
919 checkCuda( cudaEventCreate(&stopEvent) );
921 int weno_memory_accessed = 12*npoints_grid*nvars*sizeof(double);
922 int fCuC_memory_accessed = 1;
923 for (int d=0; d<ndims; d++) {
924 if (d == dir) fCuC_memory_accessed *= (dim[d]+2*ghosts);
925 else fCuC_memory_accessed *= dim[d];
927 fCuC_memory_accessed *= nvars*sizeof(double);
929 checkCuda( cudaEventRecord(startEvent, 0) );
932 WENOFifthOrderCalculateWeightsYC_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
933 npoints_grid, solver->npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->size, offset, stride[dir],
934 is_crweno, is_mpi_ip_zero, is_mpi_ip_proc, weno->eps,
935 solver->gpu_dim_local, fC, uC, weno->w1, weno->w2, weno->w3
937 cudaDeviceSynchronize();
939 #if defined(GPU_STAT)
940 checkCuda( cudaEventRecord(stopEvent, 0) );
941 checkCuda( cudaEventSynchronize(stopEvent) );
942 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
944 printf("%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f stride = %d\n",
945 "WENOFifthOrderCalculateWeightsYC2", milliseconds*1e-3, dir,
946 (1e-6*(weno_memory_accessed+fCuC_memory_accessed)/milliseconds),
949 checkCuda( cudaEventDestroy(startEvent) );
950 checkCuda( cudaEventDestroy(stopEvent) );