14 #undef _MINIMUM_GHOSTS_
19 #define _MINIMUM_GHOSTS_ 3
21 #ifdef CUDA_VAR_ORDERDING_AOS
44 int p = threadIdx.x + (blockDim.x * blockIdx.x);
45 if (p < ngrid_points) {
47 int qm1,qm2,qm3,qp1,qp2;
48 const double *ww1, *ww2, *ww3;
50 ww1 = w1 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
51 ww2 = w2 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
52 ww3 = w3 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
54 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
59 indexC[dir] = indexI[dir]-1;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
65 indexC[dir] = indexI[dir] ;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
73 const double *fm3, *fm2, *fm1, *fp1, *fp2;
88 const double *cur_w1, *cur_w2, *cur_w3;
89 cur_w1 = (ww1+p*nvars);
90 cur_w2 = (ww2+p*nvars);
91 cur_w3 = (ww3+p*nvars);
161 int ghosts = solver->
ghosts;
162 int ndims = solver->
ndims;
163 int nvars = solver->
nvars;
168 int offset = weno->
offset[dir];
169 int bounds_inter[ndims];
170 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
174 #if defined(GPU_STAT)
175 cudaEvent_t start, stop;
176 float milliseconds = 0;
177 checkCuda( cudaEventCreate(&start) );
178 checkCuda( cudaEventCreate(&stop) );
180 int weno_memory_accessed = 3*npoints_grid*nvars*
sizeof(double);
181 int fI_memory_accessed = npoints_grid*nvars*
sizeof(double);
182 int fC_memory_accessed = 1;
183 for (
int d=0; d<ndims; d++) {
184 if (d == dir) fC_memory_accessed *= (dim[d]+2*ghosts);
185 else fC_memory_accessed *= dim[d];
187 fC_memory_accessed *= nvars*
sizeof(double);
189 checkCuda( cudaEventRecord(start) );
192 Interp1PrimFifthOrderWENO_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
193 npoints_grid, ndims, dir, ghosts, nvars, weno->
size, offset, stride[dir], upw, uflag,
196 cudaDeviceSynchronize();
198 #if defined(GPU_STAT)
199 checkCuda( cudaEventRecord(stop) );
200 checkCuda( cudaEventSynchronize(stop) );
201 checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
203 printf(
"%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
204 "Interp1PrimFifthOrderWENO", milliseconds*1e-3, dir,
205 (1e-6*(weno_memory_accessed+fI_memory_accessed+fC_memory_accessed))/milliseconds);
207 checkCuda( cudaEventDestroy(start) );
208 checkCuda( cudaEventDestroy(stop) );
220 int npoints_local_wghosts,
238 int p = threadIdx.x + (blockDim.x * blockIdx.x);
239 if (p < npoints_grid) {
241 int qm1,qm2,qm3,qp1,qp2;
242 const double *ww1, *ww2, *ww3;
244 ww1 = w1 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
245 ww2 = w2 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
246 ww3 = w3 + (upw < 0 ? 2*weno_size : 0) + (uflag ? weno_size : 0) + offset;
248 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
253 indexC[dir] = indexI[dir]-1;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
254 qm3 = qm1 - 2*stride;
257 qp2 = qm1 + 2*stride;
259 indexC[dir] = indexI[dir] ;
_ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
260 qm3 = qm1 + 2*stride;
263 qp2 = qm1 - 2*stride;
267 const double *fm3, *fm2, *fm1, *fp1, *fp2;
272 for (
int j = 0; j < nvars; j++) {
283 qm3 += npoints_local_wghosts;
284 qm2 += npoints_local_wghosts;
285 qm1 += npoints_local_wghosts;
286 qp1 += npoints_local_wghosts;
287 qp2 += npoints_local_wghosts;
296 fI[l] = (ww1+l)[0]*f1 + (ww2+l)[0]*f2 + (ww3+l)[0]*f3;
367 int ghosts = solver->
ghosts;
368 int ndims = solver->
ndims;
369 int nvars = solver->
nvars;
374 int offset = weno->
offset[dir];
375 int bounds_inter[ndims];
376 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
380 #if defined(GPU_STAT)
381 cudaEvent_t start, stop;
382 float milliseconds = 0;
383 checkCuda( cudaEventCreate(&start) );
384 checkCuda( cudaEventCreate(&stop) );
387 int weno_memory_accessed = 3*npoints_grid*nvars*
sizeof(double);
388 int fI_memory_accessed = npoints_grid*nvars*
sizeof(double);
389 int fC_memory_accessed = 1;
390 for (
int d=0; d<ndims; d++) {
391 if (d == dir) fC_memory_accessed *= (dim[d]+2*ghosts);
392 else fC_memory_accessed *= dim[d];
394 fC_memory_accessed *= nvars*
sizeof(double);
396 checkCuda( cudaEventRecord(start) );
399 Interp1PrimFifthOrderWENO_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
400 npoints_grid, solver->
npoints_local_wghosts, ndims, dir, ghosts, nvars, weno->
size, offset, stride[dir], upw, uflag,
403 #if defined(GPU_STAT)
404 checkCuda( cudaEventRecord(stop) );
405 checkCuda( cudaEventSynchronize(stop) );
408 cudaDeviceSynchronize();
410 #if defined(GPU_STAT)
411 checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
413 printf(
"%-50s GPU time (secs) = %.6f dir = %d bandwidth (GB/s) = %6.2f\n",
414 "Interp1PrimFifthOrderWENO2", milliseconds*1e-3, dir,
415 (1e-6*(weno_memory_accessed+fI_memory_accessed+fC_memory_accessed))/milliseconds);
417 checkCuda( cudaEventDestroy(start) );
418 checkCuda( cudaEventDestroy(stop) );
int npoints_local_wghosts
Definitions for the functions computing the interpolated value of the primitive at the cell interface...
#define _ArrayIndexnD_(N, index, imax, i, ghost)
MPI related function definitions.
#define GPU_THREADS_PER_BLOCK
Structure of variables/parameters needed by the WENO-type scheme.
#define _ArrayIndex1D_(N, imax, i, ghost, index)
#define _ArrayMultiply3Add1D_(x, a, b, c, d, e, f, size)
Contains function definitions for common mathematical functions.
Contains function definitions for common array operations on GPU.
#define _ArrayCopy1D_(x, y, size)
int gpuInterp1PrimFifthOrderWENO(double *, double *, double *, double *, int, int, void *, void *, int)
5th order WENO reconstruction (component-wise) on a uniform grid
#define _ArrayAXBYCZ_(w, a, x, b, y, c, z, size)
Contains structure definition for hypar.
Some basic definitions and macros.
#define _ArrayProduct1D_(x, size, p)
Structure containing all solver-specific variables and functions.
__global__ void Interp1PrimFifthOrderWENO_kernel(int npoints_grid, int npoints_local_wghosts, int ndims, int dir, int ghosts, int nvars, int weno_size, int offset, int stride, int upw, int uflag, const int *dim, const double *fC, const double *w1, const double *w2, const double *w3, double *fI)