12 int(*)(
double*,
double*,
double*,
double*,
13 double*,
double*,
int,
void*,
double));
15 double*,
double*,
int,
void*,
double);
18 template <
int block_size>
24 const double * __restrict__ FluxI,
25 double * __restrict__ StageBoundaryIntegral
28 extern __shared__
double smem[];
30 int tid = threadIdx.x;
31 int grid_size = block_size * gridDim.x;
32 int i = blockIdx.x * block_size + threadIdx.x;
37 for (j=0; j<nvars; j++) tid_sum[j] += FluxI[i*nvars+j];
40 for (j=0; j<nvars; j++) smem[tid*nvars+j] = tid_sum[j];
44 if (block_size >= 512 && tid < 256) {
45 for (j=0; j<nvars; j++) smem[tid*nvars+j] = tid_sum[j] = tid_sum[j] + smem[(tid+256)*nvars+j];
48 if (block_size >= 256 && tid < 128) {
49 for (j=0; j<nvars; j++) smem[tid*nvars+j] = tid_sum[j] = tid_sum[j] + smem[(tid+128)*nvars+j];
52 if (block_size >= 128 && tid < 64) {
53 for (j=0; j<nvars; j++) smem[tid*nvars+j] = tid_sum[j] = tid_sum[j] + smem[(tid+64)*nvars+j];
58 if (block_size >= 64) {
59 for (j=0; j<nvars; j++) tid_sum[j] += smem[(tid+32)*nvars+j];
61 for (
int offset=16; offset>0; offset/=2) {
62 for (j=0; j<nvars; j++) {
63 tid_sum[j] += __shfl_down_sync(0xffffffff, tid_sum[j], offset);
70 for (j=0; j<nvars; j++) StageBoundaryIntegral[j] = (sign == 1) ? tid_sum[j] : -tid_sum[j];
75 #ifdef CUDA_VAR_ORDERDING_AOS
85 const int * __restrict__ dim,
86 const double * __restrict__ FluxI,
87 const double * __restrict__ dxinv,
88 double * __restrict__ hyp,
89 double * __restrict__ FluxI_p1,
90 double * __restrict__ FluxI_p2
93 int tx = threadIdx.x + (blockDim.x * blockIdx.x);
94 if (tx < npoints_grid) {
97 int index[3], index1[3], index2[3];
101 dim_interface[d] += 1;
108 for (v=0; v<nvars; v++) hyp[nvars*p+v] += dxinv[offset+ghosts+index[d]]
109 * (FluxI[nvars*p2+v]-FluxI[nvars*p1+v]);
110 if (index[d] == 0 || index[d] == dim[d]-1) {
112 b = index[1] + index[2]*dim[1];
114 b = index[0] + index[2]*dim[0];
116 b = index[0] + index[1]*dim[0];
119 for (v=0; v<nvars; v++) FluxI_p1[b*nvars+v] = FluxI[nvars*p1+v];
121 for (v=0; v<nvars; v++) FluxI_p2[b*nvars+v] = FluxI[nvars*p2+v];
129 void HyperbolicFunction_dim2_kernel(
135 const int * __restrict__ dim,
136 const double * __restrict__ FluxI,
137 const double * __restrict__ dxinv,
138 double * __restrict__ hyp,
139 double * __restrict__ FluxI_p1,
140 double * __restrict__ FluxI_p2
143 int tx = threadIdx.x + (blockDim.x * blockIdx.x);
144 if (tx < npoints_grid) {
146 int dim_interface[2];
147 int index[2], index1[2], index2[2];
151 dim_interface[d] += 1;
158 for (v=0; v<nvars; v++) hyp[nvars*p+v] += dxinv[offset+ghosts+index[d]]
159 * (FluxI[nvars*p2+v]-FluxI[nvars*p1+v]);
160 if (index[d] == 0 || index[d] == dim[d]-1) {
161 b = (d == 0) ? index[1] : index[0];
163 for (v=0; v<nvars; v++) FluxI_p1[b*nvars+v] = FluxI[nvars*p1+v];
165 for (v=0; v<nvars; v++) FluxI_p2[b*nvars+v] = FluxI[nvars*p2+v];
194 int(*FluxFunction)(
double*,
double*,
int,
void*,
double),
196 int(*UpwindFunction)(
double*,
double*,
double*,
double*,
double*,
double*,
int,
void*,
double)
202 double *gpu_FluxI = solver->
fluxI;
203 double *gpu_FluxC = solver->
fluxC;
205 int ndims = solver->
ndims;
206 int nvars = solver->
nvars;
207 int ghosts = solver->
ghosts;
210 double *gpu_x = solver->
gpu_x;
212 double *gpu_FluxI_p1, *gpu_FluxI_p2;
216 gpuMemset(hyp, 0, size*nvars*
sizeof(
double));
220 if (!FluxFunction)
return(0);
223 int npoints_grid = 1;
for (d = 0; d < ndims; d++) npoints_grid *= dim[d];
227 #if defined(GPU_STAT)
228 cudaEvent_t start, stop;
229 float milliseconds = 0;
231 checkCuda( cudaEventCreate(&start) );
232 checkCuda( cudaEventCreate(&stop) );
235 for (d = 0; d < ndims; d++) {
237 FluxFunction(gpu_FluxC,gpu_u,d,solver,t);
244 #if defined(GPU_STAT)
245 checkCuda( cudaEventRecord(start, 0) );
249 HyperbolicFunction_dim3_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
250 npoints_grid, d, nvars, ghosts, offset,
252 hyp, gpu_FluxI_p1, gpu_FluxI_p2
254 }
else if (ndims == 2) {
255 HyperbolicFunction_dim2_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
256 npoints_grid, d, nvars, ghosts, offset,
258 hyp, gpu_FluxI_p1, gpu_FluxI_p2
261 fprintf(stderr,
"gpuHyperbolicFunction(): Not implemented for ndims = %d!\n", ndims);
264 cudaDeviceSynchronize();
266 #if defined(GPU_STAT)
267 checkCuda( cudaEventRecord(stop, 0) );
268 checkCuda( cudaEventSynchronize(stop) );
269 checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
270 printf(
"%-50s GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f\n",
271 "HyperbolicFunction", milliseconds*1e-3, d, (1e-6*2*npoints_grid*nvars*
sizeof(
double))/milliseconds);
273 checkCuda( cudaEventRecord(start, 0) );
276 StageBoundaryIntegral_kernel<GPU_THREADS_PER_BLOCK><<<1,
GPU_THREADS_PER_BLOCK, GPU_THREADS_PER_BLOCK*nvars*
sizeof(double)>>>(
280 StageBoundaryIntegral_kernel<GPU_THREADS_PER_BLOCK><<<1,
GPU_THREADS_PER_BLOCK, GPU_THREADS_PER_BLOCK*nvars*
sizeof(double)>>>(
284 cudaDeviceSynchronize();
286 #if defined(GPU_STAT)
287 checkCuda( cudaEventRecord(stop, 0) );
288 checkCuda( cudaEventSynchronize(stop) );
289 checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
290 printf(
"%-50s GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f\n",
291 "StageBoundaryIntegral", milliseconds*1e-3, d, (1e-6*2*solver->
gpu_npoints_boundary[d]*nvars*
sizeof(
double))/milliseconds);
294 offset += dim[d] + 2*ghosts;
299 #if defined(GPU_STAT)
300 checkCuda(cudaEventDestroy(start));
301 checkCuda(cudaEventDestroy(stop));
362 int(*UpwindFunction)(
double*,
double*,
double*,
double*,
363 double*,
double*,
int,
void*,
double)
369 double *gpu_uC = NULL;
370 double *gpu_uL = solver->
uL;
371 double *gpu_uR = solver->
uR;
372 double *gpu_fluxL = solver->
fL;
373 double *gpu_fluxR = solver->
fR;
386 solver->
UFunction(gpu_uC,gpu_u,dir,solver,mpi,t);
387 }
else gpu_uC = gpu_u;
396 if (UpwindFunction) { UpwindFunction (gpu_fluxI,gpu_fluxL,gpu_fluxR,gpu_uL,gpu_uR,gpu_u,dir,solver,t); }
397 else {
DefaultUpwinding (gpu_fluxI,gpu_fluxL,gpu_fluxR,NULL,NULL,NULL,dir,solver,t); }
408 int npoints_local_wghosts,
409 int npoints_dim_interface,
414 const int * __restrict__ dim,
415 const double * __restrict__ FluxI,
416 const double * __restrict__ dxinv,
417 double * __restrict__ hyp,
418 double * __restrict__ FluxI_p1,
419 double * __restrict__ FluxI_p2
422 int tx = threadIdx.x + (blockDim.x * blockIdx.x);
423 if (tx < npoints_grid) {
425 int dim_interface[3];
426 int index[3], index1[3], index2[3];
430 dim_interface[d] += 1;
437 for (v=0; v<nvars; v++) {
438 hyp[p+v*npoints_local_wghosts] += dxinv[offset+ghosts+index[d]]
439 * (FluxI[p2+v*npoints_dim_interface]-FluxI[p1+v*npoints_dim_interface]);
441 if (index[d] == 0 || index[d] == dim[d]-1) {
443 b = index[1] + index[2]*dim[1];
445 b = index[0] + index[2]*dim[0];
447 b = index[0] + index[1]*dim[0];
450 for (v=0; v<nvars; v++) FluxI_p1[b*nvars+v] = FluxI[p1+v*npoints_dim_interface];
452 for (v=0; v<nvars; v++) FluxI_p2[b*nvars+v] = FluxI[p2+v*npoints_dim_interface];
483 int(*FluxFunction)(
double*,
double*,
int,
void*,
double),
485 int(*UpwindFunction)(
double*,
double*,
double*,
double*,
486 double*,
double*,
int,
void*,
double)
492 double *FluxI = solver->
fluxI;
493 double *FluxC = solver->
fluxC;
495 int ndims = solver->
ndims;
496 int nvars = solver->
nvars;
497 int ghosts = solver->
ghosts;
500 double *x = solver->
gpu_x;
502 double *FluxI_p1, *FluxI_p2;
506 gpuMemset(hyp, 0, size*nvars*
sizeof(
double));
510 if (!FluxFunction)
return(0);
513 int npoints_grid = 1;
for (d = 0; d < ndims; d++) npoints_grid *= dim[d];
514 int nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
517 #if defined(GPU_STAT)
518 cudaEvent_t start, stop;
519 float milliseconds = 0;
521 checkCuda( cudaEventCreate(&start) );
522 checkCuda( cudaEventCreate(&stop) );
525 for (d = 0; d < ndims; d++) {
526 int npoints_dim_interface = 1;
for (
int i=0; i<ndims; i++) npoints_dim_interface *= (i==d) ? (dim[i]+1) : dim[i];
528 FluxFunction(FluxC,u,d,solver,t);
535 #if defined(GPU_STAT)
536 checkCuda( cudaEventRecord(start, 0) );
539 HyperbolicFunction_dim3_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
540 npoints_grid, size, npoints_dim_interface, d, nvars, ghosts, offset,
542 hyp, FluxI_p1, FluxI_p2
545 fprintf(stderr,
"gpuHyperbolicFunction(): Not implemented for ndims = %d!\n", ndims);
548 cudaDeviceSynchronize();
550 #if defined(GPU_STAT)
551 checkCuda( cudaEventRecord(stop, 0) );
552 checkCuda( cudaEventSynchronize(stop) );
553 checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
554 printf(
"%-50s GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f\n",
555 "HyperbolicFunction", milliseconds*1e-3, d, (1e-6*2*npoints_grid*nvars*
sizeof(
double))/milliseconds);
557 checkCuda( cudaEventRecord(start, 0) );
560 StageBoundaryIntegral_kernel<GPU_THREADS_PER_BLOCK><<<1,
GPU_THREADS_PER_BLOCK, GPU_THREADS_PER_BLOCK*nvars*
sizeof(double)>>>(
564 StageBoundaryIntegral_kernel<GPU_THREADS_PER_BLOCK><<<1,
GPU_THREADS_PER_BLOCK, GPU_THREADS_PER_BLOCK*nvars*
sizeof(double)>>>(
568 cudaDeviceSynchronize();
570 #if defined(GPU_STAT)
571 checkCuda( cudaEventRecord(stop, 0) );
572 checkCuda( cudaEventSynchronize(stop) );
573 checkCuda( cudaEventElapsedTime(&milliseconds, start, stop) );
574 printf(
"%-50s GPU time = %.6f dir = %d bandwidth (GB/s) = %.2f\n",
575 "StageBoundaryIntegral", milliseconds*1e-3, d, (1e-6*2*solver->
gpu_npoints_boundary[d]*nvars*
sizeof(
double))/milliseconds);
577 offset += dim[d] + 2*ghosts;
582 #if defined(GPU_STAT)
583 checkCuda(cudaEventDestroy(start));
584 checkCuda(cudaEventDestroy(stop));
645 int(*UpwindFunction)(
double*,
double*,
double*,
double*,
646 double*,
double*,
int,
void*,
double)
653 double *uL = solver->
uL;
654 double *uR = solver->
uR;
655 double *fluxL = solver->
fL;
656 double *fluxR = solver->
fR;
670 solver->
UFunction(uC,u,dir,solver,mpi,t);
681 if (UpwindFunction) { UpwindFunction (fluxI,fluxL,fluxR,uL,uR,u,dir,solver,t); }
707 int ndims = solver->
ndims;
708 int nvars = solver->
nvars;
710 int bounds_outer[ndims], bounds_inter[ndims];
711 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
712 _ArrayCopy1D_(dim,bounds_inter,ndims); bounds_inter[dir] += 1;
714 done = 0;
int index_outer[ndims], index_inter[ndims];
718 for (index_inter[dir] = 0; index_inter[dir] < bounds_inter[dir]; index_inter[dir]++) {
720 int v;
for (v=0; v<nvars; v++) fI[nvars*p+v] = 0.5 * (fL[nvars*p+v]+fR[nvars*p+v]);
int npoints_local_wghosts
#define _ArraySetValue_(x, size, value)
__global__ void HyperbolicFunction_dim3_kernel(int npoints_grid, int npoints_local_wghosts, int npoints_dim_interface, int d, int nvars, int ghosts, int offset, const int *__restrict__ dim, const double *__restrict__ FluxI, const double *__restrict__ dxinv, double *__restrict__ hyp, double *__restrict__ FluxI_p1, double *__restrict__ FluxI_p2)
double * StageBoundaryBuffer
#define _ArrayIncrementIndex_(N, imax, i, done)
void gpuArrayBlockMultiply(double *, const double *, int, int)
int gpuHyperbolicFunction(double *hyp, double *u, void *s, void *m, double t, int LimFlag, int(*FluxFunction)(double *, double *, int, void *, double), int(*UpwindFunction)(double *, double *, double *, double *, double *, double *, int, void *, double))
#define _ArrayIndexnD_(N, index, imax, i, ghost)
static int ReconstructHyperbolic(double *, double *, double *, double *, int, void *, void *, double, int, int(*)(double *, double *, double *, double *, double *, double *, int, void *, double))
MPI related function definitions.
int gpu_npoints_boundary[3]
#define GPU_THREADS_PER_BLOCK
#define _ArrayIndex1D_(N, imax, i, ghost, index)
int(* SetInterpLimiterVar)(double *, double *, double *, int, void *, void *)
__global__ void StageBoundaryIntegral_kernel(int n, int nvars, int sign, const double *__restrict__ FluxI, double *__restrict__ StageBoundaryIntegral)
int gpu_npoints_boundary_offset[3]
Contains function definitions for common array operations on GPU.
#define _ArrayCopy1D_(x, y, size)
int(* InterpolateInterfacesHyp)(double *, double *, double *, double *, int, int, void *, void *, int)
void gpuMemset(void *, int, size_t)
Contains structure definition for hypar.
double * StageBoundaryIntegral
int(* UFunction)(double *, double *, int, void *, void *, double)
int StageBoundaryBuffer_size
static int DefaultUpwinding(double *, double *, double *, double *, double *, double *, int, void *, double)
Structure of MPI-related variables.
Structure containing all solver-specific variables and functions.