15 #ifdef CUDA_VAR_ORDERDING_AOS
24 const int * __restrict__ dim,
25 const double * __restrict__ grav_field_g,
26 const double * __restrict__ fL,
27 const double * __restrict__ fR,
28 const double * __restrict__ uL,
29 const double * __restrict__ uR,
30 const double * __restrict__ u,
31 double * __restrict__ fI
34 int p = blockDim.x * blockIdx.x + threadIdx.x;
36 if (p < npoints_grid) {
41 bounds_inter[0] = dim[0]; bounds_inter[1] = dim[1]; bounds_inter[2] = dim[2]; bounds_inter[dir] += 1;
51 udiff[0] = 0.5 * (uR[q+0] - uL[q+0]);
52 udiff[1] = 0.5 * (uR[q+1] - uL[q+1]);
53 udiff[2] = 0.5 * (uR[q+2] - uL[q+2]);
54 udiff[3] = 0.5 * (uR[q+3] - uL[q+3]);
55 udiff[4] = 0.5 * (uR[q+4] - uL[q+4]);
61 c = sqrt(gamma*P/rho);
62 double alphaL = c +
absolute(vel[dir]);
64 c = sqrt(gamma*P/rho);
65 double alphaR = c +
absolute(vel[dir]);
67 c = sqrt(gamma*P/rho);
68 double alphaavg = c +
absolute(vel[dir]);
70 double kappa =
max(grav_field_g[pL],grav_field_g[pR]);
71 double alpha = kappa*
max3(alphaL,alphaR,alphaavg);
73 fI[q+0] = 0.5*(fL[q+0]+fR[q+0])-alpha*udiff[0];
74 fI[q+1] = 0.5*(fL[q+1]+fR[q+1])-alpha*udiff[1];
75 fI[q+2] = 0.5*(fL[q+2]+fR[q+2])-alpha*udiff[2];
76 fI[q+3] = 0.5*(fL[q+3]+fR[q+3])-alpha*udiff[3];
77 fI[q+4] = 0.5*(fL[q+4]+fR[q+4])-alpha*udiff[4];
123 gpuNavierStokes3DUpwindRusanov_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
127 cudaDeviceSynchronize();
139 const int * __restrict__ dim,
140 const double * __restrict__ grav_field_g,
141 const double * __restrict__ fL,
142 const double * __restrict__ fR,
143 const double * __restrict__ uL,
144 const double * __restrict__ uR,
145 const double * __restrict__ u,
146 double * __restrict__ fI
149 int p = blockDim.x * blockIdx.x + threadIdx.x;
151 if (p < npoints_grid) {
159 bounds_inter[0] = dim[0]; bounds_inter[1] = dim[1]; bounds_inter[2] = dim[2]; bounds_inter[dir] += 1;
170 udiff[0] = 0.5 * (uR[p+0] - uL[p+0]);
171 udiff[1] = 0.5 * (uR[p+1] - uL[p+1]);
172 udiff[2] = 0.5 * (uR[p+2] - uL[p+2]);
173 udiff[3] = 0.5 * (uR[p+3] - uL[p+3]);
174 udiff[4] = 0.5 * (uR[p+4] - uL[p+4]);
183 double delta = 0.000001, delta2 = delta*delta;
184 k=0; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
185 k=6; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
186 k=12; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
187 k=18; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
188 k=24; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
194 fI[p+0] = 0.5 * (fL[p+0]+fR[p+0]) - udiss[0];
195 fI[p+1] = 0.5 * (fL[p+1]+fR[p+1]) - udiss[1];
196 fI[p+2] = 0.5 * (fL[p+2]+fR[p+2]) - udiss[2];
197 fI[p+3] = 0.5 * (fL[p+3]+fR[p+3]) - udiss[3];
198 fI[p+4] = 0.5 * (fL[p+4]+fR[p+4]) - udiss[4];
224 double * __restrict__ fI,
225 double * __restrict__ fL,
226 double * __restrict__ fR,
227 double * __restrict__ uL,
228 double * __restrict__ uR,
229 double * __restrict__ u,
244 gpuNavierStokes3DUpwindRoe_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
248 cudaDeviceSynchronize();
259 int npoints_local_wghosts,
263 const int * __restrict__ dim,
264 const double * __restrict__ grav_field_g,
265 const double * __restrict__ fL,
266 const double * __restrict__ fR,
267 const double * __restrict__ uL,
268 const double * __restrict__ uR,
269 const double * __restrict__ u,
270 double * __restrict__ fI
273 int p = blockDim.x * blockIdx.x + threadIdx.x;
275 if (p < npoints_grid) {
280 bounds_inter[0] = dim[0]; bounds_inter[1] = dim[1]; bounds_inter[2] = dim[2]; bounds_inter[dir] += 1;
289 udiff[0] = 0.5 * (uR[p+ 0] - uL[p+ 0]);
290 udiff[1] = 0.5 * (uR[p+ npoints_grid] - uL[p+ npoints_grid]);
291 udiff[2] = 0.5 * (uR[p+2*npoints_grid] - uL[p+2*npoints_grid]);
292 udiff[3] = 0.5 * (uR[p+3*npoints_grid] - uL[p+3*npoints_grid]);
293 udiff[4] = 0.5 * (uR[p+4*npoints_grid] - uL[p+4*npoints_grid]);
299 c = sqrt(gamma*P/rho);
300 double alphaL = c +
absolute(vel[dir]);
302 c = sqrt(gamma*P/rho);
303 double alphaR = c +
absolute(vel[dir]);
305 c = sqrt(gamma*P/rho);
306 double alphaavg = c +
absolute(vel[dir]);
308 double kappa =
max(grav_field_g[pL],grav_field_g[pR]);
309 double alpha = kappa*
max3(alphaL,alphaR,alphaavg);
311 fI[p+0] = 0.5*(fL[p+ 0]+fR[p+ 0])-alpha*udiff[0];
312 fI[p+ npoints_grid] = 0.5*(fL[p+ npoints_grid]+fR[p+ npoints_grid])-alpha*udiff[1];
313 fI[p+2*npoints_grid] = 0.5*(fL[p+2*npoints_grid]+fR[p+2*npoints_grid])-alpha*udiff[2];
314 fI[p+3*npoints_grid] = 0.5*(fL[p+3*npoints_grid]+fR[p+3*npoints_grid])-alpha*udiff[3];
315 fI[p+4*npoints_grid] = 0.5*(fL[p+4*npoints_grid]+fR[p+4*npoints_grid])-alpha*udiff[4];
361 gpuNavierStokes3DUpwindRusanov_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
365 cudaDeviceSynchronize();
374 int npoints_local_wghosts,
378 const int * __restrict__ dim,
379 const double * __restrict__ grav_field_g,
380 const double * __restrict__ fL,
381 const double * __restrict__ fR,
382 const double * __restrict__ uL,
383 const double * __restrict__ uR,
384 const double * __restrict__ u,
385 double * __restrict__ fI
388 int p = blockDim.x * blockIdx.x + threadIdx.x;
390 if (p < npoints_grid) {
398 bounds_inter[0] = dim[0]; bounds_inter[1] = dim[1]; bounds_inter[2] = dim[2]; bounds_inter[dir] += 1;
407 udiff[0] = 0.5 * (uR[p+0] - uL[p+0]);
408 udiff[1] = 0.5 * (uR[p+ npoints_grid] - uL[p+ npoints_grid]);
409 udiff[2] = 0.5 * (uR[p+2*npoints_grid] - uL[p+2*npoints_grid]);
410 udiff[3] = 0.5 * (uR[p+3*npoints_grid] - uL[p+3*npoints_grid]);
411 udiff[4] = 0.5 * (uR[p+4*npoints_grid] - uL[p+4*npoints_grid]);
420 double delta = 0.000001, delta2 = delta*delta;
421 k=0; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
422 k=6; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
423 k=12; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
424 k=18; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
425 k=24; D[k] = (
absolute(D[k]) < delta ? (D[k]*D[k]+delta2)/(2*delta) :
absolute(D[k]) );
431 fI[p+0] = 0.5 * (fL[p+0]+fR[p+0]) - udiss[0];
432 fI[p+ npoints_grid] = 0.5 * (fL[p+ npoints_grid]+fR[p+ npoints_grid]) - udiss[1];
433 fI[p+2*npoints_grid] = 0.5 * (fL[p+2*npoints_grid]+fR[p+2*npoints_grid]) - udiss[2];
434 fI[p+3*npoints_grid] = 0.5 * (fL[p+3*npoints_grid]+fR[p+3*npoints_grid]) - udiss[3];
435 fI[p+4*npoints_grid] = 0.5 * (fL[p+4*npoints_grid]+fR[p+4*npoints_grid]) - udiss[4];
461 double * __restrict__ fI,
462 double * __restrict__ fL,
463 double * __restrict__ fR,
464 double * __restrict__ uL,
465 double * __restrict__ uR,
466 double * __restrict__ u,
481 #if defined(GPU_STAT)
482 cudaEvent_t startEvent, stopEvent;
483 float milliseconds = 0;
485 checkCuda( cudaEventCreate(&startEvent) );
486 checkCuda( cudaEventCreate(&stopEvent) );
487 checkCuda( cudaEventRecord(startEvent, 0) );
490 gpuNavierStokes3DUpwindRoe_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
495 #if defined(GPU_STAT)
496 checkCuda( cudaEventRecord(stopEvent, 0) );
497 checkCuda( cudaEventSynchronize(stopEvent) );
500 cudaDeviceSynchronize();
502 #if defined(GPU_STAT)
503 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
504 printf(
"%-50s GPU time (secs) = %.6f dir = %d\n",
505 "NavierStokes3DUpwindRoe2", milliseconds*1e-3, dir);
506 checkCuda( cudaEventDestroy(startEvent) );
507 checkCuda( cudaEventDestroy(stopEvent) );
#define _NavierStokes3DGetFlowVar_(u, stride, rho, vx, vy, vz, e, P, gamma)
int npoints_local_wghosts
3D Navier Stokes equations (compressible flows)
#define _NavierStokes3DLeftEigenvectors_(u, stride, L, ga, dir)
#define _ArrayIndexnD_(N, index, imax, i, ghost)
#define _ArrayCopy1D3_(x, y, size)
#define GPU_THREADS_PER_BLOCK
#define _ArrayIndex1D_(N, imax, i, ghost, index)
#define MatVecMult5(N, y, A, x)
int gpuNavierStokes3DUpwindRoe(double *, double *, double *, double *, double *, double *, int, void *, double)
#define _NavierStokes3DEigenvalues_(u, stride, D, gamma, dir)
Contains function definitions for common mathematical functions.
Contains function definitions for common array operations on GPU.
#define _ArrayCopy1D_(x, y, size)
__global__ void gpuNavierStokes3DUpwindRoe_kernel(int npoints_grid, int npoints_local_wghosts, int dir, int ghosts, double gamma, const int *__restrict__ dim, const double *__restrict__ grav_field_g, const double *__restrict__ fL, const double *__restrict__ fR, const double *__restrict__ uL, const double *__restrict__ uR, const double *__restrict__ u, double *__restrict__ fI)
Contains structure definition for hypar.
#define MatMult5(N, A, X, Y)
Structure containing variables and parameters specific to the 3D Navier Stokes equations. This structure contains the physical parameters, variables, and function pointers specific to the 3D Navier-Stokes equations.
#define _NavierStokes3DRightEigenvectors_(u, stride, R, ga, dir)
__global__ void gpuNavierStokes3DUpwindRusanov_kernel(int npoints_grid, int npoints_local_wghosts, int dir, int ghosts, double gamma, const int *__restrict__ dim, const double *__restrict__ grav_field_g, const double *__restrict__ fL, const double *__restrict__ fR, const double *__restrict__ uL, const double *__restrict__ uR, const double *__restrict__ u, double *__restrict__ fI)
int gpuNavierStokes3DUpwindRusanov(double *, double *, double *, double *, double *, double *, int, void *, double)
#define _NavierStokes3DRoeAverage_(uavg, stride, uL, uR, gamma)
#define _ArrayProduct1D_(x, size, p)
Structure containing all solver-specific variables and functions.
double * gpu_grav_field_g
Contains macros and function definitions for common matrix multiplication.
static const int _NavierStokes3D_stride_