14 #ifdef CUDA_VAR_ORDERDING_AOS
29 int j = threadIdx.x + (blockDim.x * blockIdx.x);
31 double one_twelve = 1.0/12.0;
40 for (i = -ghosts; i < -ghosts+1; i++) {
41 int qC, qp1, qp2, qp3, qp4;
47 for (v=0; v<nvars; v++)
48 Df[qC*nvars+v] = (-25*f[qC*nvars+v]+48*f[qp1*nvars+v]-36*f[qp2*nvars+v]+16*f[qp3*nvars+v]-3*f[qp4*nvars+v])*one_twelve;
50 for (i = -ghosts+1; i < -ghosts+2; i++) {
51 int qC, qm1, qp1, qp2, qp3;
57 for (v=0; v<nvars; v++)
58 Df[qC*nvars+v] = (-3*f[qm1*nvars+v]-10*f[qC*nvars+v]+18*f[qp1*nvars+v]-6*f[qp2*nvars+v]+f[qp3*nvars+v])*one_twelve;
61 for (i = dim[dir]+ghosts-2; i < dim[dir]+ghosts-1; i++) {
62 int qC, qm3, qm2, qm1, qp1;
68 for (v=0; v<nvars; v++)
69 Df[qC*nvars+v] = (-f[qm3*nvars+v]+6*f[qm2*nvars+v]-18*f[qm1*nvars+v]+10*f[qC*nvars+v]+3*f[qp1*nvars+v])*one_twelve;
71 for (i = dim[dir]+ghosts-1; i < dim[dir]+ghosts; i++) {
72 int qC, qm4, qm3, qm2, qm1;
78 for (v=0; v<nvars; v++)
79 Df[qC*nvars+v] = (3*f[qm4*nvars+v]-16*f[qm3*nvars+v]+36*f[qm2*nvars+v]-48*f[qm1*nvars+v]+25*f[qC*nvars+v])*one_twelve;
99 int i = threadIdx.x + (blockDim.x * blockIdx.x);
100 if (i < ngrid_points) {
102 double one_twelve = 1.0/12.0;
105 int qC, qm1, qm2, qp1, qp2;
108 j = i/(dim[dir] + 2*ghosts - 4);
109 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
114 i = (i % (dim[dir] + 2*ghosts - 4)) + (-ghosts + 2);
120 for (v=0; v<nvars; v++)
121 Df[qC*nvars+v] = (f[qm2*nvars+v]-8*f[qm1*nvars+v]+8*f[qp1*nvars+v]-f[qp2*nvars+v])*one_twelve;
156 int ghosts = solver->
ghosts;
157 int ndims = solver->
ndims;
158 int nvars = solver->
nvars;
162 fprintf(stderr,
"Error in FirstDerivativeFourthOrder(): input arrays not allocated.\n");
168 int bounds_outer[ndims];
169 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
173 #if defined(GPU_STAT)
174 cudaEvent_t startEvent, stopEvent;
177 checkCuda( cudaEventCreate(&startEvent) );
178 checkCuda( cudaEventCreate(&stopEvent) );
180 checkCuda( cudaEventRecord(startEvent, 0) );
183 FirstDerivativeFourthOrderCentral_boundary_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
184 N_outer, ghosts, ndims, nvars, dir, solver->
gpu_dim_local, f, Df
187 #if defined(GPU_STAT)
188 checkCuda( cudaEventRecord(stopEvent, 0) );
189 checkCuda( cudaEventSynchronize(stopEvent) );
190 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
192 printf(
"%-50s GPU time (secs) = %.6f\n",
193 "FirstDerivativeFourthOrderCentral_boundary", milliseconds*1e-3);
196 int npoints_grid = N_outer*(dim[dir] + 2*ghosts - 4);
199 #if defined(GPU_STAT)
200 checkCuda( cudaEventRecord(startEvent, 0) );
203 FirstDerivativeFourthOrderCentral_interior_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
204 npoints_grid, ghosts, ndims, nvars, dir, solver->
gpu_dim_local, f, Df
206 cudaDeviceSynchronize();
208 #if defined(GPU_STAT)
209 checkCuda( cudaEventRecord(stopEvent, 0) );
210 checkCuda( cudaEventSynchronize(stopEvent) );
211 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
213 printf(
"%-50s GPU time (secs) = %.6f\n",
214 "FirstDerivativeFourthOrderCentral_interior", milliseconds*1e-3);
225 int npoints_local_wghosts,
235 int j = threadIdx.x + (blockDim.x * blockIdx.x);
237 double one_twelve = 1.0/12.0;
241 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
246 for (i = -ghosts; i < -ghosts+1; i++) {
247 int qC, qp1, qp2, qp3, qp4;
253 for (v=0; v<nvars; v++) {
254 Df[qC+v*npoints_local_wghosts] = (-25*f[qC+v*npoints_local_wghosts]+48*f[qp1+v*npoints_local_wghosts]-36*f[qp2+v*npoints_local_wghosts]+16*f[qp3+v*npoints_local_wghosts]-3*f[qp4+v*npoints_local_wghosts])*one_twelve;
257 for (i = -ghosts+1; i < -ghosts+2; i++) {
258 int qC, qm1, qp1, qp2, qp3;
264 for (v=0; v<nvars; v++)
265 Df[qC+v*npoints_local_wghosts] = (-3*f[qm1+v*npoints_local_wghosts]-10*f[qC+v*npoints_local_wghosts]+18*f[qp1+v*npoints_local_wghosts]-6*f[qp2+v*npoints_local_wghosts]+f[qp3+v*npoints_local_wghosts])*one_twelve;
268 for (i = dim[dir]+ghosts-2; i < dim[dir]+ghosts-1; i++) {
269 int qC, qm3, qm2, qm1, qp1;
275 for (v=0; v<nvars; v++)
276 Df[qC+v*npoints_local_wghosts] = (-f[qm3+v*npoints_local_wghosts]+6*f[qm2+v*npoints_local_wghosts]-18*f[qm1+v*npoints_local_wghosts]+10*f[qC+v*npoints_local_wghosts]+3*f[qp1+v*npoints_local_wghosts])*one_twelve;
278 for (i = dim[dir]+ghosts-1; i < dim[dir]+ghosts; i++) {
279 int qC, qm4, qm3, qm2, qm1;
285 for (v=0; v<nvars; v++)
286 Df[qC+v*npoints_local_wghosts] = (3*f[qm4+v*npoints_local_wghosts]-16*f[qm3+v*npoints_local_wghosts]+36*f[qm2+v*npoints_local_wghosts]-48*f[qm1+v*npoints_local_wghosts]+25*f[qC+v*npoints_local_wghosts])*one_twelve;
297 int npoints_local_wghosts,
307 int i = threadIdx.x + (blockDim.x * blockIdx.x);
308 if (i < ngrid_points) {
310 double one_twelve = 1.0/12.0;
313 int qC, qm1, qm2, qp1, qp2;
316 j = i/(dim[dir] + 2*ghosts - 4);
317 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
322 i = (i % (dim[dir] + 2*ghosts - 4)) + (-ghosts + 2);
328 for (v=0; v<nvars; v++)
329 Df[qC+v*npoints_local_wghosts] = (f[qm2+v*npoints_local_wghosts]-8*f[qm1+v*npoints_local_wghosts]+8*f[qp1+v*npoints_local_wghosts]-f[qp2+v*npoints_local_wghosts])*one_twelve;
364 int ghosts = solver->
ghosts;
365 int ndims = solver->
ndims;
366 int nvars = solver->
nvars;
370 fprintf(stderr,
"Error in FirstDerivativeFourthOrder(): input arrays not allocated.\n");
376 int bounds_outer[ndims];
377 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
381 #if defined(GPU_STAT)
382 cudaEvent_t startEvent, stopEvent;
385 checkCuda( cudaEventCreate(&startEvent) );
386 checkCuda( cudaEventCreate(&stopEvent) );
388 checkCuda( cudaEventRecord(startEvent, 0) );
391 FirstDerivativeFourthOrderCentral_boundary_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
395 #if defined(GPU_STAT)
396 checkCuda( cudaEventRecord(stopEvent, 0) );
397 checkCuda( cudaEventSynchronize(stopEvent) );
398 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
400 printf(
"%-50s GPU time (secs) = %.6f\n",
401 "FirstDerivativeFourthOrderCentral_boundary", milliseconds*1e-3);
404 int npoints_grid = N_outer*(dim[dir] + 2*ghosts - 4);
407 #if defined(GPU_STAT)
408 checkCuda( cudaEventRecord(startEvent, 0) );
411 FirstDerivativeFourthOrderCentral_interior_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
414 cudaDeviceSynchronize();
416 #if defined(GPU_STAT)
417 checkCuda( cudaEventRecord(stopEvent, 0) );
418 checkCuda( cudaEventSynchronize(stopEvent) );
419 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
421 printf(
"%-50s GPU time (secs) = %.6f\n",
422 "FirstDerivativeFourthOrderCentral_interior", milliseconds*1e-3);
int npoints_local_wghosts
Definitions for the functions computing the first derivative.
#define _ArrayIndexnD_(N, index, imax, i, ghost)
MPI related function definitions.
#define GPU_THREADS_PER_BLOCK
#define _ArrayIndex1D_(N, imax, i, ghost, index)
__global__ void FirstDerivativeFourthOrderCentral_boundary_kernel(int N_outer, int npoints_local_wghosts, int ghosts, int ndims, int nvars, int dir, const int *dim, const double *f, double *Df)
#define _ArrayCopy1D_(x, y, size)
Contains structure definition for hypar.
Contains macros and function definitions for common array operations.
__global__ void FirstDerivativeFourthOrderCentral_interior_kernel(int ngrid_points, int npoints_local_wghosts, int ghosts, int ndims, int nvars, int dir, const int *dim, const double *f, double *Df)
#define _ArrayProduct1D_(x, size, p)
int gpuFirstDerivativeFourthOrderCentral(double *, double *, int, int, void *, void *)
Structure of MPI-related variables.
Structure containing all solver-specific variables and functions.