1 /*! @file FirstDerivativeFourthOrder_GPU.cu
3 @brief GPU implementation of fourth order finite-difference approximation to the first derivative
6 #include <firstderivative.h>
7 #include <arrayfunctions.h>
11 typedef MPIVariables MPIContext;
12 typedef HyPar SolverContext;
14 #ifdef CUDA_VAR_ORDERDING_AOS
16 /*! Kernel for gpuFirstDerivativeFourthOrderCentral() */
18 void FirstDerivativeFourthOrderCentral_boundary_kernel(
29 int j = threadIdx.x + (blockDim.x * blockIdx.x);
31 double one_twelve = 1.0/12.0;
34 int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
35 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
36 _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
37 _ArrayCopy1D_(index_outer,indexC,ndims);
40 for (i = -ghosts; i < -ghosts+1; i++) {
41 int qC, qp1, qp2, qp3, qp4;
42 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
43 indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
44 indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
45 indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
46 indexC[dir] = i+4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp4);
47 for (v=0; v<nvars; v++)
48 Df[qC*nvars+v] = (-25*f[qC*nvars+v]+48*f[qp1*nvars+v]-36*f[qp2*nvars+v]+16*f[qp3*nvars+v]-3*f[qp4*nvars+v])*one_twelve;
50 for (i = -ghosts+1; i < -ghosts+2; i++) {
51 int qC, qm1, qp1, qp2, qp3;
52 indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
53 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
54 indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
55 indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
56 indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
57 for (v=0; v<nvars; v++)
58 Df[qC*nvars+v] = (-3*f[qm1*nvars+v]-10*f[qC*nvars+v]+18*f[qp1*nvars+v]-6*f[qp2*nvars+v]+f[qp3*nvars+v])*one_twelve;
61 for (i = dim[dir]+ghosts-2; i < dim[dir]+ghosts-1; i++) {
62 int qC, qm3, qm2, qm1, qp1;
63 indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
64 indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
65 indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
66 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
67 indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
68 for (v=0; v<nvars; v++)
69 Df[qC*nvars+v] = (-f[qm3*nvars+v]+6*f[qm2*nvars+v]-18*f[qm1*nvars+v]+10*f[qC*nvars+v]+3*f[qp1*nvars+v])*one_twelve;
71 for (i = dim[dir]+ghosts-1; i < dim[dir]+ghosts; i++) {
72 int qC, qm4, qm3, qm2, qm1;
73 indexC[dir] = i-4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm4);
74 indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
75 indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
76 indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
77 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
78 for (v=0; v<nvars; v++)
79 Df[qC*nvars+v] = (3*f[qm4*nvars+v]-16*f[qm3*nvars+v]+36*f[qm2*nvars+v]-48*f[qm1*nvars+v]+25*f[qC*nvars+v])*one_twelve;
86 /*! Kernel for gpuFirstDerivativeFourthOrderCentral() */
88 void FirstDerivativeFourthOrderCentral_interior_kernel(
99 int i = threadIdx.x + (blockDim.x * blockIdx.x);
100 if (i < ngrid_points) {
102 double one_twelve = 1.0/12.0;
105 int qC, qm1, qm2, qp1, qp2;
106 int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
108 j = i/(dim[dir] + 2*ghosts - 4); /* Outer index */
109 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
110 _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
111 _ArrayCopy1D_(index_outer,indexC,ndims);
113 //i += (-ghosts + 2);
114 i = (i % (dim[dir] + 2*ghosts - 4)) + (-ghosts + 2);
115 indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
116 indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
117 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
118 indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
119 indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
120 for (v=0; v<nvars; v++)
121 Df[qC*nvars+v] = (f[qm2*nvars+v]-8*f[qm1*nvars+v]+8*f[qp1*nvars+v]-f[qp2*nvars+v])*one_twelve;
126 /*! Computes the fourth-order finite-difference approximation to the first derivative
127 (\b Note: not divided by the grid spacing):
129 \left(\partial f\right)_i = \left\{ \begin{array}{ll} \frac{1}{12}\left(-25f_i+48f_{i+1}-36f_{i+2}+16f_{i+3}-3f_{i+4}\right) & i=-g \\ \frac{1}{12}\left(-3f_{i-1}-10f_i+18f_{i+1}-6f_{i+2}+f_{i+3}\right) & i = -g+1 \\ \frac{1}{2}\left( f_{i-2}-8f_{i-1}+8f_{i+1}-f_{i+2} \right) & -g+2 \leq i \leq N+g-3 \\ \frac{1}{12}\left( -f_{i-3}+6f_{i-2}-18f_{i-1}+10f_i+3f_{i+1}\right) & i = N+g-2 \\ \frac{1}{12}\left( 3f_{i-4}-16f_{i-3}+36f_{i-2}-48f_{i-1}+25f_i \right) & i = N+g-1 \end{array}\right.
131 where \f$i\f$ is the grid index along the spatial dimension of the derivative, \f$g\f$ is the number of ghost points, and \f$N\f$ is the number of grid points (not including the ghost points) in the spatial dimension of the derivative.
134 + The first derivative is computed at the grid points or the cell centers.
135 + The first derivative is computed at the ghost points too. Thus, biased schemes are used
136 at and near the boundaries.
137 + \b Df and \b f are 1D arrays containing the function and its computed derivatives on a multi-
138 dimensional grid. The derivative along the specified dimension \b dir is computed by looping
139 through all grid lines along \b dir.
141 \sa FirstDerivativeFourthOrderCentral()
143 int gpuFirstDerivativeFourthOrderCentral(
144 double *Df, /*!< Array to hold the computed first derivative (with ghost points) */
145 double *f, /*!< Array containing the grid point function values whose first
146 derivative is to be computed (with ghost points) */
147 int dir, /*!< The spatial dimension along which the derivative is computed */
148 int bias, /*!< Forward or backward differencing for non-central
149 finite-difference schemes (-1: backward, 1: forward)*/
150 void *s, /*!< Solver object of type #SolverContext */
151 void *m /*!< MPI object of type #MPIContext */
154 SolverContext *solver = (SolverContext*) s;
156 int ghosts = solver->ghosts;
157 int ndims = solver->ndims;
158 int nvars = solver->nvars;
159 int *dim = solver->dim_local;
162 fprintf(stderr, "Error in FirstDerivativeFourthOrder(): input arrays not allocated.\n");
166 /* create index and bounds for the outer loop, i.e., to loop over all 1D lines along
168 int bounds_outer[ndims];
169 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
170 int N_outer; _ArrayProduct1D_(bounds_outer,ndims,N_outer);
171 int nblocks = (N_outer - 1) / GPU_THREADS_PER_BLOCK + 1;
173 #if defined(GPU_STAT)
174 cudaEvent_t startEvent, stopEvent;
177 checkCuda( cudaEventCreate(&startEvent) );
178 checkCuda( cudaEventCreate(&stopEvent) );
180 checkCuda( cudaEventRecord(startEvent, 0) );
183 FirstDerivativeFourthOrderCentral_boundary_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
184 N_outer, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
187 #if defined(GPU_STAT)
188 checkCuda( cudaEventRecord(stopEvent, 0) );
189 checkCuda( cudaEventSynchronize(stopEvent) );
190 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
192 printf("%-50s GPU time (secs) = %.6f\n",
193 "FirstDerivativeFourthOrderCentral_boundary", milliseconds*1e-3);
196 int npoints_grid = N_outer*(dim[dir] + 2*ghosts - 4);
197 nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
199 #if defined(GPU_STAT)
200 checkCuda( cudaEventRecord(startEvent, 0) );
203 FirstDerivativeFourthOrderCentral_interior_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
204 npoints_grid, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
206 cudaDeviceSynchronize();
208 #if defined(GPU_STAT)
209 checkCuda( cudaEventRecord(stopEvent, 0) );
210 checkCuda( cudaEventSynchronize(stopEvent) );
211 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
213 printf("%-50s GPU time (secs) = %.6f\n",
214 "FirstDerivativeFourthOrderCentral_interior", milliseconds*1e-3);
221 /*! Kernel for gpuFirstDerivativeFourthOrderCentral() */
223 void FirstDerivativeFourthOrderCentral_boundary_kernel(
225 int npoints_local_wghosts,
235 int j = threadIdx.x + (blockDim.x * blockIdx.x);
237 double one_twelve = 1.0/12.0;
240 int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
241 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
242 _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
243 _ArrayCopy1D_(index_outer,indexC,ndims);
246 for (i = -ghosts; i < -ghosts+1; i++) {
247 int qC, qp1, qp2, qp3, qp4;
248 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
249 indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
250 indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
251 indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
252 indexC[dir] = i+4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp4);
253 for (v=0; v<nvars; v++) {
254 Df[qC+v*npoints_local_wghosts] = (-25*f[qC+v*npoints_local_wghosts]+48*f[qp1+v*npoints_local_wghosts]-36*f[qp2+v*npoints_local_wghosts]+16*f[qp3+v*npoints_local_wghosts]-3*f[qp4+v*npoints_local_wghosts])*one_twelve;
257 for (i = -ghosts+1; i < -ghosts+2; i++) {
258 int qC, qm1, qp1, qp2, qp3;
259 indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
260 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
261 indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
262 indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
263 indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
264 for (v=0; v<nvars; v++)
265 Df[qC+v*npoints_local_wghosts] = (-3*f[qm1+v*npoints_local_wghosts]-10*f[qC+v*npoints_local_wghosts]+18*f[qp1+v*npoints_local_wghosts]-6*f[qp2+v*npoints_local_wghosts]+f[qp3+v*npoints_local_wghosts])*one_twelve;
268 for (i = dim[dir]+ghosts-2; i < dim[dir]+ghosts-1; i++) {
269 int qC, qm3, qm2, qm1, qp1;
270 indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
271 indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
272 indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
273 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
274 indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
275 for (v=0; v<nvars; v++)
276 Df[qC+v*npoints_local_wghosts] = (-f[qm3+v*npoints_local_wghosts]+6*f[qm2+v*npoints_local_wghosts]-18*f[qm1+v*npoints_local_wghosts]+10*f[qC+v*npoints_local_wghosts]+3*f[qp1+v*npoints_local_wghosts])*one_twelve;
278 for (i = dim[dir]+ghosts-1; i < dim[dir]+ghosts; i++) {
279 int qC, qm4, qm3, qm2, qm1;
280 indexC[dir] = i-4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm4);
281 indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
282 indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
283 indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
284 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
285 for (v=0; v<nvars; v++)
286 Df[qC+v*npoints_local_wghosts] = (3*f[qm4+v*npoints_local_wghosts]-16*f[qm3+v*npoints_local_wghosts]+36*f[qm2+v*npoints_local_wghosts]-48*f[qm1+v*npoints_local_wghosts]+25*f[qC+v*npoints_local_wghosts])*one_twelve;
293 /*! Kernel for gpuFirstDerivativeFourthOrderCentral() */
295 void FirstDerivativeFourthOrderCentral_interior_kernel(
297 int npoints_local_wghosts,
307 int i = threadIdx.x + (blockDim.x * blockIdx.x);
308 if (i < ngrid_points) {
310 double one_twelve = 1.0/12.0;
313 int qC, qm1, qm2, qp1, qp2;
314 int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
316 j = i/(dim[dir] + 2*ghosts - 4); /* Outer index */
317 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
318 _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
319 _ArrayCopy1D_(index_outer,indexC,ndims);
321 //i += (-ghosts + 2);
322 i = (i % (dim[dir] + 2*ghosts - 4)) + (-ghosts + 2);
323 indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
324 indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
325 indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
326 indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
327 indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
328 for (v=0; v<nvars; v++)
329 Df[qC+v*npoints_local_wghosts] = (f[qm2+v*npoints_local_wghosts]-8*f[qm1+v*npoints_local_wghosts]+8*f[qp1+v*npoints_local_wghosts]-f[qp2+v*npoints_local_wghosts])*one_twelve;
334 /*! Computes the fourth-order finite-difference approximation to the first derivative
335 (\b Note: not divided by the grid spacing):
337 \left(\partial f\right)_i = \left\{ \begin{array}{ll} \frac{1}{12}\left(-25f_i+48f_{i+1}-36f_{i+2}+16f_{i+3}-3f_{i+4}\right) & i=-g \\ \frac{1}{12}\left(-3f_{i-1}-10f_i+18f_{i+1}-6f_{i+2}+f_{i+3}\right) & i = -g+1 \\ \frac{1}{2}\left( f_{i-2}-8f_{i-1}+8f_{i+1}-f_{i+2} \right) & -g+2 \leq i \leq N+g-3 \\ \frac{1}{12}\left( -f_{i-3}+6f_{i-2}-18f_{i-1}+10f_i+3f_{i+1}\right) & i = N+g-2 \\ \frac{1}{12}\left( 3f_{i-4}-16f_{i-3}+36f_{i-2}-48f_{i-1}+25f_i \right) & i = N+g-1 \end{array}\right.
339 where \f$i\f$ is the grid index along the spatial dimension of the derivative, \f$g\f$ is the number of ghost points, and \f$N\f$ is the number of grid points (not including the ghost points) in the spatial dimension of the derivative.
342 + The first derivative is computed at the grid points or the cell centers.
343 + The first derivative is computed at the ghost points too. Thus, biased schemes are used
344 at and near the boundaries.
345 + \b Df and \b f are 1D arrays containing the function and its computed derivatives on a multi-
346 dimensional grid. The derivative along the specified dimension \b dir is computed by looping
347 through all grid lines along \b dir.
349 \sa FirstDerivativeFourthOrderCentral(), gpuFirstDerivativeFourthOrderCentral()
351 int gpuFirstDerivativeFourthOrderCentral(
352 double *Df, /*!< Array to hold the computed first derivative (with ghost points) */
353 double *f, /*!< Array containing the grid point function values whose first
354 derivative is to be computed (with ghost points) */
355 int dir, /*!< The spatial dimension along which the derivative is computed */
356 int bias, /*!< Forward or backward differencing for non-central
357 finite-difference schemes (-1: backward, 1: forward)*/
358 void *s, /*!< Solver object of type #SolverContext */
359 void *m /*!< MPI object of type #MPIContext */
362 SolverContext *solver = (SolverContext*) s;
364 int ghosts = solver->ghosts;
365 int ndims = solver->ndims;
366 int nvars = solver->nvars;
367 int *dim = solver->dim_local;
370 fprintf(stderr, "Error in FirstDerivativeFourthOrder(): input arrays not allocated.\n");
374 /* create index and bounds for the outer loop, i.e., to loop over all 1D lines along
376 int bounds_outer[ndims];
377 _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
378 int N_outer; _ArrayProduct1D_(bounds_outer,ndims,N_outer);
379 int nblocks = (N_outer - 1) / GPU_THREADS_PER_BLOCK + 1;
381 #if defined(GPU_STAT)
382 cudaEvent_t startEvent, stopEvent;
385 checkCuda( cudaEventCreate(&startEvent) );
386 checkCuda( cudaEventCreate(&stopEvent) );
388 checkCuda( cudaEventRecord(startEvent, 0) );
391 FirstDerivativeFourthOrderCentral_boundary_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
392 N_outer, solver->npoints_local_wghosts, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
395 #if defined(GPU_STAT)
396 checkCuda( cudaEventRecord(stopEvent, 0) );
397 checkCuda( cudaEventSynchronize(stopEvent) );
398 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
400 printf("%-50s GPU time (secs) = %.6f\n",
401 "FirstDerivativeFourthOrderCentral_boundary", milliseconds*1e-3);
404 int npoints_grid = N_outer*(dim[dir] + 2*ghosts - 4);
405 nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
407 #if defined(GPU_STAT)
408 checkCuda( cudaEventRecord(startEvent, 0) );
411 FirstDerivativeFourthOrderCentral_interior_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
412 npoints_grid, solver->npoints_local_wghosts, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
414 cudaDeviceSynchronize();
416 #if defined(GPU_STAT)
417 checkCuda( cudaEventRecord(stopEvent, 0) );
418 checkCuda( cudaEventSynchronize(stopEvent) );
419 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
421 printf("%-50s GPU time (secs) = %.6f\n",
422 "FirstDerivativeFourthOrderCentral_interior", milliseconds*1e-3);