HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
FirstDerivativeFourthOrder_GPU.cu
Go to the documentation of this file.
1 /*! @file FirstDerivativeFourthOrder_GPU.cu
2  @author Youngdae Kim
3  @brief GPU implementation of fourth order finite-difference approximation to the first derivative
4 */
5 #include <basic_gpu.h>
6 #include <firstderivative.h>
7 #include <arrayfunctions.h>
8 
9 #include <mpivars.h>
10 #include <hypar.h>
11 typedef MPIVariables MPIContext;
12 typedef HyPar SolverContext;
13 
14 #ifdef CUDA_VAR_ORDERDING_AOS
15 
16 /*! Kernel for gpuFirstDerivativeFourthOrderCentral() */
17 __global__
18 void FirstDerivativeFourthOrderCentral_boundary_kernel(
19  int N_outer,
20  int ghosts,
21  int ndims,
22  int nvars,
23  int dir,
24  const int *dim,
25  const double *f,
26  double *Df
27 )
28 {
29  int j = threadIdx.x + (blockDim.x * blockIdx.x);
30  if (j < N_outer) {
31  double one_twelve = 1.0/12.0;
32 
33  int i, v;
34  int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
35  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
36  _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
37  _ArrayCopy1D_(index_outer,indexC,ndims);
38 
39  /* left boundary */
40  for (i = -ghosts; i < -ghosts+1; i++) {
41  int qC, qp1, qp2, qp3, qp4;
42  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
43  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
44  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
45  indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
46  indexC[dir] = i+4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp4);
47  for (v=0; v<nvars; v++)
48  Df[qC*nvars+v] = (-25*f[qC*nvars+v]+48*f[qp1*nvars+v]-36*f[qp2*nvars+v]+16*f[qp3*nvars+v]-3*f[qp4*nvars+v])*one_twelve;
49  }
50  for (i = -ghosts+1; i < -ghosts+2; i++) {
51  int qC, qm1, qp1, qp2, qp3;
52  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
53  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
54  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
55  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
56  indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
57  for (v=0; v<nvars; v++)
58  Df[qC*nvars+v] = (-3*f[qm1*nvars+v]-10*f[qC*nvars+v]+18*f[qp1*nvars+v]-6*f[qp2*nvars+v]+f[qp3*nvars+v])*one_twelve;
59  }
60  /* right boundary */
61  for (i = dim[dir]+ghosts-2; i < dim[dir]+ghosts-1; i++) {
62  int qC, qm3, qm2, qm1, qp1;
63  indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
64  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
65  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
66  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
67  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
68  for (v=0; v<nvars; v++)
69  Df[qC*nvars+v] = (-f[qm3*nvars+v]+6*f[qm2*nvars+v]-18*f[qm1*nvars+v]+10*f[qC*nvars+v]+3*f[qp1*nvars+v])*one_twelve;
70  }
71  for (i = dim[dir]+ghosts-1; i < dim[dir]+ghosts; i++) {
72  int qC, qm4, qm3, qm2, qm1;
73  indexC[dir] = i-4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm4);
74  indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
75  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
76  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
77  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
78  for (v=0; v<nvars; v++)
79  Df[qC*nvars+v] = (3*f[qm4*nvars+v]-16*f[qm3*nvars+v]+36*f[qm2*nvars+v]-48*f[qm1*nvars+v]+25*f[qC*nvars+v])*one_twelve;
80  }
81  }
82 
83  return;
84 }
85 
86 /*! Kernel for gpuFirstDerivativeFourthOrderCentral() */
87 __global__
88 void FirstDerivativeFourthOrderCentral_interior_kernel(
89  int ngrid_points,
90  int ghosts,
91  int ndims,
92  int nvars,
93  int dir,
94  const int *dim,
95  const double *f,
96  double *Df
97 )
98 {
99  int i = threadIdx.x + (blockDim.x * blockIdx.x);
100  if (i < ngrid_points) {
101  /* interior */
102  double one_twelve = 1.0/12.0;
103 
104  int j, v;
105  int qC, qm1, qm2, qp1, qp2;
106  int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
107 
108  j = i/(dim[dir] + 2*ghosts - 4); /* Outer index */
109  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
110  _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
111  _ArrayCopy1D_(index_outer,indexC,ndims);
112 
113  //i += (-ghosts + 2);
114  i = (i % (dim[dir] + 2*ghosts - 4)) + (-ghosts + 2);
115  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
116  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
117  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
118  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
119  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
120  for (v=0; v<nvars; v++)
121  Df[qC*nvars+v] = (f[qm2*nvars+v]-8*f[qm1*nvars+v]+8*f[qp1*nvars+v]-f[qp2*nvars+v])*one_twelve;
122  }
123  return;
124 }
125 
126 /*! Computes the fourth-order finite-difference approximation to the first derivative
127  (\b Note: not divided by the grid spacing):
128  \f{equation}{
129  \left(\partial f\right)_i = \left\{ \begin{array}{ll} \frac{1}{12}\left(-25f_i+48f_{i+1}-36f_{i+2}+16f_{i+3}-3f_{i+4}\right) & i=-g \\ \frac{1}{12}\left(-3f_{i-1}-10f_i+18f_{i+1}-6f_{i+2}+f_{i+3}\right) & i = -g+1 \\ \frac{1}{2}\left( f_{i-2}-8f_{i-1}+8f_{i+1}-f_{i+2} \right) & -g+2 \leq i \leq N+g-3 \\ \frac{1}{12}\left( -f_{i-3}+6f_{i-2}-18f_{i-1}+10f_i+3f_{i+1}\right) & i = N+g-2 \\ \frac{1}{12}\left( 3f_{i-4}-16f_{i-3}+36f_{i-2}-48f_{i-1}+25f_i \right) & i = N+g-1 \end{array}\right.
130  \f}
131  where \f$i\f$ is the grid index along the spatial dimension of the derivative, \f$g\f$ is the number of ghost points, and \f$N\f$ is the number of grid points (not including the ghost points) in the spatial dimension of the derivative.
132  \n\n
133  Notes:
134  + The first derivative is computed at the grid points or the cell centers.
135  + The first derivative is computed at the ghost points too. Thus, biased schemes are used
136  at and near the boundaries.
137  + \b Df and \b f are 1D arrays containing the function and its computed derivatives on a multi-
138  dimensional grid. The derivative along the specified dimension \b dir is computed by looping
139  through all grid lines along \b dir.
140 
141  \sa FirstDerivativeFourthOrderCentral()
142 */
143 int gpuFirstDerivativeFourthOrderCentral(
144  double *Df, /*!< Array to hold the computed first derivative (with ghost points) */
145  double *f, /*!< Array containing the grid point function values whose first
146  derivative is to be computed (with ghost points) */
147  int dir, /*!< The spatial dimension along which the derivative is computed */
148  int bias, /*!< Forward or backward differencing for non-central
149  finite-difference schemes (-1: backward, 1: forward)*/
150  void *s, /*!< Solver object of type #SolverContext */
151  void *m /*!< MPI object of type #MPIContext */
152 )
153 {
154  SolverContext *solver = (SolverContext*) s;
155 
156  int ghosts = solver->ghosts;
157  int ndims = solver->ndims;
158  int nvars = solver->nvars;
159  int *dim = solver->dim_local;
160 
161  if ((!Df) || (!f)) {
162  fprintf(stderr, "Error in FirstDerivativeFourthOrder(): input arrays not allocated.\n");
163  return(1);
164  }
165 
166  /* create index and bounds for the outer loop, i.e., to loop over all 1D lines along
167  dimension "dir" */
168  int bounds_outer[ndims];
169  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
170  int N_outer; _ArrayProduct1D_(bounds_outer,ndims,N_outer);
171  int nblocks = (N_outer - 1) / GPU_THREADS_PER_BLOCK + 1;
172 
173 #if defined(GPU_STAT)
174  cudaEvent_t startEvent, stopEvent;
175  float milliseconds;
176 
177  checkCuda( cudaEventCreate(&startEvent) );
178  checkCuda( cudaEventCreate(&stopEvent) );
179 
180  checkCuda( cudaEventRecord(startEvent, 0) );
181 #endif
182 
183  FirstDerivativeFourthOrderCentral_boundary_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
184  N_outer, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
185  );
186 
187 #if defined(GPU_STAT)
188  checkCuda( cudaEventRecord(stopEvent, 0) );
189  checkCuda( cudaEventSynchronize(stopEvent) );
190  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
191 
192  printf("%-50s GPU time (secs) = %.6f\n",
193  "FirstDerivativeFourthOrderCentral_boundary", milliseconds*1e-3);
194 #endif
195 
196  int npoints_grid = N_outer*(dim[dir] + 2*ghosts - 4);
197  nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
198 
199 #if defined(GPU_STAT)
200  checkCuda( cudaEventRecord(startEvent, 0) );
201 #endif
202 
203  FirstDerivativeFourthOrderCentral_interior_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
204  npoints_grid, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
205  );
206  cudaDeviceSynchronize();
207 
208 #if defined(GPU_STAT)
209  checkCuda( cudaEventRecord(stopEvent, 0) );
210  checkCuda( cudaEventSynchronize(stopEvent) );
211  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
212 
213  printf("%-50s GPU time (secs) = %.6f\n",
214  "FirstDerivativeFourthOrderCentral_interior", milliseconds*1e-3);
215 #endif
216  return(0);
217 }
218 
219 #else
220 
221 /*! Kernel for gpuFirstDerivativeFourthOrderCentral() */
222 __global__
223 void FirstDerivativeFourthOrderCentral_boundary_kernel(
224  int N_outer,
225  int npoints_local_wghosts,
226  int ghosts,
227  int ndims,
228  int nvars,
229  int dir,
230  const int *dim,
231  const double *f,
232  double *Df
233 )
234 {
235  int j = threadIdx.x + (blockDim.x * blockIdx.x);
236  if (j < N_outer) {
237  double one_twelve = 1.0/12.0;
238 
239  int i, v;
240  int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
241  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
242  _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
243  _ArrayCopy1D_(index_outer,indexC,ndims);
244 
245  /* left boundary */
246  for (i = -ghosts; i < -ghosts+1; i++) {
247  int qC, qp1, qp2, qp3, qp4;
248  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
249  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
250  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
251  indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
252  indexC[dir] = i+4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp4);
253  for (v=0; v<nvars; v++) {
254  Df[qC+v*npoints_local_wghosts] = (-25*f[qC+v*npoints_local_wghosts]+48*f[qp1+v*npoints_local_wghosts]-36*f[qp2+v*npoints_local_wghosts]+16*f[qp3+v*npoints_local_wghosts]-3*f[qp4+v*npoints_local_wghosts])*one_twelve;
255  }
256  }
257  for (i = -ghosts+1; i < -ghosts+2; i++) {
258  int qC, qm1, qp1, qp2, qp3;
259  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
260  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
261  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
262  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
263  indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
264  for (v=0; v<nvars; v++)
265  Df[qC+v*npoints_local_wghosts] = (-3*f[qm1+v*npoints_local_wghosts]-10*f[qC+v*npoints_local_wghosts]+18*f[qp1+v*npoints_local_wghosts]-6*f[qp2+v*npoints_local_wghosts]+f[qp3+v*npoints_local_wghosts])*one_twelve;
266  }
267  /* right boundary */
268  for (i = dim[dir]+ghosts-2; i < dim[dir]+ghosts-1; i++) {
269  int qC, qm3, qm2, qm1, qp1;
270  indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
271  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
272  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
273  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
274  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
275  for (v=0; v<nvars; v++)
276  Df[qC+v*npoints_local_wghosts] = (-f[qm3+v*npoints_local_wghosts]+6*f[qm2+v*npoints_local_wghosts]-18*f[qm1+v*npoints_local_wghosts]+10*f[qC+v*npoints_local_wghosts]+3*f[qp1+v*npoints_local_wghosts])*one_twelve;
277  }
278  for (i = dim[dir]+ghosts-1; i < dim[dir]+ghosts; i++) {
279  int qC, qm4, qm3, qm2, qm1;
280  indexC[dir] = i-4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm4);
281  indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
282  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
283  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
284  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
285  for (v=0; v<nvars; v++)
286  Df[qC+v*npoints_local_wghosts] = (3*f[qm4+v*npoints_local_wghosts]-16*f[qm3+v*npoints_local_wghosts]+36*f[qm2+v*npoints_local_wghosts]-48*f[qm1+v*npoints_local_wghosts]+25*f[qC+v*npoints_local_wghosts])*one_twelve;
287  }
288  }
289 
290  return;
291 }
292 
293 /*! Kernel for gpuFirstDerivativeFourthOrderCentral() */
294 __global__
295 void FirstDerivativeFourthOrderCentral_interior_kernel(
296  int ngrid_points,
297  int npoints_local_wghosts,
298  int ghosts,
299  int ndims,
300  int nvars,
301  int dir,
302  const int *dim,
303  const double *f,
304  double *Df
305 )
306 {
307  int i = threadIdx.x + (blockDim.x * blockIdx.x);
308  if (i < ngrid_points) {
309  /* interior */
310  double one_twelve = 1.0/12.0;
311 
312  int j, v;
313  int qC, qm1, qm2, qp1, qp2;
314  int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
315 
316  j = i/(dim[dir] + 2*ghosts - 4); /* Outer index */
317  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
318  _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
319  _ArrayCopy1D_(index_outer,indexC,ndims);
320 
321  //i += (-ghosts + 2);
322  i = (i % (dim[dir] + 2*ghosts - 4)) + (-ghosts + 2);
323  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
324  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
325  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
326  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
327  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
328  for (v=0; v<nvars; v++)
329  Df[qC+v*npoints_local_wghosts] = (f[qm2+v*npoints_local_wghosts]-8*f[qm1+v*npoints_local_wghosts]+8*f[qp1+v*npoints_local_wghosts]-f[qp2+v*npoints_local_wghosts])*one_twelve;
330  }
331  return;
332 }
333 
334 /*! Computes the fourth-order finite-difference approximation to the first derivative
335  (\b Note: not divided by the grid spacing):
336  \f{equation}{
337  \left(\partial f\right)_i = \left\{ \begin{array}{ll} \frac{1}{12}\left(-25f_i+48f_{i+1}-36f_{i+2}+16f_{i+3}-3f_{i+4}\right) & i=-g \\ \frac{1}{12}\left(-3f_{i-1}-10f_i+18f_{i+1}-6f_{i+2}+f_{i+3}\right) & i = -g+1 \\ \frac{1}{2}\left( f_{i-2}-8f_{i-1}+8f_{i+1}-f_{i+2} \right) & -g+2 \leq i \leq N+g-3 \\ \frac{1}{12}\left( -f_{i-3}+6f_{i-2}-18f_{i-1}+10f_i+3f_{i+1}\right) & i = N+g-2 \\ \frac{1}{12}\left( 3f_{i-4}-16f_{i-3}+36f_{i-2}-48f_{i-1}+25f_i \right) & i = N+g-1 \end{array}\right.
338  \f}
339  where \f$i\f$ is the grid index along the spatial dimension of the derivative, \f$g\f$ is the number of ghost points, and \f$N\f$ is the number of grid points (not including the ghost points) in the spatial dimension of the derivative.
340  \n\n
341  Notes:
342  + The first derivative is computed at the grid points or the cell centers.
343  + The first derivative is computed at the ghost points too. Thus, biased schemes are used
344  at and near the boundaries.
345  + \b Df and \b f are 1D arrays containing the function and its computed derivatives on a multi-
346  dimensional grid. The derivative along the specified dimension \b dir is computed by looping
347  through all grid lines along \b dir.
348 
349  \sa FirstDerivativeFourthOrderCentral(), gpuFirstDerivativeFourthOrderCentral()
350 */
351 int gpuFirstDerivativeFourthOrderCentral(
352  double *Df, /*!< Array to hold the computed first derivative (with ghost points) */
353  double *f, /*!< Array containing the grid point function values whose first
354  derivative is to be computed (with ghost points) */
355  int dir, /*!< The spatial dimension along which the derivative is computed */
356  int bias, /*!< Forward or backward differencing for non-central
357  finite-difference schemes (-1: backward, 1: forward)*/
358  void *s, /*!< Solver object of type #SolverContext */
359  void *m /*!< MPI object of type #MPIContext */
360 )
361 {
362  SolverContext *solver = (SolverContext*) s;
363 
364  int ghosts = solver->ghosts;
365  int ndims = solver->ndims;
366  int nvars = solver->nvars;
367  int *dim = solver->dim_local;
368 
369  if ((!Df) || (!f)) {
370  fprintf(stderr, "Error in FirstDerivativeFourthOrder(): input arrays not allocated.\n");
371  return(1);
372  }
373 
374  /* create index and bounds for the outer loop, i.e., to loop over all 1D lines along
375  dimension "dir" */
376  int bounds_outer[ndims];
377  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
378  int N_outer; _ArrayProduct1D_(bounds_outer,ndims,N_outer);
379  int nblocks = (N_outer - 1) / GPU_THREADS_PER_BLOCK + 1;
380 
381 #if defined(GPU_STAT)
382  cudaEvent_t startEvent, stopEvent;
383  float milliseconds;
384 
385  checkCuda( cudaEventCreate(&startEvent) );
386  checkCuda( cudaEventCreate(&stopEvent) );
387 
388  checkCuda( cudaEventRecord(startEvent, 0) );
389 #endif
390 
391  FirstDerivativeFourthOrderCentral_boundary_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
392  N_outer, solver->npoints_local_wghosts, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
393  );
394 
395 #if defined(GPU_STAT)
396  checkCuda( cudaEventRecord(stopEvent, 0) );
397  checkCuda( cudaEventSynchronize(stopEvent) );
398  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
399 
400  printf("%-50s GPU time (secs) = %.6f\n",
401  "FirstDerivativeFourthOrderCentral_boundary", milliseconds*1e-3);
402 #endif
403 
404  int npoints_grid = N_outer*(dim[dir] + 2*ghosts - 4);
405  nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
406 
407 #if defined(GPU_STAT)
408  checkCuda( cudaEventRecord(startEvent, 0) );
409 #endif
410 
411  FirstDerivativeFourthOrderCentral_interior_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
412  npoints_grid, solver->npoints_local_wghosts, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
413  );
414  cudaDeviceSynchronize();
415 
416 #if defined(GPU_STAT)
417  checkCuda( cudaEventRecord(stopEvent, 0) );
418  checkCuda( cudaEventSynchronize(stopEvent) );
419  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
420 
421  printf("%-50s GPU time (secs) = %.6f\n",
422  "FirstDerivativeFourthOrderCentral_interior", milliseconds*1e-3);
423 #endif
424 
425  return(0);
426 }
427 
428 #endif