HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
FirstDerivativeFourthOrder_GPU.cu
Go to the documentation of this file.
1 
5 #include <basic_gpu.h>
6 #include <firstderivative.h>
7 #include <arrayfunctions.h>
8 
9 #include <mpivars.h>
10 #include <hypar.h>
13 
14 #ifdef CUDA_VAR_ORDERDING_AOS
15 
17 __global__
19  int N_outer,
20  int ghosts,
21  int ndims,
22  int nvars,
23  int dir,
24  const int *dim,
25  const double *f,
26  double *Df
27 )
28 {
29  int j = threadIdx.x + (blockDim.x * blockIdx.x);
30  if (j < N_outer) {
31  double one_twelve = 1.0/12.0;
32 
33  int i, v;
34  int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
35  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
36  _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
37  _ArrayCopy1D_(index_outer,indexC,ndims);
38 
39  /* left boundary */
40  for (i = -ghosts; i < -ghosts+1; i++) {
41  int qC, qp1, qp2, qp3, qp4;
42  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
43  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
44  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
45  indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
46  indexC[dir] = i+4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp4);
47  for (v=0; v<nvars; v++)
48  Df[qC*nvars+v] = (-25*f[qC*nvars+v]+48*f[qp1*nvars+v]-36*f[qp2*nvars+v]+16*f[qp3*nvars+v]-3*f[qp4*nvars+v])*one_twelve;
49  }
50  for (i = -ghosts+1; i < -ghosts+2; i++) {
51  int qC, qm1, qp1, qp2, qp3;
52  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
53  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
54  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
55  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
56  indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
57  for (v=0; v<nvars; v++)
58  Df[qC*nvars+v] = (-3*f[qm1*nvars+v]-10*f[qC*nvars+v]+18*f[qp1*nvars+v]-6*f[qp2*nvars+v]+f[qp3*nvars+v])*one_twelve;
59  }
60  /* right boundary */
61  for (i = dim[dir]+ghosts-2; i < dim[dir]+ghosts-1; i++) {
62  int qC, qm3, qm2, qm1, qp1;
63  indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
64  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
65  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
66  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
67  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
68  for (v=0; v<nvars; v++)
69  Df[qC*nvars+v] = (-f[qm3*nvars+v]+6*f[qm2*nvars+v]-18*f[qm1*nvars+v]+10*f[qC*nvars+v]+3*f[qp1*nvars+v])*one_twelve;
70  }
71  for (i = dim[dir]+ghosts-1; i < dim[dir]+ghosts; i++) {
72  int qC, qm4, qm3, qm2, qm1;
73  indexC[dir] = i-4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm4);
74  indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
75  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
76  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
77  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
78  for (v=0; v<nvars; v++)
79  Df[qC*nvars+v] = (3*f[qm4*nvars+v]-16*f[qm3*nvars+v]+36*f[qm2*nvars+v]-48*f[qm1*nvars+v]+25*f[qC*nvars+v])*one_twelve;
80  }
81  }
82 
83  return;
84 }
85 
87 __global__
89  int ngrid_points,
90  int ghosts,
91  int ndims,
92  int nvars,
93  int dir,
94  const int *dim,
95  const double *f,
96  double *Df
97 )
98 {
99  int i = threadIdx.x + (blockDim.x * blockIdx.x);
100  if (i < ngrid_points) {
101  /* interior */
102  double one_twelve = 1.0/12.0;
103 
104  int j, v;
105  int qC, qm1, qm2, qp1, qp2;
106  int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
107 
108  j = i/(dim[dir] + 2*ghosts - 4); /* Outer index */
109  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
110  _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
111  _ArrayCopy1D_(index_outer,indexC,ndims);
112 
113  //i += (-ghosts + 2);
114  i = (i % (dim[dir] + 2*ghosts - 4)) + (-ghosts + 2);
115  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
116  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
117  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
118  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
119  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
120  for (v=0; v<nvars; v++)
121  Df[qC*nvars+v] = (f[qm2*nvars+v]-8*f[qm1*nvars+v]+8*f[qp1*nvars+v]-f[qp2*nvars+v])*one_twelve;
122  }
123  return;
124 }
125 
144  double *Df,
145  double *f,
147  int dir,
148  int bias,
150  void *s,
151  void *m
152 )
153 {
154  SolverContext *solver = (SolverContext*) s;
155 
156  int ghosts = solver->ghosts;
157  int ndims = solver->ndims;
158  int nvars = solver->nvars;
159  int *dim = solver->dim_local;
160 
161  if ((!Df) || (!f)) {
162  fprintf(stderr, "Error in FirstDerivativeFourthOrder(): input arrays not allocated.\n");
163  return(1);
164  }
165 
166  /* create index and bounds for the outer loop, i.e., to loop over all 1D lines along
167  dimension "dir" */
168  int bounds_outer[ndims];
169  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
170  int N_outer; _ArrayProduct1D_(bounds_outer,ndims,N_outer);
171  int nblocks = (N_outer - 1) / GPU_THREADS_PER_BLOCK + 1;
172 
173 #if defined(GPU_STAT)
174  cudaEvent_t startEvent, stopEvent;
175  float milliseconds;
176 
177  checkCuda( cudaEventCreate(&startEvent) );
178  checkCuda( cudaEventCreate(&stopEvent) );
179 
180  checkCuda( cudaEventRecord(startEvent, 0) );
181 #endif
182 
183  FirstDerivativeFourthOrderCentral_boundary_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
184  N_outer, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
185  );
186 
187 #if defined(GPU_STAT)
188  checkCuda( cudaEventRecord(stopEvent, 0) );
189  checkCuda( cudaEventSynchronize(stopEvent) );
190  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
191 
192  printf("%-50s GPU time (secs) = %.6f\n",
193  "FirstDerivativeFourthOrderCentral_boundary", milliseconds*1e-3);
194 #endif
195 
196  int npoints_grid = N_outer*(dim[dir] + 2*ghosts - 4);
197  nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
198 
199 #if defined(GPU_STAT)
200  checkCuda( cudaEventRecord(startEvent, 0) );
201 #endif
202 
203  FirstDerivativeFourthOrderCentral_interior_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
204  npoints_grid, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
205  );
206  cudaDeviceSynchronize();
207 
208 #if defined(GPU_STAT)
209  checkCuda( cudaEventRecord(stopEvent, 0) );
210  checkCuda( cudaEventSynchronize(stopEvent) );
211  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
212 
213  printf("%-50s GPU time (secs) = %.6f\n",
214  "FirstDerivativeFourthOrderCentral_interior", milliseconds*1e-3);
215 #endif
216  return(0);
217 }
218 
219 #else
220 
222 __global__
224  int N_outer,
225  int npoints_local_wghosts,
226  int ghosts,
227  int ndims,
228  int nvars,
229  int dir,
230  const int *dim,
231  const double *f,
232  double *Df
233 )
234 {
235  int j = threadIdx.x + (blockDim.x * blockIdx.x);
236  if (j < N_outer) {
237  double one_twelve = 1.0/12.0;
238 
239  int i, v;
240  int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
241  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
242  _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
243  _ArrayCopy1D_(index_outer,indexC,ndims);
244 
245  /* left boundary */
246  for (i = -ghosts; i < -ghosts+1; i++) {
247  int qC, qp1, qp2, qp3, qp4;
248  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
249  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
250  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
251  indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
252  indexC[dir] = i+4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp4);
253  for (v=0; v<nvars; v++) {
254  Df[qC+v*npoints_local_wghosts] = (-25*f[qC+v*npoints_local_wghosts]+48*f[qp1+v*npoints_local_wghosts]-36*f[qp2+v*npoints_local_wghosts]+16*f[qp3+v*npoints_local_wghosts]-3*f[qp4+v*npoints_local_wghosts])*one_twelve;
255  }
256  }
257  for (i = -ghosts+1; i < -ghosts+2; i++) {
258  int qC, qm1, qp1, qp2, qp3;
259  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
260  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
261  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
262  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
263  indexC[dir] = i+3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp3);
264  for (v=0; v<nvars; v++)
265  Df[qC+v*npoints_local_wghosts] = (-3*f[qm1+v*npoints_local_wghosts]-10*f[qC+v*npoints_local_wghosts]+18*f[qp1+v*npoints_local_wghosts]-6*f[qp2+v*npoints_local_wghosts]+f[qp3+v*npoints_local_wghosts])*one_twelve;
266  }
267  /* right boundary */
268  for (i = dim[dir]+ghosts-2; i < dim[dir]+ghosts-1; i++) {
269  int qC, qm3, qm2, qm1, qp1;
270  indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
271  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
272  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
273  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
274  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
275  for (v=0; v<nvars; v++)
276  Df[qC+v*npoints_local_wghosts] = (-f[qm3+v*npoints_local_wghosts]+6*f[qm2+v*npoints_local_wghosts]-18*f[qm1+v*npoints_local_wghosts]+10*f[qC+v*npoints_local_wghosts]+3*f[qp1+v*npoints_local_wghosts])*one_twelve;
277  }
278  for (i = dim[dir]+ghosts-1; i < dim[dir]+ghosts; i++) {
279  int qC, qm4, qm3, qm2, qm1;
280  indexC[dir] = i-4; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm4);
281  indexC[dir] = i-3; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm3);
282  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
283  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
284  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
285  for (v=0; v<nvars; v++)
286  Df[qC+v*npoints_local_wghosts] = (3*f[qm4+v*npoints_local_wghosts]-16*f[qm3+v*npoints_local_wghosts]+36*f[qm2+v*npoints_local_wghosts]-48*f[qm1+v*npoints_local_wghosts]+25*f[qC+v*npoints_local_wghosts])*one_twelve;
287  }
288  }
289 
290  return;
291 }
292 
294 __global__
296  int ngrid_points,
297  int npoints_local_wghosts,
298  int ghosts,
299  int ndims,
300  int nvars,
301  int dir,
302  const int *dim,
303  const double *f,
304  double *Df
305 )
306 {
307  int i = threadIdx.x + (blockDim.x * blockIdx.x);
308  if (i < ngrid_points) {
309  /* interior */
310  double one_twelve = 1.0/12.0;
311 
312  int j, v;
313  int qC, qm1, qm2, qp1, qp2;
314  int indexC[GPU_MAX_NDIMS], index_outer[GPU_MAX_NDIMS], bounds_outer[GPU_MAX_NDIMS];
315 
316  j = i/(dim[dir] + 2*ghosts - 4); /* Outer index */
317  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
318  _ArrayIndexnD_(ndims,j,bounds_outer,index_outer,0);
319  _ArrayCopy1D_(index_outer,indexC,ndims);
320 
321  //i += (-ghosts + 2);
322  i = (i % (dim[dir] + 2*ghosts - 4)) + (-ghosts + 2);
323  indexC[dir] = i-2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm2);
324  indexC[dir] = i-1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qm1);
325  indexC[dir] = i ; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qC );
326  indexC[dir] = i+1; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp1);
327  indexC[dir] = i+2; _ArrayIndex1D_(ndims,dim,indexC,ghosts,qp2);
328  for (v=0; v<nvars; v++)
329  Df[qC+v*npoints_local_wghosts] = (f[qm2+v*npoints_local_wghosts]-8*f[qm1+v*npoints_local_wghosts]+8*f[qp1+v*npoints_local_wghosts]-f[qp2+v*npoints_local_wghosts])*one_twelve;
330  }
331  return;
332 }
333 
352  double *Df,
353  double *f,
355  int dir,
356  int bias,
358  void *s,
359  void *m
360 )
361 {
362  SolverContext *solver = (SolverContext*) s;
363 
364  int ghosts = solver->ghosts;
365  int ndims = solver->ndims;
366  int nvars = solver->nvars;
367  int *dim = solver->dim_local;
368 
369  if ((!Df) || (!f)) {
370  fprintf(stderr, "Error in FirstDerivativeFourthOrder(): input arrays not allocated.\n");
371  return(1);
372  }
373 
374  /* create index and bounds for the outer loop, i.e., to loop over all 1D lines along
375  dimension "dir" */
376  int bounds_outer[ndims];
377  _ArrayCopy1D_(dim,bounds_outer,ndims); bounds_outer[dir] = 1;
378  int N_outer; _ArrayProduct1D_(bounds_outer,ndims,N_outer);
379  int nblocks = (N_outer - 1) / GPU_THREADS_PER_BLOCK + 1;
380 
381 #if defined(GPU_STAT)
382  cudaEvent_t startEvent, stopEvent;
383  float milliseconds;
384 
385  checkCuda( cudaEventCreate(&startEvent) );
386  checkCuda( cudaEventCreate(&stopEvent) );
387 
388  checkCuda( cudaEventRecord(startEvent, 0) );
389 #endif
390 
391  FirstDerivativeFourthOrderCentral_boundary_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
392  N_outer, solver->npoints_local_wghosts, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
393  );
394 
395 #if defined(GPU_STAT)
396  checkCuda( cudaEventRecord(stopEvent, 0) );
397  checkCuda( cudaEventSynchronize(stopEvent) );
398  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
399 
400  printf("%-50s GPU time (secs) = %.6f\n",
401  "FirstDerivativeFourthOrderCentral_boundary", milliseconds*1e-3);
402 #endif
403 
404  int npoints_grid = N_outer*(dim[dir] + 2*ghosts - 4);
405  nblocks = (npoints_grid - 1) / GPU_THREADS_PER_BLOCK + 1;
406 
407 #if defined(GPU_STAT)
408  checkCuda( cudaEventRecord(startEvent, 0) );
409 #endif
410 
411  FirstDerivativeFourthOrderCentral_interior_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
412  npoints_grid, solver->npoints_local_wghosts, ghosts, ndims, nvars, dir, solver->gpu_dim_local, f, Df
413  );
414  cudaDeviceSynchronize();
415 
416 #if defined(GPU_STAT)
417  checkCuda( cudaEventRecord(stopEvent, 0) );
418  checkCuda( cudaEventSynchronize(stopEvent) );
419  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
420 
421  printf("%-50s GPU time (secs) = %.6f\n",
422  "FirstDerivativeFourthOrderCentral_interior", milliseconds*1e-3);
423 #endif
424 
425  return(0);
426 }
427 
428 #endif
int npoints_local_wghosts
Definition: hypar.h:42
HyPar SolverContext
Definitions for the functions computing the first derivative.
#define _ArrayIndexnD_(N, index, imax, i, ghost)
int * dim_local
Definition: hypar.h:37
MPI related function definitions.
#define GPU_THREADS_PER_BLOCK
Definition: basic_gpu.h:7
#define GPU_MAX_NDIMS
Definition: basic_gpu.h:8
int ghosts
Definition: hypar.h:52
#define _ArrayIndex1D_(N, imax, i, ghost, index)
__global__ void FirstDerivativeFourthOrderCentral_boundary_kernel(int N_outer, int npoints_local_wghosts, int ghosts, int ndims, int nvars, int dir, const int *dim, const double *f, double *Df)
int * gpu_dim_local
Definition: hypar.h:455
#define _ArrayCopy1D_(x, y, size)
int nvars
Definition: hypar.h:29
Contains structure definition for hypar.
int ndims
Definition: hypar.h:26
Contains macros and function definitions for common array operations.
__global__ void FirstDerivativeFourthOrderCentral_interior_kernel(int ngrid_points, int npoints_local_wghosts, int ghosts, int ndims, int nvars, int dir, const int *dim, const double *f, double *Df)
#define _ArrayProduct1D_(x, size, p)
int gpuFirstDerivativeFourthOrderCentral(double *, double *, int, int, void *, void *)
Structure of MPI-related variables.
MPIVariables MPIContext
Structure containing all solver-specific variables and functions.
Definition: hypar.h:23