HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
MPIExchangeBoundariesnD_GPU.cu
Go to the documentation of this file.
1 /*! @file MPIExchangeBoundariesnD_GPU.cu
2  @brief Exchange data and fill ghost points for a 1D array
3  @author Youngdae Kim
4 */
5 
6 #include <basic_gpu.h>
7 #include <arrayfunctions_gpu.h>
8 #include <mpivars.h>
9 
10 #ifdef CUDA_VAR_ORDERDING_AOS
11 
12 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
13 __global__
14 void gpuFillSendBufLeft_kernel(
15  int npoints_grid,
16  int d,
17  int ndims,
18  int nvars,
19  int ghosts,
20  int stride,
21  const int * __restrict__ dim,
22  const double * __restrict__ var,
23  double * __restrict__ sendbuf
24 )
25 {
26  int p = blockDim.x * blockIdx.x + threadIdx.x;
27 
28  if (p < npoints_grid) {
29  int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
30  int p1, p2;
31 
32  _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
33  _ArraySetValue_(offset,ndims,0);
34  _ArrayIndexnD_(ndims,p,bounds,index,0);
35  _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
36  _ArrayIndex1D_(ndims,bounds,index,0,p2);
37  _ArrayCopy1D_((var+nvars*p1),(sendbuf+2*d*stride+nvars*p2),nvars);
38  }
39 
40  return;
41 }
42 
43 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
44 __global__
45 void gpuFillSendBufRight_kernel(
46  int npoints_grid,
47  int d,
48  int ndims,
49  int nvars,
50  int ghosts,
51  int stride,
52  const int * __restrict__ dim,
53  const double * __restrict__ var,
54  double * __restrict__ sendbuf
55 )
56 {
57  int p = blockDim.x * blockIdx.x + threadIdx.x;
58 
59  if (p < npoints_grid) {
60  int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
61  int p1, p2;
62 
63  _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
64  _ArraySetValue_(offset,ndims,0); offset[d] = dim[d]-ghosts;
65  _ArrayIndexnD_(ndims,p,bounds,index,0);
66  _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
67  _ArrayIndex1D_(ndims,bounds,index,0,p2);
68  _ArrayCopy1D_((var+nvars*p1),(sendbuf+(2*d+1)*stride+nvars*p2),nvars);
69  }
70 
71  return;
72 }
73 
74 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
75 __global__
76 void gpuFillRecvBufLeft_kernel(
77  int npoints_grid,
78  int d,
79  int ndims,
80  int nvars,
81  int ghosts,
82  int stride,
83  const int * __restrict__ dim,
84  const double * __restrict__ recvbuf,
85  double * __restrict__ var
86 )
87 {
88  int p = blockDim.x * blockIdx.x + threadIdx.x;
89 
90  if (p < npoints_grid) {
91  int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
92  int p1, p2;
93 
94  _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
95  _ArraySetValue_(offset,ndims,0); offset[d] = -ghosts;
96  _ArrayIndexnD_(ndims,p,bounds,index,0);
97  _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
98  _ArrayIndex1D_(ndims,bounds,index,0,p2);
99  _ArrayCopy1D_((recvbuf+2*d*stride+nvars*p2),(var+nvars*p1),nvars);
100  }
101 
102  return;
103 }
104 
105 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
106 __global__
107 void gpuFillRecvBufRight_kernel(
108  int npoints_grid,
109  int d,
110  int ndims,
111  int nvars,
112  int ghosts,
113  int stride,
114  const int * __restrict__ dim,
115  const double * __restrict__ recvbuf,
116  double * __restrict__ var
117 )
118 {
119  int p = blockDim.x * blockIdx.x + threadIdx.x;
120 
121  if (p < npoints_grid) {
122  int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
123  int p1, p2;
124 
125  _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
126  _ArraySetValue_(offset,ndims,0); offset[d] = dim[d];
127  _ArrayIndexnD_(ndims,p,bounds,index,0);
128  _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
129  _ArrayIndex1D_(ndims,bounds,index,0,p2);
130  _ArrayCopy1D_((recvbuf+(2*d+1)*stride+nvars*p2),(var+nvars*p1),nvars);
131  }
132 
133  return;
134 }
135 
136 /*!
137  Exchange the data across MPI ranks and fill in ghost points for a 1D array. In a multidimensional
138  simulation, a 1D array is an array of data along one of the spatial dimensions, i.e. its an array
139  storing a variable that varies in only one of the spatial dimension. For example, for a
140  2D problem on a Cartesian grid (with spatial dimensions x and y), the array of x-coordinates is
141  a 1D array along x, and the array of y-coordinates is a 1D array along y. Thus, the size of the
142  1D array is equal to the size of the domain along the spatial dimension corresponding to that array.
143 
144  Consider a two-dimensional problem, partitioned on 21 MPI ranks as follows:
145  @image html mpi_ranks.png
146  @image latex mpi_ranks.eps width=0.9\textwidth
147  and consider rank 9.
148 
149  If the argument \a dir is specified as 0, and thus we are dealing with a 1D array
150  along dimension 0, then
151  + Rank 9 will exchange data with ranks 8 and 10, and fill in its ghost points.
152 
153  If \a dir is specified as 1, and thus we are dealing with a 1D array along dimension
154  1, then
155  + Rank 9 will exchange data with ranks 2 and 16, and fill in its ghost points.
156 */
157 extern "C" int gpuMPIExchangeBoundariesnD(
158  int ndims,
159  int nvars,
160  const int * __restrict__ dim,
161  int ghosts,
162  void *m,
163  double * __restrict__ var
164 )
165 {
166 #ifndef serial
167  MPIVariables *mpi = (MPIVariables*) m;
168  int d;
169 
170  int *ip = mpi->ip;
171  int *iproc = mpi->iproc;
172  int *bcflag = mpi->bcperiodic;
173  int *cpu_dim = mpi->cpu_dim;
174 
175  int neighbor_rank[2*ndims], nip[ndims], bounds[ndims];
176  MPI_Request rcvreq[2*ndims], sndreq[2*ndims];
177  for (d=0; d<2*ndims; d++) rcvreq[d] = sndreq[d] = MPI_REQUEST_NULL;
178 
179  /* each process has 2*ndims neighbors (except at non-periodic physical boundaries) */
180  /* calculate the rank of these neighbors (-1 -> none) */
181  for (d = 0; d < ndims; d++) {
182  _ArrayCopy1D_(ip,nip,ndims);
183  if (ip[d] == 0) nip[d] = iproc[d]-1;
184  else nip[d]--;
185  if ((ip[d] == 0) && (!bcflag[d])) neighbor_rank[2*d] = -1;
186  else neighbor_rank[2*d] = MPIRank1D(ndims,iproc,nip);
187  _ArrayCopy1D_(ip,nip,ndims);
188  if (ip[d] == (iproc[d]-1)) nip[d] = 0;
189  else nip[d]++;
190  if ((ip[d] == (iproc[d]-1)) && (!bcflag[d])) neighbor_rank[2*d+1] = -1;
191  else neighbor_rank[2*d+1] = MPIRank1D(ndims,iproc,nip);
192  }
193 
194  /* calculate dimensions of each of the send-receive regions */
195  double *sendbuf = mpi->gpu_sendbuf;
196  double *recvbuf = mpi->gpu_recvbuf;
197  int stride = mpi->maxbuf;
198  int bufdim[ndims];
199  for (d = 0; d < ndims; d++) {
200  bufdim[d] = 1;
201  int i;
202  for (i = 0; i < ndims; i++) {
203  if (i == d) bufdim[d] *= ghosts;
204  else bufdim[d] *= cpu_dim[i];
205  }
206  }
207 
208  /* post the receive requests */
209  for (d = 0; d < ndims; d++) {
210  if (neighbor_rank[2*d ] != -1) {
211  MPI_Irecv(&recvbuf[2*d*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d ],1630,
212  mpi->world,&rcvreq[2*d]);
213  }
214  if (neighbor_rank[2*d+1] != -1) {
215  MPI_Irecv(&recvbuf[(2*d+1)*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d+1],1631,
216  mpi->world,&rcvreq[2*d+1]);
217  }
218  }
219 
220  /* count number of neighbors and copy data to send buffers */
221  for (d = 0; d < ndims; d++) {
222  _ArrayCopy1D_(cpu_dim,bounds,ndims); bounds[d] = ghosts;
223  if (neighbor_rank[2*d] != -1) {
224  int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
225  int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
226  gpuFillSendBufLeft_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(npoints_grid,d,ndims,nvars,ghosts,stride,dim,var,sendbuf);
227  }
228  if (neighbor_rank[2*d+1] != -1) {
229  int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
230  int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
231  gpuFillSendBufRight_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(npoints_grid,d,ndims,nvars,ghosts,stride,dim,var,sendbuf);
232  }
233  }
234  cudaDeviceSynchronize();
235 
236  /* send the data */
237  for (d = 0; d < ndims; d++) {
238  if (neighbor_rank[2*d ] != -1) {
239  MPI_Isend(&sendbuf[2*d*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d ],1631,
240  mpi->world,&sndreq[2*d]);
241  }
242  if (neighbor_rank[2*d+1] != -1) {
243  MPI_Isend(&sendbuf[(2*d+1)*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d+1],1630,
244  mpi->world,&sndreq[2*d+1]);
245  }
246  }
247 
248  /* Wait till data is done received */
249  MPI_Status status_arr[2*ndims];
250  MPI_Waitall(2*ndims,rcvreq,status_arr);
251  /* copy received data to ghost points */
252  for (d = 0; d < ndims; d++) {
253  _ArrayCopy1D_(cpu_dim,bounds,ndims); bounds[d] = ghosts;
254  if (neighbor_rank[2*d] != -1) {
255  int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
256  int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
257  gpuFillRecvBufLeft_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(npoints_grid,d,ndims,nvars,ghosts,stride,dim,recvbuf,var);
258  }
259  if (neighbor_rank[2*d+1] != -1) {
260  int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
261  int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
262  gpuFillRecvBufRight_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(npoints_grid,d,ndims,nvars,ghosts,stride,dim,recvbuf,var);
263  }
264  }
265  cudaDeviceSynchronize();
266  /* Wait till send requests are complete before freeing memory */
267  MPI_Waitall(2*ndims,sndreq,status_arr);
268 #endif
269  return 0;
270 }
271 
272 #else
273 
274 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
275 __global__
276 void gpuFillSendBufLeft_kernel(
277  int npoints_grid,
278  int npoints_local_wghosts,
279  int d,
280  int ndims,
281  int nvars,
282  int ghosts,
283  int stride,
284  const int * __restrict__ dim,
285  const double * __restrict__ var,
286  double * __restrict__ sendbuf
287 )
288 {
289  int p = blockDim.x * blockIdx.x + threadIdx.x;
290 
291  if (p < npoints_grid) {
292  int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
293  int p1, p2;
294 
295  _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
296  _ArraySetValue_(offset,ndims,0);
297  _ArrayIndexnD_(ndims,p,bounds,index,0);
298  _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
299  _ArrayIndex1D_(ndims,bounds,index,0,p2);
300  for (int v=0; v<nvars; v++) {
301  sendbuf[2*d*stride+nvars*p2+v] = var[p1+v*npoints_local_wghosts];
302  }
303  //_ArrayCopy1D_((var+nvars*p1),(sendbuf+2*d*stride+nvars*p2),nvars);
304  }
305 
306  return;
307 }
308 
309 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
310 __global__
311 void gpuFillSendBufRight_kernel(
312  int npoints_grid,
313  int npoints_local_wghosts,
314  int d,
315  int ndims,
316  int nvars,
317  int ghosts,
318  int stride,
319  const int * __restrict__ dim,
320  const double * __restrict__ var,
321  double * __restrict__ sendbuf
322 )
323 {
324  int p = blockDim.x * blockIdx.x + threadIdx.x;
325 
326  if (p < npoints_grid) {
327  int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
328  int p1, p2;
329 
330  _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
331  _ArraySetValue_(offset,ndims,0); offset[d] = dim[d]-ghosts;
332  _ArrayIndexnD_(ndims,p,bounds,index,0);
333  _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
334  _ArrayIndex1D_(ndims,bounds,index,0,p2);
335  for (int v=0; v<nvars; v++) {
336  sendbuf[(2*d+1)*stride+nvars*p2+v] = var[p1+v*npoints_local_wghosts];
337  }
338  //_ArrayCopy1D_((var+nvars*p1),(sendbuf+(2*d+1)*stride+nvars*p2),nvars);
339  }
340 
341  return;
342 }
343 
344 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
345 __global__
346 void gpuFillRecvBufLeft_kernel(
347  int npoints_grid,
348  int npoints_local_wghosts,
349  int d,
350  int ndims,
351  int nvars,
352  int ghosts,
353  int stride,
354  const int * __restrict__ dim,
355  const double * __restrict__ recvbuf,
356  double * __restrict__ var
357 )
358 {
359  int p = blockDim.x * blockIdx.x + threadIdx.x;
360 
361  if (p < npoints_grid) {
362  int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
363  int p1, p2;
364 
365  _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
366  _ArraySetValue_(offset,ndims,0); offset[d] = -ghosts;
367  _ArrayIndexnD_(ndims,p,bounds,index,0);
368  _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
369  _ArrayIndex1D_(ndims,bounds,index,0,p2);
370  for (int v=0; v<nvars; v++) {
371  var[p1+v*npoints_local_wghosts] = recvbuf[2*d*stride+nvars*p2+v];
372  }
373  //_ArrayCopy1D_((recvbuf+2*d*stride+nvars*p2),(var+nvars*p1),nvars);
374  }
375 
376  return;
377 }
378 
379 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
380 __global__
381 void gpuFillRecvBufRight_kernel(
382  int npoints_grid,
383  int npoints_local_wghosts,
384  int d,
385  int ndims,
386  int nvars,
387  int ghosts,
388  int stride,
389  const int * __restrict__ dim,
390  const double * __restrict__ recvbuf,
391  double * __restrict__ var
392 )
393 {
394  int p = blockDim.x * blockIdx.x + threadIdx.x;
395 
396  if (p < npoints_grid) {
397  int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
398  int p1, p2;
399 
400  _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
401  _ArraySetValue_(offset,ndims,0); offset[d] = dim[d];
402  _ArrayIndexnD_(ndims,p,bounds,index,0);
403  _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
404  _ArrayIndex1D_(ndims,bounds,index,0,p2);
405  for (int v=0; v<nvars; v++) {
406  var[p1+v*npoints_local_wghosts] = recvbuf[(2*d+1)*stride+nvars*p2+v];
407  }
408  //_ArrayCopy1D_((recvbuf+(2*d+1)*stride+nvars*p2),(var+nvars*p1),nvars);
409  }
410 
411  return;
412 }
413 
414 /*!
415  Exchange the data across MPI ranks and fill in ghost points for a 1D array. In a multidimensional
416  simulation, a 1D array is an array of data along one of the spatial dimensions, i.e. its an array
417  storing a variable that varies in only one of the spatial dimension. For example, for a
418  2D problem on a Cartesian grid (with spatial dimensions x and y), the array of x-coordinates is
419  a 1D array along x, and the array of y-coordinates is a 1D array along y. Thus, the size of the
420  1D array is equal to the size of the domain along the spatial dimension corresponding to that array.
421 
422  Consider a two-dimensional problem, partitioned on 21 MPI ranks as follows:
423  @image html mpi_ranks.png
424  @image latex mpi_ranks.eps width=0.9\textwidth
425  and consider rank 9.
426 
427  If the argument \a dir is specified as 0, and thus we are dealing with a 1D array
428  along dimension 0, then
429  + Rank 9 will exchange data with ranks 8 and 10, and fill in its ghost points.
430 
431  If \a dir is specified as 1, and thus we are dealing with a 1D array along dimension
432  1, then
433  + Rank 9 will exchange data with ranks 2 and 16, and fill in its ghost points.
434 */
435 extern "C" int gpuMPIExchangeBoundariesnD(
436  int ndims,
437  int nvars,
438  const int * __restrict__ dim,
439  int ghosts,
440  void *m,
441  double * __restrict__ var
442 )
443 {
444 #ifndef serial
445 
446  MPIVariables *mpi = (MPIVariables*) m;
447  int d;
448 
449  int *ip = mpi->ip;
450  int *iproc = mpi->iproc;
451  int *bcflag = mpi->bcperiodic;
452  int *cpu_dim = mpi->cpu_dim;
453  int size = 1; for (d=0; d<ndims; d++) size *= (cpu_dim[d]+2*ghosts);
454 
455  int neighbor_rank[2*ndims], nip[ndims], bounds[ndims];
456  MPI_Request rcvreq[2*ndims], sndreq[2*ndims];
457  for (d=0; d<2*ndims; d++) rcvreq[d] = sndreq[d] = MPI_REQUEST_NULL;
458 
459  /* each process has 2*ndims neighbors (except at non-periodic physical boundaries) */
460  /* calculate the rank of these neighbors (-1 -> none) */
461  for (d = 0; d < ndims; d++) {
462  _ArrayCopy1D_(ip,nip,ndims);
463  if (ip[d] == 0) nip[d] = iproc[d]-1;
464  else nip[d]--;
465  if ((ip[d] == 0) && (!bcflag[d])) neighbor_rank[2*d] = -1;
466  else neighbor_rank[2*d] = MPIRank1D(ndims,iproc,nip);
467  _ArrayCopy1D_(ip,nip,ndims);
468  if (ip[d] == (iproc[d]-1)) nip[d] = 0;
469  else nip[d]++;
470  if ((ip[d] == (iproc[d]-1)) && (!bcflag[d])) neighbor_rank[2*d+1] = -1;
471  else neighbor_rank[2*d+1] = MPIRank1D(ndims,iproc,nip);
472  }
473 
474  /* calculate dimensions of each of the send-receive regions */
475  double *sendbuf = mpi->gpu_sendbuf;
476  double *recvbuf = mpi->gpu_recvbuf;
477  int stride = mpi->maxbuf;
478  int bufdim[ndims];
479  for (d = 0; d < ndims; d++) {
480  bufdim[d] = 1;
481  int i;
482  for (i = 0; i < ndims; i++) {
483  if (i == d) bufdim[d] *= ghosts;
484  else bufdim[d] *= cpu_dim[i];
485  }
486  }
487 
488  /* post the receive requests */
489  for (d = 0; d < ndims; d++) {
490  if (neighbor_rank[2*d ] != -1) {
491  MPI_Irecv(&recvbuf[2*d*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d ],1630,
492  mpi->world,&rcvreq[2*d]);
493  }
494  if (neighbor_rank[2*d+1] != -1) {
495  MPI_Irecv(&recvbuf[(2*d+1)*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d+1],1631,
496  mpi->world,&rcvreq[2*d+1]);
497  }
498  }
499 
500  /* count number of neighbors and copy data to send buffers */
501  for (d = 0; d < ndims; d++) {
502  _ArrayCopy1D_(cpu_dim,bounds,ndims); bounds[d] = ghosts;
503  if (neighbor_rank[2*d] != -1) {
504  int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
505  int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
506  gpuFillSendBufLeft_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
507  npoints_grid,size,d,ndims,nvars,ghosts,stride,dim,var,sendbuf);
508  }
509  if (neighbor_rank[2*d+1] != -1) {
510  int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
511  int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
512  gpuFillSendBufRight_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
513  npoints_grid,size,d,ndims,nvars,ghosts,stride,dim,var,sendbuf);
514  }
515  }
516  cudaDeviceSynchronize();
517 
518  /* send the data */
519  for (d = 0; d < ndims; d++) {
520  if (neighbor_rank[2*d ] != -1) {
521  MPI_Isend(&sendbuf[2*d*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d ],1631,
522  mpi->world,&sndreq[2*d]);
523  }
524  if (neighbor_rank[2*d+1] != -1) {
525  MPI_Isend(&sendbuf[(2*d+1)*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d+1],1630,
526  mpi->world,&sndreq[2*d+1]);
527  }
528  }
529 
530  /* Wait till data is done received */
531  MPI_Waitall(2*ndims,rcvreq,MPI_STATUS_IGNORE);
532 
533  /* copy received data to ghost points */
534  for (d = 0; d < ndims; d++) {
535  _ArrayCopy1D_(cpu_dim,bounds,ndims); bounds[d] = ghosts;
536  if (neighbor_rank[2*d] != -1) {
537  int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
538  int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
539  gpuFillRecvBufLeft_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
540  npoints_grid,size,d,ndims,nvars,ghosts,stride,dim,recvbuf,var);
541  }
542  if (neighbor_rank[2*d+1] != -1) {
543  int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
544  int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
545  gpuFillRecvBufRight_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
546  npoints_grid,size,d,ndims,nvars,ghosts,stride,dim,recvbuf,var);
547  }
548  }
549  cudaDeviceSynchronize();
550 
551  /* Wait till send requests are complete before freeing memory */
552  MPI_Waitall(2*ndims,sndreq,MPI_STATUS_IGNORE);
553 
554 #endif
555  return 0;
556 }
557 
558 #endif