1 /*! @file MPIExchangeBoundariesnD_GPU.cu
2 @brief Exchange data and fill ghost points for a 1D array
7 #include <arrayfunctions_gpu.h>
10 #ifdef CUDA_VAR_ORDERDING_AOS
12 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
14 void gpuFillSendBufLeft_kernel(
21 const int * __restrict__ dim,
22 const double * __restrict__ var,
23 double * __restrict__ sendbuf
26 int p = blockDim.x * blockIdx.x + threadIdx.x;
28 if (p < npoints_grid) {
29 int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
32 _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
33 _ArraySetValue_(offset,ndims,0);
34 _ArrayIndexnD_(ndims,p,bounds,index,0);
35 _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
36 _ArrayIndex1D_(ndims,bounds,index,0,p2);
37 _ArrayCopy1D_((var+nvars*p1),(sendbuf+2*d*stride+nvars*p2),nvars);
43 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
45 void gpuFillSendBufRight_kernel(
52 const int * __restrict__ dim,
53 const double * __restrict__ var,
54 double * __restrict__ sendbuf
57 int p = blockDim.x * blockIdx.x + threadIdx.x;
59 if (p < npoints_grid) {
60 int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
63 _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
64 _ArraySetValue_(offset,ndims,0); offset[d] = dim[d]-ghosts;
65 _ArrayIndexnD_(ndims,p,bounds,index,0);
66 _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
67 _ArrayIndex1D_(ndims,bounds,index,0,p2);
68 _ArrayCopy1D_((var+nvars*p1),(sendbuf+(2*d+1)*stride+nvars*p2),nvars);
74 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
76 void gpuFillRecvBufLeft_kernel(
83 const int * __restrict__ dim,
84 const double * __restrict__ recvbuf,
85 double * __restrict__ var
88 int p = blockDim.x * blockIdx.x + threadIdx.x;
90 if (p < npoints_grid) {
91 int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
94 _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
95 _ArraySetValue_(offset,ndims,0); offset[d] = -ghosts;
96 _ArrayIndexnD_(ndims,p,bounds,index,0);
97 _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
98 _ArrayIndex1D_(ndims,bounds,index,0,p2);
99 _ArrayCopy1D_((recvbuf+2*d*stride+nvars*p2),(var+nvars*p1),nvars);
105 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
107 void gpuFillRecvBufRight_kernel(
114 const int * __restrict__ dim,
115 const double * __restrict__ recvbuf,
116 double * __restrict__ var
119 int p = blockDim.x * blockIdx.x + threadIdx.x;
121 if (p < npoints_grid) {
122 int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
125 _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
126 _ArraySetValue_(offset,ndims,0); offset[d] = dim[d];
127 _ArrayIndexnD_(ndims,p,bounds,index,0);
128 _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
129 _ArrayIndex1D_(ndims,bounds,index,0,p2);
130 _ArrayCopy1D_((recvbuf+(2*d+1)*stride+nvars*p2),(var+nvars*p1),nvars);
137 Exchange the data across MPI ranks and fill in ghost points for a 1D array. In a multidimensional
138 simulation, a 1D array is an array of data along one of the spatial dimensions, i.e. its an array
139 storing a variable that varies in only one of the spatial dimension. For example, for a
140 2D problem on a Cartesian grid (with spatial dimensions x and y), the array of x-coordinates is
141 a 1D array along x, and the array of y-coordinates is a 1D array along y. Thus, the size of the
142 1D array is equal to the size of the domain along the spatial dimension corresponding to that array.
144 Consider a two-dimensional problem, partitioned on 21 MPI ranks as follows:
145 @image html mpi_ranks.png
146 @image latex mpi_ranks.eps width=0.9\textwidth
149 If the argument \a dir is specified as 0, and thus we are dealing with a 1D array
150 along dimension 0, then
151 + Rank 9 will exchange data with ranks 8 and 10, and fill in its ghost points.
153 If \a dir is specified as 1, and thus we are dealing with a 1D array along dimension
155 + Rank 9 will exchange data with ranks 2 and 16, and fill in its ghost points.
157 extern "C" int gpuMPIExchangeBoundariesnD(
160 const int * __restrict__ dim,
163 double * __restrict__ var
167 MPIVariables *mpi = (MPIVariables*) m;
171 int *iproc = mpi->iproc;
172 int *bcflag = mpi->bcperiodic;
173 int *cpu_dim = mpi->cpu_dim;
175 int neighbor_rank[2*ndims], nip[ndims], bounds[ndims];
176 MPI_Request rcvreq[2*ndims], sndreq[2*ndims];
177 for (d=0; d<2*ndims; d++) rcvreq[d] = sndreq[d] = MPI_REQUEST_NULL;
179 /* each process has 2*ndims neighbors (except at non-periodic physical boundaries) */
180 /* calculate the rank of these neighbors (-1 -> none) */
181 for (d = 0; d < ndims; d++) {
182 _ArrayCopy1D_(ip,nip,ndims);
183 if (ip[d] == 0) nip[d] = iproc[d]-1;
185 if ((ip[d] == 0) && (!bcflag[d])) neighbor_rank[2*d] = -1;
186 else neighbor_rank[2*d] = MPIRank1D(ndims,iproc,nip);
187 _ArrayCopy1D_(ip,nip,ndims);
188 if (ip[d] == (iproc[d]-1)) nip[d] = 0;
190 if ((ip[d] == (iproc[d]-1)) && (!bcflag[d])) neighbor_rank[2*d+1] = -1;
191 else neighbor_rank[2*d+1] = MPIRank1D(ndims,iproc,nip);
194 /* calculate dimensions of each of the send-receive regions */
195 double *sendbuf = mpi->gpu_sendbuf;
196 double *recvbuf = mpi->gpu_recvbuf;
197 int stride = mpi->maxbuf;
199 for (d = 0; d < ndims; d++) {
202 for (i = 0; i < ndims; i++) {
203 if (i == d) bufdim[d] *= ghosts;
204 else bufdim[d] *= cpu_dim[i];
208 /* post the receive requests */
209 for (d = 0; d < ndims; d++) {
210 if (neighbor_rank[2*d ] != -1) {
211 MPI_Irecv(&recvbuf[2*d*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d ],1630,
212 mpi->world,&rcvreq[2*d]);
214 if (neighbor_rank[2*d+1] != -1) {
215 MPI_Irecv(&recvbuf[(2*d+1)*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d+1],1631,
216 mpi->world,&rcvreq[2*d+1]);
220 /* count number of neighbors and copy data to send buffers */
221 for (d = 0; d < ndims; d++) {
222 _ArrayCopy1D_(cpu_dim,bounds,ndims); bounds[d] = ghosts;
223 if (neighbor_rank[2*d] != -1) {
224 int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
225 int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
226 gpuFillSendBufLeft_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(npoints_grid,d,ndims,nvars,ghosts,stride,dim,var,sendbuf);
228 if (neighbor_rank[2*d+1] != -1) {
229 int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
230 int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
231 gpuFillSendBufRight_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(npoints_grid,d,ndims,nvars,ghosts,stride,dim,var,sendbuf);
234 cudaDeviceSynchronize();
237 for (d = 0; d < ndims; d++) {
238 if (neighbor_rank[2*d ] != -1) {
239 MPI_Isend(&sendbuf[2*d*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d ],1631,
240 mpi->world,&sndreq[2*d]);
242 if (neighbor_rank[2*d+1] != -1) {
243 MPI_Isend(&sendbuf[(2*d+1)*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d+1],1630,
244 mpi->world,&sndreq[2*d+1]);
248 /* Wait till data is done received */
249 MPI_Status status_arr[2*ndims];
250 MPI_Waitall(2*ndims,rcvreq,status_arr);
251 /* copy received data to ghost points */
252 for (d = 0; d < ndims; d++) {
253 _ArrayCopy1D_(cpu_dim,bounds,ndims); bounds[d] = ghosts;
254 if (neighbor_rank[2*d] != -1) {
255 int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
256 int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
257 gpuFillRecvBufLeft_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(npoints_grid,d,ndims,nvars,ghosts,stride,dim,recvbuf,var);
259 if (neighbor_rank[2*d+1] != -1) {
260 int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
261 int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
262 gpuFillRecvBufRight_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(npoints_grid,d,ndims,nvars,ghosts,stride,dim,recvbuf,var);
265 cudaDeviceSynchronize();
266 /* Wait till send requests are complete before freeing memory */
267 MPI_Waitall(2*ndims,sndreq,status_arr);
274 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
276 void gpuFillSendBufLeft_kernel(
278 int npoints_local_wghosts,
284 const int * __restrict__ dim,
285 const double * __restrict__ var,
286 double * __restrict__ sendbuf
289 int p = blockDim.x * blockIdx.x + threadIdx.x;
291 if (p < npoints_grid) {
292 int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
295 _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
296 _ArraySetValue_(offset,ndims,0);
297 _ArrayIndexnD_(ndims,p,bounds,index,0);
298 _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
299 _ArrayIndex1D_(ndims,bounds,index,0,p2);
300 for (int v=0; v<nvars; v++) {
301 sendbuf[2*d*stride+nvars*p2+v] = var[p1+v*npoints_local_wghosts];
303 //_ArrayCopy1D_((var+nvars*p1),(sendbuf+2*d*stride+nvars*p2),nvars);
309 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
311 void gpuFillSendBufRight_kernel(
313 int npoints_local_wghosts,
319 const int * __restrict__ dim,
320 const double * __restrict__ var,
321 double * __restrict__ sendbuf
324 int p = blockDim.x * blockIdx.x + threadIdx.x;
326 if (p < npoints_grid) {
327 int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
330 _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
331 _ArraySetValue_(offset,ndims,0); offset[d] = dim[d]-ghosts;
332 _ArrayIndexnD_(ndims,p,bounds,index,0);
333 _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
334 _ArrayIndex1D_(ndims,bounds,index,0,p2);
335 for (int v=0; v<nvars; v++) {
336 sendbuf[(2*d+1)*stride+nvars*p2+v] = var[p1+v*npoints_local_wghosts];
338 //_ArrayCopy1D_((var+nvars*p1),(sendbuf+(2*d+1)*stride+nvars*p2),nvars);
344 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
346 void gpuFillRecvBufLeft_kernel(
348 int npoints_local_wghosts,
354 const int * __restrict__ dim,
355 const double * __restrict__ recvbuf,
356 double * __restrict__ var
359 int p = blockDim.x * blockIdx.x + threadIdx.x;
361 if (p < npoints_grid) {
362 int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
365 _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
366 _ArraySetValue_(offset,ndims,0); offset[d] = -ghosts;
367 _ArrayIndexnD_(ndims,p,bounds,index,0);
368 _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
369 _ArrayIndex1D_(ndims,bounds,index,0,p2);
370 for (int v=0; v<nvars; v++) {
371 var[p1+v*npoints_local_wghosts] = recvbuf[2*d*stride+nvars*p2+v];
373 //_ArrayCopy1D_((recvbuf+2*d*stride+nvars*p2),(var+nvars*p1),nvars);
379 /*! Kernel function for gpuMPIExchangeBoundariesnD() */
381 void gpuFillRecvBufRight_kernel(
383 int npoints_local_wghosts,
389 const int * __restrict__ dim,
390 const double * __restrict__ recvbuf,
391 double * __restrict__ var
394 int p = blockDim.x * blockIdx.x + threadIdx.x;
396 if (p < npoints_grid) {
397 int index[GPU_MAX_NDIMS], bounds[GPU_MAX_NDIMS], offset[GPU_MAX_NDIMS];
400 _ArrayCopy1D_(dim,bounds,ndims); bounds[d] = ghosts;
401 _ArraySetValue_(offset,ndims,0); offset[d] = dim[d];
402 _ArrayIndexnD_(ndims,p,bounds,index,0);
403 _ArrayIndex1DWO_(ndims,dim,index,offset,ghosts,p1);
404 _ArrayIndex1D_(ndims,bounds,index,0,p2);
405 for (int v=0; v<nvars; v++) {
406 var[p1+v*npoints_local_wghosts] = recvbuf[(2*d+1)*stride+nvars*p2+v];
408 //_ArrayCopy1D_((recvbuf+(2*d+1)*stride+nvars*p2),(var+nvars*p1),nvars);
415 Exchange the data across MPI ranks and fill in ghost points for a 1D array. In a multidimensional
416 simulation, a 1D array is an array of data along one of the spatial dimensions, i.e. its an array
417 storing a variable that varies in only one of the spatial dimension. For example, for a
418 2D problem on a Cartesian grid (with spatial dimensions x and y), the array of x-coordinates is
419 a 1D array along x, and the array of y-coordinates is a 1D array along y. Thus, the size of the
420 1D array is equal to the size of the domain along the spatial dimension corresponding to that array.
422 Consider a two-dimensional problem, partitioned on 21 MPI ranks as follows:
423 @image html mpi_ranks.png
424 @image latex mpi_ranks.eps width=0.9\textwidth
427 If the argument \a dir is specified as 0, and thus we are dealing with a 1D array
428 along dimension 0, then
429 + Rank 9 will exchange data with ranks 8 and 10, and fill in its ghost points.
431 If \a dir is specified as 1, and thus we are dealing with a 1D array along dimension
433 + Rank 9 will exchange data with ranks 2 and 16, and fill in its ghost points.
435 extern "C" int gpuMPIExchangeBoundariesnD(
438 const int * __restrict__ dim,
441 double * __restrict__ var
446 MPIVariables *mpi = (MPIVariables*) m;
450 int *iproc = mpi->iproc;
451 int *bcflag = mpi->bcperiodic;
452 int *cpu_dim = mpi->cpu_dim;
453 int size = 1; for (d=0; d<ndims; d++) size *= (cpu_dim[d]+2*ghosts);
455 int neighbor_rank[2*ndims], nip[ndims], bounds[ndims];
456 MPI_Request rcvreq[2*ndims], sndreq[2*ndims];
457 for (d=0; d<2*ndims; d++) rcvreq[d] = sndreq[d] = MPI_REQUEST_NULL;
459 /* each process has 2*ndims neighbors (except at non-periodic physical boundaries) */
460 /* calculate the rank of these neighbors (-1 -> none) */
461 for (d = 0; d < ndims; d++) {
462 _ArrayCopy1D_(ip,nip,ndims);
463 if (ip[d] == 0) nip[d] = iproc[d]-1;
465 if ((ip[d] == 0) && (!bcflag[d])) neighbor_rank[2*d] = -1;
466 else neighbor_rank[2*d] = MPIRank1D(ndims,iproc,nip);
467 _ArrayCopy1D_(ip,nip,ndims);
468 if (ip[d] == (iproc[d]-1)) nip[d] = 0;
470 if ((ip[d] == (iproc[d]-1)) && (!bcflag[d])) neighbor_rank[2*d+1] = -1;
471 else neighbor_rank[2*d+1] = MPIRank1D(ndims,iproc,nip);
474 /* calculate dimensions of each of the send-receive regions */
475 double *sendbuf = mpi->gpu_sendbuf;
476 double *recvbuf = mpi->gpu_recvbuf;
477 int stride = mpi->maxbuf;
479 for (d = 0; d < ndims; d++) {
482 for (i = 0; i < ndims; i++) {
483 if (i == d) bufdim[d] *= ghosts;
484 else bufdim[d] *= cpu_dim[i];
488 /* post the receive requests */
489 for (d = 0; d < ndims; d++) {
490 if (neighbor_rank[2*d ] != -1) {
491 MPI_Irecv(&recvbuf[2*d*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d ],1630,
492 mpi->world,&rcvreq[2*d]);
494 if (neighbor_rank[2*d+1] != -1) {
495 MPI_Irecv(&recvbuf[(2*d+1)*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d+1],1631,
496 mpi->world,&rcvreq[2*d+1]);
500 /* count number of neighbors and copy data to send buffers */
501 for (d = 0; d < ndims; d++) {
502 _ArrayCopy1D_(cpu_dim,bounds,ndims); bounds[d] = ghosts;
503 if (neighbor_rank[2*d] != -1) {
504 int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
505 int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
506 gpuFillSendBufLeft_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
507 npoints_grid,size,d,ndims,nvars,ghosts,stride,dim,var,sendbuf);
509 if (neighbor_rank[2*d+1] != -1) {
510 int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
511 int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
512 gpuFillSendBufRight_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
513 npoints_grid,size,d,ndims,nvars,ghosts,stride,dim,var,sendbuf);
516 cudaDeviceSynchronize();
519 for (d = 0; d < ndims; d++) {
520 if (neighbor_rank[2*d ] != -1) {
521 MPI_Isend(&sendbuf[2*d*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d ],1631,
522 mpi->world,&sndreq[2*d]);
524 if (neighbor_rank[2*d+1] != -1) {
525 MPI_Isend(&sendbuf[(2*d+1)*stride],bufdim[d]*nvars,MPI_DOUBLE,neighbor_rank[2*d+1],1630,
526 mpi->world,&sndreq[2*d+1]);
530 /* Wait till data is done received */
531 MPI_Waitall(2*ndims,rcvreq,MPI_STATUS_IGNORE);
533 /* copy received data to ghost points */
534 for (d = 0; d < ndims; d++) {
535 _ArrayCopy1D_(cpu_dim,bounds,ndims); bounds[d] = ghosts;
536 if (neighbor_rank[2*d] != -1) {
537 int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
538 int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
539 gpuFillRecvBufLeft_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
540 npoints_grid,size,d,ndims,nvars,ghosts,stride,dim,recvbuf,var);
542 if (neighbor_rank[2*d+1] != -1) {
543 int npoints_grid = 1; for (int i=0; i<ndims; i++) npoints_grid *= bounds[i];
544 int nblocks = (npoints_grid-1)/GPU_THREADS_PER_BLOCK + 1;
545 gpuFillRecvBufRight_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
546 npoints_grid,size,d,ndims,nvars,ghosts,stride,dim,recvbuf,var);
549 cudaDeviceSynchronize();
551 /* Wait till send requests are complete before freeing memory */
552 MPI_Waitall(2*ndims,sndreq,MPI_STATUS_IGNORE);