1 /*! @file BCPeriodic_GPU.cu
3 @brief GPU implementations of periodic boundary conditions
6 #include <arrayfunctions_gpu.h>
8 #include <boundaryconditions.h>
10 #ifdef CUDA_VAR_ORDERDING_AOS
12 /*! Kernel of gpuBCPeriodicU() */
14 void BCPeriodicU_kernel(
16 int npoints_local_wghosts,
22 const int * __restrict__ bounds,
23 const int * __restrict__ size,
24 const int * __restrict__ boundary_is,
25 double * __restrict__ phi
28 int p = blockDim.x * blockIdx.x + threadIdx.x;
30 if (p < npoints_bounds) {
32 int index1[GPU_MAX_NDIMS], index2[GPU_MAX_NDIMS];
34 _ArrayIndexnD_(ndims,p,bounds,index1,0);
35 _ArrayCopy1D_(index1,index2,ndims);
37 index2[dim] = index1[dim] + size[dim]-ghosts;
38 _ArrayIndex1DWO_(ndims,size,index1,boundary_is,ghosts,p1);
39 _ArrayIndex1D_(ndims,size,index2,ghosts,p2);
40 } else if (face == -1) {
41 _ArrayIndex1DWO_(ndims,size,index1,boundary_is,ghosts,p1);
42 _ArrayIndex1D_(ndims,size,index1,ghosts,p2);
44 _ArrayCopy1D_((phi+nvars*p2),(phi+nvars*p1),nvars);
49 /*! Applies periodic boundary conditions: Implemented by copying the solution
50 from the other end of the domain into the physical boundary ghost points.
52 **Note**: This function only acts if the the number of processors is 1 along
53 the spatial dimension this boundary corresponds to. If there are more than 1
54 processors along this dimension, periodicity is handled by MPIExchangeBoundariesnD()
55 to minimize communication.
57 extern "C" int gpuBCPeriodicU(
58 void * __restrict__ b, /*!< Boundary object of type #DomainBoundary */
59 void * __restrict__ m, /*!< MPI object of type #MPIVariables */
60 int ndims, /*!< Number of spatial dimensions */
61 int nvars, /*!< Number of variables/DoFs per grid point */
62 int * __restrict__ size, /*!< Integer array with the number of grid points in
63 each spatial dimensions */
64 int ghosts, /*!< Number of ghost points */
65 double * __restrict__ phi, /*!< The solution array on which to apply the boundary condition */
66 double waqt /*!< Current solution time */
69 DomainBoundary *boundary = (DomainBoundary*) b;
70 MPIVariables *mpi = (MPIVariables*) m;
72 int dim = boundary->dim;
73 int face = boundary->face;
75 if ((boundary->on_this_proc) && (mpi->iproc[dim] == 1)) {
77 nblocks = (boundary->gpu_npoints_bounds-1) / GPU_THREADS_PER_BLOCK + 1;
80 cudaEvent_t startEvent, stopEvent;
81 float milliseconds = 0;
83 int memory_accessed = 2*boundary->gpu_npoints_bounds*nvars*sizeof(double);
85 checkCuda( cudaEventCreate(&startEvent));
86 checkCuda( cudaEventCreate(&stopEvent));
88 checkCuda( cudaEventRecord(startEvent, 0) );
91 BCPeriodicU_kernel<<<nblocks,GPU_THREADS_PER_BLOCK>>>(boundary->gpu_npoints_bounds,
92 boundary->gpu_npoints_local_wghosts, face, ndims, dim, ghosts, nvars,
93 boundary->gpu_bounds, size, boundary->gpu_is, phi
95 cudaDeviceSynchronize();
98 checkCuda( cudaEventRecord(stopEvent, 0) );
99 checkCuda( cudaEventSynchronize(stopEvent) );
100 checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
102 printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
103 "BCPeriodicU", milliseconds*1e-3,
104 (1e-6*(memory_accessed)/milliseconds));
113 /*! Kernel of gpuBCPeriodicU() */
115 void BCPeriodicU_kernel(
117 int npoints_local_wghosts,
123 const int * __restrict__ bounds,
124 const int * __restrict__ size,
125 const int * __restrict__ boundary_is,
126 double * __restrict__ phi
129 int p = blockDim.x * blockIdx.x + threadIdx.x;
131 if (p < npoints_bounds) {
133 int index1[GPU_MAX_NDIMS], index2[GPU_MAX_NDIMS];
135 _ArrayIndexnD_(ndims,p,bounds,index1,0);
136 _ArrayCopy1D_(index1,index2,ndims);
138 index2[dim] = index1[dim] + size[dim]-ghosts;
139 _ArrayIndex1DWO_(ndims,size,index1,boundary_is,ghosts,p1);
140 _ArrayIndex1D_(ndims,size,index2,ghosts,p2);
141 } else if (face == -1) {
142 _ArrayIndex1DWO_(ndims,size,index1,boundary_is,ghosts,p1);
143 _ArrayIndex1D_(ndims,size,index1,ghosts,p2);
146 for (int j=0; j<nvars; j++) {
148 p1 += npoints_local_wghosts;
149 p2 += npoints_local_wghosts;
155 /*! Applies periodic boundary conditions: Implemented by copying the solution
156 from the other end of the domain into the physical boundary ghost points.
158 **Note**: This function only acts if the the number of processors is 1 along
159 the spatial dimension this boundary corresponds to. If there are more than 1
160 processors along this dimension, periodicity is handled by MPIExchangeBoundariesnD()
161 to minimize communication.
162 \sa BCPeriodicU(), gpuBCPeriodicU() */
163 extern "C" int gpuBCPeriodicU(
164 void * __restrict__ b, /*!< Boundary object of type #DomainBoundary */
165 void * __restrict__ m, /*!< MPI object of type #MPIVariables */
166 int ndims, /*!< Number of spatial dimensions */
167 int nvars, /*!< Number of variables/DoFs per grid point */
168 int * __restrict__ size, /*!< Integer array with the number of grid points in
169 each spatial dimensions */
170 int ghosts, /*!< Number of ghost points */
171 double * __restrict__ phi, /*!< The solution array on which to apply the boundary condition */
172 double waqt /*!< Current solution time */
175 DomainBoundary *boundary = (DomainBoundary*) b;
176 MPIVariables *mpi = (MPIVariables*) m;
178 int dim = boundary->dim;
179 int face = boundary->face;
181 if ((boundary->on_this_proc) && (mpi->iproc[dim] == 1)) {
183 nblocks = (boundary->gpu_npoints_bounds-1) / GPU_THREADS_PER_BLOCK + 1;
185 #if defined(GPU_STAT)
186 cudaEvent_t startEvent, stopEvent;
187 float milliseconds = 0;
189 int memory_accessed = 2*boundary->gpu_npoints_bounds*nvars*sizeof(double);
191 checkCuda(cudaEventCreate(&startEvent));
192 checkCuda(cudaEventCreate(&stopEvent));
194 checkCuda(cudaEventRecord(startEvent, 0));
197 BCPeriodicU_kernel<<<nblocks,GPU_THREADS_PER_BLOCK>>>(boundary->gpu_npoints_bounds,
198 boundary->gpu_npoints_local_wghosts, face, ndims, dim, ghosts, nvars,
199 boundary->gpu_bounds, size, boundary->gpu_is, phi
201 cudaDeviceSynchronize();
203 #if defined(GPU_STAT)
204 checkCuda(cudaEventRecord(stopEvent, 0));
205 checkCuda(cudaEventSynchronize(stopEvent));
206 checkCuda(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent));
208 printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
209 "BCPeriodicU2", milliseconds*1e-3,
210 (1e-6*(memory_accessed)/milliseconds));