HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
ArrayImplementations_GPU.cu
Go to the documentation of this file.
1 /*! @file ArrayImplementations_GPU.cu
2  @author Youngdae Kim
3  @brief GPU implementations of array functions.
4 */
5 
6 #include <assert.h>
7 #include <basic_gpu.h>
8 #include <arrayfunctions_gpu.h>
9 
10 /*! Element-wise copy \a y = \a x, where \a x, \a y are 1-dimensional arrays
11  of length \a size. \sa #_ArrayCopy1D_ */
12 __global__
13 void ArrayCopy1D_kernel( const double* x, /*!< copy-from array*/
14  double* y, /*!< copy-to array */
15  int n /*!< size of array */ )
16 {
17  int tx = threadIdx.x + (blockIdx.x * blockDim.x);
18  if (tx < n) y[tx] = x[tx];
19  return;
20 }
21 
22 /*! Set all elements of a 1-dimensional array \a x (any datatype)
23  of length \a size to a scalar \a value. \sa #_ArraySetValue_ */
24 __global__
25 void ArraySetValue_kernel( double* x, /*!< array*/
26  int n, /*!< size of array */
27  double value /*!< scalar value */ )
28 {
29  int tx = threadIdx.x + (blockIdx.x * blockDim.x);
30  if (tx < n) x[tx] = value;
31  return;
32 }
33 
34 /*! \sa #_ArrayAXPY_
35 
36  Element-wise AXPY \a y = \a a \a x + \a y,
37  where \a a is a scalar, and \a x, \a y, \a z are
38  1-dimensional arrays of length \a size. */
39 __global__
40 void ArrayAXPY_kernel( const double* x, /*!< x */
41  double a, /*!< a */
42  double* y, /*!< y */
43  int n /*!< size of array */ )
44 {
45  int tx = threadIdx.x + (blockIdx.x * blockDim.x);
46  if (tx < n) y[tx] += a*x[tx];
47  return;
48 }
49 
50 /*! \sa #_ArrayBlockMultiply_
51 
52  Given two arrays: \a x of size \a n*bs, and \a a of size \a n, this function
53  implements: \a x[i][j] *= \a a[i] where \a i = 1,...,\a n, j = 1,...,\a bs,
54  and \a x is stored as a 1D array in row-major format, i.e., \a x[i][j] = \a x[i*bs+j]. */
55 __global__
56 void ArrayBlockMultiply_kernel( double* x, /*!< x */
57  const double* a, /*!< a */
58  int n, /*!< size of array */
59  int bs /*!< block size */)
60 {
61  int tx = threadIdx.x + (blockIdx.x * blockDim.x);
62  if (tx < n) {
63  for (int i = 0; i < bs; i++) x[tx*bs + i] *= a[tx];
64  }
65 }
66 
67 /*! Alternative implementation of #_ArrayCopy1D_ */
68 __global__
69 void ArrayCopy1DNewScheme_kernel( const double* __restrict__ src, /*!< source array */
70  double* __restrict__ dest, /*!< destination array */
71  int npoints, /*!< number of points */
72  int nvars /*!< number of components */ )
73 {
74  int p = blockDim.x * blockIdx.x + threadIdx.x;
75  if (p < npoints) {
76  for (int v=0; v<nvars; v++) {
77  dest[p+v*npoints] = src[p*nvars+v];
78  }
79  }
80  return;
81 }
82 
83 /*! Set device */
84 void gpuSetDevice(int device /*!< device */)
85 {
86  cudaSetDevice(device);
87  cudaError_t err = cudaGetLastError();
88  if (err != cudaSuccess) {
89  fprintf(stderr,"Error in gpuSetDevice(): device=%d error message=\"%s\"\n", device, cudaGetErrorString(err));
90  }
91 }
92 
93 /*! GPU memory copy */
94 void gpuMemcpy( void* dest, /*!< destination */
95  const void* src, /*!< source */
96  size_t count,/*!< count */
97  enum gpuMemcpyKind kind /*!< kind of copy */ )
98 {
99  switch (kind) {
100  case gpuMemcpyHostToDevice:
101  checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyHostToDevice) );
102  break;
103  case gpuMemcpyDeviceToHost:
104  checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToHost) );
105  break;
106  case gpuMemcpyDeviceToDevice:
107  checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToDevice) );
108  break;
109  default:
110  fprintf(stderr, "Error: invalid gpuMemcpyKind: %d\n", kind);
111  assert(0);
112  break;
113  }
114  return;
115 }
116 
117 /*! Allocate memory */
118 void gpuMalloc( void** devPtr, /*!< pointer to memory */
119  size_t size /*!< size of memory */ )
120 {
121  cudaMalloc(devPtr, size);
122  cudaError_t err = cudaGetLastError();
123  if (err != cudaSuccess) {
124  fprintf( stderr,"Error in gpuMalloc(): size=%d, error message=\"%s\"\n", size,
125  cudaGetErrorString(err) );
126  }
127  return;
128 }
129 
130 /*! Set value */
131 void gpuMemset( void* devPtr, /*!< Pointer to memory */
132  int value, /*!< value to set */
133  size_t count /*!< size of data */ )
134 {
135  checkCuda( cudaMemset(devPtr, value, count) );
136  return;
137 }
138 
139 /*! deallocate memory */
140 void gpuFree(void *devPtr /*!< Pointer to memory */)
141 {
142  checkCuda( cudaFree(devPtr) );
143  return;
144 }
145 
146 /*! Element-wise copy \a y = \a x, where \a x, \a y are 1-dimensional arrays
147  of length \a size. \sa #_ArrayCopy1D_, ArrayCopy1D_kernel() */
148 void gpuArrayCopy1D( const double* x, /*!< copy-from array*/
149  double* y, /*!< copy-to array */
150  int n /*!< size of array */ )
151 {
152  int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
153  ArrayCopy1D_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, y, n);
154  cudaDeviceSynchronize();
155  return;
156 }
157 
158 /*! Set all elements of a 1-dimensional array \a x (any datatype)
159  of length \a size to a scalar \a value.
160  \sa #_ArraySetValue_, ArraySetValue_kernel() */
161 void gpuArraySetValue( double* devPtr, /*!< array*/
162  int n, /*!< size of array */
163  double value /*!< scalar value */ )
164 {
165  int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
166  ArraySetValue_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(devPtr, n, value);
167  cudaDeviceSynchronize();
168  return;
169 }
170 
171 /*! \sa #_ArrayAXPY_, ArrayAXPY_kernel()
172 
173  Element-wise AXPY \a y = \a a \a x + \a y,
174  where \a a is a scalar, and \a x, \a y, \a z are
175  1-dimensional arrays of length \a size.
176 */
177 void gpuArrayAXPY( const double* x, /*!< x */
178  double a, /*!< a */
179  double* y, /*!< y */
180  int n /*!< size of array */ )
181 {
182  int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
183  ArrayAXPY_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, y, n);
184  cudaDeviceSynchronize();
185  return;
186 }
187 
188 /*! \sa #_ArrayBlockMultiply_, ArrayBlockMultiply_kernel()
189 
190  Given two arrays: \a x of size \a n*bs, and \a a of size \a n, this function
191  implements: \a x[i][j] *= \a a[i] where \a i = 1,...,\a n, j = 1,...,\a bs,
192  and \a x is stored as a 1D array in row-major format, i.e., \a x[i][j] = \a x[i*bs+j]. */
193 void gpuArrayBlockMultiply( double* x, /*!< x */
194  const double* a, /*!< a */
195  int n, /*!< size of array */
196  int bs /*!< block size */ )
197 {
198  int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
199  ArrayBlockMultiply_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, n, bs);
200  cudaDeviceSynchronize();
201  return;
202 }
203 
204 /*! Returns the sum-of-squares of the elements in an n-D array (useful for L_2 norm)
205  \sa ArraySumSquarenD() */
206 double gpuArraySumSquarenD(int nvars, /*!< number of elements at one array location,
207  can be > 1 for systems of equations */
208  int ndims, /*!< number of dimensions */
209  int *dim, /*!< integer array of size in each dimension */
210  int ghosts, /*!< number of ghost points in the array x */
211  int *index, /*!< pre-allocated (by the calling function)
212  integer array of size ndims */
213  double *x /*!< the array */
214  )
215 {
216  double sum = 0;
217  printf("gpuArraySumSquarenD hasn't been implemented, yet.\n");
218  exit(0);
219  return (sum);
220 }
221 
222 /*! Alternative implementation of #_ArrayCopy1D_ */
223 void gpuArrayCopy1DNewScheme( const double* src, /*!< source array */
224  double* dest, /*!< destination array */
225  int npoints, /*!< number of points */
226  int nvars /*!< number of components */ )
227 {
228  int nblocks = (npoints-1) / GPU_THREADS_PER_BLOCK + 1;
229  ArrayCopy1DNewScheme_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(src, dest, npoints, nvars);
230  cudaDeviceSynchronize();
231  return;
232 }
233 
234 /*! Check if two arrays are equal, if not, report the difference */
235 /*! Check if two arrays are equal, if not, report the difference */
236 void gpuArrayCheckEqual( const char* msg, /*!< message */
237  const double* var_adj, /*!< array */
238  const double* var_sep, /*!< array */
239  int npoints /*!< size of array */ )
240 {
241  double *h_var_adj = (double *) malloc(npoints*sizeof(double));
242  double *h_var_sep = (double *) malloc(npoints*sizeof(double));
243 
244  gpuMemcpy(h_var_adj, var_adj, npoints*sizeof(double), gpuMemcpyDeviceToHost);
245  gpuMemcpy(h_var_sep, var_sep, npoints*sizeof(double), gpuMemcpyDeviceToHost);
246 
247  double max_err = 0.0;
248  for (int j=0; j<npoints; j++) {
249  if (h_var_sep[j] != h_var_adj[j]) {
250  max_err = max(max_err, fabs(h_var_sep[j]-h_var_adj[j]));
251  }
252  }
253 
254  free(h_var_adj);
255  free(h_var_sep);
256 
257  if (max_err > 1e-10) {
258  printf("gpuArrayCheckEqual: %-30s max_err = %e\n", msg, max_err);
259  exit(0);
260  }
261  return;
262 }
263