1 /*! @file ArrayImplementations_GPU.cu
3 @brief GPU implementations of array functions.
8 #include <arrayfunctions_gpu.h>
10 /*! Element-wise copy \a y = \a x, where \a x, \a y are 1-dimensional arrays
11 of length \a size. \sa #_ArrayCopy1D_ */
13 void ArrayCopy1D_kernel( const double* x, /*!< copy-from array*/
14 double* y, /*!< copy-to array */
15 int n /*!< size of array */ )
17 int tx = threadIdx.x + (blockIdx.x * blockDim.x);
18 if (tx < n) y[tx] = x[tx];
22 /*! Set all elements of a 1-dimensional array \a x (any datatype)
23 of length \a size to a scalar \a value. \sa #_ArraySetValue_ */
25 void ArraySetValue_kernel( double* x, /*!< array*/
26 int n, /*!< size of array */
27 double value /*!< scalar value */ )
29 int tx = threadIdx.x + (blockIdx.x * blockDim.x);
30 if (tx < n) x[tx] = value;
36 Element-wise AXPY \a y = \a a \a x + \a y,
37 where \a a is a scalar, and \a x, \a y, \a z are
38 1-dimensional arrays of length \a size. */
40 void ArrayAXPY_kernel( const double* x, /*!< x */
43 int n /*!< size of array */ )
45 int tx = threadIdx.x + (blockIdx.x * blockDim.x);
46 if (tx < n) y[tx] += a*x[tx];
50 /*! \sa #_ArrayBlockMultiply_
52 Given two arrays: \a x of size \a n*bs, and \a a of size \a n, this function
53 implements: \a x[i][j] *= \a a[i] where \a i = 1,...,\a n, j = 1,...,\a bs,
54 and \a x is stored as a 1D array in row-major format, i.e., \a x[i][j] = \a x[i*bs+j]. */
56 void ArrayBlockMultiply_kernel( double* x, /*!< x */
57 const double* a, /*!< a */
58 int n, /*!< size of array */
59 int bs /*!< block size */)
61 int tx = threadIdx.x + (blockIdx.x * blockDim.x);
63 for (int i = 0; i < bs; i++) x[tx*bs + i] *= a[tx];
67 /*! Alternative implementation of #_ArrayCopy1D_ */
69 void ArrayCopy1DNewScheme_kernel( const double* __restrict__ src, /*!< source array */
70 double* __restrict__ dest, /*!< destination array */
71 int npoints, /*!< number of points */
72 int nvars /*!< number of components */ )
74 int p = blockDim.x * blockIdx.x + threadIdx.x;
76 for (int v=0; v<nvars; v++) {
77 dest[p+v*npoints] = src[p*nvars+v];
84 void gpuSetDevice(int device /*!< device */)
86 cudaSetDevice(device);
87 cudaError_t err = cudaGetLastError();
88 if (err != cudaSuccess) {
89 fprintf(stderr,"Error in gpuSetDevice(): device=%d error message=\"%s\"\n", device, cudaGetErrorString(err));
93 /*! GPU memory copy */
94 void gpuMemcpy( void* dest, /*!< destination */
95 const void* src, /*!< source */
96 size_t count,/*!< count */
97 enum gpuMemcpyKind kind /*!< kind of copy */ )
100 case gpuMemcpyHostToDevice:
101 checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyHostToDevice) );
103 case gpuMemcpyDeviceToHost:
104 checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToHost) );
106 case gpuMemcpyDeviceToDevice:
107 checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToDevice) );
110 fprintf(stderr, "Error: invalid gpuMemcpyKind: %d\n", kind);
117 /*! Allocate memory */
118 void gpuMalloc( void** devPtr, /*!< pointer to memory */
119 size_t size /*!< size of memory */ )
121 cudaMalloc(devPtr, size);
122 cudaError_t err = cudaGetLastError();
123 if (err != cudaSuccess) {
124 fprintf( stderr,"Error in gpuMalloc(): size=%d, error message=\"%s\"\n", size,
125 cudaGetErrorString(err) );
131 void gpuMemset( void* devPtr, /*!< Pointer to memory */
132 int value, /*!< value to set */
133 size_t count /*!< size of data */ )
135 checkCuda( cudaMemset(devPtr, value, count) );
139 /*! deallocate memory */
140 void gpuFree(void *devPtr /*!< Pointer to memory */)
142 checkCuda( cudaFree(devPtr) );
146 /*! Element-wise copy \a y = \a x, where \a x, \a y are 1-dimensional arrays
147 of length \a size. \sa #_ArrayCopy1D_, ArrayCopy1D_kernel() */
148 void gpuArrayCopy1D( const double* x, /*!< copy-from array*/
149 double* y, /*!< copy-to array */
150 int n /*!< size of array */ )
152 int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
153 ArrayCopy1D_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, y, n);
154 cudaDeviceSynchronize();
158 /*! Set all elements of a 1-dimensional array \a x (any datatype)
159 of length \a size to a scalar \a value.
160 \sa #_ArraySetValue_, ArraySetValue_kernel() */
161 void gpuArraySetValue( double* devPtr, /*!< array*/
162 int n, /*!< size of array */
163 double value /*!< scalar value */ )
165 int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
166 ArraySetValue_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(devPtr, n, value);
167 cudaDeviceSynchronize();
171 /*! \sa #_ArrayAXPY_, ArrayAXPY_kernel()
173 Element-wise AXPY \a y = \a a \a x + \a y,
174 where \a a is a scalar, and \a x, \a y, \a z are
175 1-dimensional arrays of length \a size.
177 void gpuArrayAXPY( const double* x, /*!< x */
180 int n /*!< size of array */ )
182 int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
183 ArrayAXPY_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, y, n);
184 cudaDeviceSynchronize();
188 /*! \sa #_ArrayBlockMultiply_, ArrayBlockMultiply_kernel()
190 Given two arrays: \a x of size \a n*bs, and \a a of size \a n, this function
191 implements: \a x[i][j] *= \a a[i] where \a i = 1,...,\a n, j = 1,...,\a bs,
192 and \a x is stored as a 1D array in row-major format, i.e., \a x[i][j] = \a x[i*bs+j]. */
193 void gpuArrayBlockMultiply( double* x, /*!< x */
194 const double* a, /*!< a */
195 int n, /*!< size of array */
196 int bs /*!< block size */ )
198 int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
199 ArrayBlockMultiply_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, n, bs);
200 cudaDeviceSynchronize();
204 /*! Returns the sum-of-squares of the elements in an n-D array (useful for L_2 norm)
205 \sa ArraySumSquarenD() */
206 double gpuArraySumSquarenD(int nvars, /*!< number of elements at one array location,
207 can be > 1 for systems of equations */
208 int ndims, /*!< number of dimensions */
209 int *dim, /*!< integer array of size in each dimension */
210 int ghosts, /*!< number of ghost points in the array x */
211 int *index, /*!< pre-allocated (by the calling function)
212 integer array of size ndims */
213 double *x /*!< the array */
217 printf("gpuArraySumSquarenD hasn't been implemented, yet.\n");
222 /*! Alternative implementation of #_ArrayCopy1D_ */
223 void gpuArrayCopy1DNewScheme( const double* src, /*!< source array */
224 double* dest, /*!< destination array */
225 int npoints, /*!< number of points */
226 int nvars /*!< number of components */ )
228 int nblocks = (npoints-1) / GPU_THREADS_PER_BLOCK + 1;
229 ArrayCopy1DNewScheme_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(src, dest, npoints, nvars);
230 cudaDeviceSynchronize();
234 /*! Check if two arrays are equal, if not, report the difference */
235 /*! Check if two arrays are equal, if not, report the difference */
236 void gpuArrayCheckEqual( const char* msg, /*!< message */
237 const double* var_adj, /*!< array */
238 const double* var_sep, /*!< array */
239 int npoints /*!< size of array */ )
241 double *h_var_adj = (double *) malloc(npoints*sizeof(double));
242 double *h_var_sep = (double *) malloc(npoints*sizeof(double));
244 gpuMemcpy(h_var_adj, var_adj, npoints*sizeof(double), gpuMemcpyDeviceToHost);
245 gpuMemcpy(h_var_sep, var_sep, npoints*sizeof(double), gpuMemcpyDeviceToHost);
247 double max_err = 0.0;
248 for (int j=0; j<npoints; j++) {
249 if (h_var_sep[j] != h_var_adj[j]) {
250 max_err = max(max_err, fabs(h_var_sep[j]-h_var_adj[j]));
257 if (max_err > 1e-10) {
258 printf("gpuArrayCheckEqual: %-30s max_err = %e\n", msg, max_err);