HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
ArrayImplementations_GPU.cu
Go to the documentation of this file.
1 
6 #include <assert.h>
7 #include <basic_gpu.h>
8 #include <arrayfunctions_gpu.h>
9 
12 __global__
13 void ArrayCopy1D_kernel( const double* x,
14  double* y,
15  int n )
16 {
17  int tx = threadIdx.x + (blockIdx.x * blockDim.x);
18  if (tx < n) y[tx] = x[tx];
19  return;
20 }
21 
24 __global__
25 void ArraySetValue_kernel( double* x,
26  int n,
27  double value )
28 {
29  int tx = threadIdx.x + (blockIdx.x * blockDim.x);
30  if (tx < n) x[tx] = value;
31  return;
32 }
33 
39 __global__
40 void ArrayAXPY_kernel( const double* x,
41  double a,
42  double* y,
43  int n )
44 {
45  int tx = threadIdx.x + (blockIdx.x * blockDim.x);
46  if (tx < n) y[tx] += a*x[tx];
47  return;
48 }
49 
55 __global__
56 void ArrayBlockMultiply_kernel( double* x,
57  const double* a,
58  int n,
59  int bs )
60 {
61  int tx = threadIdx.x + (blockIdx.x * blockDim.x);
62  if (tx < n) {
63  for (int i = 0; i < bs; i++) x[tx*bs + i] *= a[tx];
64  }
65 }
66 
68 __global__
69 void ArrayCopy1DNewScheme_kernel( const double* __restrict__ src,
70  double* __restrict__ dest,
71  int npoints,
72  int nvars )
73 {
74  int p = blockDim.x * blockIdx.x + threadIdx.x;
75  if (p < npoints) {
76  for (int v=0; v<nvars; v++) {
77  dest[p+v*npoints] = src[p*nvars+v];
78  }
79  }
80  return;
81 }
82 
84 void gpuSetDevice(int device )
85 {
86  cudaSetDevice(device);
87  cudaError_t err = cudaGetLastError();
88  if (err != cudaSuccess) {
89  fprintf(stderr,"Error in gpuSetDevice(): device=%d error message=\"%s\"\n", device, cudaGetErrorString(err));
90  }
91 }
92 
94 void gpuMemcpy( void* dest,
95  const void* src,
96  size_t count,
97  enum gpuMemcpyKind kind )
98 {
99  switch (kind) {
101  checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyHostToDevice) );
102  break;
104  checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToHost) );
105  break;
107  checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToDevice) );
108  break;
109  default:
110  fprintf(stderr, "Error: invalid gpuMemcpyKind: %d\n", kind);
111  assert(0);
112  break;
113  }
114  return;
115 }
116 
118 void gpuMalloc( void** devPtr,
119  size_t size )
120 {
121  cudaMalloc(devPtr, size);
122  cudaError_t err = cudaGetLastError();
123  if (err != cudaSuccess) {
124  fprintf( stderr,"Error in gpuMalloc(): size=%d, error message=\"%s\"\n", size,
125  cudaGetErrorString(err) );
126  }
127  return;
128 }
129 
131 void gpuMemset( void* devPtr,
132  int value,
133  size_t count )
134 {
135  checkCuda( cudaMemset(devPtr, value, count) );
136  return;
137 }
138 
140 void gpuFree(void *devPtr )
141 {
142  checkCuda( cudaFree(devPtr) );
143  return;
144 }
145 
148 void gpuArrayCopy1D( const double* x,
149  double* y,
150  int n )
151 {
152  int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
153  ArrayCopy1D_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, y, n);
154  cudaDeviceSynchronize();
155  return;
156 }
157 
161 void gpuArraySetValue( double* devPtr,
162  int n,
163  double value )
164 {
165  int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
166  ArraySetValue_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(devPtr, n, value);
167  cudaDeviceSynchronize();
168  return;
169 }
170 
177 void gpuArrayAXPY( const double* x,
178  double a,
179  double* y,
180  int n )
181 {
182  int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
183  ArrayAXPY_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, y, n);
184  cudaDeviceSynchronize();
185  return;
186 }
187 
193 void gpuArrayBlockMultiply( double* x,
194  const double* a,
195  int n,
196  int bs )
197 {
198  int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
199  ArrayBlockMultiply_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, n, bs);
200  cudaDeviceSynchronize();
201  return;
202 }
203 
206 double gpuArraySumSquarenD(int nvars,
208  int ndims,
209  int *dim,
210  int ghosts,
211  int *index,
213  double *x
214  )
215 {
216  double sum = 0;
217  printf("gpuArraySumSquarenD hasn't been implemented, yet.\n");
218  exit(0);
219  return (sum);
220 }
221 
223 void gpuArrayCopy1DNewScheme( const double* src,
224  double* dest,
225  int npoints,
226  int nvars )
227 {
228  int nblocks = (npoints-1) / GPU_THREADS_PER_BLOCK + 1;
229  ArrayCopy1DNewScheme_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(src, dest, npoints, nvars);
230  cudaDeviceSynchronize();
231  return;
232 }
233 
236 void gpuArrayCheckEqual( const char* msg,
237  const double* var_adj,
238  const double* var_sep,
239  int npoints )
240 {
241  double *h_var_adj = (double *) malloc(npoints*sizeof(double));
242  double *h_var_sep = (double *) malloc(npoints*sizeof(double));
243 
244  gpuMemcpy(h_var_adj, var_adj, npoints*sizeof(double), gpuMemcpyDeviceToHost);
245  gpuMemcpy(h_var_sep, var_sep, npoints*sizeof(double), gpuMemcpyDeviceToHost);
246 
247  double max_err = 0.0;
248  for (int j=0; j<npoints; j++) {
249  if (h_var_sep[j] != h_var_adj[j]) {
250  max_err = max(max_err, fabs(h_var_sep[j]-h_var_adj[j]));
251  }
252  }
253 
254  free(h_var_adj);
255  free(h_var_sep);
256 
257  if (max_err > 1e-10) {
258  printf("gpuArrayCheckEqual: %-30s max_err = %e\n", msg, max_err);
259  exit(0);
260  }
261  return;
262 }
263 
void gpuArrayAXPY(const double *, double, double *, int)
__global__ void ArrayCopy1DNewScheme_kernel(const double *__restrict__ src, double *__restrict__ dest, int npoints, int nvars)
void gpuArrayBlockMultiply(double *, const double *, int, int)
void gpuMemcpy(void *, const void *, size_t, enum gpuMemcpyKind)
void gpuArraySetValue(double *, int, double)
#define GPU_THREADS_PER_BLOCK
Definition: basic_gpu.h:7
__global__ void ArraySetValue_kernel(double *x, int n, double value)
__global__ void ArrayBlockMultiply_kernel(double *x, const double *a, int n, int bs)
void gpuArrayCopy1DNewScheme(const double *, double *, int, int)
double gpuArraySumSquarenD(int, int, int *, int, int *, double *)
void gpuFree(void *)
Contains function definitions for common array operations on GPU.
long sum(const std::vector< int > &a_iv)
Definition: std_vec_ops.h:18
void gpuMemset(void *, int, size_t)
void gpuMalloc(void **, size_t)
__global__ void ArrayAXPY_kernel(const double *x, double a, double *y, int n)
void gpuSetDevice(int device)
void gpuArrayCheckEqual(const char *, const double *, const double *, int, int)
__global__ void ArrayCopy1D_kernel(const double *x, double *y, int n)
gpuMemcpyKind
#define max(a, b)
Definition: math_ops.h:18
void gpuArrayCopy1D(const double *, double *, int)