17 int tx = threadIdx.x + (blockIdx.x * blockDim.x);
18 if (tx < n) y[tx] = x[tx];
29 int tx = threadIdx.x + (blockIdx.x * blockDim.x);
30 if (tx < n) x[tx] = value;
45 int tx = threadIdx.x + (blockIdx.x * blockDim.x);
46 if (tx < n) y[tx] += a*x[tx];
61 int tx = threadIdx.x + (blockIdx.x * blockDim.x);
63 for (
int i = 0; i < bs; i++) x[tx*bs + i] *= a[tx];
70 double* __restrict__ dest,
74 int p = blockDim.x * blockIdx.x + threadIdx.x;
76 for (
int v=0; v<nvars; v++) {
77 dest[p+v*npoints] = src[p*nvars+v];
86 cudaSetDevice(device);
87 cudaError_t err = cudaGetLastError();
88 if (err != cudaSuccess) {
89 fprintf(stderr,
"Error in gpuSetDevice(): device=%d error message=\"%s\"\n", device, cudaGetErrorString(err));
101 checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyHostToDevice) );
104 checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToHost) );
107 checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToDevice) );
110 fprintf(stderr,
"Error: invalid gpuMemcpyKind: %d\n", kind);
121 cudaMalloc(devPtr, size);
122 cudaError_t err = cudaGetLastError();
123 if (err != cudaSuccess) {
124 fprintf( stderr,
"Error in gpuMalloc(): size=%d, error message=\"%s\"\n", size,
125 cudaGetErrorString(err) );
135 checkCuda( cudaMemset(devPtr, value, count) );
142 checkCuda( cudaFree(devPtr) );
153 ArrayCopy1D_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, y, n);
154 cudaDeviceSynchronize();
166 ArraySetValue_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(devPtr, n, value);
167 cudaDeviceSynchronize();
183 ArrayAXPY_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, y, n);
184 cudaDeviceSynchronize();
199 ArrayBlockMultiply_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, n, bs);
200 cudaDeviceSynchronize();
217 printf(
"gpuArraySumSquarenD hasn't been implemented, yet.\n");
229 ArrayCopy1DNewScheme_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(src, dest, npoints, nvars);
230 cudaDeviceSynchronize();
237 const double* var_adj,
238 const double* var_sep,
241 double *h_var_adj = (
double *) malloc(npoints*
sizeof(
double));
242 double *h_var_sep = (
double *) malloc(npoints*
sizeof(
double));
247 double max_err = 0.0;
248 for (
int j=0; j<npoints; j++) {
249 if (h_var_sep[j] != h_var_adj[j]) {
250 max_err =
max(max_err, fabs(h_var_sep[j]-h_var_adj[j]));
257 if (max_err > 1e-10) {
258 printf(
"gpuArrayCheckEqual: %-30s max_err = %e\n", msg, max_err);
void gpuArrayAXPY(const double *, double, double *, int)
__global__ void ArrayCopy1DNewScheme_kernel(const double *__restrict__ src, double *__restrict__ dest, int npoints, int nvars)
void gpuArrayBlockMultiply(double *, const double *, int, int)
void gpuMemcpy(void *, const void *, size_t, enum gpuMemcpyKind)
void gpuArraySetValue(double *, int, double)
#define GPU_THREADS_PER_BLOCK
__global__ void ArraySetValue_kernel(double *x, int n, double value)
__global__ void ArrayBlockMultiply_kernel(double *x, const double *a, int n, int bs)
void gpuArrayCopy1DNewScheme(const double *, double *, int, int)
double gpuArraySumSquarenD(int, int, int *, int, int *, double *)
Contains function definitions for common array operations on GPU.
long sum(const std::vector< int > &a_iv)
void gpuMemset(void *, int, size_t)
void gpuMalloc(void **, size_t)
__global__ void ArrayAXPY_kernel(const double *x, double a, double *y, int n)
void gpuSetDevice(int device)
void gpuArrayCheckEqual(const char *, const double *, const double *, int, int)
__global__ void ArrayCopy1D_kernel(const double *x, double *y, int n)
void gpuArrayCopy1D(const double *, double *, int)