GPU implementations of array functions. More...

#include <assert.h>
#include <basic_gpu.h>
#include <arrayfunctions_gpu.h>

Functions
__global__ void	ArrayCopy1D_kernel (const double x, double y, int n)

__global__ void	ArraySetValue_kernel (double *x, int n, double value)

__global__ void	ArrayAXPY_kernel (const double x, double a, double y, int n)

__global__ void	ArrayBlockMultiply_kernel (double x, const double a, int n, int bs)

__global__ void	ArrayCopy1DNewScheme_kernel (const double __restrict__ src, double __restrict__ dest, int npoints, int nvars)

void	gpuSetDevice (int device)

void	gpuMemcpy (void dest, const void src, size_t count, enum gpuMemcpyKind kind)

void	gpuMalloc (void **devPtr, size_t size)

void	gpuMemset (void *devPtr, int value, size_t count)

void	gpuFree (void *devPtr)

void	gpuArrayCopy1D (const double x, double y, int n)

void	gpuArraySetValue (double *devPtr, int n, double value)

void	gpuArrayAXPY (const double x, double a, double y, int n)

void	gpuArrayBlockMultiply (double x, const double a, int n, int bs)

double	gpuArraySumSquarenD (int nvars, int ndims, int dim, int ghosts, int index, double *x)

void	gpuArrayCopy1DNewScheme (const double src, double dest, int npoints, int nvars)

void	gpuArrayCheckEqual (const char msg, const double var_adj, const double *var_sep, int npoints)

Detailed Description

GPU implementations of array functions.

Author: Youngdae Kim

Definition in file ArrayImplementations_GPU.cu.

Function Documentation

__global__ void ArrayCopy1D_kernel	(	const double *	x,
		double *	y,
		int	n
	)

Element-wise copy y = x, where x, y are 1-dimensional arrays of length size.

See Also: _ArrayCopy1D_

Parameters

x	copy-from array
y	copy-to array
n	size of array

Definition at line 13 of file ArrayImplementations_GPU.cu.

 {
     int tx = threadIdx.x + (blockIdx.x * blockDim.x);
     if (tx < n) y[tx] = x[tx];
     return;
 }

__global__ void ArraySetValue_kernel	(	double *	x,
		int	n,
		double	value
	)

Set all elements of a 1-dimensional array x (any datatype) of length size to a scalar value.

See Also: _ArraySetValue_

Parameters

x	array
n	size of array
value	scalar value

Definition at line 25 of file ArrayImplementations_GPU.cu.

 {
     int tx = threadIdx.x + (blockIdx.x * blockDim.x);
     if (tx < n) x[tx] = value;
     return;
 }

__global__ void ArrayAXPY_kernel	(	const double *	x,
		double	a,
		double *	y,
		int	n
	)

See Also: _ArrayAXPY_

Element-wise AXPY y = a x + y, where a is a scalar, and x, y, z are 1-dimensional arrays of length size.

Parameters

x	x
a	a
y	y
n	size of array

Definition at line 40 of file ArrayImplementations_GPU.cu.

 {
     int tx = threadIdx.x + (blockIdx.x * blockDim.x);
     if (tx < n) y[tx] += a*x[tx];
     return;
 }

__global__ void ArrayBlockMultiply_kernel	(	double *	x,
		const double *	a,
		int	n,
		int	bs
	)

See Also: _ArrayBlockMultiply_

Given two arrays: x of size n*bs, and a of size n, this function implements: x[i][j] *= a[i] where i = 1,...,n, j = 1,...,bs, and x is stored as a 1D array in row-major format, i.e., x[i][j] = x[i*bs+j].

Parameters

x	x
a	a
n	size of array
bs	block size

Definition at line 56 of file ArrayImplementations_GPU.cu.

 {
     int tx = threadIdx.x + (blockIdx.x * blockDim.x);
     if (tx < n) {
         for (int i = 0; i < bs; i++) x[tx*bs + i] *= a[tx];
     }
 }

__global__ void ArrayCopy1DNewScheme_kernel	(	const double *__restrict__	src,
		double *__restrict__	dest,
		int	npoints,
		int	nvars
	)

Alternative implementation of _ArrayCopy1D_

Parameters

src	source array
dest	destination array
npoints	number of points
nvars	number of components

Definition at line 69 of file ArrayImplementations_GPU.cu.

 {
     int p = blockDim.x * blockIdx.x + threadIdx.x;
     if (p < npoints) {
         for (int v=0; v<nvars; v++) {
             dest[p+v*npoints] = src[p*nvars+v];
         }
     }
     return;
 }

void gpuSetDevice ( int device )

Set device

Parameters

device device

Definition at line 84 of file ArrayImplementations_GPU.cu.

 {
     cudaSetDevice(device);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
       fprintf(stderr,"Error in gpuSetDevice(): device=%d error message=\"%s\"\n", device, cudaGetErrorString(err));
     }
 }

void gpuMemcpy	(	void *	dest,
		const void *	src,
		size_t	count,
		enum gpuMemcpyKind	kind
	)

GPU memory copy

Parameters

dest	destination
src	source
count	count
kind	kind of copy

Definition at line 94 of file ArrayImplementations_GPU.cu.

 {
     switch (kind) {
         case gpuMemcpyHostToDevice:
             checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyHostToDevice) );
             break;
         case gpuMemcpyDeviceToHost:
             checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToHost) );
             break;
         case gpuMemcpyDeviceToDevice:
             checkCuda( cudaMemcpy(dest, src, count, cudaMemcpyDeviceToDevice) );
             break;
         default:
             fprintf(stderr, "Error: invalid gpuMemcpyKind: %d\n", kind);
             assert(0);
             break;
     }
     return;
 }

void gpuMalloc	(	void **	devPtr,
		size_t	size
	)

Allocate memory

Parameters

devPtr	pointer to memory
size	size of memory

Definition at line 118 of file ArrayImplementations_GPU.cu.

 {
     cudaMalloc(devPtr, size);
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
       fprintf(  stderr,"Error in gpuMalloc(): size=%d, error message=\"%s\"\n", size, 
                 cudaGetErrorString(err) );
     }
     return;
 }

void gpuMemset	(	void *	devPtr,
		int	value,
		size_t	count
	)

Set value

Parameters

devPtr	Pointer to memory
value	value to set
count	size of data

Definition at line 131 of file ArrayImplementations_GPU.cu.

 {
     checkCuda( cudaMemset(devPtr, value, count) );
     return;
 }

void gpuFree ( void * devPtr )

deallocate memory

Parameters

devPtr Pointer to memory

Definition at line 140 of file ArrayImplementations_GPU.cu.

 {
     checkCuda( cudaFree(devPtr) );
     return;
 }

void gpuArrayCopy1D	(	const double *	x,
		double *	y,
		int	n
	)

Element-wise copy y = x, where x, y are 1-dimensional arrays of length size.

See Also: _ArrayCopy1D_, ArrayCopy1D_kernel()

Parameters

x	copy-from array
y	copy-to array
n	size of array

Definition at line 148 of file ArrayImplementations_GPU.cu.

 {
     int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
     ArrayCopy1D_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, y, n);
     cudaDeviceSynchronize();
     return;
 }

void gpuArraySetValue	(	double *	devPtr,
		int	n,
		double	value
	)

Set all elements of a 1-dimensional array x (any datatype) of length size to a scalar value.

See Also: _ArraySetValue_, ArraySetValue_kernel()

Parameters

devPtr	array
n	size of array
value	scalar value

Definition at line 161 of file ArrayImplementations_GPU.cu.

 {
     int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
     ArraySetValue_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(devPtr, n, value);
     cudaDeviceSynchronize();
     return;
 }

void gpuArrayAXPY	(	const double *	x,
		double	a,
		double *	y,
		int	n
	)

See Also: _ArrayAXPY_, ArrayAXPY_kernel()

Element-wise AXPY y = a x + y, where a is a scalar, and x, y, z are 1-dimensional arrays of length size.

Parameters

x	x
a	a
y	y
n	size of array

Definition at line 177 of file ArrayImplementations_GPU.cu.

 {
     int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
     ArrayAXPY_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, y, n);
     cudaDeviceSynchronize();
     return;
 }

void gpuArrayBlockMultiply	(	double *	x,
		const double *	a,
		int	n,
		int	bs
	)

See Also: _ArrayBlockMultiply_, ArrayBlockMultiply_kernel()

Given two arrays: x of size n*bs, and a of size n, this function implements: x[i][j] *= a[i] where i = 1,...,n, j = 1,...,bs, and x is stored as a 1D array in row-major format, i.e., x[i][j] = x[i*bs+j].

Parameters

x	x
a	a
n	size of array
bs	block size

Definition at line 193 of file ArrayImplementations_GPU.cu.

 {
     int nblocks = (n - 1) / GPU_THREADS_PER_BLOCK + 1;
     ArrayBlockMultiply_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(x, a, n, bs);
     cudaDeviceSynchronize();
     return;
 }

double gpuArraySumSquarenD	(	int	nvars,
		int	ndims,
		int *	dim,
		int	ghosts,
		int *	index,
		double *	x
	)

Returns the sum-of-squares of the elements in an n-D array (useful for L_2 norm)

See Also: ArraySumSquarenD()

Parameters

nvars	number of elements at one array location, can be > 1 for systems of equations
ndims	number of dimensions
dim	integer array of size in each dimension
ghosts	number of ghost points in the array x
index	pre-allocated (by the calling function) integer array of size ndims
x	the array

Definition at line 206 of file ArrayImplementations_GPU.cu.

 {
     double sum = 0;
     printf("gpuArraySumSquarenD hasn't been implemented, yet.\n");
     exit(0);
     return (sum);
 }

void gpuArrayCopy1DNewScheme	(	const double *	src,
		double *	dest,
		int	npoints,
		int	nvars
	)

Alternative implementation of _ArrayCopy1D_

Parameters

src	source array
dest	destination array
npoints	number of points
nvars	number of components

Definition at line 223 of file ArrayImplementations_GPU.cu.

 {
     int nblocks = (npoints-1) / GPU_THREADS_PER_BLOCK + 1;
     ArrayCopy1DNewScheme_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(src, dest, npoints, nvars);
     cudaDeviceSynchronize();
     return;
 }

void gpuArrayCheckEqual	(	const char *	msg,
		const double *	var_adj,
		const double *	var_sep,
		int	npoints
	)

Check if two arrays are equal, if not, report the difference

Parameters

msg	message
var_adj	array
var_sep	array
npoints	size of array

Definition at line 236 of file ArrayImplementations_GPU.cu.

 {
     double *h_var_adj = (double *) malloc(npoints*sizeof(double));
     double *h_var_sep = (double *) malloc(npoints*sizeof(double));
 
     gpuMemcpy(h_var_adj, var_adj, npoints*sizeof(double), gpuMemcpyDeviceToHost);
     gpuMemcpy(h_var_sep, var_sep, npoints*sizeof(double), gpuMemcpyDeviceToHost);
 
     double max_err = 0.0;
     for (int j=0; j<npoints; j++) {
         if (h_var_sep[j] != h_var_adj[j]) {
             max_err = max(max_err, fabs(h_var_sep[j]-h_var_adj[j]));
         }
     }
 
     free(h_var_adj);
     free(h_var_sep);
 
     if (max_err > 1e-10) {
         printf("gpuArrayCheckEqual: %-30s max_err = %e\n", msg, max_err);
         exit(0);
     }
     return;
 }

Functions

Detailed Description

Function Documentation