37 #if defined(HAVE_CUDA) 43 if (!simobj[0].mpi.rank) printf(
"Partitioning domain and allocating data arrays.\n");
45 for (n = 0; n < nsims; n++) {
51 simobj[n].
mpi.
ip = (
int*) calloc (simobj[n].solver.ndims,
sizeof(
int));
52 simobj[n].
mpi.
is = (
int*) calloc (simobj[n].solver.ndims,
sizeof(
int));
53 simobj[n].
mpi.
ie = (
int*) calloc (simobj[n].solver.ndims,
sizeof(
int));
54 simobj[n].
mpi.
bcperiodic = (
int*) calloc (simobj[n].solver.ndims,
sizeof(
int));
55 simobj[n].
solver.
dim_local = (
int*) calloc (simobj[n].solver.ndims,
sizeof(
int));
56 simobj[n].
solver.
isPeriodic= (
int*) calloc (simobj[n].solver.ndims,
sizeof(
int));
58 #if defined(HAVE_CUDA) 68 for (i=0; i<simobj[n].
solver.
ndims; i++) total_proc *= simobj[n].mpi.iproc[i];
69 if (simobj[n].mpi.nproc != total_proc) {
70 fprintf(stderr,
"Error on rank %d: total number of processes is not consistent ", simobj[n].mpi.rank);
71 fprintf(stderr,
"with number of processes along each dimension.\n");
72 if (nsims > 1) fprintf(stderr,
"for domain %d.\n", n);
73 fprintf(stderr,
"mpiexec was called with %d processes, ",simobj[n].mpi.nproc);
74 fprintf(stderr,
"total number of processes from \"solver.inp\" is %d.\n", total_proc);
88 simobj[n].
mpi.
ip[i] );
104 for (i=0; i<simobj[n].
solver.
ndims; i++) simobj[n].mpi.bcperiodic[i] = 0;
112 simobj[n].
mpi.
ip[i] = 0;
115 simobj[n].
mpi.
is[i] = 0;
136 int accu1 = 1, accu2 = 1;
144 #if defined(HAVE_CUDA) 145 if (simobj[n].solver.use_gpu) {
147 gpuMemcpy( simobj[n].solver.gpu_dim_local,
160 simobj[n].
solver.
u = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
161 #if defined(HAVE_CUDA) 162 if (simobj[n].solver.use_gpu) {
168 if (simobj[n].solver.use_petscTS) {
169 simobj[n].
solver.
u0 = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
170 simobj[n].
solver.
uref = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
171 simobj[n].
solver.
rhsref = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
172 simobj[n].
solver.
rhs = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
178 simobj[n].
solver.
hyp = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
179 simobj[n].
solver.
par = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
180 simobj[n].
solver.
source = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
181 simobj[n].
solver.
iblank = (
double*) calloc (size ,
sizeof(
double));
183 #if defined(HAVE_CUDA) 184 if (simobj[n].solver.use_gpu) {
193 simobj[n].
solver.
hyp = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
194 simobj[n].
solver.
par = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
195 simobj[n].
solver.
source = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
196 #if defined(HAVE_CUDA) 200 simobj[n].
solver.
iblank = (
double*) calloc (size,
sizeof(
double));
201 #if defined(HAVE_CUDA) 202 if (simobj[n].solver.use_gpu) {
203 gpuMalloc((
void**)&simobj[n].solver.gpu_iblank, size*
sizeof(
double));
204 gpuMemset(simobj[n].solver.gpu_iblank, 0, size*
sizeof(
double));
213 simobj[n].
solver.
x = (
double*) calloc (size,
sizeof(
double));
214 simobj[n].
solver.
dxinv = (
double*) calloc (size,
sizeof(
double));
216 #if defined(HAVE_CUDA) 217 if (simobj[n].solver.use_gpu) {
218 gpuMalloc((
void**)&simobj[n].solver.gpu_x, size*
sizeof(
double));
219 gpuMalloc((
void**)&simobj[n].solver.gpu_dxinv, size*
sizeof(
double));
220 gpuMemset(simobj[n].solver.gpu_x, 0, size*
sizeof(
double));
221 gpuMemset(simobj[n].solver.gpu_dxinv, 0, size*
sizeof(
double));
230 #if defined(HAVE_CUDA) 231 if (simobj[n].solver.use_gpu) {
242 simobj[n].
solver.
uC = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
243 simobj[n].
solver.
fluxC = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
244 simobj[n].
solver.
Deriv1 = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
245 simobj[n].
solver.
Deriv2 = (
double*) calloc (simobj[n].solver.nvars*size,
sizeof(
double));
246 #if defined(HAVE_CUDA) 251 size = 1;
for (i=0; i<simobj[n].
solver.
ndims; i++) size *= (simobj[n].solver.dim_local[i]+1);
254 #if defined(HAVE_CUDA) 255 if (simobj[n].solver.use_gpu) {
256 gpuMalloc((
void**)&simobj[n].solver.fluxI, size*
sizeof(
double));
257 gpuMalloc((
void**)&simobj[n].solver.uL, size*
sizeof(
double));
258 gpuMalloc((
void**)&simobj[n].solver.uR, size*
sizeof(
double));
259 gpuMalloc((
void**)&simobj[n].solver.fL, size*
sizeof(
double));
260 gpuMalloc((
void**)&simobj[n].solver.fR, size*
sizeof(
double));
261 gpuMemset(simobj[n].solver.fluxI, 0, size*
sizeof(
double));
262 gpuMemset(simobj[n].solver.uL, 0, size*
sizeof(
double));
263 gpuMemset(simobj[n].solver.uR, 0, size*
sizeof(
double));
264 gpuMemset(simobj[n].solver.fL, 0, size*
sizeof(
double));
265 gpuMemset(simobj[n].solver.fR, 0, size*
sizeof(
double));
268 simobj[n].
solver.
fluxI = (
double*) calloc (size,
sizeof(
double));
269 simobj[n].
solver.
uL = (
double*) calloc (size,
sizeof(
double));
270 simobj[n].
solver.
uR = (
double*) calloc (size,
sizeof(
double));
271 simobj[n].
solver.
fL = (
double*) calloc (size,
sizeof(
double));
272 simobj[n].
solver.
fR = (
double*) calloc (size,
sizeof(
double));
273 #if defined(HAVE_CUDA) 279 for (d = 0; d < simobj[n].solver.ndims; d++) {
281 for (i = 0; i < simobj[n].solver.ndims; i++) {
282 if (i == d) bufdim[d] *= simobj[n].solver.ghosts;
283 else bufdim[d] *= simobj[n].solver.dim_local[i];
285 if (bufdim[d] > maxbuf) maxbuf = bufdim[d];
287 maxbuf *= (simobj[n].solver.nvars*simobj[n].solver.ndims);
288 simobj[n].mpi.maxbuf = maxbuf;
289 simobj[n].mpi.sendbuf = (
double*) calloc (2*simobj[n].solver.ndims*maxbuf,
sizeof(
double));
290 simobj[n].mpi.recvbuf = (
double*) calloc (2*simobj[n].solver.ndims*maxbuf,
sizeof(
double));
291 #if defined(HAVE_CUDA) 292 if (simobj[n].solver.use_gpu) {
293 simobj[n].mpi.cpu_dim = (
int *) calloc(simobj[n].solver.ndims,
sizeof(
int));
294 _ArrayCopy1D_(simobj[n].solver.dim_local, simobj[n].mpi.cpu_dim, simobj[n].solver.ndims);
295 gpuMalloc((
void**)&simobj[n].mpi.gpu_sendbuf, 2*simobj[n].solver.ndims*simobj[n].mpi.maxbuf*
sizeof(
double));
296 gpuMalloc((
void**)&simobj[n].mpi.gpu_recvbuf, 2*simobj[n].solver.ndims*simobj[n].mpi.maxbuf*
sizeof(
double));
297 gpuMemset(simobj[n].mpi.gpu_sendbuf, 0, 2*simobj[n].solver.ndims*simobj[n].mpi.maxbuf*
sizeof(
double));
298 gpuMemset(simobj[n].mpi.gpu_recvbuf, 0, 2*simobj[n].solver.ndims*simobj[n].mpi.maxbuf*
sizeof(
double));
303 simobj[n].solver.VolumeIntegral = (
double*) calloc (simobj[n].solver.nvars ,
sizeof(
double));
304 simobj[n].solver.VolumeIntegralInitial = (
double*) calloc (simobj[n].solver.nvars ,
sizeof(
double));
305 simobj[n].solver.TotalBoundaryIntegral = (
double*) calloc (simobj[n].solver.nvars,
sizeof(
double));
306 simobj[n].solver.ConservationError = (
double*) calloc (simobj[n].solver.nvars,
sizeof(
double));
307 for (i=0; i<simobj[n].solver.nvars; i++) simobj[n].solver.ConservationError[i] = -1;
308 #
if defined(HAVE_CUDA)
309 if (simobj[n].solver.use_gpu) {
310 int total_offset = 0;
311 for (d=0; d<simobj[n].solver.ndims; d++) {
312 simobj[n].solver.gpu_npoints_boundary_offset[d] = total_offset;
313 simobj[n].solver.gpu_npoints_boundary[d] = 1;
315 for (i=0; i<simobj[n].solver.ndims; i++) {
316 if (i != d) simobj[n].solver.gpu_npoints_boundary[d] *= simobj[n].solver.dim_local[i];
318 total_offset += 2*simobj[n].solver.gpu_npoints_boundary[d];
320 simobj[n].solver.StageBoundaryBuffer_size = (total_offset*simobj[n].solver.nvars);
321 gpuMalloc((
void**)&simobj[n].solver.StageBoundaryBuffer, simobj[n].solver.StageBoundaryBuffer_size*
sizeof(
double));
322 gpuMemset(simobj[n].solver.StageBoundaryBuffer, 0, simobj[n].solver.StageBoundaryBuffer_size*
sizeof(
double));
324 size = 2*simobj[n].solver.ndims*simobj[n].solver.nvars;
325 gpuMalloc((
void**)&simobj[n].solver.StageBoundaryIntegral, size*
sizeof(
double));
326 gpuMalloc((
void**)&simobj[n].solver.StepBoundaryIntegral, size*
sizeof(
double));
327 gpuMemset(simobj[n].solver.StageBoundaryIntegral, 0, size*
sizeof(
double));
328 gpuMemset(simobj[n].solver.StepBoundaryIntegral, 0, size*
sizeof(
double));
331 simobj[n].solver.StageBoundaryIntegral = (
double*) calloc (2*simobj[n].solver.ndims*simobj[n].solver.nvars,
sizeof(
double));
332 simobj[n].solver.StepBoundaryIntegral = (
double*) calloc (2*simobj[n].solver.ndims*simobj[n].solver.nvars,
sizeof(
double));
333 #if defined(HAVE_CUDA) 338 simobj[n].solver.count_hyp
339 = simobj[n].solver.count_par
340 = simobj[n].solver.count_sou
343 simobj[n].solver.count_RHSFunction
344 = simobj[n].solver.count_IFunction
345 = simobj[n].solver.count_IJacobian
346 = simobj[n].solver.count_IJacFunction
351 _ArraySetValue_(simobj[n].solver.iblank,simobj[n].solver.npoints_local_wghosts,1);
352 #if defined(HAVE_CUDA) 353 if (simobj[n].solver.use_gpu) {
354 gpuArraySetValue(simobj[n].solver.gpu_iblank, simobj[n].solver.npoints_local_wghosts, 1.0);
void gpuMemset(void *, int, size_t)
MPI related function definitions.
Contains function definitions for common array operations on GPU.
Structure defining a simulation.
Some basic definitions and macros.
void gpuArraySetValue(double *, int, double)
void gpuMalloc(void **, size_t)
int * stride_without_ghosts
int MPIPartition1D(int, int, int)
#define _ArraySetValue_(x, size, value)
void gpuSetDevice(int device)
void gpuMemcpy(void *, const void *, size_t, enum gpuMemcpyKind)
int MPILocalDomainLimits(int, int, void *, int *, int *, int *)
int MPICreateIOGroups(void *)
int npoints_local_wghosts
int MPIRanknD(int, int, int *, int *)
#define _ArrayCopy1D_(x, y, size)
Contains macros and function definitions for common array operations.
int Initialize(void *s, int nsims)
int MPICreateCommunicators(int, void *)