HyPar  1.0
Finite-Difference Hyperbolic-Parabolic PDE Solver on Cartesian Grids
NavierStokes3DParabolicFunction_GPU.cu
Go to the documentation of this file.
1 /*! @file NavierStokes3DParabolicFunction_GPU.cu
2  @author Youngdae Kim
3  @brief Compute the viscous terms for the 3D Navier Stokes equations
4 */
5 #include <basic_gpu.h>
6 #include <arrayfunctions_gpu.h>
7 #include <mathfunctions.h>
8 #include <physicalmodels/navierstokes3d.h>
9 #include <mpivars.h>
10 #include <hypar.h>
11 
12 #ifdef CUDA_VAR_ORDERDING_AOS
13 
14 /*! Kernel function */
15 __global__
16 void gpuNavierStokes3DParabolicFunction_Q_kernel(
17  int npoints_grid,
18  double gamma,
19  const double * __restrict__ u,
20  double * __restrict__ Q
21 )
22 {
23  int p = blockDim.x * blockIdx.x + threadIdx.x;
24 
25  if (p < npoints_grid) {
26  double energy, pressure;
27  p *= _MODEL_NVARS_;
28  _NavierStokes3DGetFlowVar_(
29  (u+p),
30  _NavierStokes3D_stride_,
31  Q[p+0],
32  Q[p+1],
33  Q[p+2],
34  Q[p+3],
35  energy,
36  pressure,
37  gamma
38  );
39  Q[p+4] = gamma*pressure/Q[p+0]; /* temperature */
40  }
41  return;
42 }
43 
44 /*! Kernel function */
45 __global__
46 void gpuNavierStokes3DParabolicFunction_QDeriv_kernel(
47  int npoints_grid,
48  int ghosts,
49  const int * __restrict__ dim,
50  const double * __restrict__ dxinv,
51  double * __restrict__ QDerivX,
52  double * __restrict__ QDerivY,
53  double * __restrict__ QDerivZ
54 )
55 {
56  int p = blockDim.x * blockIdx.x + threadIdx.x;
57 
58  if (p < npoints_grid) {
59  int index[3];
60  double dxinv_val, dyinv_val, dzinv_val;
61 
62  _ArrayIndexnD_(_MODEL_NDIMS_,p,dim,index,ghosts); p *= _MODEL_NVARS_;
63  _GetCoordinate_(_XDIR_,index[_XDIR_],dim,ghosts,dxinv,dxinv_val);
64  _GetCoordinate_(_YDIR_,index[_YDIR_],dim,ghosts,dxinv,dyinv_val);
65  _GetCoordinate_(_ZDIR_,index[_ZDIR_],dim,ghosts,dxinv,dzinv_val);
66  _ArrayScale1D_((QDerivX+p),dxinv_val,_MODEL_NVARS_);
67  _ArrayScale1D_((QDerivY+p),dyinv_val,_MODEL_NVARS_);
68  _ArrayScale1D_((QDerivZ+p),dzinv_val,_MODEL_NVARS_);
69  }
70  return;
71 }
72 
73 /*! Kernel function */
74 __global__
75 void gpuNavierStokes3DParabolicFunction_Xdir_kernel(
76  int npoints_grid,
77  double two_third,
78  double inv_gamma_m1,
79  double inv_Re,
80  double inv_Pr,
81  const double * __restrict__ Q,
82  const double * __restrict__ QDerivX,
83  const double * __restrict__ QDerivY,
84  const double * __restrict__ QDerivZ,
85  double * __restrict__ FViscous
86 )
87 {
88  int p = blockDim.x * blockIdx.x + threadIdx.x;
89 
90  if (p < npoints_grid) {
91  double uvel, vvel, wvel, T, Tx,
92  ux, uy, uz, vx, vy, wx, wz;
93 
94  p *= _MODEL_NVARS_;
95 
96  uvel = (Q+p)[1];
97  vvel = (Q+p)[2];
98  wvel = (Q+p)[3];
99  T = (Q+p)[4];
100  Tx = (QDerivX+p)[4];
101  ux = (QDerivX+p)[1];
102  vx = (QDerivX+p)[2];
103  wx = (QDerivX+p)[3];
104  uy = (QDerivY+p)[1];
105  vy = (QDerivY+p)[2];
106  uz = (QDerivZ+p)[1];
107  wz = (QDerivZ+p)[3];
108 
109  /* calculate viscosity coeff based on Sutherland's law */
110  double mu = raiseto(T, 0.76);
111  double tau_xx, tau_xy, tau_xz, qx;
112  tau_xx = two_third * (mu*inv_Re) * (2*ux - vy - wz);
113  tau_xy = (mu*inv_Re) * (uy + vx);
114  tau_xz = (mu*inv_Re) * (uz + wx);
115  qx = ( mu*inv_Re * inv_gamma_m1 * inv_Pr ) * Tx;
116 
117  (FViscous+p)[0] = 0.0;
118  (FViscous+p)[1] = tau_xx;
119  (FViscous+p)[2] = tau_xy;
120  (FViscous+p)[3] = tau_xz;
121  (FViscous+p)[4] = uvel*tau_xx + vvel*tau_xy + wvel*tau_xz + qx;
122  }
123  return;
124 }
125 
126 /*! Kernel function */
127 __global__
128 void gpuNavierStokes3DParabolicFunction_Ydir_kernel(
129  int npoints_grid,
130  double two_third,
131  double inv_gamma_m1,
132  double inv_Re,
133  double inv_Pr,
134  const double * __restrict__ Q,
135  const double * __restrict__ QDerivX,
136  const double * __restrict__ QDerivY,
137  const double * __restrict__ QDerivZ,
138  double * __restrict__ FViscous
139 )
140 {
141  int p = blockDim.x * blockIdx.x + threadIdx.x;
142  if (p < npoints_grid) {
143  double uvel, vvel, wvel, T, Ty,
144  ux, uy, vx, vy, vz, wy, wz;
145 
146  p *= _MODEL_NVARS_;
147 
148  uvel = (Q+p)[1];
149  vvel = (Q+p)[2];
150  wvel = (Q+p)[3];
151  T = (Q+p)[4];
152  Ty = (QDerivY+p)[4];
153  ux = (QDerivX+p)[1];
154  vx = (QDerivX+p)[2];
155  uy = (QDerivY+p)[1];
156  vy = (QDerivY+p)[2];
157  wy = (QDerivY+p)[3];
158  vz = (QDerivZ+p)[2];
159  wz = (QDerivZ+p)[3];
160 
161  /* calculate viscosity coeff based on Sutherland's law */
162  double mu = raiseto(T, 0.76);
163 
164  double tau_yx, tau_yy, tau_yz, qy;
165  tau_yx = (mu*inv_Re) * (uy + vx);
166  tau_yy = two_third * (mu*inv_Re) * (-ux + 2*vy - wz);
167  tau_yz = (mu*inv_Re) * (vz + wy);
168  qy = ( mu*inv_Re * inv_gamma_m1 * inv_Pr ) * Ty;
169 
170  (FViscous+p)[0] = 0.0;
171  (FViscous+p)[1] = tau_yx;
172  (FViscous+p)[2] = tau_yy;
173  (FViscous+p)[3] = tau_yz;
174  (FViscous+p)[4] = uvel*tau_yx + vvel*tau_yy + wvel*tau_yz + qy;
175  }
176  return;
177 }
178 
179 /*! Kernel function */
180 __global__
181 void gpuNavierStokes3DParabolicFunction_Zdir_kernel(
182  int npoints_grid,
183  double two_third,
184  double inv_gamma_m1,
185  double inv_Re,
186  double inv_Pr,
187  const double * __restrict__ Q,
188  const double * __restrict__ QDerivX,
189  const double * __restrict__ QDerivY,
190  const double * __restrict__ QDerivZ,
191  double * __restrict__ FViscous
192 )
193 {
194  int p = blockDim.x * blockIdx.x + threadIdx.x;
195  if (p < npoints_grid) {
196  double uvel, vvel, wvel, T, Tz,
197  ux, uz, vy, vz, wx, wy, wz;
198 
199  p *= _MODEL_NVARS_;
200 
201  uvel = (Q+p)[1];
202  vvel = (Q+p)[2];
203  wvel = (Q+p)[3];
204  T = (Q+p)[4];
205  Tz = (QDerivZ+p)[4];
206  ux = (QDerivX+p)[1];
207  wx = (QDerivX+p)[3];
208  vy = (QDerivY+p)[2];
209  wy = (QDerivY+p)[3];
210  uz = (QDerivZ+p)[1];
211  vz = (QDerivZ+p)[2];
212  wz = (QDerivZ+p)[3];
213 
214  /* calculate viscosity coeff based on Sutherland's law */
215  double mu = raiseto(T,0.76);
216 
217  double tau_zx, tau_zy, tau_zz, qz;
218  tau_zx = (mu*inv_Re) * (uz + wx);
219  tau_zy = (mu*inv_Re) * (vz + wy);
220  tau_zz = two_third * (mu*inv_Re) * (-ux - vy + 2*wz);
221  qz = ( mu*inv_Re * inv_gamma_m1 * inv_Pr ) * Tz;
222 
223  (FViscous+p)[0] = 0.0;
224  (FViscous+p)[1] = tau_zx;
225  (FViscous+p)[2] = tau_zy;
226  (FViscous+p)[3] = tau_zz;
227  (FViscous+p)[4] = uvel*tau_zx + vvel*tau_zy + wvel*tau_zz + qz;
228  }
229  return;
230 }
231 
232 /*! Kernel function */
233 __global__
234 void gpuNavierStokes3DParabolicFunction_par_kernel(
235  int npoints_grid,
236  int ghosts,
237  int dir,
238  const int * __restrict__ dim,
239  const double * __restrict__ dxinv,
240  const double * __restrict__ FDeriv,
241  double * __restrict__ par
242 )
243 {
244  int p = blockDim.x * blockIdx.x + threadIdx.x;
245 
246  if (p < npoints_grid) {
247  int index[3];
248  double dxinv_val;
249 
250  _ArrayIndexnD_(_MODEL_NDIMS_,p,dim,index,0);
251  _ArrayIndex1D_(_MODEL_NDIMS_,dim,index,ghosts,p); p *= _MODEL_NVARS_;
252  _GetCoordinate_(dir,index[dir],dim,ghosts,dxinv,dxinv_val);
253 
254  #pragma unroll
255  for (int v=0; v<_MODEL_NVARS_; v++) (par+p)[v] += (dxinv_val * (FDeriv+p)[v] );
256  }
257  return;
258 }
259 
260 /*!
261  Compute the viscous terms in the 3D Navier Stokes equations: this function computes
262  the following:
263  \f{equation}{
264  \frac {\partial} {\partial x} \left[\begin{array}{c} 0 \\ \tau_{xx} \\ \tau_{yx} \\ \tau_{zx} \\ u \tau_{xx} + v \tau_{yx} + w \tau_{zx} - q_x \end{array}\right]
265  + \frac {\partial} {\partial y} \left[\begin{array}{c} 0 \\ \tau_{xy} \\ \tau_{yy} \\ \tau_{zy} \\ u \tau_{xy} + v \tau_{yy} + w \tau_{zy} - q_y \end{array}\right]
266  + \frac {\partial} {\partial z} \left[\begin{array}{c} 0 \\ \tau_{xz} \\ \tau_{yz} \\ \tau_{zz} \\ u \tau_{xz} + v \tau_{yz} + w \tau_{zz} - q_z \end{array}\right]
267  \f}
268  where
269  \f{align}{
270  \tau_{xx} &= \frac{2}{3}\left(\frac{\mu}{Re}\right)\left(2\frac{\partial u}{\partial x} - \frac{\partial v}{\partial y} - \frac{\partial w}{\partial z}\right),\\
271  \tau_{xy} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial u}{\partial y} + \frac{\partial v}{\partial x}\right),\\
272  \tau_{xz} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial u}{\partial z} + \frac{\partial w}{\partial x}\right),\\
273  \tau_{yx} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial u}{\partial y} + \frac{\partial v}{\partial x}\right),\\
274  \tau_{yy} &= \frac{2}{3}\left(\frac{\mu}{Re}\right)\left(-\frac{\partial u}{\partial x} +2\frac{\partial v}{\partial y} - \frac{\partial w}{\partial z}\right),\\
275  \tau_{yz} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial w}{\partial y} + \frac{\partial v}{\partial z}\right),\\
276  \tau_{zx} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial u}{\partial z} + \frac{\partial w}{\partial x}\right),\\
277  \tau_{zy} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial v}{\partial z} + \frac{\partial w}{\partial y}\right),\\
278  \tau_{zz} &= \frac{2}{3}\left(\frac{\mu}{Re}\right)\left(-\frac{\partial u}{\partial x} - \frac{\partial v}{\partial y} + 2\frac{\partial v}{\partial y}\right),\\
279  q_x &= -\frac{mu}{\left(\gamma-1\right)Re Pr}\frac{\partial T}{\partial x}, \\
280  q_y &= -\frac{mu}{\left(\gamma-1\right)Re Pr}\frac{\partial T}{\partial y}, \\
281  q_z &= -\frac{mu}{\left(\gamma-1\right)Re Pr}\frac{\partial T}{\partial z}
282  \f}
283  and the temperature is \f$T = \gamma p/\rho\f$. \f$Re\f$ and \f$Pr\f$ are the Reynolds and Prandtl numbers, respectively. Note that this function
284  computes the entire parabolic term, and thus bypasses HyPar's parabolic function calculation interfaces. NavierStokes3DInitialize() assigns this
285  function to #HyPar::ParabolicFunction.
286  \n\n
287  Reference:
288  + Tannehill, Anderson and Pletcher, Computational Fluid Mechanics and Heat Transfer,
289  Chapter 5, Section 5.1.7 (However, the non-dimensional velocity and the Reynolds
290  number is based on speed of sound, instead of the freestream velocity).
291 */
292 extern "C" int gpuNavierStokes3DParabolicFunction(
293  double * __restrict__ par, /*!< Array to hold the computed viscous terms */
294  double * __restrict__ u, /*!< Solution vector array */
295  void * __restrict__ s, /*!< Solver object of type #HyPar */
296  void * __restrict__ m, /*!< MPI object of type #MPIVariables */
297  double t /*!< Current simulation time */
298 )
299 {
300  HyPar *solver = (HyPar*) s;
301  MPIVariables *mpi = (MPIVariables*) m;
302  NavierStokes3D *physics = (NavierStokes3D*) solver->physics;
303 
304  int ghosts = solver->ghosts;
305  int *dim = solver->gpu_dim_local;
306  int size = solver->npoints_local_wghosts;
307 
308  gpuMemset(par, 0, size*_MODEL_NVARS_*sizeof(double));
309  if (physics->Re <= 0) return (0); /* inviscid flow */
310  solver->count_par++;
311 
312  static double two_third = 2.0/3.0;
313  double inv_gamma_m1 = 1.0 / (physics->gamma-1.0);
314  double inv_Re = 1.0 / physics->Re;
315  double inv_Pr = 1.0 / physics->Pr;
316 
317  double *Q = physics->gpu_Q;
318  double *QDerivX = physics->gpu_QDerivX;
319  double *QDerivY = physics->gpu_QDerivY;
320  double *QDerivZ = physics->gpu_QDerivZ;
321  double *dxinv = solver->gpu_dxinv;
322 
323  int nblocks = (size-1)/GPU_THREADS_PER_BLOCK + 1;
324  int nblocks_par = (solver->npoints_local-1)/GPU_THREADS_PER_BLOCK + 1;
325 
326 #if defined(GPU_STAT)
327  cudaEvent_t startEvent, stopEvent;
328  float milliseconds;
329 
330  checkCuda( cudaEventCreate(&startEvent) );
331  checkCuda( cudaEventCreate(&stopEvent) );
332  checkCuda( cudaEventRecord(startEvent, 0) );
333 #endif
334 
335  gpuNavierStokes3DParabolicFunction_Q_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
336  size, physics->gamma, u, Q
337  );
338 #if defined(GPU_STAT)
339  checkCuda( cudaEventRecord(stopEvent, 0) );
340  checkCuda( cudaEventSynchronize(stopEvent) );
341 #endif
342 
343  cudaDeviceSynchronize();
344 
345 #if defined(GPU_STAT)
346  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
347 
348  int memory_accessed = size*_MODEL_NVARS_*sizeof(double);
349  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
350  "NavierStokes3DParabolicFunction_Q", milliseconds*1e-3,
351  (1e-6*(memory_accessed)/milliseconds));
352 #endif
353 
354  solver->FirstDerivativePar(QDerivX,Q,_XDIR_,1,solver,mpi);
355  solver->FirstDerivativePar(QDerivY,Q,_YDIR_,1,solver,mpi);
356  solver->FirstDerivativePar(QDerivZ,Q,_ZDIR_,1,solver,mpi);
357 
358  gpuMPIExchangeBoundariesnD(_MODEL_NDIMS_,_MODEL_NVARS_,solver->gpu_dim_local,
359  solver->ghosts,mpi,QDerivX);
360  gpuMPIExchangeBoundariesnD(_MODEL_NDIMS_,_MODEL_NVARS_,solver->gpu_dim_local,
361  solver->ghosts,mpi,QDerivY);
362  gpuMPIExchangeBoundariesnD(_MODEL_NDIMS_,_MODEL_NVARS_,solver->gpu_dim_local,
363  solver->ghosts,mpi,QDerivY);
364 
365 #if defined(GPU_STAT)
366  checkCuda( cudaEventRecord(startEvent, 0) );
367 #endif
368 
369  gpuNavierStokes3DParabolicFunction_QDeriv_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
370  size, ghosts, dim, dxinv, QDerivX, QDerivY, QDerivZ
371  );
372 #if defined(GPU_STAT)
373  checkCuda( cudaEventRecord(stopEvent, 0) );
374  checkCuda( cudaEventSynchronize(stopEvent) );
375 #endif
376 
377  cudaDeviceSynchronize();
378 
379 #if defined(GPU_STAT)
380  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
381 
382  memory_accessed = (3*size*_MODEL_NVARS_ + solver->size_x)*sizeof(double);
383  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
384  "NavierStokes3DParabolicFunction_QDeriv", milliseconds*1e-3,
385  (1e-6*(memory_accessed)/milliseconds));
386 #endif
387 
388  double *FViscous = physics->gpu_FViscous;
389  double *FDeriv = physics->gpu_FDeriv;
390  gpuMemset(FViscous, 0, size*_MODEL_NVARS_*sizeof(double));
391  gpuMemset(FDeriv, 0, size*_MODEL_NVARS_*sizeof(double));
392 
393  /* Along X */
394 #if defined(GPU_STAT)
395  checkCuda( cudaEventRecord(startEvent, 0) );
396 #endif
397 
398  gpuNavierStokes3DParabolicFunction_Xdir_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
399  size, two_third, inv_gamma_m1, inv_Re, inv_Pr, Q,
400  QDerivX, QDerivY, QDerivZ, FViscous
401  );
402  cudaDeviceSynchronize();
403 
404 #if defined(GPU_STAT)
405  checkCuda( cudaEventRecord(stopEvent, 0) );
406  checkCuda( cudaEventSynchronize(stopEvent) );
407  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
408 
409  memory_accessed = 17*size*sizeof(double);
410  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
411  "NavierStokes3DParabolicFunction_Xdir", milliseconds*1e-3,
412  (1e-6*(memory_accessed)/milliseconds));
413 #endif
414 
415  solver->FirstDerivativePar(FDeriv,FViscous,_XDIR_,1,solver,mpi);
416 
417 #if defined(GPU_STAT)
418  checkCuda( cudaEventRecord(startEvent, 0) );
419 #endif
420 
421  gpuNavierStokes3DParabolicFunction_par_kernel<<<nblocks_par, GPU_THREADS_PER_BLOCK>>>(
422  solver->npoints_local, ghosts, _XDIR_, dim, dxinv, FDeriv, par
423  );
424  cudaDeviceSynchronize();
425 
426 #if defined(GPU_STAT)
427  checkCuda( cudaEventRecord(stopEvent, 0) );
428  checkCuda( cudaEventSynchronize(stopEvent) );
429  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
430 
431  memory_accessed = (solver->npoints_local + 2*solver->npoints_local*_MODEL_NVARS_)*sizeof(double);
432  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
433  "NavierStokes3DParabolicFunction_par", milliseconds*1e-3,
434  (1e-6*(memory_accessed)/milliseconds));
435 #endif
436 
437  /* Along Y */
438 #if defined(GPU_STAT)
439  checkCuda( cudaEventRecord(startEvent, 0) );
440 #endif
441 
442  gpuNavierStokes3DParabolicFunction_Ydir_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
443  size, two_third, inv_gamma_m1, inv_Re, inv_Pr, Q,
444  QDerivX, QDerivY, QDerivZ, FViscous
445  );
446  cudaDeviceSynchronize();
447 
448 #if defined(GPU_STAT)
449  checkCuda( cudaEventRecord(stopEvent, 0) );
450  checkCuda( cudaEventSynchronize(stopEvent) );
451  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
452  //cudaDeviceSynchronize();
453  memory_accessed = 17*size*sizeof(double);
454  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
455  "NavierStokes3DParabolicFunction_Ydir", milliseconds*1e-3,
456  (1e-6*(memory_accessed)/milliseconds));
457 #endif
458 
459  solver->FirstDerivativePar(FDeriv,FViscous,_YDIR_,1,solver,mpi);
460  gpuNavierStokes3DParabolicFunction_par_kernel<<<nblocks_par, GPU_THREADS_PER_BLOCK>>>(
461  solver->npoints_local, ghosts, _YDIR_, dim, dxinv, FDeriv, par
462  );
463  cudaDeviceSynchronize();
464 
465  /* Along Z */
466 #if defined(GPU_STAT)
467  checkCuda( cudaEventRecord(startEvent, 0) );
468 #endif
469 
470  gpuNavierStokes3DParabolicFunction_Zdir_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
471  size, two_third, inv_gamma_m1, inv_Re, inv_Pr, Q,
472  QDerivX, QDerivY, QDerivZ, FViscous
473  );
474  cudaDeviceSynchronize();
475 
476 #if defined(GPU_STAT)
477  checkCuda( cudaEventRecord(stopEvent, 0) );
478  checkCuda( cudaEventSynchronize(stopEvent) );
479  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
480 
481  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
482  "NavierStokes3DParabolicFunction_Zdir", milliseconds*1e-3,
483  (1e-6*(memory_accessed)/milliseconds));
484 #endif
485 
486  solver->FirstDerivativePar(FDeriv,FViscous,_ZDIR_,1,solver,mpi);
487  gpuNavierStokes3DParabolicFunction_par_kernel<<<nblocks_par, GPU_THREADS_PER_BLOCK>>>(
488  solver->npoints_local, ghosts, _ZDIR_, dim, dxinv, FDeriv, par
489  );
490  cudaDeviceSynchronize();
491 
492 #if defined(GPU_STAT)
493  checkCuda( cudaEventDestroy(startEvent) );
494  checkCuda( cudaEventDestroy(stopEvent) );
495 #endif
496 
497  if (solver->flag_ib) gpuArrayBlockMultiply(par,solver->gpu_iblank,size,_MODEL_NVARS_);
498 
499  return (0);
500 }
501 
502 #else
503 
504 /*! Kernel function */
505 __global__
506 void gpuNavierStokes3DParabolicFunction_Q_kernel(
507  int npoints_grid,
508  double gamma,
509  const double * __restrict__ u,
510  double * __restrict__ Q
511 )
512 {
513  int p = blockDim.x * blockIdx.x + threadIdx.x;
514 
515  if (p < npoints_grid) {
516  double energy, pressure;
517  _NavierStokes3DGetFlowVar_(
518  (u+p),
519  npoints_grid,
520  Q[p],
521  Q[p+ npoints_grid],
522  Q[p+2*npoints_grid],
523  Q[p+3*npoints_grid],
524  energy,
525  pressure,
526  gamma
527  );
528  Q[p+4*npoints_grid] = gamma*pressure/Q[p+0]; /* temperature */
529  }
530  return;
531 }
532 
533 /*! Kernel function */
534 __global__
535 void gpuNavierStokes3DParabolicFunction_QDeriv_kernel(
536  int npoints_grid,
537  int ghosts,
538  const int * __restrict__ dim,
539  const double * __restrict__ dxinv,
540  double * __restrict__ QDerivX,
541  double * __restrict__ QDerivY,
542  double * __restrict__ QDerivZ
543 )
544 {
545  int p = blockDim.x * blockIdx.x + threadIdx.x;
546 
547  if (p < npoints_grid) {
548  int index[_MODEL_NDIMS_];
549  double dxinv_val, dyinv_val, dzinv_val;
550 
551  _ArrayIndexnD_(_MODEL_NDIMS_,p,dim,index,ghosts);
552  _GetCoordinate_(_XDIR_,index[_XDIR_],dim,ghosts,dxinv,dxinv_val);
553  _GetCoordinate_(_YDIR_,index[_YDIR_],dim,ghosts,dxinv,dyinv_val);
554  _GetCoordinate_(_ZDIR_,index[_ZDIR_],dim,ghosts,dxinv,dzinv_val);
555 
556  #pragma unroll
557  for (int j=0; j<_MODEL_NVARS_; j++) {
558  QDerivX[p] *= dxinv_val;
559  QDerivY[p] *= dyinv_val;
560  QDerivZ[p] *= dzinv_val;
561  p += npoints_grid;
562  }
563  }
564  return;
565 }
566 
567 /*! Kernel function */
568 __global__
569 void gpuNavierStokes3DParabolicFunction_Xdir_kernel(
570  int npoints_grid,
571  double two_third,
572  double inv_gamma_m1,
573  double inv_Re,
574  double inv_Pr,
575  const double * __restrict__ Q,
576  const double * __restrict__ QDerivX,
577  const double * __restrict__ QDerivY,
578  const double * __restrict__ QDerivZ,
579  double * __restrict__ FViscous
580 )
581 {
582  int p = blockDim.x * blockIdx.x + threadIdx.x;
583 
584  if (p < npoints_grid) {
585  double uvel, vvel, wvel, T, Tx,
586  ux, uy, uz, vx, vy, wx, wz;
587 
588  uvel = (Q+p+ npoints_grid)[0];
589  vvel = (Q+p+2*npoints_grid)[0];
590  wvel = (Q+p+3*npoints_grid)[0];
591  T = (Q+p+4*npoints_grid)[0];
592  Tx = (QDerivX+p+4*npoints_grid)[0];
593  ux = (QDerivX+p+ npoints_grid)[0];
594  vx = (QDerivX+p+2*npoints_grid)[0];
595  wx = (QDerivX+p+3*npoints_grid)[0];
596  uy = (QDerivY+p+ npoints_grid)[0];
597  vy = (QDerivY+p+2*npoints_grid)[0];
598  uz = (QDerivZ+p+ npoints_grid)[0];
599  wz = (QDerivZ+p+3*npoints_grid)[0];
600 
601  /* calculate viscosity coeff based on Sutherland's law */
602  double mu = raiseto(T, 0.76);
603  double tau_xx, tau_xy, tau_xz, qx;
604  tau_xx = two_third * (mu*inv_Re) * (2*ux - vy - wz);
605  tau_xy = (mu*inv_Re) * (uy + vx);
606  tau_xz = (mu*inv_Re) * (uz + wx);
607  qx = ( mu*inv_Re * inv_gamma_m1 * inv_Pr ) * Tx;
608 
609  (FViscous+p)[0] = 0.0;
610  (FViscous+p+ npoints_grid)[0] = tau_xx;
611  (FViscous+p+2*npoints_grid)[0] = tau_xy;
612  (FViscous+p+3*npoints_grid)[0] = tau_xz;
613  (FViscous+p+4*npoints_grid)[0] = uvel*tau_xx + vvel*tau_xy + wvel*tau_xz + qx;
614  }
615  return;
616 }
617 
618 /*! Kernel function */
619 __global__
620 void gpuNavierStokes3DParabolicFunction_Ydir_kernel(
621  int npoints_grid,
622  double two_third,
623  double inv_gamma_m1,
624  double inv_Re,
625  double inv_Pr,
626  const double * __restrict__ Q,
627  const double * __restrict__ QDerivX,
628  const double * __restrict__ QDerivY,
629  const double * __restrict__ QDerivZ,
630  double * __restrict__ FViscous
631 )
632 {
633  int p = blockDim.x * blockIdx.x + threadIdx.x;
634  if (p < npoints_grid) {
635  double uvel, vvel, wvel, T, Ty,
636  ux, uy, vx, vy, vz, wy, wz;
637 
638  uvel = (Q+p+ npoints_grid)[0];
639  vvel = (Q+p+2*npoints_grid)[0];
640  wvel = (Q+p+3*npoints_grid)[0];
641  T = (Q+p+4*npoints_grid)[0];
642  Ty = (QDerivY+p+4*npoints_grid)[0];
643  ux = (QDerivX+p+ npoints_grid)[0];
644  vx = (QDerivX+p+2*npoints_grid)[0];
645  uy = (QDerivY+p+ npoints_grid)[0];
646  vy = (QDerivY+p+2*npoints_grid)[0];
647  wy = (QDerivY+p+3*npoints_grid)[0];
648  vz = (QDerivZ+p+2*npoints_grid)[0];
649  wz = (QDerivZ+p+3*npoints_grid)[0];
650 
651  /* calculate viscosity coeff based on Sutherland's law */
652  double mu = raiseto(T, 0.76);
653 
654  double tau_yx, tau_yy, tau_yz, qy;
655  tau_yx = (mu*inv_Re) * (uy + vx);
656  tau_yy = two_third * (mu*inv_Re) * (-ux + 2*vy - wz);
657  tau_yz = (mu*inv_Re) * (vz + wy);
658  qy = ( mu*inv_Re * inv_gamma_m1 * inv_Pr ) * Ty;
659 
660  (FViscous+p)[0] = 0.0;
661  (FViscous+p+ npoints_grid)[0] = tau_yx;
662  (FViscous+p+2*npoints_grid)[0] = tau_yy;
663  (FViscous+p+3*npoints_grid)[0] = tau_yz;
664  (FViscous+p+4*npoints_grid)[0] = uvel*tau_yx + vvel*tau_yy + wvel*tau_yz + qy;
665  }
666  return;
667 }
668 
669 /*! Kernel function */
670 __global__
671 void gpuNavierStokes3DParabolicFunction_Zdir_kernel(
672  int npoints_grid,
673  double two_third,
674  double inv_gamma_m1,
675  double inv_Re,
676  double inv_Pr,
677  const double * __restrict__ Q,
678  const double * __restrict__ QDerivX,
679  const double * __restrict__ QDerivY,
680  const double * __restrict__ QDerivZ,
681  double * __restrict__ FViscous
682 )
683 {
684  int p = blockDim.x * blockIdx.x + threadIdx.x;
685  if (p < npoints_grid) {
686  double uvel, vvel, wvel, T, Tz,
687  ux, uz, vy, vz, wx, wy, wz;
688 
689  uvel = (Q+p+ npoints_grid)[0];
690  vvel = (Q+p+2*npoints_grid)[0];
691  wvel = (Q+p+3*npoints_grid)[0];
692  T = (Q+p+4*npoints_grid)[0];
693  Tz = (QDerivZ+p+4*npoints_grid)[0];
694  ux = (QDerivX+p+ npoints_grid)[0];
695  wx = (QDerivX+p+3*npoints_grid)[0];
696  vy = (QDerivY+p+2*npoints_grid)[0];
697  wy = (QDerivY+p+3*npoints_grid)[0];
698  uz = (QDerivZ+p+ npoints_grid)[0];
699  vz = (QDerivZ+p+2*npoints_grid)[0];
700  wz = (QDerivZ+p+3*npoints_grid)[0];
701 
702  /* calculate viscosity coeff based on Sutherland's law */
703  double mu = raiseto(T,0.76);
704 
705  double tau_zx, tau_zy, tau_zz, qz;
706  tau_zx = (mu*inv_Re) * (uz + wx);
707  tau_zy = (mu*inv_Re) * (vz + wy);
708  tau_zz = two_third * (mu*inv_Re) * (-ux - vy + 2*wz);
709  qz = ( mu*inv_Re * inv_gamma_m1 * inv_Pr ) * Tz;
710 
711  (FViscous+p)[0] = 0.0;
712  (FViscous+p+ npoints_grid)[0] = tau_zx;
713  (FViscous+p+2*npoints_grid)[0] = tau_zy;
714  (FViscous+p+3*npoints_grid)[0] = tau_zz;
715  (FViscous+p+4*npoints_grid)[0] = uvel*tau_zx + vvel*tau_zy + wvel*tau_zz + qz;
716  }
717  return;
718 }
719 
720 /*! Kernel function */
721 __global__
722 void gpuNavierStokes3DParabolicFunction_par_kernel(
723  int npoints_grid,
724  int npoints_local_wghosts,
725  int ghosts,
726  int dir,
727  const int * __restrict__ dim,
728  const double * __restrict__ dxinv,
729  const double * __restrict__ FDeriv,
730  double * __restrict__ par
731 )
732 {
733  int p = blockDim.x * blockIdx.x + threadIdx.x;
734 
735  if (p < npoints_grid) {
736  int index[3];
737  double dxinv_val;
738 
739  _ArrayIndexnD_(_MODEL_NDIMS_,p,dim,index,0);
740  _ArrayIndex1D_(_MODEL_NDIMS_,dim,index,ghosts,p);
741  _GetCoordinate_(dir,index[dir],dim,ghosts,dxinv,dxinv_val);
742 
743  #pragma unroll
744  for (int v=0; v<_MODEL_NVARS_; v++) {
745  (par+p+v*npoints_local_wghosts)[0] += (dxinv_val * (FDeriv+p+v*npoints_local_wghosts)[0]);
746  }
747  }
748  return;
749 }
750 
751 /*!
752  Compute the viscous terms in the 3D Navier Stokes equations: this function computes
753  the following:
754  \f{equation}{
755  \frac {\partial} {\partial x} \left[\begin{array}{c} 0 \\ \tau_{xx} \\ \tau_{yx} \\ \tau_{zx} \\ u \tau_{xx} + v \tau_{yx} + w \tau_{zx} - q_x \end{array}\right]
756  + \frac {\partial} {\partial y} \left[\begin{array}{c} 0 \\ \tau_{xy} \\ \tau_{yy} \\ \tau_{zy} \\ u \tau_{xy} + v \tau_{yy} + w \tau_{zy} - q_y \end{array}\right]
757  + \frac {\partial} {\partial z} \left[\begin{array}{c} 0 \\ \tau_{xz} \\ \tau_{yz} \\ \tau_{zz} \\ u \tau_{xz} + v \tau_{yz} + w \tau_{zz} - q_z \end{array}\right]
758  \f}
759  where
760  \f{align}{
761  \tau_{xx} &= \frac{2}{3}\left(\frac{\mu}{Re}\right)\left(2\frac{\partial u}{\partial x} - \frac{\partial v}{\partial y} - \frac{\partial w}{\partial z}\right),\\
762  \tau_{xy} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial u}{\partial y} + \frac{\partial v}{\partial x}\right),\\
763  \tau_{xz} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial u}{\partial z} + \frac{\partial w}{\partial x}\right),\\
764  \tau_{yx} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial u}{\partial y} + \frac{\partial v}{\partial x}\right),\\
765  \tau_{yy} &= \frac{2}{3}\left(\frac{\mu}{Re}\right)\left(-\frac{\partial u}{\partial x} +2\frac{\partial v}{\partial y} - \frac{\partial w}{\partial z}\right),\\
766  \tau_{yz} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial w}{\partial y} + \frac{\partial v}{\partial z}\right),\\
767  \tau_{zx} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial u}{\partial z} + \frac{\partial w}{\partial x}\right),\\
768  \tau_{zy} &= \left(\frac{\mu}{Re}\right)\left(\frac{\partial v}{\partial z} + \frac{\partial w}{\partial y}\right),\\
769  \tau_{zz} &= \frac{2}{3}\left(\frac{\mu}{Re}\right)\left(-\frac{\partial u}{\partial x} - \frac{\partial v}{\partial y} + 2\frac{\partial v}{\partial y}\right),\\
770  q_x &= -\frac{mu}{\left(\gamma-1\right)Re Pr}\frac{\partial T}{\partial x}, \\
771  q_y &= -\frac{mu}{\left(\gamma-1\right)Re Pr}\frac{\partial T}{\partial y}, \\
772  q_z &= -\frac{mu}{\left(\gamma-1\right)Re Pr}\frac{\partial T}{\partial z}
773  \f}
774  and the temperature is \f$T = \gamma p/\rho\f$. \f$Re\f$ and \f$Pr\f$ are the Reynolds and Prandtl numbers, respectively. Note that this function
775  computes the entire parabolic term, and thus bypasses HyPar's parabolic function calculation interfaces. NavierStokes3DInitialize() assigns this
776  function to #HyPar::ParabolicFunction.
777  \n\n
778  Reference:
779  + Tannehill, Anderson and Pletcher, Computational Fluid Mechanics and Heat Transfer,
780  Chapter 5, Section 5.1.7 (However, the non-dimensional velocity and the Reynolds
781  number is based on speed of sound, instead of the freestream velocity).
782 */
783 extern "C" int gpuNavierStokes3DParabolicFunction(
784  double * __restrict__ par, /*!< Array to hold the computed viscous terms */
785  double * __restrict__ u, /*!< Solution vector array */
786  void * __restrict__ s, /*!< Solver object of type #HyPar */
787  void * __restrict__ m, /*!< MPI object of type #MPIVariables */
788  double t /*!< Current simulation time */
789 )
790 {
791  HyPar *solver = (HyPar*) s;
792  MPIVariables *mpi = (MPIVariables*) m;
793  NavierStokes3D *physics = (NavierStokes3D*) solver->physics;
794 
795  int ghosts = solver->ghosts;
796  int *dim = solver->gpu_dim_local;
797  int size = solver->npoints_local_wghosts;
798 
799  gpuMemset(par, 0, size*_MODEL_NVARS_*sizeof(double));
800  if (physics->Re <= 0) return (0); /* inviscid flow */
801  solver->count_par++;
802 
803  static double two_third = 2.0/3.0;
804  double inv_gamma_m1 = 1.0 / (physics->gamma-1.0);
805  double inv_Re = 1.0 / physics->Re;
806  double inv_Pr = 1.0 / physics->Pr;
807 
808  double *Q = physics->gpu_Q;
809  double *QDerivX = physics->gpu_QDerivX;
810  double *QDerivY = physics->gpu_QDerivY;
811  double *QDerivZ = physics->gpu_QDerivZ;
812  double *dxinv = solver->gpu_dxinv;
813 
814  int nblocks = (size-1)/GPU_THREADS_PER_BLOCK + 1;
815  int nblocks_par = (solver->npoints_local-1)/GPU_THREADS_PER_BLOCK + 1;
816 
817 #if defined(GPU_STAT)
818  cudaEvent_t startEvent, stopEvent;
819  float milliseconds;
820 
821  checkCuda( cudaEventCreate(&startEvent) );
822  checkCuda( cudaEventCreate(&stopEvent) );
823 
824  checkCuda( cudaEventRecord(startEvent, 0) );
825 #endif
826 
827  gpuNavierStokes3DParabolicFunction_Q_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
828  size, physics->gamma, u, Q
829  );
830  cudaDeviceSynchronize();
831 
832 #if defined(GPU_STAT)
833  checkCuda( cudaEventRecord(stopEvent, 0) );
834  checkCuda( cudaEventSynchronize(stopEvent) );
835  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
836 
837  int memory_accessed = size*_MODEL_NVARS_*sizeof(double);
838  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
839  "NavierStokes3DParabolicFunction_Q", milliseconds*1e-3,
840  (1e-6*(memory_accessed)/milliseconds));
841 #endif
842 
843  solver->FirstDerivativePar(QDerivX,Q,_XDIR_,1,solver,mpi);
844  solver->FirstDerivativePar(QDerivY,Q,_YDIR_,1,solver,mpi);
845  solver->FirstDerivativePar(QDerivZ,Q,_ZDIR_,1,solver,mpi);
846 
847  gpuMPIExchangeBoundariesnD(_MODEL_NDIMS_,_MODEL_NVARS_,solver->gpu_dim_local,
848  solver->ghosts,mpi,QDerivX);
849  gpuMPIExchangeBoundariesnD(_MODEL_NDIMS_,_MODEL_NVARS_,solver->gpu_dim_local,
850  solver->ghosts,mpi,QDerivY);
851  gpuMPIExchangeBoundariesnD(_MODEL_NDIMS_,_MODEL_NVARS_,solver->gpu_dim_local,
852  solver->ghosts,mpi,QDerivY);
853 
854 #if defined(GPU_STAT)
855  checkCuda( cudaEventRecord(startEvent, 0) );
856 #endif
857 
858  gpuNavierStokes3DParabolicFunction_QDeriv_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
859  size, ghosts, dim, dxinv, QDerivX, QDerivY, QDerivZ
860  );
861  cudaDeviceSynchronize();
862 
863 #if defined(GPU_STAT)
864  checkCuda( cudaEventRecord(stopEvent, 0) );
865  checkCuda( cudaEventSynchronize(stopEvent) );
866  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
867 
868  memory_accessed = (3*size*_MODEL_NVARS_ + solver->size_x)*sizeof(double);
869  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
870  "NavierStokes3DParabolicFunction_QDeriv", milliseconds*1e-3,
871  (1e-6*(memory_accessed)/milliseconds));
872 #endif
873 
874  double *FViscous = physics->gpu_FViscous;
875  double *FDeriv = physics->gpu_FDeriv;
876  gpuMemset(FViscous, 0, size*_MODEL_NVARS_*sizeof(double));
877  gpuMemset(FDeriv, 0, size*_MODEL_NVARS_*sizeof(double));
878 
879  /* Along X */
880 
881 #if defined(GPU_STAT)
882  checkCuda( cudaEventRecord(startEvent, 0) );
883 #endif
884 
885  gpuNavierStokes3DParabolicFunction_Xdir_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
886  size, two_third, inv_gamma_m1, inv_Re, inv_Pr, Q,
887  QDerivX, QDerivY, QDerivZ, FViscous
888  );
889  cudaDeviceSynchronize();
890 
891 #if defined(GPU_STAT)
892  checkCuda( cudaEventRecord(stopEvent, 0) );
893  checkCuda( cudaEventSynchronize(stopEvent) );
894  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
895 
896  memory_accessed = 17*size*sizeof(double);
897  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
898  "NavierStokes3DParabolicFunction_Xdir", milliseconds*1e-3,
899  (1e-6*(memory_accessed)/milliseconds));
900 #endif
901 
902  solver->FirstDerivativePar(FDeriv,FViscous,_XDIR_,1,solver,mpi);
903 
904 #if defined(GPU_STAT)
905  checkCuda( cudaEventRecord(startEvent, 0) );
906 #endif
907 
908  gpuNavierStokes3DParabolicFunction_par_kernel<<<nblocks_par, GPU_THREADS_PER_BLOCK>>>(
909  solver->npoints_local, size, ghosts, _XDIR_, dim, dxinv, FDeriv, par
910  );
911  cudaDeviceSynchronize();
912 
913 #if defined(GPU_STAT)
914  checkCuda( cudaEventRecord(stopEvent, 0) );
915  checkCuda( cudaEventSynchronize(stopEvent) );
916  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
917 
918  memory_accessed = (solver->npoints_local + 2*solver->npoints_local*_MODEL_NVARS_)*sizeof(double);
919  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
920  "NavierStokes3DParabolicFunction_par", milliseconds*1e-3,
921  (1e-6*(memory_accessed)/milliseconds));
922 #endif
923 
924  /* Along Y */
925 
926 #if defined(GPU_STAT)
927  checkCuda( cudaEventRecord(startEvent, 0) );
928 #endif
929 
930  gpuNavierStokes3DParabolicFunction_Ydir_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
931  size, two_third, inv_gamma_m1, inv_Re, inv_Pr, Q,
932  QDerivX, QDerivY, QDerivZ, FViscous
933  );
934  cudaDeviceSynchronize();
935 
936 #if defined(GPU_STAT)
937  checkCuda( cudaEventRecord(stopEvent, 0) );
938  checkCuda( cudaEventSynchronize(stopEvent) );
939  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
940 
941  memory_accessed = 17*size*sizeof(double);
942  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
943  "NavierStokes3DParabolicFunction_Ydir", milliseconds*1e-3,
944  (1e-6*(memory_accessed)/milliseconds));
945 #endif
946 
947  solver->FirstDerivativePar(FDeriv,FViscous,_YDIR_,1,solver,mpi);
948  gpuNavierStokes3DParabolicFunction_par_kernel<<<nblocks_par, GPU_THREADS_PER_BLOCK>>>(
949  solver->npoints_local, size, ghosts, _YDIR_, dim, dxinv, FDeriv, par
950  );
951  cudaDeviceSynchronize();
952 
953  /* Along Z */
954 #if defined(GPU_STAT)
955  checkCuda( cudaEventRecord(startEvent, 0) );
956 #endif
957 
958  gpuNavierStokes3DParabolicFunction_Zdir_kernel<<<nblocks, GPU_THREADS_PER_BLOCK>>>(
959  size, two_third, inv_gamma_m1, inv_Re, inv_Pr, Q,
960  QDerivX, QDerivY, QDerivZ, FViscous
961  );
962  cudaDeviceSynchronize();
963 
964 #if defined(GPU_STAT)
965  checkCuda( cudaEventRecord(stopEvent, 0) );
966  checkCuda( cudaEventSynchronize(stopEvent) );
967  checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
968 
969  memory_accessed = 17*size*sizeof(double);
970  printf("%-50s GPU time (secs) = %.6f bandwidth (GB/s) = %6.2f\n",
971  "NavierStokes3DParabolicFunction_Zdir", milliseconds*1e-3,
972  (1e-6*(memory_accessed)/milliseconds));
973 #endif
974 
975  solver->FirstDerivativePar(FDeriv,FViscous,_ZDIR_,1,solver,mpi);
976  gpuNavierStokes3DParabolicFunction_par_kernel<<<nblocks_par, GPU_THREADS_PER_BLOCK>>>(
977  solver->npoints_local, size, ghosts, _ZDIR_, dim, dxinv, FDeriv, par
978  );
979  cudaDeviceSynchronize();
980 
981 #if defined(GPU_STAT)
982  checkCuda( cudaEventDestroy(startEvent) );
983  checkCuda( cudaEventDestroy(stopEvent) );
984 #endif
985 
986  if (solver->flag_ib) gpuArrayBlockMultiply(par,solver->gpu_iblank,size,_MODEL_NVARS_);
987 
988  return (0);
989 }
990 
991 #endif