! See copyright notice in the COPYRIGHT file.
! ****************************************************************************** !
!> author: Manuel Hasert
!! author: Kannan Masilamani
!! author: Jiaxing Qi
!! Routines and parameter definitions for the standard D3Q19 model
?? include 'header/lbm_macros.inc'
?? include 'header/lbm_interfaceMacros.inc'
?? include 'header/lbm_d3q19Macros.inc'
module mus_d3q19_module

  ! include treelm modules
  use env_module,            only: rk
  use tem_compileconf_module, only: vlen
  use tem_varSys_module,     only: tem_varSys_type, tem_varSys_op_type
  use tem_param_module,      only: div1_3, div1_6, div1_36, div1_8, div3_4h, &
    &                              cs2inv, cs4inv, t2cs2inv,t2cs4inv, rho0
  use tem_dyn_array_module,  only: PositionOfVal

  ! include musubi modules
  use mus_field_prop_module,    only: mus_field_prop_type
  use mus_scheme_layout_module, only: mus_scheme_layout_type
  use mus_scheme_type_module,   only: mus_scheme_type
  use mus_param_module,         only: mus_param_type

  implicit none

  private

  public :: bgk_advRel_d3q19
  public :: bgk_advRel_d3q19_les

  public :: bgk_advRel_d3q19_incomp
  public :: bgk_advRel_d3q19_incomp_les

  public :: bgk_advRel_d3q19_nNwtn_PL
  public :: bgk_advRel_d3q19_nNwtn_CY
  public :: bgk_advRel_d3q19_nNwtn_CS
  public :: bgk_advRel_d3q19_incomp_nNwtn_CY
  public :: bgk_advRel_d3q19_incomp_nNwtn_PL
  public :: bgk_advRel_d3q19_incomp_nNwtn

  public :: trt_advRel_d3q19
  public :: trt_advRel_d3q19_incomp
  public :: trt_advRel_d3q19_les

  public :: bgk_d3q19_block

  ! =============================================================================
  ! D3Q19 flow model
  ! =============================================================================
  !> Definition of the discrete velocity set

  ! integer,parameter :: block = 32
  integer,parameter :: QQ   = 19  !< number of pdf directions

  integer,parameter :: qN00 = 1   !< west             x-
  integer,parameter :: q0N0 = 2   !< south            y-
  integer,parameter :: q00N = 3   !< bottom           z-
  integer,parameter :: q100 = 4   !< east             x+
  integer,parameter :: q010 = 5   !< north            y+
  integer,parameter :: q001 = 6   !< top              z+
  integer,parameter :: q0NN = 7   !<                  z-,y-
  integer,parameter :: q0N1 = 8   !<                  z+,y-
  integer,parameter :: q01N = 9   !<                  z-,y+
  integer,parameter :: q011 = 10  !<                  z+,y+
  integer,parameter :: qN0N = 11  !<                  x-,z-
  integer,parameter :: q10N = 12  !<                  x+,z-
  integer,parameter :: qN01 = 13  !<                  x-,z+
  integer,parameter :: q101 = 14  !<                  x+,z+
  integer,parameter :: qNN0 = 15  !<                  y-,x-
  integer,parameter :: qN10 = 16  !<                  y+,x-
  integer,parameter :: q1N0 = 17  !<                  y-,x+
  integer,parameter :: q110 = 18  !<                  y+,x+
  integer,parameter :: q000 = 19  !< rest density is last

contains

! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK.
  !! 
  !! \[ f_\alpha(x_i+e_{\alpha,i},t+1) = 
  !! f_\alpha(x_i,t) - \omega(f_\alpha(x_i,t)-f^{eq}_{\alpha}(x_i,t)) \]
  !!
  !! The number of floating point operation in this routine is 160 roughly.
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19 )
?? copy :: compute_routineParams
?? IF (SOA) THEN
    ! ---------------------------------------------------------------------------
    integer       :: iElem, minElem, maxElem, ii, iLink
    real(kind=rk) ::      fN00(params%block)
    real(kind=rk) ::      f0N0(params%block)
    real(kind=rk) ::      f00N(params%block)
    real(kind=rk) ::      f100(params%block)
    real(kind=rk) ::      f010(params%block)
    real(kind=rk) ::      f001(params%block)
    real(kind=rk) ::      f0NN(params%block)
    real(kind=rk) ::      f0N1(params%block)
    real(kind=rk) ::      f01N(params%block)
    real(kind=rk) ::      f011(params%block)
    real(kind=rk) ::      fN0N(params%block)
    real(kind=rk) ::      f10N(params%block)
    real(kind=rk) ::      fN01(params%block)
    real(kind=rk) ::      f101(params%block)
    real(kind=rk) ::      fNN0(params%block)
    real(kind=rk) ::      fN10(params%block)
    real(kind=rk) ::      f1N0(params%block)
    real(kind=rk) ::      f110(params%block)
    real(kind=rk) ::      f000(params%block)
    real(kind=rk) :: usqn_o1(1:params%block),&
      &                  rho(1:params%block),&
      &                  u_x(1:params%block),&
      &                  u_y(1:params%block),&
      &                  u_z(1:params%block)
    real(kind=rk) :: usq, cmpl_o, omega, omega_1_36, inv_rho
    real(kind=rk) :: coeff_1, coeff_2, sum1, sum2, ui, fac
    ! ---------------------------------------------------------------------------

    ! some global variables
    omega = fieldProp(1)%fluid%omLvl( level )
    cmpl_o  = 1._rk - omega
    coeff_1 = div1_8 * omega
    coeff_2 = coeff_1 * 2._rk
    omega_1_36 = omega * div1_36

!$omp do schedule(static)

    !NEC$ ivdep
    !DIR$ IVDEP
    nodeloop: do minElem = 1, nSolve, params%block

      ii = 0
      maxElem = min( minElem + params%block - 1, nSolve )

      !DIR$ IVDEP
      do iElem = minElem, maxElem
        ii = ii + 1

        ! First load all local values into temp array
        fN00(ii) = inState( neigh((qN00-1)*nElems+iElem) )
        f0N0(ii) = inState( neigh((q0N0-1)*nElems+iElem) )
        f00N(ii) = inState( neigh((q00N-1)*nElems+iElem) )
        f100(ii) = inState( neigh((q100-1)*nElems+iElem) )
        f010(ii) = inState( neigh((q010-1)*nElems+iElem) )
        f001(ii) = inState( neigh((q001-1)*nElems+iElem) )
        f0NN(ii) = inState( neigh((q0NN-1)*nElems+iElem) )
        f0N1(ii) = inState( neigh((q0N1-1)*nElems+iElem) )
        f01N(ii) = inState( neigh((q01N-1)*nElems+iElem) )
        f011(ii) = inState( neigh((q011-1)*nElems+iElem) )
        fN0N(ii) = inState( neigh((qN0N-1)*nElems+iElem) )
        f10N(ii) = inState( neigh((q10N-1)*nElems+iElem) )
        fN01(ii) = inState( neigh((qN01-1)*nElems+iElem) )
        f101(ii) = inState( neigh((q101-1)*nElems+iElem) )
        fNN0(ii) = inState( neigh((qNN0-1)*nElems+iElem) )
        fN10(ii) = inState( neigh((qN10-1)*nElems+iElem) )
        f1N0(ii) = inState( neigh((q1N0-1)*nElems+iElem) )
        f110(ii) = inState( neigh((q110-1)*nElems+iElem) )
        f000(ii) = inState( neigh((q000-1)*nElems+iElem) )

        rho(ii) =   f000(ii) &
                  + f110(ii) + f010(ii) + fN10(ii) + fN00(ii)  &
                  + fNN0(ii) + f0N0(ii) + f1N0(ii) + f100(ii)  &
                  + f001(ii) + f101(ii) + f011(ii) + fN01(ii) &
                  + f0N1(ii) + f00N(ii) + f10N(ii) + f01N(ii) &
                  + fN0N(ii) + f0NN(ii)
        inv_rho = 1.0_rk / rho(ii)

        u_x(ii)  =   f100(ii) + f10N(ii) + f101(ii) + f1N0(ii) + f110(ii) &
          &        - fN00(ii) - fN0N(ii) - fN01(ii) - fNN0(ii) - fN10(ii)
        u_y(ii)  =   f010(ii) + f01N(ii) + f011(ii) + fN10(ii) + f110(ii) &
          &        - f0N0(ii) - f0NN(ii) - f0N1(ii) - fNN0(ii) - f1N0(ii)
        u_z(ii)  =   f001(ii) + f0N1(ii) + f011(ii) + fN01(ii) + f101(ii) &
          &        - f00N(ii) - f0NN(ii) - f01N(ii) - fN0N(ii) - f10N(ii)
        u_x(ii)  = u_x(ii) * inv_rho
        u_y(ii)  = u_y(ii) * inv_rho
        u_z(ii)  = u_z(ii) * inv_rho

        ! square velocity and derived constants
        usq  = u_x(ii)*u_x(ii) + u_y(ii)*u_y(ii) + u_z(ii)*u_z(ii)
        usqn_o1(ii) = div1_36 * (1._rk - 1.5_rk * usq) * rho(ii) * omega

      end do ! iElem
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q000-1)*nElems+minElem, (q000-1)*nElems+maxElem
        ii=ii+1
        outState(iLink) = f000(ii)*cmpl_o + 12._rk*usqn_o1(ii)
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q110-1)*nElems+minElem, (q110-1)*nElems+maxElem
        ii=ii+1
        ui    = u_x(ii) + u_y(ii)
        fac   = coeff_1 * ui * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = f110(ii) * cmpl_o + sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qNN0-1)*nElems+minElem, (qNN0-1)*nElems+maxElem
        ii=ii+1
        ui    = u_x(ii) + u_y(ii)
        fac   = coeff_1 * ui * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = fNN0(ii) * cmpl_o - sum1 + sum2
      end do
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN10-1)*nElems+minElem, (qN10-1)*nElems+maxElem
        ii=ii+1
        ui    = -u_x(ii) + u_y(ii)
        fac   = coeff_1 * ui * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = fN10(ii) *cmpl_o+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q1N0-1)*nElems+minElem, (q1N0-1)*nElems+maxElem
        ii=ii+1
        ui    = -u_x(ii) + u_y(ii)
        fac   = coeff_1 * ui * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = f1N0(ii) *cmpl_o-sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q101-1)*nElems+minElem, (q101-1)*nElems+maxElem
        ii=ii+1
        ui      =  u_x(ii) + u_z(ii)
        fac  = coeff_1 * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f101(ii)*cmpl_o+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN0N-1)*nElems+minElem, (qN0N-1)*nElems+maxElem
        ii=ii+1
        ui   =  u_x(ii) + u_z(ii)
        fac  = coeff_1 * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = fN0N(ii)*cmpl_o-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN01-1)*nElems+minElem, (qN01-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_x(ii) + u_z(ii)
        fac  = coeff_1 * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = fN01(ii)*cmpl_o+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q10N-1)*nElems+minElem, (q10N-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_x(ii) + u_z(ii)
        fac  = coeff_1 * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f10N(ii)*cmpl_o-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q011-1)*nElems+minElem, (q011-1)*nElems+maxElem
        ii=ii+1
        ui   =  u_y(ii) + u_z(ii)
        fac  = coeff_1 * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink)=f011(ii)*cmpl_o+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0NN-1)*nElems+minElem, (q0NN-1)*nElems+maxElem
        ii=ii+1
       ui    =  u_y(ii) + u_z(ii)
       fac  = coeff_1 * ui * rho(ii)
       sum1 = fac * div3_4h
       sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f0NN(ii)*cmpl_o-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N1-1)*nElems+minElem, (q0N1-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_y(ii) + u_z(ii)
        fac  = coeff_1 * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f0N1(ii)*cmpl_o+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q01N-1)*nElems+minElem, (q01N-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_y(ii) + u_z(ii)
        fac  = coeff_1 * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f01N(ii)*cmpl_o-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q010-1)*nElems+minElem, (q010-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_y(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_y(ii) + 2d0*usqn_o1(ii)
        outState(iLink)=f010(ii) *cmpl_o+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N0-1)*nElems+minElem, (q0N0-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_y(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_y(ii) + 2d0*usqn_o1(ii)
        outState(iLink)=f0N0(ii) *cmpl_o-sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN00-1)*nElems+minElem, (qN00-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_x(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_x(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = fN00(ii) * cmpl_o - sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q100-1)*nElems+minElem, (q100-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_x(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_x(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f100(ii) * cmpl_o + sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q00N-1)*nElems+minElem, (q00N-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_z(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_z(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f00N(ii) * cmpl_o - sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q001-1)*nElems+minElem, (q001-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_z(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_z(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f001(ii) *cmpl_o+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

    enddo nodeloop
!$omp end do nowait

?? ELSE
    ! ---------------------------------------------------------------------------
    integer :: iElem, nScalars
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: inv_rho ! inverse local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    ! derived constants
    real(kind=rk) :: usqn, usqn_o1, usqn_o2
    real(kind=rk) :: omega_2, cmpl_o, omega
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &              sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1,        &
      &              sum11_2, sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------

    ! nElems = size(neigh)/QQ
    nScalars = varSys%nScalars
    ! read the relaxation parameter omega for the current level
    omega   = fieldProp(1)%fluid%omLvl( level )
    ! pre-calculate partial collision constants
    omega_2 = 2._rk * omega
    cmpl_o  = 1._rk - omega

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

      ! inverse local density
      inv_rho = 1._rk / rho

?? copy :: ux_d3q19( u_x, f, inv_rho )
?? copy :: uy_d3q19( u_y, f, inv_rho )
?? copy :: uz_d3q19( u_z, f, inv_rho )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      usqn = div1_36 * (1._rk - 1.5_rk * usq) * rho

      ! f = (1-w) * f + w * fEq
      outState(?SAVE?(q000,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                     f000*cmpl_o+omega*rho*(div1_3-0.5_rk*usq)

      coeff_1 = div1_8 * omega * rho

      usqn_o1 = omega * usqn

      ui1     =  u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   fNN0 *cmpl_o-sum1_1 +sum1_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f1N0 *cmpl_o-sum3_1 +sum3_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   fN0N*cmpl_o-sum10_1+sum10_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f10N*cmpl_o-sum12_1+sum12_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f011*cmpl_o+sum11_1+sum11_2
      outState(?SAVE?(q0NN,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f0NN*cmpl_o-sum11_1+sum11_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f0N1*cmpl_o+sum13_1+sum13_2
      outState(?SAVE?(q01N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f01N*cmpl_o-sum13_1+sum13_2

      coeff_2 = div1_8 * omega_2 * rho
      usqn_o2 = omega_2 * usqn

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f010 *cmpl_o+sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f0N0 *cmpl_o-sum2_1 +sum2_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   fN00 *cmpl_o-sum4_1 +sum4_2
      outState(?SAVE?(q100,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f100 *cmpl_o+sum4_1 +sum4_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f001 *cmpl_o+sum9_1 +sum9_2
      outState(?SAVE?(q00N,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f00N *cmpl_o-sum9_1 +sum9_2

    enddo nodeloop
!$omp end do nowait
?? END IF

  end subroutine bgk_advRel_d3q19
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK and 
  !! Smagorinsky Subgrid-scale model for large eddy simulations (LES).
  !!
  !! This implementation follows the BGK model for LES described by Hou et al.
  !! in Hou, S., Sterling, J., Chen, S., & Doolen, G. D. (1994). 
  !! A lattice Boltzmann subgrid model for high Reynolds number flows.
  !! arXiv preprint comp-gas/9401004.
  !! 
?? copy :: compute_routineHeader( bgk_advRel_d3q19_LES )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem, iDir
    real(kind=rk) :: pdfTmp(19)
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: inv_rho ! inverse local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    ! derived constants
    real(kind=rk) :: usq, ucx
    real(kind=rk) :: fEq(19) !< equilibrium distribution
    real(kind=rk) :: nEq(19) !< non-equilibrium part
    real(kind=rk) :: q_sum, q_bgk(3,3)
    real(kind=rk) :: tau0, tau_turb, om_turb, smag_const
    ! ---------------------------------------------------------------------------

    smag_const =         fieldProp(1)%fluid%les%c_s
    tau0       = 1._rk / fieldProp(1)%fluid%omLvl( level )

    !$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1,nSolve
      ! First load all local values into temp array
      ! Generic! PUSH+PULL is possible
      pdfTmp( qN00 ) = inState(?FETCH?( qN00, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q0N0 ) = inState(?FETCH?( q0N0, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q00N ) = inState(?FETCH?( q00N, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q100 ) = inState(?FETCH?( q100, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q010 ) = inState(?FETCH?( q010, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q001 ) = inState(?FETCH?( q001, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q0NN ) = inState(?FETCH?( q0NN, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q0N1 ) = inState(?FETCH?( q0N1, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q01N ) = inState(?FETCH?( q01N, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q011 ) = inState(?FETCH?( q011, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( qN0N ) = inState(?FETCH?( qN0N, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q10N ) = inState(?FETCH?( q10N, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( qN01 ) = inState(?FETCH?( qN01, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q101 ) = inState(?FETCH?( q101, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( qNN0 ) = inState(?FETCH?( qNN0, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( qN10 ) = inState(?FETCH?( qN10, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q1N0 ) = inState(?FETCH?( q1N0, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q110 ) = inState(?FETCH?( q110, 1, iElem, 19, 19, nElems,neigh ))
      pdfTmp( q000 ) = inState(?FETCH?( q000, 1, iElem, 19, 19, nElems,neigh ))

      ! local density
      rho = pdfTmp(q110) + pdfTmp(q010) + pdfTmp(qN10)                         &
        & + pdfTmp(qN00) + pdfTmp(qNN0) + pdfTmp(q0N0)                         &
        & + pdfTmp(q1N0) + pdfTmp(q100) + pdfTmp(q001)                         &
        & + pdfTmp(q101) + pdfTmp(q011) + pdfTmp(qN01)                         &
        & + pdfTmp(q0N1) + pdfTmp(q00N) + pdfTmp(q10N)                         &
        & + pdfTmp(q01N) + pdfTmp(qN0N) + pdfTmp(q0NN)                         &
        & + pdfTmp(q000)

      ! inverse local density
      inv_rho = 1._rk / rho

      ! local x-, y- and z-velocity
      u_x = pdfTmp(q110) + pdfTmp(q1N0) + pdfTmp(q100)                         &
        & + pdfTmp(q101) + pdfTmp(q10N)                                        &
        & - pdfTmp(qN10) - pdfTmp(qN00) - pdfTmp(qNN0)                         &
        & - pdfTmp(qN01) - pdfTmp(qN0N)

      u_y = pdfTmp(q110) + pdfTmp(q010)  + pdfTmp(qN10)                        &
        & + pdfTmp(q011) + pdfTmp(q01N)                                        &
        & - pdfTmp(qNN0) - pdfTmp(q0N0 ) - pdfTmp(q1N0)                        &
        & - pdfTmp(q0N1) - pdfTmp(q0NN)

      u_z = pdfTmp(q001) + pdfTmp(q101) + pdfTmp(q011)                         &
        & + pdfTmp(qN01) + pdfTmp(q0N1)                                        &
        & - pdfTmp(q00N) - pdfTmp(q10N) - pdfTmp(q01N)                         &
        & - pdfTmp(qN0N) - pdfTmp(q0NN)

      ! transfer moments to velocities
      u_x = u_x * inv_rho
      u_y = u_y * inv_rho
      u_z = u_z * inv_rho

      ! Calculate the equilibrium distribution function
      !fEq(:) = getEquilibrium( density = rho, velocity = (/u_x, u_y, u_z /), layout = layout)
      ! square of velocity
      usq = u_x*u_x + u_y*u_y + u_z*u_z
      do iDir = 1, QQ

        ! velocity times lattice unit velocity
        ucx =   dble( layout%fStencil%cxDir(1,iDir)) * u_x       &
          &   + dble( layout%fStencil%cxDir(2,iDir)) * u_y       &
          &   + dble( layout%fStencil%cxDir(3,iDir)) * u_z

        ! calculate equilibrium density
        fEq( iDir ) = layout%weight( iDir ) * rho * ( 1.d0 + ucx*cs2inv        &
          &         + ucx*ucx*t2cs4inv - usq*t2cs2inv )

      enddo

      ! Calculate the non-equilibrium part
      nEq(:) = pdfTmp(:) - fEq(:)

      ! Now calculate the stress tensor part
      q_bgk(1,1) = sum( real(layout%fStencil%cxDir( 1, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(2,1) = sum( real(layout%fStencil%cxDir( 2, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(1,2) = q_bgk(2,1)
      q_bgk(2,2) = sum( real(layout%fStencil%cxDir( 2, :)                    &
        &                   *layout%fStencil%cxDir( 2, :),kind=rk)*nEq(:) )
      q_bgk(3,3) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 3, :),kind=rk)*nEq(:) )
      q_bgk(3,1) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(1,3) = q_bgk(3,1)
      q_bgk(3,2) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 2, :),kind=rk)*nEq(:) )
      q_bgk(2,3) = q_bgk(3,2)

      q_sum = sqrt(2._rk*(q_bgk(1,1)*q_bgk(1,1)                                &
        &               + q_bgk(2,1)*q_bgk(2,1)                                &
        &               + q_bgk(1,2)*q_bgk(1,2)                                &
        &               + q_bgk(2,2)*q_bgk(2,2)                                &
        &               + q_bgk(3,1)*q_bgk(3,1)                                &
        &               + q_bgk(3,2)*q_bgk(3,2)                                &
        &               + q_bgk(1,3)*q_bgk(1,3)                                &
        &               + q_bgk(2,3)*q_bgk(2,3)                                &
        &               + q_bgk(3,3)*q_bgk(3,3) ) )

      ! and the turbulent part of the viscosity
      tau_turb = 0.5_rk *(sqrt(tau0*tau0 + 2._rk*smag_const                    &
        &              *smag_const/rho*cs4inv*q_sum)-tau0)
      om_turb = 1._rk/(tau0+tau_turb)

      ! relaxation
      outState(?SAVE?(qN00,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      pdfTmp( qN00 ) - om_turb*(pdfTmp( qN00 ) - fEq( qN00 ))
      outState(?SAVE?(q0N0,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      pdfTmp( q0N0 ) - om_turb*(pdfTmp( q0N0 ) - fEq( q0N0 ))
      outState(?SAVE?(q00N,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      pdfTmp( q00N ) - om_turb*(pdfTmp( q00N ) - fEq( q00N ))
      outState(?SAVE?(q100,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      pdfTmp( q100 ) - om_turb*(pdfTmp( q100 ) - fEq( q100 ))
      outState(?SAVE?(q010,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      pdfTmp( q010 ) - om_turb*(pdfTmp( q010 ) - fEq( q010 ))
      outState(?SAVE?(q001,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      pdfTmp( q001 ) - om_turb*(pdfTmp( q001 ) - fEq( q001 ))
      outState(?SAVE?(q0NN,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( q0NN ) - om_turb*(pdfTmp( q0NN ) - fEq( q0NN ))
      outState(?SAVE?(q0N1,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( q0N1 ) - om_turb*(pdfTmp( q0N1 ) - fEq( q0N1 ))
      outState(?SAVE?(q01N,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( q01N ) - om_turb*(pdfTmp( q01N ) - fEq( q01N ))
      outState(?SAVE?(q011,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( q011 ) - om_turb*(pdfTmp( q011 ) - fEq( q011 ))
      outState(?SAVE?(qN0N,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( qN0N ) - om_turb*(pdfTmp( qN0N ) - fEq( qN0N ))
      outState(?SAVE?(q10N,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( q10N ) - om_turb*(pdfTmp( q10N ) - fEq( q10N ))
      outState(?SAVE?(qN01,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( qN01 ) - om_turb*(pdfTmp( qN01 ) - fEq( qN01 ))
      outState(?SAVE?(q101,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( q101 ) - om_turb*(pdfTmp( q101 ) - fEq( q101 ))
      outState(?SAVE?(qNN0,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( qNN0 ) - om_turb*(pdfTmp( qNN0 ) - fEq( qNN0 ))
      outState(?SAVE?(qN10,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( qN10 ) - om_turb*(pdfTmp( qN10 ) - fEq( qN10 ))
      outState(?SAVE?(q1N0,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( q1N0 ) - om_turb*(pdfTmp( q1N0 ) - fEq( q1N0 ))
      outState(?SAVE?(q110,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      pdfTmp( q110 ) - om_turb*(pdfTmp( q110 ) - fEq( q110 ))
      outState(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh)) =                             &
        &      pdfTmp( q000 ) - om_turb*(pdfTmp( q000 ) - fEq( q000 ))
    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_les
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK for 
  !! incompressible lbm model
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19_incomp )
?? copy :: compute_routineParams
?? IF (SOA) THEN
    ! ---------------------------------------------------------------------------
    integer       :: iElem, minElem, maxElem, ii, iLink
    real(kind=rk) ::      fN00(params%block)
    real(kind=rk) ::      f0N0(params%block)
    real(kind=rk) ::      f00N(params%block)
    real(kind=rk) ::      f100(params%block)
    real(kind=rk) ::      f010(params%block)
    real(kind=rk) ::      f001(params%block)
    real(kind=rk) ::      f0NN(params%block)
    real(kind=rk) ::      f0N1(params%block)
    real(kind=rk) ::      f01N(params%block)
    real(kind=rk) ::      f011(params%block)
    real(kind=rk) ::      fN0N(params%block)
    real(kind=rk) ::      f10N(params%block)
    real(kind=rk) ::      fN01(params%block)
    real(kind=rk) ::      f101(params%block)
    real(kind=rk) ::      fNN0(params%block)
    real(kind=rk) ::      fN10(params%block)
    real(kind=rk) ::      f1N0(params%block)
    real(kind=rk) ::      f110(params%block)
    real(kind=rk) ::      f000(params%block)
    real(kind=rk) :: usqn_o1(1:params%block),&
      &                  u_x(1:params%block),&
      &                  u_y(1:params%block),&
      &                  u_z(1:params%block)
    real(kind=rk) :: rho, usq
    real(kind=rk) :: cmpl_o, omega, omega_1_36
    real(kind=rk) :: coeff_1, coeff_2, sum1, sum2, ui, fac
    ! ---------------------------------------------------------------------------

    ! some global variables
    omega = fieldProp(1)%fluid%omLvl( level )
    cmpl_o  = 1._rk - omega
    coeff_1 = div1_8 * omega
    coeff_2 = coeff_1 * 2._rk
    omega_1_36 = omega * div1_36

!$omp do schedule(static)

    !NEC$ ivdep
    !DIR$ IVDEP
    nodeloop: do minElem = 1, nSolve, params%block

      ii = 0
      maxElem = min( minElem + params%block - 1, nSolve )

      !DIR$ ivdep
      do iElem = minElem, maxElem
        ii = ii + 1

        ! First load all local values into temp array
        fN00(ii) = inState( neigh((qN00-1)*nElems+iElem) )
        f0N0(ii) = inState( neigh((q0N0-1)*nElems+iElem) )
        f00N(ii) = inState( neigh((q00N-1)*nElems+iElem) )
        f100(ii) = inState( neigh((q100-1)*nElems+iElem) )
        f010(ii) = inState( neigh((q010-1)*nElems+iElem) )
        f001(ii) = inState( neigh((q001-1)*nElems+iElem) )
        f0NN(ii) = inState( neigh((q0NN-1)*nElems+iElem) )
        f0N1(ii) = inState( neigh((q0N1-1)*nElems+iElem) )
        f01N(ii) = inState( neigh((q01N-1)*nElems+iElem) )
        f011(ii) = inState( neigh((q011-1)*nElems+iElem) )
        fN0N(ii) = inState( neigh((qN0N-1)*nElems+iElem) )
        f10N(ii) = inState( neigh((q10N-1)*nElems+iElem) )
        fN01(ii) = inState( neigh((qN01-1)*nElems+iElem) )
        f101(ii) = inState( neigh((q101-1)*nElems+iElem) )
        fNN0(ii) = inState( neigh((qNN0-1)*nElems+iElem) )
        fN10(ii) = inState( neigh((qN10-1)*nElems+iElem) )
        f1N0(ii) = inState( neigh((q1N0-1)*nElems+iElem) )
        f110(ii) = inState( neigh((q110-1)*nElems+iElem) )
        f000(ii) = inState( neigh((q000-1)*nElems+iElem) )

        rho =   f000(ii) &
              + f110(ii) + f010(ii) + fN10(ii) + fN00(ii)  &
              + fNN0(ii) + f0N0(ii) + f1N0(ii) + f100(ii)  &
              + f001(ii) + f101(ii) + f011(ii) + fN01(ii) &
              + f0N1(ii) + f00N(ii) + f10N(ii) + f01N(ii) &
              + fN0N(ii) + f0NN(ii)

        u_x(ii)  = f100(ii) + f10N(ii) + f101(ii) + f1N0(ii) + f110(ii) - fN00(ii) - fN0N(ii) - fN01(ii) - fNN0(ii) - fN10(ii)
        u_y(ii)  = f010(ii) + f01N(ii) + f011(ii) + fN10(ii) + f110(ii) - f0N0(ii) - f0NN(ii) - f0N1(ii) - fNN0(ii) - f1N0(ii)
        u_z(ii)  = f001(ii) + f0N1(ii) + f011(ii) + fN01(ii) + f101(ii) - f00N(ii) - f0NN(ii) - f01N(ii) - fN0N(ii) - f10N(ii)

        ! square velocity and derived constants
        usq  = u_x(ii)*u_x(ii) + u_y(ii)*u_y(ii) + u_z(ii)*u_z(ii)
        usqn_o1(ii) = omega_1_36 * ( rho - 1.5d0 * usq )

        ! @todo: also write this in a link loop?
        ! outState(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh)) = f000(ii)*cmpl_o + 12d0*usqn_o1(ii)

      end do ! iElem
      call obscure_setzero(ii)

      !NEC$ ivdep
?? copy :: dir_vector
      do iLink = (q000-1)*nElems+minElem, (q000-1)*nElems+maxElem
        ii=ii+1
        outState(iLink) = f000(ii)*cmpl_o + 12d0*usqn_o1(ii)
      end do ! iLink
      call obscure_setzero(ii)

      !NEC$ ivdep
?? copy :: dir_vector
      do iLink = (q110-1)*nElems+minElem, (q110-1)*nElems+maxElem
        ii=ii+1
        ui    = u_x(ii) + u_y(ii)
        fac   = coeff_1 * ui
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = f110(ii) * cmpl_o + sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qNN0-1)*nElems+minElem, (qNN0-1)*nElems+maxElem
        ii=ii+1
        ui    = u_x(ii) + u_y(ii)
        fac   = coeff_1 * ui
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = fNN0(ii) * cmpl_o - sum1 + sum2
      end do
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN10-1)*nElems+minElem, (qN10-1)*nElems+maxElem
        ii=ii+1
        ui    = -u_x(ii) + u_y(ii)
        fac   = coeff_1 * ui
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = fN10(ii) *cmpl_o+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q1N0-1)*nElems+minElem, (q1N0-1)*nElems+maxElem
        ii=ii+1
        ui    = -u_x(ii) + u_y(ii)
        fac   = coeff_1 * ui
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = f1N0(ii) *cmpl_o-sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q101-1)*nElems+minElem, (q101-1)*nElems+maxElem
        ii=ii+1
        ui      =  u_x(ii) + u_z(ii)
        fac  = coeff_1 * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f101(ii)*cmpl_o+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN0N-1)*nElems+minElem, (qN0N-1)*nElems+maxElem
        ii=ii+1
        ui   =  u_x(ii) + u_z(ii)
        fac  = coeff_1 * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = fN0N(ii)*cmpl_o-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN01-1)*nElems+minElem, (qN01-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_x(ii) + u_z(ii)
        fac  = coeff_1 * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = fN01(ii)*cmpl_o+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q10N-1)*nElems+minElem, (q10N-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_x(ii) + u_z(ii)
        fac  = coeff_1 * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f10N(ii)*cmpl_o-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q011-1)*nElems+minElem, (q011-1)*nElems+maxElem
        ii=ii+1
        ui   =  u_y(ii) + u_z(ii)
        fac  = coeff_1 * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink)=f011(ii)*cmpl_o+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0NN-1)*nElems+minElem, (q0NN-1)*nElems+maxElem
        ii=ii+1
       ui    =  u_y(ii) + u_z(ii)
       fac  = coeff_1 * ui
       sum1 = fac * div3_4h
       sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f0NN(ii)*cmpl_o-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N1-1)*nElems+minElem, (q0N1-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_y(ii) + u_z(ii)
        fac  = coeff_1 * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f0N1(ii)*cmpl_o+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q01N-1)*nElems+minElem, (q01N-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_y(ii) + u_z(ii)
        fac  = coeff_1 * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f01N(ii)*cmpl_o-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q010-1)*nElems+minElem, (q010-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_y(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_y(ii) + 2d0*usqn_o1(ii)
        outState(iLink)=f010(ii) *cmpl_o+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N0-1)*nElems+minElem, (q0N0-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_y(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_y(ii) + 2d0*usqn_o1(ii)
        outState(iLink)=f0N0(ii) *cmpl_o-sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN00-1)*nElems+minElem, (qN00-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_x(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_x(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = fN00(ii) * cmpl_o - sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q100-1)*nElems+minElem, (q100-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_x(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_x(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f100(ii) * cmpl_o + sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q00N-1)*nElems+minElem, (q00N-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_z(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_z(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f00N(ii) * cmpl_o - sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q001-1)*nElems+minElem, (q001-1)*nElems+maxElem
        ii=ii+1
        fac   = coeff_2 * u_z(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_z(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f001(ii) *cmpl_o+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

    enddo nodeloop
!$omp end do nowait

?? ELSE !! SOA == FALSE
    ! ---------------------------------------------------------------------------
    integer :: iElem
?? copy :: pdfTmp19( f )
    real(kind=rk) rho, u_x, u_y, u_z, usq
    real(kind=rk) usqn_o1, usqn_o2
    real(kind=rk) omega_2, cmpl_o, omega
    real(kind=rk) coeff_1, coeff_2
    real(kind=rk) ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,   &
      &           fac_13
    real(kind=rk) sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,      &
      &           sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1, sum11_2,  &
      &           sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------

    omega = fieldProp(1)%fluid%omLvl( level )
    omega_2 = 2._rk * omega
    cmpl_o  = 1._rk - omega

!$omp do schedule(static)

    !NEC$ ivdep
?? copy :: dir_novec
    nodeloop: do iElem=1,nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, QQ, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, QQ, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, QQ, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, QQ, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, QQ, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, QQ, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, QQ, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, QQ, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, QQ, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, QQ, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, QQ, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, QQ, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, QQ, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, QQ, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, QQ, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

      ! local x-, y- and z-velocity
?? copy :: ux_d3q19( u_x, f, 1._rk )
?? copy :: uy_d3q19( u_y, f, 1._rk )
?? copy :: uz_d3q19( u_z, f, 1._rk )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      ! usqn = div1_36 * (rho - 1.5_rk * usq * rho0 )
      usqn_o1 = omega * div1_36 * ( rho - 1.5d0 * usq )

      outState(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh))=f000*cmpl_o + 12d0*usqn_o1

      coeff_1 = div1_8 * omega

      ui1     =  u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1,iElem,QQ,QQ,nElems,neigh))=f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1,iElem,QQ,QQ,nElems,neigh))=fNN0 *cmpl_o-sum1_1 +sum1_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1,iElem,QQ,QQ,nElems,neigh))=fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1,iElem,QQ,QQ,nElems,neigh))=f1N0 *cmpl_o-sum3_1 +sum3_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1,iElem,QQ,QQ,nElems,neigh))=f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1,iElem,QQ,QQ,nElems,neigh))=fN0N*cmpl_o-sum10_1+sum10_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1,iElem,QQ,QQ,nElems,neigh))=fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1,iElem,QQ,QQ,nElems,neigh))=f10N*cmpl_o-sum12_1+sum12_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1,iElem,QQ,QQ,nElems,neigh))=f011*cmpl_o+sum11_1+sum11_2
      outState(?SAVE?(q0NN,1,iElem,QQ,QQ,nElems,neigh))=f0NN*cmpl_o-sum11_1+sum11_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1,iElem,QQ,QQ,nElems,neigh))=f0N1*cmpl_o+sum13_1+sum13_2
      outState(?SAVE?(q01N,1,iElem,QQ,QQ,nElems,neigh))=f01N*cmpl_o-sum13_1+sum13_2

      coeff_2 = div1_8 * omega_2
      usqn_o2 = 2d0 * usqn_o1

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1,iElem,QQ,QQ,nElems,neigh ))=f010 *cmpl_o+sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1,iElem,QQ,QQ,nElems,neigh ))=f0N0 *cmpl_o-sum2_1 +sum2_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1,iElem,QQ,QQ,nElems,neigh ))=fN00 *cmpl_o-sum4_1 +sum4_2
      outState(?SAVE?(q100,1,iElem,QQ,QQ,nElems,neigh ))=f100 *cmpl_o+sum4_1 +sum4_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1,iElem,QQ,QQ,nElems,neigh ))=f001 *cmpl_o+sum9_1 +sum9_2
      outState(?SAVE?(q00N,1,iElem,QQ,QQ,nElems,neigh ))=f00N *cmpl_o-sum9_1 +sum9_2

    enddo nodeloop
!$omp end do nowait
?? END IF

  end subroutine bgk_advRel_d3q19_incomp
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK incompressible 
  !! model and Smagorinsky Subgrid-scale model for large eddy simulations (LES.
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19_incomp_les )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem,iDir
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: inv_rho0 ! inverse local density
    ! derived constants
    real(kind=rk) :: usq, ucx
    real(kind=rk) :: fEq(19) ! equilibrium distribution
    real(kind=rk) :: nEq(19) ! non-equilibrium part
    real(kind=rk) :: q_sum, q_bgk(3,3), omega
    real(kind=rk) :: tau0, tau_turb, om_turb, smag_const
    ! ---------------------------------------------------------------------------

    omega   = fieldProp(1)%fluid%omLvl( level )
    smag_const = fieldProp(1)%fluid%les%c_s
    tau0 = 1._rk/omega
    inv_rho0 = 1._rk / rho0

    !$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, QQ, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, QQ, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, QQ, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, QQ, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, QQ, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, QQ, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, QQ, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, QQ, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, QQ, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, QQ, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, QQ, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, QQ, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, QQ, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, QQ, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, QQ, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

?? copy :: ux_d3q19( u_x, f, inv_rho0 )
?? copy :: uy_d3q19( u_y, f, inv_rho0 )
?? copy :: uz_d3q19( u_z, f, inv_rho0 )

      ! Calculate the equilibrium distribution function using
      ! incompressible formulation
      ! square of velocity
      usq = u_x*u_x + u_y*u_y + u_z*u_z
      do iDir = 1, layout%fStencil%QQ

        ! velocity times lattice unit velocity
        ucx =   real( layout%fStencil%cxDir( 1, iDir ), kind=rk) * u_x       &
          &   + real( layout%fStencil%cxDir( 2, iDir ), kind=rk) * u_y       &
          &   + real( layout%fStencil%cxDir( 3, iDir ), kind=rk) * u_z

        ! calculate equilibrium density
        fEq( iDir ) = layout%weight( iDir ) * ( rho + rho0 * ( ucx*cs2inv      &
          &         + ucx*ucx*t2cs4inv - usq*t2cs2inv ) )

      enddo

      ! Calculate the non-equilibrium part
      nEq( 1) = fN00 - fEq( 1)
      nEq( 2) = f0N0 - fEq( 2)
      nEq( 3) = f00N - fEq( 3)
      nEq( 4) = f100 - fEq( 4)
      nEq( 5) = f010 - fEq( 5)
      nEq( 6) = f001 - fEq( 6)
      nEq( 7) = f0NN - fEq( 7)
      nEq( 8) = f0N1 - fEq( 8)
      nEq( 9) = f01N - fEq( 9)
      nEq(10) = f011 - fEq(10)
      nEq(11) = fN0N - fEq(11)
      nEq(12) = f10N - fEq(12)
      nEq(13) = fN01 - fEq(13)
      nEq(14) = f101 - fEq(14)
      nEq(15) = fNN0 - fEq(15)
      nEq(16) = fN10 - fEq(16)
      nEq(17) = f1N0 - fEq(17)
      nEq(18) = f110 - fEq(18)
      nEq(19) = f000 - fEq(19)

      ! Now calculate the stress tensor part
      q_bgk      = 0._rk
      q_bgk(1,1) = sum( real(layout%fStencil%cxDir( 1, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(2,1) = sum( real(layout%fStencil%cxDir( 2, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(1,2) = q_bgk(2,1)
      q_bgk(2,2) = sum( real(layout%fStencil%cxDir( 2, :)                    &
        &                   *layout%fStencil%cxDir( 2, :),kind=rk)*nEq(:) )
      q_bgk(3,3) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 3, :),kind=rk)*nEq(:) )
      q_bgk(3,1) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(1,3) = q_bgk(3,1)
      q_bgk(3,2) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 2, :),kind=rk)*nEq(:) )
      q_bgk(2,3) = q_bgk(3,2)

      q_sum = sqrt(2._rk*(q_bgk(1,1)*q_bgk(1,1)                                &
        &               + q_bgk(2,1)*q_bgk(2,1)                                &
        &               + q_bgk(1,2)*q_bgk(1,2)                                &
        &               + q_bgk(2,2)*q_bgk(2,2)                                &
        &               + q_bgk(3,1)*q_bgk(3,1)                                &
        &               + q_bgk(3,2)*q_bgk(3,2)                                &
        &               + q_bgk(1,3)*q_bgk(1,3)                                &
        &               + q_bgk(2,3)*q_bgk(2,3)                                &
        &               + q_bgk(3,3)*q_bgk(3,3) ) )
      ! and the turbulent part of the viscosity
      tau_turb = 0.5_rk *(sqrt(tau0*tau0 + 2._rk*smag_const                    &
        &              *smag_const/rho0*cs4inv*q_sum)-tau0)

      om_turb = 1._rk/(tau0+tau_turb)

      ! relaxation
      outState(?SAVE?( q000,1,iElem,QQ,QQ, nElems,neigh)) =                           &
        &      f000 - om_turb*(f000 - fEq(q000))
      outState(?SAVE?(q110,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      f110 - om_turb*(f110 - fEq(q110))
      outState(?SAVE?(qNN0,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      fNN0 - om_turb*(fNN0 - fEq(qNN0))
      outState(?SAVE?(qN10,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      fN10 - om_turb*(fN10 - fEq(qN10))
      outState(?SAVE?(q1N0,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      f1N0 - om_turb*(f1N0 - fEq(q1N0))
      outState(?SAVE?(q101,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      f101 - om_turb*(f101 - fEq(q101))
      outState(?SAVE?(qN0N,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      fN0N - om_turb*(fN0N - fEq(qN0N))
      outState(?SAVE?(qN01,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      fN01 - om_turb*(fN01 - fEq(qN01))
      outState(?SAVE?(q10N,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      f10N - om_turb*(f10N - fEq(q10N))
      outState(?SAVE?(q011,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      f011 - om_turb*(f011 - fEq(q011))
      outState(?SAVE?(q0NN,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      f0NN - om_turb*(f0NN - fEq(q0NN))
      outState(?SAVE?(q0N1,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      f0N1 - om_turb*(f0N1 - fEq(q0N1))
      outState(?SAVE?(q01N,1,iElem,QQ,QQ,nElems,neigh))  =                            &
        &      f01N - om_turb*(f01N - fEq(q01N))
      outState(?SAVE?(q010,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      f010 - om_turb*(f010 - fEq(q010))
      outState(?SAVE?(q0N0,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      f0N0 - om_turb*(f0N0 - fEq(q0N0))
      outState(?SAVE?(q100,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      f100 - om_turb*(f100 - fEq(q100))
      outState(?SAVE?(qN00,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      fN00 - om_turb*(fN00 - fEq(qN00))
      outState(?SAVE?(q00N,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      f00N - om_turb*(f00N - fEq(q00N))
      outState(?SAVE?(q001,1,iElem,QQ,QQ,nElems,neigh )) =                            &
        &      f001 - om_turb*(f001 - fEq(q001))

    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_incomp_les
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with TRT collision 
  !! operator
  !! In TRT, there are two relaxation parameters one can choose.
  !! They have a relationship, which is so-called magic number:
  !! Lambda = ( 1/omegaP - 1/2 ) * ( 1/omegaN - 1/2 )
  !! Different value of Lambda results different error:
  !! Lambda = 1/4 is the best stability for the LBE. As well, this number gives
  !! the solution for the steady-state case dependant only on the equilibirium
  !! funciton.
  !! Lambda = 1/12 removes the third-order advection error
  !! Lambda = 1/6 removes fourth-order diffusion errors
  !! Lambda = 3/16 gives exact location of bounce-back walls for the Poiseuille
  !! flow.
  !! omegaP is usually fixed by viscosity, another one is fixed through the
  !! above magic number combination.
  !!
?? copy :: compute_routineHeader( trt_advRel_d3q19 )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
?? IF (SOA) THEN
    integer       :: iElem, minElem, maxElem, ii, iLink
    real(kind=rk) ::      fN00(params%block)
    real(kind=rk) ::      f0N0(params%block)
    real(kind=rk) ::      f00N(params%block)
    real(kind=rk) ::      f100(params%block)
    real(kind=rk) ::      f010(params%block)
    real(kind=rk) ::      f001(params%block)
    real(kind=rk) ::      f0NN(params%block)
    real(kind=rk) ::      f0N1(params%block)
    real(kind=rk) ::      f01N(params%block)
    real(kind=rk) ::      f011(params%block)
    real(kind=rk) ::      fN0N(params%block)
    real(kind=rk) ::      f10N(params%block)
    real(kind=rk) ::      fN01(params%block)
    real(kind=rk) ::      f101(params%block)
    real(kind=rk) ::      fNN0(params%block)
    real(kind=rk) ::      fN10(params%block)
    real(kind=rk) ::      f1N0(params%block)
    real(kind=rk) ::      f110(params%block)
    real(kind=rk) feq_common(1:params%block),&
      &                  rho(1:params%block),&
      &                  u_x(1:params%block),&
      &                  u_y(1:params%block),&
      &                  u_z(1:params%block)
    real(kind=rk) :: f000, inv_rho
    real(kind=rk) :: usq
    real(kind=rk) :: asym, sym
    real(kind=rk) :: t1x2,t2x2,fac1,fac2
    ! derived constants
    real(kind=rk) :: cmpl_o, omega, omega_h, asym_omega_h
    real(kind=rk) :: ui, fac
    real(kind=rk), parameter :: t1x2_0 = 1.d0/18.d0 * 2.d0
    real(kind=rk), parameter :: t2x2_0 = 1.d0/36.d0 * 2.d0
    ! ---------------------------------------------------------------------------

    ! some global variables
    omega  = fieldProp(1)%fluid%omLvl( level )
    cmpl_o  = 1._rk - omega

    omega_h      = 0.5d0 * omega
    asym_omega_h = 1.0_rk / ( 1.0_rk + 4.0_rk * fieldProp(1)%fluid%lambda / ( 2.0_rk/omega - 1.0_rk ) )
    ! asym_omega_h = 0.5d0 * 8.d0 * (2.d0-omega)/(8.d0-omega)

!$omp do schedule(static)

    !NEC$ ivdep
    !DIR$ IVDEP
    nodeloop: do minElem = 1, nSolve, params%block

      ii = 0
      maxElem = min( minElem + params%block - 1, nSolve )

?? copy :: dir_NOVEC
      do iElem = minElem, maxElem
        ii = ii + 1

        ! First load all local values into temp array
        fN00(ii) = inState( neigh((qN00-1)*nElems+iElem) )
        f0N0(ii) = inState( neigh((q0N0-1)*nElems+iElem) )
        f00N(ii) = inState( neigh((q00N-1)*nElems+iElem) )
        f100(ii) = inState( neigh((q100-1)*nElems+iElem) )
        f010(ii) = inState( neigh((q010-1)*nElems+iElem) )
        f001(ii) = inState( neigh((q001-1)*nElems+iElem) )
        f0NN(ii) = inState( neigh((q0NN-1)*nElems+iElem) )
        f0N1(ii) = inState( neigh((q0N1-1)*nElems+iElem) )
        f01N(ii) = inState( neigh((q01N-1)*nElems+iElem) )
        f011(ii) = inState( neigh((q011-1)*nElems+iElem) )
        fN0N(ii) = inState( neigh((qN0N-1)*nElems+iElem) )
        f10N(ii) = inState( neigh((q10N-1)*nElems+iElem) )
        fN01(ii) = inState( neigh((qN01-1)*nElems+iElem) )
        f101(ii) = inState( neigh((q101-1)*nElems+iElem) )
        fNN0(ii) = inState( neigh((qNN0-1)*nElems+iElem) )
        fN10(ii) = inState( neigh((qN10-1)*nElems+iElem) )
        f1N0(ii) = inState( neigh((q1N0-1)*nElems+iElem) )
        f110(ii) = inState( neigh((q110-1)*nElems+iElem) )
        f000     = inState( neigh((q000-1)*nElems+iElem) )

        rho(ii) =   f000     &
                  + f110(ii) + f010(ii) + fN10(ii) + fN00(ii)  &
                  + fNN0(ii) + f0N0(ii) + f1N0(ii) + f100(ii)  &
                  + f001(ii) + f101(ii) + f011(ii) + fN01(ii) &
                  + f0N1(ii) + f00N(ii) + f10N(ii) + f01N(ii) &
                  + fN0N(ii) + f0NN(ii)
        inv_rho = 1._rk / rho(ii)

        u_x(ii) = f100(ii) + f10N(ii) + f101(ii) + f1N0(ii) + f110(ii) - fN00(ii) - fN0N(ii) - fN01(ii) - fNN0(ii) - fN10(ii)
        u_y(ii) = f010(ii) + f01N(ii) + f011(ii) + fN10(ii) + f110(ii) - f0N0(ii) - f0NN(ii) - f0N1(ii) - fNN0(ii) - f1N0(ii)
        u_z(ii) = f001(ii) + f0N1(ii) + f011(ii) + fN01(ii) + f101(ii) - f00N(ii) - f0NN(ii) - f01N(ii) - fN0N(ii) - f10N(ii)
        u_x(ii) = u_x(ii) * inv_rho
        u_y(ii) = u_y(ii) * inv_rho
        u_z(ii) = u_z(ii) * inv_rho

        ! square velocity and derived constants
        usq  = u_x(ii)*u_x(ii) + u_y(ii)*u_y(ii) + u_z(ii)*u_z(ii)
        feq_common(ii) = 1.d0 - 1.5d0 * usq

        outState( (q000-1)*nElems+iElem ) = &
          &                            f000*cmpl_o + omega*div1_3*rho(ii)*feq_common(ii)

      end do ! iElem
      call obscure_setzero(ii)

      ! 110 and NN0 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q110-1)*nElems+minElem, (q110-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) + u_y(ii)
        sym  =      omega_h * ( f110(ii) + fNN0(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f110(ii) - fNN0(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f110(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qNN0-1)*nElems+minElem, (qNN0-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) + u_y(ii)
        sym  =      omega_h * ( f110(ii) + fNN0(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f110(ii) - fNN0(ii) - 3.d0*t2x2*ui )
        outState(iLink) = fNN0(ii) - sym + asym
      end do
      call obscure_setzero(ii)
      ! 110 and NN0 ----------------------------------------------------------------

      ! 1N0 and N10 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q1N0-1)*nElems+minElem, (q1N0-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) - u_y(ii)
        sym  =      omega_h * ( f1N0(ii) + fN10(ii) - fac2*ui*ui -t2x2*feq_common(ii))
        asym = asym_omega_h * ( f1N0(ii) - fN10(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f1N0(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN10-1)*nElems+minElem, (qN10-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) - u_y(ii)
        sym  =      omega_h * ( f1N0(ii) + fN10(ii) - fac2*ui*ui -t2x2*feq_common(ii))
        asym = asym_omega_h * ( f1N0(ii) - fN10(ii) - 3.d0*t2x2*ui )
        outState(iLink) = fN10(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! ----------------------------------------------------------------

      ! 101 and N0N ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q101-1)*nElems+minElem, (q101-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) + u_z(ii)
        sym  =      omega_h * ( f101(ii) + fN0N(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f101(ii) - fN0N(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f101(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN0N-1)*nElems+minElem, (qN0N-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) + u_z(ii)
        sym  =      omega_h * ( f101(ii) + fN0N(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f101(ii) - fN0N(ii) - 3.d0*t2x2*ui )
        outState(iLink) = fN0N(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 101 and N0N ----------------------------------------------------------------

      ! 10N and 10N ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q10N-1)*nElems+minElem, (q10N-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) - u_z(ii)
        sym  =      omega_h * ( f10N(ii) + fN01(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f10N(ii) - fN01(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f10N(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN01-1)*nElems+minElem, (qN01-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) - u_z(ii)
        sym  =      omega_h * ( f10N(ii) + fN01(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f10N(ii) - fN01(ii) - 3.d0*t2x2*ui )
        outState(iLink) = fN01(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 10N and 10N ----------------------------------------------------------------

      ! 011 and 0NN ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q011-1)*nElems+minElem, (q011-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_y(ii) + u_z(ii)
        sym  =      omega_h * ( f011(ii) + f0NN(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f011(ii) - f0NN(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f011(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0NN-1)*nElems+minElem, (q0NN-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_y(ii) + u_z(ii)
        sym  =      omega_h * ( f011(ii) + f0NN(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f011(ii) - f0NN(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f0NN(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 011 and 0NN ----------------------------------------------------------------

      ! 01N and 0N1 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q01N-1)*nElems+minElem, (q01N-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_y(ii) - u_z(ii)
        sym  =      omega_h * ( f01N(ii) + f0N1(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f01N(ii) - f0N1(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f01N(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N1-1)*nElems+minElem, (q0N1-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_y(ii) - u_z(ii)
        sym  =      omega_h * ( f01N(ii) + f0N1(ii) - fac2*ui*ui -t2x2*feq_common(ii) )
        asym = asym_omega_h * ( f01N(ii) - f0N1(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f0N1(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 01N and 0N1 ----------------------------------------------------------------

      ! 100 and N00 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q100-1)*nElems+minElem, (q100-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h * ( f100(ii) + fN00(ii) - fac1*u_x(ii)*u_x(ii)-t1x2*feq_common(ii))
        asym = asym_omega_h * ( f100(ii) - fN00(ii) - 3.d0*t1x2*u_x(ii) )
        outState(iLink) = f100(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN00-1)*nElems+minElem, (qN00-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h * ( f100(ii) + fN00(ii) - fac1*u_x(ii)*u_x(ii)-t1x2*feq_common(ii))
        asym = asym_omega_h * ( f100(ii) - fN00(ii) - 3.d0*t1x2*u_x(ii) )
        outState(iLink) = fN00(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 100 and N00 ----------------------------------------------------------------

      ! 010 and 0N0 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q010-1)*nElems+minElem, (q010-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h * ( f010(ii) + f0N0(ii) - fac1*u_y(ii)*u_y(ii)-t1x2*feq_common(ii))
        asym = asym_omega_h * ( f010(ii) - f0N0(ii) - 3.d0*t1x2*u_y(ii) )
        outState(iLink) = f010(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N0-1)*nElems+minElem, (q0N0-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h * ( f010(ii) + f0N0(ii) - fac1*u_y(ii)*u_y(ii)-t1x2*feq_common(ii))
        asym = asym_omega_h * ( f010(ii) - f0N0(ii) - 3.d0*t1x2*u_y(ii) )
        outState(iLink) = f0N0(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 010 and 0N0 ----------------------------------------------------------------

      ! 001 and 00N ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q001-1)*nElems+minElem, (q001-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h * ( f001(ii) + f00N(ii) - fac1*u_z(ii)*u_z(ii)-t1x2*feq_common(ii))
        asym = asym_omega_h * ( f001(ii) - f00N(ii) - 3.d0*t1x2*u_z(ii) )
        outState(iLink) = f001(ii) - sym - asym
      end do !iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q00N-1)*nElems+minElem, (q00N-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h * ( f001(ii) + f00N(ii) - fac1*u_z(ii)*u_z(ii)-t1x2*feq_common(ii))
        asym = asym_omega_h * ( f001(ii) - f00N(ii) - 3.d0*t1x2*u_z(ii) )
        outState(iLink) = f00N(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 001 and 00N ----------------------------------------------------------------

    enddo nodeloop
!$omp end do nowait

?? ELSE
    ! ---------------------------------------------------------------------------
    integer :: iElem
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho, inv_rho
    real(kind=rk) :: u_x, u_y, u_z, usq
    real(kind=rk) cmpl_o, omega, omega_h, asym_omega_h
    real(kind=rk) ui
    real(kind=rk) :: asym, sym, feq_common
    real(kind=rk) :: t1x2,t2x2,fac1,fac2
    real(kind=rk), parameter :: t1x2_0 = 1.d0/18.d0 * 2.d0
    real(kind=rk), parameter :: t2x2_0 = 1.d0/36.d0 * 2.d0
    ! ---------------------------------------------------------------------------

    omega   = fieldProp(1)%fluid%omLvl( level )
    cmpl_o  = 1._rk - omega
    omega_h      = 0.5d0 * omega
    ! asym_omega_h = 0.5d0 * 8.d0 * (2.d0-omega)/(8.d0-omega)
    asym_omega_h = 1.0_rk / ( 1.0_rk + fieldProp(1)%fluid%lambda / ( 2.0_rk/omega - 1.0_rk ) )

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem=1,nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, QQ, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, QQ, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, QQ, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, QQ, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, QQ, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, QQ, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, QQ, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, QQ, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, QQ, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, QQ, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, QQ, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, QQ, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, QQ, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, QQ, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, QQ, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

      ! inverse local density
      inv_rho = 1._rk / rho

?? copy :: ux_d3q19( u_x, f, inv_rho )
?? copy :: uy_d3q19( u_y, f, inv_rho )
?? copy :: uz_d3q19( u_z, f, inv_rho )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      feq_common = 1.d0 - 1.5d0 * usq

      ! let's start the relaxation process
      outstate(?SAVE?(q000, 1, iElem, QQ, QQ, nElems,neigh)) =                               &
        &                            f000*cmpl_o + omega*div1_3*rho*feq_common

      t2x2 = t2x2_0 * rho
      fac2 = t2x2 * t2cs4inv !inv2csq2

      ui   = u_x + u_y
      sym  =      omega_h * ( f110 + fNN0 - fac2*ui*ui -t2x2*feq_common )
      asym = asym_omega_h * ( f110 - fNN0 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q110, 1, iElem, QQ, QQ, nElems,neigh)) = f110 - sym - asym
      outstate(?SAVE?(qNN0, 1, iElem, QQ, QQ, nElems,neigh)) = fNN0 - sym + asym

      ui   = u_x - u_y
      sym  =      omega_h * ( f1N0 + fN10 - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f1N0 - fN10 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q1N0, 1, iElem, QQ, QQ, nElems,neigh)) = f1N0 - sym - asym
      outstate(?SAVE?(qN10, 1, iElem, QQ, QQ, nElems,neigh)) = fN10 - sym + asym

      ui   = u_x + u_z
      sym  =      omega_h * ( f101 + fN0N - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f101 - fN0N - 3.d0*t2x2*ui )
      outstate(?SAVE?(q101, 1, iElem, QQ, QQ, nElems,neigh)) = f101 - sym - asym
      outstate(?SAVE?(qN0N, 1, iElem, QQ, QQ, nElems,neigh)) = fN0N - sym + asym

      ui   = u_x - u_z
      sym  =      omega_h * ( f10N + fN01 - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f10N - fN01 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q10N, 1, iElem, QQ, QQ, nElems,neigh)) = f10N - sym - asym
      outstate(?SAVE?(qN01, 1, iElem, QQ, QQ, nElems,neigh)) = fN01 - sym + asym

      ui   = u_y + u_z
      sym  =      omega_h * ( f011 + f0NN - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f011 - f0NN - 3.d0*t2x2*ui )
      outstate(?SAVE?(q011, 1, iElem, QQ, QQ, nElems,neigh)) = f011 - sym - asym
      outstate(?SAVE?(q0NN, 1, iElem, QQ, QQ, nElems,neigh)) = f0NN - sym + asym

      ui   = u_y - u_z
      sym  =      omega_h * ( f01N + f0N1 - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f01N - f0N1 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q01N, 1, iElem, QQ, QQ, nElems,neigh)) = f01N - sym - asym
      outstate(?SAVE?(q0N1, 1, iElem, QQ, QQ, nElems,neigh)) = f0N1 - sym + asym

      t1x2 = t1x2_0 * rho
      fac1 = t1x2 * t2cs4inv !inv2csq2

      sym  =      omega_h * ( f100 + fN00 - fac1*u_x*u_x-t1x2*feq_common)
      asym = asym_omega_h * ( f100 - fN00 - 3.d0*t1x2*u_x )
      outstate(?SAVE?(q100, 1, iElem, QQ, QQ, nElems,neigh)) = f100 - sym - asym
      outstate(?SAVE?(qN00, 1, iElem, QQ, QQ, nElems,neigh)) = fN00 - sym + asym

      sym  =      omega_h * ( f010 + f0N0 - fac1*u_y*u_y-t1x2*feq_common)
      asym = asym_omega_h * ( f010 - f0N0 - 3.d0*t1x2*u_y )
      outstate(?SAVE?(q010, 1, iElem, QQ, QQ, nElems,neigh)) = f010 - sym - asym
      outstate(?SAVE?(q0N0, 1, iElem, QQ, QQ, nElems,neigh)) = f0N0 - sym + asym

      sym  =      omega_h * ( f001 + f00N - fac1*u_z*u_z-t1x2*feq_common)
      asym = asym_omega_h * ( f001 - f00N - 3.d0*t1x2*u_z )
      outstate(?SAVE?(q001, 1, iElem, QQ, QQ, nElems,neigh)) = f001 - sym - asym
      outstate(?SAVE?(q00N, 1, iElem, QQ, QQ, nElems,neigh)) = f00N - sym + asym
    enddo nodeloop
!$omp end do

?? ENDIF

  end subroutine trt_advRel_d3q19
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with TRT collision
  !! operator
  !!
?? copy :: compute_routineHeader( trt_advRel_d3q19_les )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem, iDir
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     !< local density
    real(kind=rk) :: inv_rho !< inverse local density
    real(kind=rk) :: u_x     !< local x-velocity
    real(kind=rk) :: u_y     !< local y-velocity
    real(kind=rk) :: u_z     !< local z-velocity
    real(kind=rk) :: usq     !< square velocity
    ! derived constants
    real(kind=rk) :: ucx
    real(kind=rk) :: omega, om, omega_h, asym_omega_h
    real(kind=rk) :: ui
    real(kind=rk) :: asym, sym, feq_common
    ! in case of the default model, the actual values depend on density
    real(kind=rk) ::  t0,t1x2,t2x2,fac1,fac2

    ! common prefactors for calculating the equilibrium parts (D3Q19 model)
    real(kind=rk), parameter :: t0_0 = 1.d0/3.d0
    real(kind=rk), parameter :: t1x2_0 = 1.d0/18.d0 * 2.d0
    real(kind=rk), parameter :: t2x2_0 = 1.d0/36.d0 * 2.d0
    ! Variables required for LES
    real(kind=rk) :: fEq(19) !< equilibrium distribution
    real(kind=rk) :: nEq(19) !< non-equilibrium part
    real(kind=rk) :: q_sum, q_bgk(3,3)
    real(kind=rk) :: tau0, tau_turb, om_turb, smag_const
    real(kind=rk) :: cs2_dt2_18
    ! ---------------------------------------------------------------------------
    ! nElems = size(neigh)/QQ
    ! read the relaxation parameter omega for the current level
    omega   = fieldProp(1)%fluid%omLvl( level )
    smag_const = fieldProp(1)%fluid%les%c_s
    tau0 = 1._rk/omega
    cs2_dt2_18 = 18._rk &
      &    * fieldProp(1)%fluid%les%c_s * fieldProp(1)%fluid%les%c_s &
      &    * fieldProp(1)%fluid%dtLvl(level) * fieldProp(1)%fluid%dtLvl(level)


!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, QQ, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, QQ, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, QQ, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, QQ, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, QQ, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, QQ, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, QQ, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, QQ, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, QQ, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, QQ, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, QQ, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, QQ, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, QQ, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, QQ, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, QQ, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

      ! inverse local density
      inv_rho = 1._rk / rho

?? copy :: ux_d3q19( u_x, f, inv_rho )
?? copy :: uy_d3q19( u_y, f, inv_rho )
?? copy :: uz_d3q19( u_z, f, inv_rho )

      ! Calculate the equilibrium distribution function
      ! square of velocity
      usq = u_x*u_x + u_y*u_y + u_z*u_z
      do iDir = 1, layout%fStencil%QQ

        ! velocity times lattice unit velocity
        ucx =   real( layout%fStencil%cxDir( 1, iDir ), kind=rk) * u_x       &
            & + real( layout%fStencil%cxDir( 2, iDir ), kind=rk) * u_y       &
            & + real( layout%fStencil%cxDir( 3, iDir ), kind=rk) * u_z

        ! calculate equilibrium density
        fEq( iDir ) = layout%weight( iDir ) * rho * ( 1.d0 + ucx*cs2inv        &
          &         + ucx*ucx*t2cs4inv - usq*t2cs2inv )

      enddo

      ! Calculate the non-equilibrium part
      nEq( 1) = fN00 - fEq( 1)
      nEq( 2) = f0N0 - fEq( 2)
      nEq( 3) = f00N - fEq( 3)
      nEq( 4) = f100 - fEq( 4)
      nEq( 5) = f010 - fEq( 5)
      nEq( 6) = f001 - fEq( 6)
      nEq( 7) = f0NN - fEq( 7)
      nEq( 8) = f0N1 - fEq( 8)
      nEq( 9) = f01N - fEq( 9)
      nEq(10) = f011 - fEq(10)
      nEq(11) = fN0N - fEq(11)
      nEq(12) = f10N - fEq(12)
      nEq(13) = fN01 - fEq(13)
      nEq(14) = f101 - fEq(14)
      nEq(15) = fNN0 - fEq(15)
      nEq(16) = fN10 - fEq(16)
      nEq(17) = f1N0 - fEq(17)
      nEq(18) = f110 - fEq(18)
      nEq(19) = f000 - fEq(19)

      ! Now calculate the stress tensor part
      q_bgk      = 0._rk
      q_bgk(1,1) = sum( real(layout%fStencil%cxDir( 1, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(2,1) = sum( real(layout%fStencil%cxDir( 2, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(1,2) = q_bgk(2,1)
      q_bgk(2,2) = sum( real(layout%fStencil%cxDir( 2, :)                    &
        &                   *layout%fStencil%cxDir( 2, :),kind=rk)*nEq(:) )
      q_bgk(3,3) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 3, :),kind=rk)*nEq(:) )
      q_bgk(3,1) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 1, :),kind=rk)*nEq(:) )
      q_bgk(1,3) = q_bgk(3,1)
      q_bgk(3,2) = sum( real(layout%fStencil%cxDir( 3, :)                    &
        &                   *layout%fStencil%cxDir( 2, :),kind=rk)*nEq(:) )
      q_bgk(2,3) = q_bgk(3,2)

      q_sum = sqrt(2._rk*(q_bgk(1,1)*q_bgk(1,1)                                &
        &               + q_bgk(2,1)*q_bgk(2,1)                                &
        &               + q_bgk(1,2)*q_bgk(1,2)                                &
        &               + q_bgk(2,2)*q_bgk(2,2)                                &
        &               + q_bgk(3,1)*q_bgk(3,1)                                &
        &               + q_bgk(3,2)*q_bgk(3,2)                                &
        &               + q_bgk(1,3)*q_bgk(1,3)                                &
        &               + q_bgk(2,3)*q_bgk(2,3)                                &
        &               + q_bgk(3,3)*q_bgk(3,3) ) )

      ! and the turbulent part of the viscosity
!KM!      tau_turb = 0.5_rk *(sqrt(tau0*tau0 + 2._rk*smag_const                    &
!KM!        &              *smag_const/rho*cs4inv*q_sum)-tau0)
!KM!      om_turb = 1._rk/(tau0+tau_turb)
      tau_turb = 0.5_rk * ( tau0 + sqrt(tau0*tau0 + q_sum * cs2_dt2_18) )
      om_turb = 1._rk/(tau_turb)
      om = om_turb

      ! some global variables
      omega_h      = 0.5d0 * om
      asym_omega_h = 0.5d0 * 8.d0 * (2.d0-om)/(8.d0-om)

      ! square velocity and derived constants
      feq_common = 1.d0  - 1.5d0 * usq
      t0 = t0_0 * rho

      ! let's start the relaxation process
      outstate(?SAVE?(q000, 1, iElem, QQ, QQ, nElems,neigh)) =                               &
        &                            f000*(1.d0-om) + om*t0*feq_common

      t2x2 = t2x2_0 * rho
      fac2 = t2x2 * t2cs4inv !inv2csq2

      ui   = u_x + u_y
      sym  = omega_h*(f110 + fNN0 - fac2*ui*ui-t2x2*feq_common)
      asym = asym_omega_h*( f110 - fNN0 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q110, 1, iElem, QQ, QQ, nElems,neigh)) = f110 - sym - asym
      outstate(?SAVE?(qNN0, 1, iElem, QQ, QQ, nElems,neigh)) = fNN0 - sym + asym

      ui   = u_x - u_y
      sym  = omega_h*(f1N0 + fN10 - fac2*ui*ui-t2x2*feq_common)
      asym = asym_omega_h*( f1N0 - fN10 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q1N0, 1, iElem, QQ, QQ, nElems,neigh)) = f1N0 - sym - asym
      outstate(?SAVE?(qN10, 1, iElem, QQ, QQ, nElems,neigh)) = fN10 - sym + asym

      ui   = u_x + u_z
      sym  = omega_h*(f101 + fN0N - fac2*ui*ui-t2x2*feq_common)
      asym = asym_omega_h*( f101 - fN0N - 3.d0*t2x2*ui )
      outstate(?SAVE?(q101, 1, iElem, QQ, QQ, nElems,neigh)) = f101 - sym - asym
      outstate(?SAVE?(qN0N, 1, iElem, QQ, QQ, nElems,neigh)) = fN0N - sym + asym

      ui   = u_x - u_z
      sym  = omega_h*(f10N + fN01 - fac2*ui*ui-t2x2*feq_common)
      asym = asym_omega_h*( f10N - fN01 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q10N, 1, iElem, QQ, QQ, nElems,neigh)) = f10N - sym - asym
      outstate(?SAVE?(qN01, 1, iElem, QQ, QQ, nElems,neigh)) = fN01 - sym + asym

      ui   = u_y + u_z
      sym  = omega_h*(f011 + f0NN - fac2*ui*ui-t2x2*feq_common)
      asym = asym_omega_h*( f011 - f0NN - 3.d0*t2x2*ui )
      outstate(?SAVE?(q011, 1, iElem, QQ, QQ, nElems,neigh)) = f011 - sym - asym
      outstate(?SAVE?(q0NN, 1, iElem, QQ, QQ, nElems,neigh)) = f0NN - sym + asym

      ui   = u_y - u_z
      sym  = omega_h*(f01N + f0N1 - fac2*ui*ui-t2x2*feq_common)
      asym = asym_omega_h*( f01N - f0N1 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q01N, 1, iElem, QQ, QQ, nElems,neigh)) = f01N - sym - asym
      outstate(?SAVE?(q0N1, 1, iElem, QQ, QQ, nElems,neigh)) = f0N1 - sym + asym

      t1x2 = t1x2_0 * rho
      fac1 = t1x2 * t2cs4inv !inv2csq2

      ui   = u_y
      sym  = omega_h*(f010 + f0N0 - fac1*ui*ui-t1x2*feq_common)
      asym = asym_omega_h*( f010 - f0N0 - 3.d0*t1x2*ui )
      outstate(?SAVE?(q010, 1, iElem, QQ, QQ, nElems,neigh)) = f010 - sym - asym
      outstate(?SAVE?(q0N0, 1, iElem, QQ, QQ, nElems,neigh)) = f0N0 - sym + asym

      ui   = u_x
      sym  = omega_h*(f100 + fN00 - fac1*ui*ui-t1x2*feq_common)
      asym = asym_omega_h*( f100 - fN00 - 3.d0*t1x2*ui )
      outstate(?SAVE?(q100, 1, iElem, QQ, QQ, nElems,neigh)) = f100 - sym - asym
      outstate(?SAVE?(qN00, 1, iElem, QQ, QQ, nElems,neigh)) = fN00 - sym + asym

      ui   = u_z
      sym  = omega_h*(f001 + f00N - fac1*ui*ui-t1x2*feq_common)
      asym = asym_omega_h*( f001 - f00N - 3.d0*t1x2*ui )
      outstate(?SAVE?(q001, 1, iElem, QQ, QQ, nElems,neigh)) = f001 - sym - asym
      outstate(?SAVE?(q00N, 1, iElem, QQ, QQ, nElems,neigh)) = f00N - sym + asym
    enddo nodeloop

  end subroutine trt_advRel_d3q19_les
! ****************************************************************************** !

! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with TRT collision
  !! operator.
  !!
  !! Incompressible model
  !!
?? copy :: compute_routineHeader( trt_advRel_d3q19_incomp )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem ! element counter
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho, u_x, u_y, u_z, usq
    real(kind=rk) :: om, omega_h, asym_omega_h
    real(kind=rk) :: ui
    real(kind=rk) :: asym, sym, feq_common, t1_feq, t2_feq
    real(kind=rk), parameter :: t1x2 = 1.d0/ 9.d0
    real(kind=rk), parameter :: t2x2 = 1.d0/18.d0
    real(kind=rk), parameter :: fac1 = t1x2*t2cs4inv !inv2csq2
    real(kind=rk), parameter :: fac2 = t2x2*t2cs4inv !inv2csq2
    ! ---------------------------------------------------------------------------

    om   = fieldProp(1)%fluid%omLvl( level )
    omega_h      = 0.5d0 * om
    asym_omega_h = 0.5d0 * 8.d0 * (2.d0-om)/(8.d0-om)

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, QQ, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, QQ, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, QQ, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, QQ, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, QQ, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, QQ, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, QQ, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, QQ, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, QQ, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, QQ, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, QQ, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, QQ, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, QQ, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, QQ, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, QQ, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho,f )

?? copy :: ux_d3q19( u_x, f, 1._rk )
?? copy :: uy_d3q19( u_y, f, 1._rk )
?? copy :: uz_d3q19( u_z, f, 1._rk )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      feq_common = rho - 1.5d0 * usq

      ! let's start the relaxation process
      outstate(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh)) = f000*(1.d0-om) + om*div1_3*feq_common

      ! t2x2 = 1.d0/18.d0
      t2_feq = t2x2*feq_common

      ui   = u_x + u_y
      sym  =      omega_h*( f110 + fNN0 - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f110 - fNN0 - div1_6*ui )
      outstate(?SAVE?(q110,1,iElem,QQ,QQ,nElems,neigh)) = f110 - sym - asym
      outstate(?SAVE?(qNN0,1,iElem,QQ,QQ,nElems,neigh)) = fNN0 - sym + asym

      ui   = u_x - u_y
      sym  =      omega_h*( f1N0 + fN10 - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f1N0 - fN10 - div1_6*ui )
      outstate(?SAVE?(q1N0,1,iElem,QQ,QQ,nElems,neigh)) = f1N0 - sym - asym
      outstate(?SAVE?(qN10,1,iElem,QQ,QQ,nElems,neigh)) = fN10 - sym + asym

      ui   = u_x + u_z
      sym  =      omega_h*( f101 + fN0N - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f101 - fN0N - div1_6*ui )
      outstate(?SAVE?(q101,1,iElem,QQ,QQ,nElems,neigh)) = f101 - sym - asym
      outstate(?SAVE?(qN0N,1,iElem,QQ,QQ,nElems,neigh)) = fN0N - sym + asym

      ui   = u_x - u_z
      sym  =      omega_h*( f10N + fN01 - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f10N - fN01 - div1_6*ui )
      outstate(?SAVE?(q10N,1,iElem,QQ,QQ,nElems,neigh)) = f10N - sym - asym
      outstate(?SAVE?(qN01,1,iElem,QQ,QQ,nElems,neigh)) = fN01 - sym + asym

      ui   = u_y + u_z
      sym  =      omega_h*( f011 + f0NN - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f011 - f0NN - div1_6*ui )
      outstate(?SAVE?(q011,1,iElem,QQ,QQ,nElems,neigh)) = f011 - sym - asym
      outstate(?SAVE?(q0NN,1,iElem,QQ,QQ,nElems,neigh)) = f0NN - sym + asym

      ui   = u_y - u_z
      sym  =      omega_h*( f01N + f0N1 - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f01N - f0N1 - div1_6*ui )
      outstate(?SAVE?(q01N,1,iElem,QQ,QQ,nElems,neigh)) = f01N - sym - asym
      outstate(?SAVE?(q0N1,1,iElem,QQ,QQ,nElems,neigh)) = f0N1 - sym + asym

      ! t1x2 = 1.d0/ 9.d0
      t1_feq = t1x2*feq_common

      ! ui   = u_y
      sym  =      omega_h*( f010 + f0N0 - fac1*u_y*u_y-t1_feq)
      asym = asym_omega_h*( f010 - f0N0 - div1_3*u_y )
      outstate(?SAVE?(q010,1,iElem,QQ,QQ,nElems,neigh)) = f010 - sym - asym
      outstate(?SAVE?(q0N0,1,iElem,QQ,QQ,nElems,neigh)) = f0N0 - sym + asym

      ! ui   = u_x
      sym  =      omega_h*( f100 + fN00 - fac1*u_x*u_x-t1_feq)
      asym = asym_omega_h*( f100 - fN00 - div1_3*u_x )
      outstate(?SAVE?(q100,1,iElem,QQ,QQ,nElems,neigh)) = f100 - sym - asym
      outstate(?SAVE?(qN00,1,iElem,QQ,QQ,nElems,neigh)) = fN00 - sym + asym

      ! ui   = u_z
      sym  =      omega_h*( f001 + f00N - fac1*u_z*u_z-t1_feq)
      asym = asym_omega_h*( f001 - f00N - div1_3*u_z )
      outstate(?SAVE?(q001,1,iElem,QQ,QQ,nElems,neigh)) = f001 - sym - asym
      outstate(?SAVE?(q00N,1,iElem,QQ,QQ,nElems,neigh)) = f00N - sym + asym
    enddo nodeloop

  end subroutine trt_advRel_d3q19_incomp
! ****************************************************************************** !

! ****************************************************************************** !
  !> author: Jiaxing Qi
  !! Advection relaxation routine for the D3Q19 model with BGK and nonNewtonian
  !! power-law (PL) model. \n
  !! After BGK scheme, strain rate is calculated from nonEquilibilirium.
  !! Then shear rate is computed by strain rate. Next, viscosity is updated
  !! according to nonNewtonian model. Finally viscosity is converted into omega
  !! and updated.
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19_nNwtn_PL )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem ! element counter
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: inv_rho ! inverse local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    real(kind=rk) :: fEq000
    ! derived constants
    real(kind=rk) :: usqn, usqn_o1, usqn_o2
    real(kind=rk) :: omega_2, cmpl_o, omega, inv_omg_2, coeffSR
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &              sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1,        &
      &              sum11_2, sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------
    ! parameters for non-Newtonian
    integer :: nScalars, omegaStatePos
    real(kind=rk) :: fNeq_NE_SW, fNeq_NW_SE, fNeq_BW_TE, &
      &              fNeq_BE_TW, fNeq_BS_TN, fNeq_BN_TS
    real(kind=rk) :: strainRate(6)
    real(kind=rk) :: shearRate ! second invariant of strainRate
    real(kind=rk) :: nuLB
    real(kind=rk) :: convSR
    real(kind=rk) :: convVisc
    real(kind=rk) :: k, n_1 ! parameters for power-law
    ! ---------------------------------------------------------------------------

    ! omega pos in state
    omegaStatePos = QQ + 1

    nScalars = varSys%nScalars

    ! conversion factor for Shear Rate (or Strain Rate)
    convSR = params%physics%fac(level)%strainRate

    ! conversion factor for kinematic viscosity
    convVisc = params%physics%fac(level)%visc

    ! get nonNewtonian power law parameter
    k   = fieldProp(1)%fluid%nNwtn%PL%k &
      &   / params%physics%rho0 / convVisc
    n_1 = fieldProp(1)%fluid%nNwtn%PL%n_1

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve

      ! read the relaxation parameter omega for the current element
      omega   = inState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) )
      inv_omg_2 = 2._rk / omega

      ! pre-calculate partial collision constants
      omega_2 = 2._rk * omega
      cmpl_o  = 1._rk - omega

      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

      ! inverse local density
      inv_rho = 1._rk / rho
      coeffSR = -1.5_rk * omega * convSR * inv_rho

?? copy :: ux_d3q19( u_x, f, inv_rho )
?? copy :: uy_d3q19( u_y, f, inv_rho )
?? copy :: uz_d3q19( u_z, f, inv_rho )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      usqn = div1_36 * (1._rk - 1.5_rk * usq) * rho
      fEq000 = rho*(div1_3-0.5_rk*usq)

      ! cmpl_o = 1 - omega
      ! f = (1-omega) * f + omega * fEq
      outState(?SAVE?(q000,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                f000*cmpl_o + omega*fEq000

      coeff_1 = div1_8 * omega * rho

      usqn_o1 = omega * usqn

      ui1     = u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   f110*cmpl_o +sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   fNN0*cmpl_o -sum1_1 +sum1_2

      ! cmpl_o = 1 - omega
      ! f = (1-omega) * f + omega * fEq
      ! the term (omg*fEq) is already embedded in (sum1_1,sum1_2)
      ! so we devide (omg*fEq) by omg to get fNeq.
      ! This also applied to all other links.
      ! fNeq_NE + fNeq_SW
      fNeq_NE_SW = f110 + fNN0 - sum1_2*inv_omg_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   f1N0 *cmpl_o-sum3_1 +sum3_2

      fNeq_NW_SE = fN10 + f1N0 - sum3_2*inv_omg_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   fN0N*cmpl_o-sum10_1+sum10_2

      fNeq_BW_TE = fN0N + f101 - sum10_2*inv_omg_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   f10N*cmpl_o-sum12_1+sum12_2

      fNeq_BE_TW = f10N + fN01 - sum12_2*inv_omg_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   f011*cmpl_o+sum11_1+sum11_2
      outState(?SAVE?(q0NN,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   f0NN*cmpl_o-sum11_1+sum11_2

      fNeq_BS_TN = f0NN + f011 - sum11_2*inv_omg_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   f0N1*cmpl_o+sum13_1+sum13_2
      outState(?SAVE?(q01N,1,iElem,QQ,nScalars,nElems,neigh)) =                           &
        &                                   f01N*cmpl_o-sum13_1+sum13_2

      fNeq_BN_TS = f01N + f0N1 - sum13_2*inv_omg_2

      coeff_2 = div1_8 * omega_2 * rho
      usqn_o2 = omega_2 * usqn

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1,iElem,QQ,nScalars,nElems,neigh )) =                          &
        &                                   f010 *cmpl_o+sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1,iElem,QQ,nScalars,nElems,neigh )) =                          &
        &                                   f0N0 *cmpl_o-sum2_1 +sum2_2

      strainRate(2) = f010 + f0N0 - sum2_2*inv_omg_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1,iElem,QQ,nScalars,nElems,neigh )) =                          &
        &                                   fN00 *cmpl_o-sum4_1 +sum4_2
      outState(?SAVE?(q100,1,iElem,QQ,nScalars,nElems,neigh )) =                          &
        &                                   f100 *cmpl_o+sum4_1 +sum4_2

      strainRate(1) = fN00 + f100 - sum4_2*inv_omg_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1,iElem,QQ,nScalars,nElems,neigh )) =                          &
        &                                   f001*cmpl_o +sum9_1 +sum9_2
      outState(?SAVE?(q00N,1,iElem,QQ,nScalars,nElems,neigh )) =                          &
        &                                   f00N*cmpl_o -sum9_1 +sum9_2

      strainRate(3) = f001 + f00N - sum9_2*inv_omg_2

      ! Calculate strain rate and convert physics unit
      ! s1 = (W+E + (SW+NE + (SE+NW + (BW+TE + (BE+TW
      strainRate(1) =   strainRate(1) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BW_TE + fNeq_BE_TW

      ! s2 = (S+N + (NE+SW + (NW+SE + (BS+TN + (BN+TS
      strainRate(2) =   strainRate(2) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BS_TN + fNeq_BN_TS

      ! s3 = (B+T + (BS+TN + (BN+TS + (BW+TE + (BE+TW
      strainRate(3) =   strainRate(3) &
        &             + fNeq_BS_TN + fNeq_BN_TS + fNeq_BW_TE + fNeq_BE_TW

      ! s4 = (SW+NE - (NW+SE
      strainRate(4) = fNeq_NE_SW - fNeq_NW_SE

      ! s5 = (BS+TN - (TS+BN
      strainRate(5) = fNeq_BS_TN - fNeq_BN_TS

      ! s6 = (BW+TE - (BE+TW
      strainRate(6) = fNeq_BW_TE - fNeq_BE_TW

      ! coeffSR can NOT be moved into the following SQRT
      strainRate = strainRate * coeffSR

      !! Calculate physical shear rate
      shearRate = 2._rk * sqrt( sum( strainRate * strainRate ) )

      ! the exponential operation is quite expensive.
      ! it decreases performance about 3-4 MLUPS on SHU
      ! calculate kinematic visc by POWER-LAW model
      ! from paper: LBMs for non-newtonian flows, page 803
      ! k is actually k / rho0 / convVisc
      nuLB = ( shearRate ** n_1 ) * k

      ! calc omage and save state vector
outState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) ) = 1._rk / ( 0.5_rk + 3._rk*nuLB )

    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_nNwtn_PL
! ****************************************************************************** !

! ****************************************************************************** !
  !> author: Jiaxing Qi
  !! Advection relaxation routine for the D3Q19 model with BGK and nonNewtonian
  !! Carreau-Yasuda (CY) model. \n
  !! After BGK scheme, strain rate is calculated from nonEquilibilirium.
  !! Then shear rate is computed by strain rate. Next, viscosity is updated
  !! according to nonNewtonian model. Finally viscosity is converted into omega
  !! and updated.
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19_nNwtn_CY )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem ! element counter
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: inv_rho ! inverse local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    real(kind=rk) :: fEq000
    ! derived constants
    real(kind=rk) :: usqn, usqn_o1, usqn_o2
    real(kind=rk) :: omega_2, cmpl_o, omega, inv_omg_2, coeffSR
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &              sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1,        &
      &              sum11_2, sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------
    ! parameters for non-Newtonian
    integer :: nScalars, omegaStatePos
    real(kind=rk) :: fNeq_NE_SW, fNeq_NW_SE, fNeq_BW_TE, &
      &              fNeq_BE_TW, fNeq_BS_TN, fNeq_BN_TS
    real(kind=rk) :: strainRate(6)
    real(kind=rk) :: shearRate ! second invariant of strainRate
    real(kind=rk) :: nuLB, t
    real(kind=rk) :: convSR
    real(kind=rk) :: convVisc
    real(kind=rk) :: a, n_1_a, lambda, v0_vInf, vInf ! parameters for CY model
    ! ---------------------------------------------------------------------------

    ! omega pos in state
    omegaStatePos = QQ + 1

    nScalars = varSys%nScalars

    ! conversion factor for Shear Rate (or Strain Rate)
    convSR = params%physics%fac(level)%strainRate

    ! conversion factor for kinematic viscosity
    convVisc = params%physics%fac(level)%visc

    ! get nonNewtonian power law parameter
    a       = fieldProp(1)%fluid%nNwtn%CY%a
    n_1_a   = fieldProp(1)%fluid%nNwtn%CY%n_1_a
    lambda  = fieldProp(1)%fluid%nNwtn%CY%lambda * 2._rk
    vInf    = fieldProp(1)%fluid%nNwtn%CY%viscInf
    v0_vInf = fieldProp(1)%fluid%nNwtn%CY%visc0 - vInf

    vInf = vInf / params%physics%rho0 / convVisc
    v0_vInf = v0_vInf / params%physics%rho0 / convVisc

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve

      ! read the relaxation parameter omega for the current element
      omega   = inState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) )
      inv_omg_2 = 2._rk / omega

      ! pre-calculate partial collision constants
      omega_2 = 2._rk * omega
      cmpl_o  = 1._rk - omega

      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

      ! inverse local density
      inv_rho = 1._rk / rho
      coeffSR = -1.5_rk * omega * convSR * inv_rho

      ! transfer moments to velocities
?? copy :: ux_d3q19( u_x, f, inv_rho )
?? copy :: uy_d3q19( u_y, f, inv_rho )
?? copy :: uz_d3q19( u_z, f, inv_rho )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      usqn = div1_36 * (1._rk - 1.5_rk * usq) * rho
      fEq000 = rho*(div1_3-0.5_rk*usq)

      ! f = (1-omega) * f + omega * fEq
      outState(?SAVE?(q000,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                f000*cmpl_o + omega*fEq000

      coeff_1 = div1_8 * omega * rho

      usqn_o1 = omega * usqn

      ui1     = u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   fNN0 *cmpl_o-sum1_1 +sum1_2

      fNeq_NE_SW = f110 + fNN0 - sum1_2*inv_omg_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f1N0 *cmpl_o-sum3_1 +sum3_2

      fNeq_NW_SE = fN10 + f1N0 - sum3_2*inv_omg_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   fN0N*cmpl_o-sum10_1+sum10_2

      fNeq_BW_TE = fN0N + f101 - sum10_2*inv_omg_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f10N*cmpl_o-sum12_1+sum12_2

      fNeq_BE_TW = f10N + fN01 - sum12_2*inv_omg_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                  f011*cmpl_o +sum11_1+sum11_2
      outState(?SAVE?(q0NN,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                  f0NN*cmpl_o -sum11_1+sum11_2

      fNeq_BS_TN = f0NN + f011 - sum11_2*inv_omg_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                  f0N1*cmpl_o +sum13_1+sum13_2
      outState(?SAVE?(q01N,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                  f01N*cmpl_o -sum13_1+sum13_2

      fNeq_BN_TS = f01N + f0N1 - sum13_2*inv_omg_2

      coeff_2 = div1_8 * omega_2 * rho
      usqn_o2 = omega_2 * usqn

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f010*cmpl_o +sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f0N0*cmpl_o -sum2_1 +sum2_2

      strainRate(2) = f010 + f0N0 - sum2_2*inv_omg_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   fN00*cmpl_o -sum4_1 +sum4_2
      outState(?SAVE?(q100,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f100*cmpl_o +sum4_1 +sum4_2

      strainRate(1) = fN00 + f100 - sum4_2*inv_omg_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f001*cmpl_o +sum9_1 +sum9_2
      outState(?SAVE?(q00N,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f00N*cmpl_o -sum9_1 +sum9_2

      strainRate(3) = f001 + f00N - sum9_2*inv_omg_2

      ! s1 = (W+E + (SW+NE + (SE+NW + (BW+TE + (BE+TW
      strainRate(1) =   strainRate(1) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BW_TE + fNeq_BE_TW

      ! s2 = (S+N + (NE+SW + (NW+SE + (BS+TN + (BN+TS
      strainRate(2) =   strainRate(2) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BS_TN + fNeq_BN_TS

      ! s3 = (B+T + (BS+TN + (BN+TS + (BW+TE + (BE+TW
      strainRate(3) =   strainRate(3) &
        &             + fNeq_BS_TN + fNeq_BN_TS + fNeq_BW_TE + fNeq_BE_TW

      ! s4 = (SW+NE - (NW+SE
      strainRate(4) = fNeq_NE_SW - fNeq_NW_SE

      ! s5 = (BS+TN - (TS+BN
      strainRate(5) = fNeq_BS_TN - fNeq_BN_TS

      ! s6 = (BW+TE - (BE+TW
      strainRate(6) = fNeq_BW_TE - fNeq_BE_TW

      ! convert physics unit
      strainRate = strainRate * coeffSR

      !! Calculate physical shear rate
      shearRate = sqrt( sum( strainRate * strainRate ) )

      ! calculate dynamic visc by Carreau-Yasuda model
      ! convert to kinematic visc
      t = ( 1._rk + (lambda*shearRate) ** a ) ** n_1_a
      nuLB = vInf + v0_vInf * t

      ! calc omage and save state vector
  outState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) ) = 1._rk / ( 0.5_rk + 3._rk*nuLB )

    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_nNwtn_CY
! ****************************************************************************** !

! ****************************************************************************** !
  !> author: Jiaxing Qi
  !! Advection relaxation routine for the D3Q19 model with BGK and nonNewtonian
  !! Casson (CS model. \n
  !! After BGK scheme, strain rate is calculated from nonEquilibilirium.
  !! Then shear rate is computed by strain rate. Next, viscosity is updated
  !! according to nonNewtonian model. Finally viscosity is converted into omega
  !! and updated.
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19_nNwtn_CS )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem ! element counter
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: inv_rho ! inverse local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    real(kind=rk) :: fEq000
    ! derived constants
    real(kind=rk) :: usqn, usqn_o1, usqn_o2
    real(kind=rk) :: omega_2, cmpl_o, omega, inv_omg_2, coeffSR
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &              sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1,        &
      &              sum11_2, sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------
    ! parameters for non-Newtonian
    integer :: nScalars, omegaStatePos
    real(kind=rk) :: fNeq_NE_SW, fNeq_NW_SE, fNeq_BW_TE, &
      &              fNeq_BE_TW, fNeq_BS_TN, fNeq_BN_TS
    real(kind=rk) :: strainRate(6)
    real(kind=rk) :: shearRate ! second invariant of strainRate
    real(kind=rk) :: nuLB, t
    real(kind=rk) :: convSR
    real(kind=rk) :: convVisc
    real(kind=rk) :: k0, k1 ! parameters for Casson model
    ! ---------------------------------------------------------------------------

    ! omega pos in state
    omegaStatePos = QQ + 1

    nScalars = varSys%nScalars

    ! conversion factor for Shear Rate (or Strain Rate)
    convSR = params%physics%fac(level)%strainRate

    ! conversion factor for kinematic viscosity
    convVisc = params%physics%fac(level)%visc

    ! get nonNewtonian Casson model parameter
    k0 = fieldProp(1)%fluid%nNwtn%CS%k0
    k1 = fieldProp(1)%fluid%nNwtn%CS%k1

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve

      ! read the relaxation parameter omega for the current element
      omega   = inState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) )
      inv_omg_2 = 2._rk / omega

      ! pre-calculate partial collision constants
      omega_2 = 2._rk * omega
      cmpl_o  = 1._rk - omega

      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

      ! inverse local density
      inv_rho = 1._rk / rho
      coeffSR = -1.5_rk * omega * convSR / rho

      ! transfer moments to velocities
?? copy :: ux_d3q19( u_x, f, inv_rho )
?? copy :: uy_d3q19( u_y, f, inv_rho )
?? copy :: uz_d3q19( u_z, f, inv_rho )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      usqn = div1_36 * (1._rk - 1.5_rk * usq) * rho
      fEq000 = rho*(div1_3-0.5_rk*usq)

      ! f = (1-omega) * f + omega * fEq
      outState(?SAVE?(q000,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                f000*cmpl_o + omega*fEq000

      coeff_1 = div1_8 * omega * rho

      usqn_o1 = omega * usqn

      ui1     = u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   fNN0 *cmpl_o-sum1_1 +sum1_2

      fNeq_NE_SW = f110 + fNN0 - sum1_2*inv_omg_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f1N0 *cmpl_o-sum3_1 +sum3_2

      fNeq_NW_SE = fN10 + f1N0 - sum3_2*inv_omg_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   fN0N*cmpl_o-sum10_1+sum10_2

      fNeq_BW_TE = fN0N + f101 - sum10_2*inv_omg_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f10N*cmpl_o-sum12_1+sum12_2

      fNeq_BE_TW = f10N + fN01 - sum12_2*inv_omg_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f011*cmpl_o +sum11_1+sum11_2
      outState(?SAVE?(q0NN,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f0NN*cmpl_o -sum11_1+sum11_2

      fNeq_BS_TN = f0NN + f011 - sum11_2*inv_omg_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f0N1*cmpl_o +sum13_1+sum13_2
      outState(?SAVE?(q01N,1,iElem,QQ,nScalars,nElems,neigh)) =                       &
        &                                   f01N*cmpl_o -sum13_1+sum13_2

      fNeq_BN_TS = f01N + f0N1 - sum13_2*inv_omg_2

      coeff_2 = div1_8 * omega_2 * rho
      usqn_o2 = omega_2 * usqn

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f010*cmpl_o +sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f0N0*cmpl_o -sum2_1 +sum2_2

      strainRate(2) = f010 + f0N0 - sum2_2*inv_omg_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   fN00*cmpl_o -sum4_1 +sum4_2
      outState(?SAVE?(q100,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f100*cmpl_o +sum4_1 +sum4_2

      strainRate(1) = fN00 + f100 - sum4_2*inv_omg_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f001*cmpl_o +sum9_1 +sum9_2
      outState(?SAVE?(q00N,1,iElem,QQ,nScalars,nElems,neigh )) =                      &
        &                                   f00N*cmpl_o -sum9_1 +sum9_2

      strainRate(3) = f001 + f00N - sum9_2*inv_omg_2

      ! s1 = (W+E + (SW+NE + (SE+NW + (BW+TE + (BE+TW
      strainRate(1) =   strainRate(1) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BW_TE + fNeq_BE_TW

      ! s2 = (S+N + (NE+SW + (NW+SE + (BS+TN + (BN+TS
      strainRate(2) =   strainRate(2) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BS_TN + fNeq_BN_TS

      ! s3 = (B+T + (BS+TN + (BN+TS + (BW+TE + (BE+TW
      strainRate(3) =   strainRate(3) &
        &             + fNeq_BS_TN + fNeq_BN_TS + fNeq_BW_TE + fNeq_BE_TW

      ! s4 = (SW+NE - (NW+SE
      strainRate(4) = fNeq_NE_SW - fNeq_NW_SE

      ! s5 = (BS+TN - (TS+BN
      strainRate(5) = fNeq_BS_TN - fNeq_BN_TS

      ! s6 = (BW+TE - (BE+TW
      strainRate(6) = fNeq_BW_TE - fNeq_BE_TW

      ! convert strain rate to physics unit
      strainRate = strainRate * coeffSR

      ! Calculate physical shear rate
      shearRate = 2._rk * sqrt( sum( strainRate * strainRate ) )

      ! calculate dynamic visc by Carreau-Yasuda model
      ! this equation could be simplified to k0^2/shear + k1^2 + 2k1/sqrt(shear)
      t = ( k0 + k1 * sqrt(shearRate) )
      nuLB = t * t / shearRate
      ! convert to kinematic visc
      nuLB = nuLB / params%physics%rho0 / convVisc

      ! calc omage and save state vector
outState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) ) = 1._rk / ( 0.5_rk + 3._rk*nuLB )

    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_nNwtn_CS
! ****************************************************************************** !

! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK for
  !! incompressible lbm model
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19_incomp_nNwtn_CY )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem ! element counter
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: inv_rho0 ! inverse local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    real(kind=rk) :: fEq000
    ! derived constants
    real(kind=rk) usqn, usqn_o1, usqn_o2
    real(kind=rk) omega_2, cmpl_o, omega, inv_omg_2, coeffSR
    real(kind=rk) coeff_1, coeff_2
    real(kind=rk) ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,   &
      &           fac_13
    real(kind=rk) sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,      &
      &           sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1, sum11_2,  &
      &           sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------
    ! parameters for non-Newtonian
    integer :: nScalars, omegaStatePos
    real(kind=rk) :: fNeq_NE_SW, fNeq_NW_SE, fNeq_BW_TE, &
      &              fNeq_BE_TW, fNeq_BS_TN, fNeq_BN_TS
    real(kind=rk) :: strainRate(6)
    real(kind=rk) :: shearRate ! second invariant of strainRate
    real(kind=rk) :: nuLB, t
    real(kind=rk) :: convSR
    real(kind=rk) :: convVisc
    real(kind=rk) :: a, n_1_a, lambda, v0_vInf, vInf ! parameters for CY model
    ! ---------------------------------------------------------------------------

    ! omega pos in state
    omegaStatePos = QQ + 1

    nScalars = varSys%nScalars

    ! conversion factor for Shear Rate (or Strain Rate)
    convSR = params%physics%fac(level)%strainRate

    ! conversion factor for kinematic viscosity
    convVisc = params%physics%fac(level)%visc

    ! get nonNewtonian power law parameter
    a       = fieldProp(1)%fluid%nNwtn%CY%a
    n_1_a   = fieldProp(1)%fluid%nNwtn%CY%n_1_a
    lambda  = fieldProp(1)%fluid%nNwtn%CY%lambda * 2.0_rk
    vInf    = fieldProp(1)%fluid%nNwtn%CY%viscInf
    v0_vInf = fieldProp(1)%fluid%nNwtn%CY%visc0 - vInf

    vInf = vInf / params%physics%rho0 / convVisc
    v0_vInf = v0_vInf / params%physics%rho0 / convVisc

    ! inverse local density
    inv_rho0 = 1._rk / rho0

!$omp do schedule(static)

    !NEC$ ivdep
?? IF (SOA) THEN
?? copy :: dir_vector
?? ELSE
?? copy :: dir_novec
?? END IF
    nodeloop: do iElem = 1, nSolve
      ! read the relaxation parameter omega for the current element
      omega   = inState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) )
      inv_omg_2 = 2._rk / omega
      coeffSR = -1.5_rk * omega * convSR * inv_rho0

      ! pre-calculate partial collision constants
      omega_2 = 2._rk * omega
      cmpl_o  = 1._rk - omega

      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

?? copy :: ux_d3q19( u_x, f, inv_rho0 )
?? copy :: uy_d3q19( u_y, f, inv_rho0 )
?? copy :: uz_d3q19( u_z, f, inv_rho0 )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      usqn = div1_36 * (rho - 1.5_rk * usq * rho0 )
      fEq000 = rho*div1_3 - 0.5_rk*usq*rho0

      outState(?SAVE?(q000,1,iElem,QQ,nScalars,nElems,neigh)) = f000*cmpl_o + omega*fEq000

      coeff_1 = div1_8 * omega * rho0

      usqn_o1 = omega * usqn

      ui1     =  u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1,iElem,QQ,nScalars,nElems,neigh))=f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1,iElem,QQ,nScalars,nElems,neigh))=fNN0 *cmpl_o-sum1_1 +sum1_2

      fNeq_NE_SW = f110 + fNN0 - sum1_2*inv_omg_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1,iElem,QQ,nScalars,nElems,neigh))=fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1,iElem,QQ,nScalars,nElems,neigh))=f1N0 *cmpl_o-sum3_1 +sum3_2

      fNeq_NW_SE = fN10 + f1N0 - sum3_2*inv_omg_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1,iElem,QQ,nScalars,nElems,neigh))=f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1,iElem,QQ,nScalars,nElems,neigh))=fN0N*cmpl_o-sum10_1+sum10_2

      fNeq_BW_TE = fN0N + f101 - sum10_2*inv_omg_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1,iElem,QQ,nScalars,nElems,neigh))=fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1,iElem,QQ,nScalars,nElems,neigh))=f10N*cmpl_o-sum12_1+sum12_2

      fNeq_BE_TW = f10N + fN01 - sum12_2*inv_omg_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1,iElem,QQ,nScalars,nElems,neigh))=f011*cmpl_o+sum11_1+sum11_2
      outState(?SAVE?(q0NN,1,iElem,QQ,nScalars,nElems,neigh))=f0NN*cmpl_o-sum11_1+sum11_2

      fNeq_BS_TN = f0NN + f011 - sum11_2*inv_omg_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1,iElem,QQ,nScalars,nElems,neigh))=f0N1*cmpl_o+sum13_1+sum13_2
      outState(?SAVE?(q01N,1,iElem,QQ,nScalars,nElems,neigh))=f01N*cmpl_o-sum13_1+sum13_2

      fNeq_BN_TS = f01N + f0N1 - sum13_2*inv_omg_2

      coeff_2 = div1_8 * omega_2 * rho0
      usqn_o2 = omega_2 * usqn

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1,iElem,QQ,nScalars,nElems,neigh ))=f010 *cmpl_o+sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1,iElem,QQ,nScalars,nElems,neigh ))=f0N0 *cmpl_o-sum2_1 +sum2_2

      strainRate(2) = f010 + f0N0 - sum2_2*inv_omg_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1,iElem,QQ,nScalars,nElems,neigh ))=fN00 *cmpl_o-sum4_1 +sum4_2
      outState(?SAVE?(q100,1,iElem,QQ,nScalars,nElems,neigh ))=f100 *cmpl_o+sum4_1 +sum4_2

      strainRate(1) = fN00 + f100 - sum4_2*inv_omg_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1,iElem,QQ,nScalars,nElems,neigh ))=f001 *cmpl_o+sum9_1 +sum9_2
      outState(?SAVE?(q00N,1,iElem,QQ,nScalars,nElems,neigh ))=f00N *cmpl_o-sum9_1 +sum9_2

      strainRate(3) = f001 + f00N - sum9_2*inv_omg_2

      ! s1 = (W+E + (SW+NE + (SE+NW + (BW+TE + (BE+TW
      strainRate(1) =   strainRate(1) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BW_TE + fNeq_BE_TW

      ! s2 = (S+N + (NE+SW + (NW+SE + (BS+TN + (BN+TS
      strainRate(2) =   strainRate(2) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BS_TN + fNeq_BN_TS

      ! s3 = (B+T + (BS+TN + (BN+TS + (BW+TE + (BE+TW
      strainRate(3) =   strainRate(3) &
        &             + fNeq_BS_TN + fNeq_BN_TS + fNeq_BW_TE + fNeq_BE_TW

      ! s4 = (SW+NE - (NW+SE
      strainRate(4) = fNeq_NE_SW - fNeq_NW_SE

      ! s5 = (BS+TN - (TS+BN
      strainRate(5) = fNeq_BS_TN - fNeq_BN_TS

      ! s6 = (BW+TE - (BE+TW
      strainRate(6) = fNeq_BW_TE - fNeq_BE_TW

      strainRate = strainRate * coeffSR

      ! Calculate physical shear rate
      shearRate = sqrt( sum( strainRate * strainRate ) )

      ! calculate dynamic visc by Carreau-Yasuda model
      ! convert to kinematic visc
      t = ( 1._rk + (lambda*shearRate) ** a ) ** n_1_a
      nuLB = vInf + v0_vInf * t

      ! calc omage and save state vector
outState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) ) = 1._rk / ( 0.5_rk + 3._rk*nuLB )

    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_incomp_nNwtn_CY
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK for
  !! incompressible lbm model
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19_incomp_nNwtn_PL )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem ! element counter
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: inv_rho0 ! inverse local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    real(kind=rk) :: fEq000
    ! derived constants
    real(kind=rk) :: usqn, usqn_o1, usqn_o2, inv_omg_2, coeffSR
    real(kind=rk) :: omega_2, cmpl_o, omega
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &           sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1, sum11_2,  &
      &           sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------
    ! parameters for non-Newtonian
    integer :: nScalars, omegaStatePos
    real(kind=rk) :: fNeq_NE_SW, fNeq_NW_SE, fNeq_BW_TE, &
      &              fNeq_BE_TW, fNeq_BS_TN, fNeq_BN_TS
    real(kind=rk) :: strainRate(6)
    real(kind=rk) :: shearRate ! second invariant of strainRate
    real(kind=rk) :: nuLB
    real(kind=rk) :: convSR
    real(kind=rk) :: convVisc
    real(kind=rk) :: k, n_1 ! parameters for power-law
    ! ---------------------------------------------------------------------------

    ! omega pos in state
    omegaStatePos = QQ + 1

    nScalars = varSys%nScalars

    ! conversion factor for Shear Rate (or Strain Rate)
    convSR = params%physics%fac(level)%strainRate

    ! conversion factor for kinematic viscosity
    convVisc = params%physics%fac(level)%visc

    ! get nonNewtonian power law parameter
    k   = fieldProp(1)%fluid%nNwtn%PL%k / params%physics%rho0 / convVisc
    n_1 = fieldProp(1)%fluid%nNwtn%PL%n_1


    ! inverse local density
    inv_rho0 = 1._rk / rho0

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve
      ! read the relaxation parameter omega for the current element
      omega   = inState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) )
      inv_omg_2 = 2._rk / omega
      coeffSR = -1.5_rk * omega * convSR * inv_rho0

      ! pre-calculate partial collision constants
      omega_2 = 2._rk * omega
      cmpl_o  = 1._rk - omega

      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

?? copy :: ux_d3q19( u_x, f, inv_rho0 )
?? copy :: uy_d3q19( u_y, f, inv_rho0 )
?? copy :: uz_d3q19( u_z, f, inv_rho0 )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      usqn = div1_36 * (rho - 1.5_rk * usq * rho0 )
      fEq000 = rho*div1_3 - 0.5_rk*usq*rho0

      outState(?SAVE?(q000,1,iElem,QQ,nScalars,nElems,neigh)) = f000*cmpl_o + omega*fEq000

      coeff_1 = div1_8 * omega * rho0

      usqn_o1 = omega * usqn

      ui1     =  u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1,iElem,QQ,nScalars,nElems,neigh))=f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1,iElem,QQ,nScalars,nElems,neigh))=fNN0 *cmpl_o-sum1_1 +sum1_2

      fNeq_NE_SW = f110 + fNN0 - sum1_2*inv_omg_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1,iElem,QQ,nScalars,nElems,neigh))=fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1,iElem,QQ,nScalars,nElems,neigh))=f1N0 *cmpl_o-sum3_1 +sum3_2

      fNeq_NW_SE = fN10 + f1N0 - sum3_2*inv_omg_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1,iElem,QQ,nScalars,nElems,neigh))=f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1,iElem,QQ,nScalars,nElems,neigh))=fN0N*cmpl_o-sum10_1+sum10_2

      fNeq_BW_TE = fN0N + f101 - sum10_2*inv_omg_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1,iElem,QQ,nScalars,nElems,neigh))=fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1,iElem,QQ,nScalars,nElems,neigh))=f10N*cmpl_o-sum12_1+sum12_2

      fNeq_BE_TW = f10N + fN01 - sum12_2*inv_omg_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1,iElem,QQ,nScalars,nElems,neigh))=f011*cmpl_o+sum11_1+sum11_2
      outState(?SAVE?(q0NN,1,iElem,QQ,nScalars,nElems,neigh))=f0NN*cmpl_o-sum11_1+sum11_2

      fNeq_BS_TN = f0NN + f011 - sum11_2*inv_omg_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1,iElem,QQ,nScalars,nElems,neigh))=f0N1*cmpl_o+sum13_1+sum13_2
      outState(?SAVE?(q01N,1,iElem,QQ,nScalars,nElems,neigh))=f01N*cmpl_o-sum13_1+sum13_2

      fNeq_BN_TS = f01N + f0N1 - sum13_2*inv_omg_2

      coeff_2 = div1_8 * omega_2 * rho0
      usqn_o2 = omega_2 * usqn

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1,iElem,QQ,nScalars,nElems,neigh ))=f010 *cmpl_o+sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1,iElem,QQ,nScalars,nElems,neigh ))=f0N0 *cmpl_o-sum2_1 +sum2_2

      strainRate(2) = f010 + f0N0 - sum2_2*inv_omg_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1,iElem,QQ,nScalars,nElems,neigh ))=fN00 *cmpl_o-sum4_1 +sum4_2
      outState(?SAVE?(q100,1,iElem,QQ,nScalars,nElems,neigh ))=f100 *cmpl_o+sum4_1 +sum4_2

      strainRate(1) = fN00 + f100 - sum4_2*inv_omg_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1,iElem,QQ,nScalars,nElems,neigh ))=f001 *cmpl_o+sum9_1 +sum9_2
      outState(?SAVE?(q00N,1,iElem,QQ,nScalars,nElems,neigh ))=f00N *cmpl_o-sum9_1 +sum9_2

      strainRate(3) = f001 + f00N - sum9_2*inv_omg_2

      ! s1 = (W+E + (SW+NE + (SE+NW + (BW+TE + (BE+TW
      strainRate(1) =   strainRate(1) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BW_TE + fNeq_BE_TW

      ! s2 = (S+N + (NE+SW + (NW+SE + (BS+TN + (BN+TS
      strainRate(2) =   strainRate(2) &
        &             + fNeq_NE_SW + fNeq_NW_SE + fNeq_BS_TN + fNeq_BN_TS

      ! s3 = (B+T + (BS+TN + (BN+TS + (BW+TE + (BE+TW
      strainRate(3) =   strainRate(3) &
        &             + fNeq_BS_TN + fNeq_BN_TS + fNeq_BW_TE + fNeq_BE_TW

      ! s4 = (SW+NE - (NW+SE
      strainRate(4) = fNeq_NE_SW - fNeq_NW_SE

      ! s5 = (BS+TN - (TS+BN
      strainRate(5) = fNeq_BS_TN - fNeq_BN_TS

      ! s6 = (BW+TE - (BE+TW
      strainRate(6) = fNeq_BW_TE - fNeq_BE_TW

      strainRate = strainRate * coeffSR

      ! Calculate physical shear rate
      shearRate = 2._rk * sqrt( sum( strainRate * strainRate ) )

      ! the exponential operation is quite expensive.
      ! it decreases performance about 3-4 MLUPS on SHU
      ! calculate kinematic visc by POWER-LAW model
      ! from paper: LBMs for non-newtonian flows, page 803
      ! k is actually k / rho0 / convVisc
      nuLB = ( shearRate ** n_1 ) * k

      ! calc omage and save state vector
outState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) ) = 1._rk / ( 0.5_rk + 3._rk*nuLB )

    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_incomp_nNwtn_PL
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK for
  !! incompressible lbm model
  !!
?? copy :: compute_routineHeader( bgk_advRel_d3q19_incomp_nNwtn )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem ! element counter
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    ! derived constants
    real(kind=rk) :: usqn_o1, usqn_o2
    real(kind=rk) :: omega_2, cmpl_o, omega
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &           sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1, sum11_2,  &
      &           sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------
    ! parameters for non-Newtonian
    integer :: nScalars, omegaStatePos
    ! ---------------------------------------------------------------------------

    ! omega pos in state
    omegaStatePos = QQ + 1

    nScalars = varSys%nScalars
    ! write(*, "(A,I0)") 'nScalars: ', nScalars
    ! write(*, "(A,F10.4)") 'omega: ', inState( ?IDX?(omegaStatePos, 10, nScalars, nElems) )

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve

      ! read the relaxation parameter omega for the current element
      omega = inState( ?IDX?(omegaStatePos, iElem, nScalars, nElems) )
      ! write(*, "(A,F10.4)") 'omega: ', omega
      ! inv_omg_2 = 2._rk / omega

      ! pre-calculate partial collision constants
      omega_2 = 2._rk * omega
      cmpl_o  = 1._rk - omega

      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! local density
?? copy :: rho_d3q19( rho, f )

?? copy :: ux_d3q19( u_x, f, 1._rk )
?? copy :: uy_d3q19( u_y, f, 1._rk )
?? copy :: uz_d3q19( u_z, f, 1._rk )

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      ! usqn = div1_36 * (rho - 1.5_rk * usq * rho0 )
      usqn_o1 = omega * div1_36 * ( rho - 1.5d0 * usq )

      outState(?SAVE?(q000,1,iElem,QQ,nScalars,nElems,neigh))=f000*cmpl_o + 12._rk*usqn_o1

      coeff_1 = div1_8 * omega

      ui1     =  u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1,iElem,QQ,nScalars,nElems,neigh))=f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1,iElem,QQ,nScalars,nElems,neigh))=fNN0 *cmpl_o-sum1_1 +sum1_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1,iElem,QQ,nScalars,nElems,neigh))=fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1,iElem,QQ,nScalars,nElems,neigh))=f1N0 *cmpl_o-sum3_1 +sum3_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1,iElem,QQ,nScalars,nElems,neigh))=f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1,iElem,QQ,nScalars,nElems,neigh))=fN0N*cmpl_o-sum10_1+sum10_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1,iElem,QQ,nScalars,nElems,neigh))=fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1,iElem,QQ,nScalars,nElems,neigh))=f10N*cmpl_o-sum12_1+sum12_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1,iElem,QQ,nScalars,nElems,neigh))=f011*cmpl_o+sum11_1+sum11_2
      outState(?SAVE?(q0NN,1,iElem,QQ,nScalars,nElems,neigh))=f0NN*cmpl_o-sum11_1+sum11_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1,iElem,QQ,nScalars,nElems,neigh))=f0N1*cmpl_o+sum13_1+sum13_2
      outState(?SAVE?(q01N,1,iElem,QQ,nScalars,nElems,neigh))=f01N*cmpl_o-sum13_1+sum13_2

      coeff_2 = div1_8 * omega_2
      usqn_o2 = 2d0 * usqn_o1

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1,iElem,QQ,nScalars,nElems,neigh ))=f010 *cmpl_o+sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1,iElem,QQ,nScalars,nElems,neigh ))=f0N0 *cmpl_o-sum2_1 +sum2_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1,iElem,QQ,nScalars,nElems,neigh ))=fN00 *cmpl_o-sum4_1 +sum4_2
      outState(?SAVE?(q100,1,iElem,QQ,nScalars,nElems,neigh ))=f100 *cmpl_o+sum4_1 +sum4_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1,iElem,QQ,nScalars,nElems,neigh ))=f001 *cmpl_o+sum9_1 +sum9_2
      outState(?SAVE?(q00N,1,iElem,QQ,nScalars,nElems,neigh ))=f00N *cmpl_o-sum9_1 +sum9_2

    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_incomp_nNwtn
! ****************************************************************************** !

?? IF (SOA) THEN
! ****************************************************************************** !
  subroutine obscure_setzero(ii)
    integer :: ii
    ii = 0
  end subroutine obscure_setzero
! ****************************************************************************** !
?? ENDIF

! ****************************************************************************** !
?? copy :: compute_routineHeader( bgk_d3q19_block )
?? copy :: compute_routineParams
    ! ---------------------------------------------------------------------------
    integer :: iElem, nScalars, minElem, maxElem
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: inv_rho ! inverse local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    ! derived constants
    real(kind=rk) :: usqn, usqn_o1, usqn_o2
    real(kind=rk) :: omega_2, cmpl_o, omega
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &              sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1,        &
      &              sum11_2, sum12_1, sum12_2, sum13_1, sum13_2
    ! ---------------------------------------------------------------------------

    nScalars = varSys%nScalars
    ! read the relaxation parameter omega for the current level
    omega   = fieldProp(1)%fluid%omLvl( level )
    ! pre-calculate partial collision constants
    omega_2 = 2._rk * omega
    cmpl_o  = 1._rk - omega

!$omp do schedule(static)
    do minElem = 1, nSolve, vlen

      maxElem = min( minElem + vlen - 1, nSolve )

      !NEC$ ivdep
      !NEC$ shortloop
      do iElem = minElem, maxElem
        ! First load all local values into temp array
        fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
        f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
        f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
        f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
        f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
        f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
        f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
        f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
        f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
        f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
        fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
        f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
        fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
        f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
        fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
        fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
        f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
        f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
        f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

        ! local density
?? copy :: rho_d3q19( rho, f )

        ! inverse local density
        inv_rho = 1._rk / rho

?? copy :: ux_d3q19( u_x, f, inv_rho )
?? copy :: uy_d3q19( u_y, f, inv_rho )
?? copy :: uz_d3q19( u_z, f, inv_rho )

        ! square velocity and derived constants
        usq  = u_x*u_x + u_y*u_y + u_z*u_z
        usqn = div1_36 * (1._rk - 1.5_rk * usq) * rho

        ! f = (1-w) * f + w * fEq
        outState(?SAVE?(q000,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                     f000*cmpl_o+omega*rho*(div1_3-0.5_rk*usq)

        coeff_1 = div1_8 * omega * rho

        usqn_o1 = omega * usqn

        ui1     =  u_x + u_y
        fac_1   = coeff_1 * ui1
        sum1_1  = fac_1 * div3_4h
        sum1_2  = fac_1 * ui1 + usqn_o1

        outState(?SAVE?(q110,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   f110 *cmpl_o+sum1_1 +sum1_2
        outState(?SAVE?(qNN0,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   fNN0 *cmpl_o-sum1_1 +sum1_2

        ui3     = -u_x + u_y
        fac_3   = coeff_1 * ui3
        sum3_1  = fac_3 * div3_4h
        sum3_2  = fac_3 * ui3 + usqn_o1

        outState(?SAVE?(qN10,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   fN10 *cmpl_o+sum3_1 +sum3_2
        outState(?SAVE?(q1N0,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   f1N0 *cmpl_o-sum3_1 +sum3_2

        ui10    =  u_x + u_z
        fac_10  = coeff_1 * ui10
        sum10_1 = fac_10 * div3_4h
        sum10_2 = fac_10 * ui10 + usqn_o1

        outState(?SAVE?(q101,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   f101*cmpl_o+sum10_1+sum10_2
        outState(?SAVE?(qN0N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   fN0N*cmpl_o-sum10_1+sum10_2

        ui12    = -u_x + u_z
        fac_12  = coeff_1 * ui12
        sum12_1 = fac_12 * div3_4h
        sum12_2 = fac_12 * ui12 + usqn_o1

        outState(?SAVE?(qN01,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   fN01*cmpl_o+sum12_1+sum12_2
        outState(?SAVE?(q10N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   f10N*cmpl_o-sum12_1+sum12_2

        ui11    =  u_y + u_z
        fac_11  = coeff_1 * ui11
        sum11_1 = fac_11 * div3_4h
        sum11_2 = fac_11 * ui11 + usqn_o1

        outState(?SAVE?(q011,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   f011*cmpl_o+sum11_1+sum11_2
        outState(?SAVE?(q0NN,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   f0NN*cmpl_o-sum11_1+sum11_2

        ui13    = -u_y + u_z
        fac_13  = coeff_1 * ui13
        sum13_1 = fac_13 * div3_4h
        sum13_2 = fac_13 * ui13 + usqn_o1

        outState(?SAVE?(q0N1,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   f0N1*cmpl_o+sum13_1+sum13_2
        outState(?SAVE?(q01N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
          &                                   f01N*cmpl_o-sum13_1+sum13_2

        coeff_2 = div1_8 * omega_2 * rho
        usqn_o2 = omega_2 * usqn

        fac_2   = coeff_2 * u_y
        sum2_1  = fac_2 * div3_4h
        sum2_2  = fac_2 * u_y + usqn_o2

        outState(?SAVE?(q010,1, iElem, QQ, nScalars,nElems,neigh )) =            &
          &                                   f010 *cmpl_o+sum2_1 +sum2_2
        outState(?SAVE?(q0N0,1, iElem, QQ, nScalars,nElems,neigh )) =            &
          &                                   f0N0 *cmpl_o-sum2_1 +sum2_2

        fac_4   = coeff_2 * u_x
        sum4_1  = fac_4 * div3_4h
        sum4_2  = fac_4 * u_x + usqn_o2

        outState(?SAVE?(qN00,1, iElem, QQ, nScalars,nElems,neigh )) =            &
          &                                   fN00 *cmpl_o-sum4_1 +sum4_2
        outState(?SAVE?(q100,1, iElem, QQ, nScalars,nElems,neigh )) =            &
          &                                   f100 *cmpl_o+sum4_1 +sum4_2

        fac_9   = coeff_2 * u_z
        sum9_1  = fac_9 * div3_4h
        sum9_2  = fac_9 * u_z + usqn_o2

        outState(?SAVE?(q001,1, iElem, QQ, nScalars,nElems,neigh )) =            &
          &                                   f001 *cmpl_o+sum9_1 +sum9_2
        outState(?SAVE?(q00N,1, iElem, QQ, nScalars,nElems,neigh )) =            &
          &                                   f00N *cmpl_o-sum9_1 +sum9_2

      end do
    end do
!$omp end do nowait

  end subroutine bgk_d3q19_block
! ****************************************************************************** !

end module mus_d3q19_module
! ****************************************************************************** !
