#ifdef cl_khr_fp64
  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
  #error Double precision floating point not supported by OpenCL implementation.
#endif




const double alpha=1.8;

const int WX=64;
const int WY=64;
const int ipn=8;




__kernel void newgrad(__global double *yn, __global double *y, __global double *GX, __global double *GY) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX)%WX;
int i1=(i+1) % WX;
int j0=(j-1+WY) % WY;
int j1=(j+1) % WY;
j*=WX;
j0*=WX;
j1*=WX;
int ij=i+j; 
GX [ij] +=(yn[i1+j]-yn[i0+j]-y[i1 +j]+y[i0 + j])*0.5;
GY [ij] +=(yn[i + j1]-yn[i + j0]-y[i +j1]+y[i + j0])*0.5;
}


__kernel void dcip0(__global double *fn,__global double *gxn,__global double *gyn,__global double *u,__global double *v,__global double *GXd,__global double *GYd,__global double *Yd,double DT) {
int i = get_global_id(0);
int j = get_global_id(1);
int jwx=j*WX;
double a1;
double b1;
double c1;
double d1;
double f1;
double e1;
double g1;
double tmp;
double tmq;

double xx=-u[i+jwx]*DT;
double yy=-v[i+jwx]*DT;

int isn;
int jsn;
double disn;
double djsn;

if (xx==0.0){isn=0;disn=0.0;}else{
if (xx>0.0){isn=-1;disn=-1.0;}else{isn=1;disn=1.0;}}
if (yy==0.0){jsn=0;djsn=0.0;}else{
if (yy>0.0){jsn=-1;djsn=-1.0;}else{jsn=1;djsn=1.0;}}

int im1=(i-isn+WX)%WX;
int jm1=((j-jsn+WY)%WY)*WX;
j=i+jm1;
jm1+=im1;
im1+=jwx;
jwx+=i;

	a1=GXd[im1]+GXd[jwx]-2.0*disn*(Yd[jwx]-Yd[im1]);
	b1=GYd[j  ]+GYd[jwx]-2.0*djsn*(Yd[jwx]-Yd[j  ]);
	e1=3.0*(Yd[im1]-Yd[jwx])+(GXd[im1]+2.0*GXd[jwx])*disn;
	f1=3.0*(Yd[j  ]-Yd[jwx])+(GYd[j  ]+2.0*GYd[jwx])*djsn;
	tmp=Yd[jwx]-Yd[j]-Yd[im1]+Yd[jm1];
	tmq=GYd[im1]-GYd[jwx];
	d1=(-tmp-tmq*djsn)*disn;
	c1=(-tmp-(GXd[j]-GXd[jwx])*disn)*djsn;
	g1=(c1-tmq)*disn;
fn[jwx] =((a1*xx+c1*yy+e1)*xx+g1*yy+GXd[jwx])*xx+((b1*yy+d1*xx+f1)*yy+GYd[jwx])*yy+Yd[jwx];
disn=(3.0*a1*xx+2.0*(c1*yy+e1))*xx+(d1*yy+g1)*yy+GXd[jwx];
djsn=(3.0*b1*yy+2.0*(d1*xx+f1))*yy+(c1*xx+g1)*xx+GYd[jwx];

j = get_global_id(1);
i = get_global_id(0);
int i0=(i+WX-1)%WX+j*WX;
int i1=(i+1)%WX+j*WX;
int j0=((j+WY-1)%WY)*WX+i;
int j1=((j+1)%WY)*WX+i;
gxn[jwx]=disn-0.5*DT*(disn*(u[i1]-u[i0])+djsn*(v[i1]-v[i0]));
gyn[jwx]=djsn-0.5*DT*(disn*(u[j1]-u[j0])+djsn*(v[j1]-v[j0]));
}


__kernel void pressure0(__global double *DIV,__global double *YPN) {
int i = get_global_id(0);
int j = get_global_id(1);
if (((i+j)%2)==0){
int i0=(i+WX-1)%WX;
int i1=(i+1)%WX;
int j0=(j+WY-1)%WY;
int j1=(j+1)%WY;
j*=WX;
int ij=i+j;
double ff=DIV[ij];
YPN[ij]+=( -0.25*(ff-YPN[i0+j]-YPN[i1+j]-YPN[i+WX*j0]-YPN[i+WX*j1])-YPN[ij] )*alpha;
}
}


__kernel void pressure1(__global double *DIV,__global double *YPN) {
int i = get_global_id(0);
int j = get_global_id(1);
if (((i+j)%2)==1){
int i0=(i+WX-1)%WX;
int i1=(i+1)%WX;
int j0=(j+WY-1)%WY;
int j1=(j+1)%WY;
j*=WX;
int ij=i+j;
double ff=DIV[ij];
YPN[ij]+=( -0.25*(ff-YPN[i0+j]-YPN[i1+j]-YPN[i+j0*WX]-YPN[i+j1*WX])-YPN[ij] )*alpha;
}
}



__kernel void div(__global double *DIV, __global double *YU, __global double *YV, double rDT) {
int i = get_global_id(0);
int j = get_global_id(1);
int i1=(i+1)%WX;
int j1=(j+1)%WY;
j*=WX;
j1*=WX;
int ij=i+j;
DIV[ij]=(YU[i1+j]-YU[ij]+YV[i+j1]-YV[ij])*rDT;
}


__kernel void narasi(__global double *YP, double yp00) {
int i = get_global_id(0)+get_global_id(1)*WX;
YP[i]-=yp00;
}



__kernel void rhs(__global double *YUN,__global double *YVN,__global double *YPN,double DT) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX)%WX;
int j0=(j-1+WY)%WY;
j*=WX;
j0*=WX;
int ij=i+j;
YUN[ij]-=(YPN[ij]-YPN[i0+j])*DT;
YVN[ij]-=(YPN[ij]-YPN[i+j0])*DT;
}


__kernel void veloc(__global double *YU,__global double *YV,__global double *YVU, __global double *YUV ,__global double *YVT, __global double *YUT ) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX) % WX;
int i1=(i+1) % WX;
int j0=(j-1+WY) % WY;
int j1=(j+1) % WY;
j*=WX;
j0*=WX;
j1*=WX;
int ij=i+j;
YVU [ij] =0.25*(YV[ij]+YV[i+j1]+YV[i0+j]+YV[i0+j1]);
YUV [ij] =0.25*(YU[ij]+YU[i1+j]+YU[i+j0]+YU[i1+j0]);
YVT [ij] =0.5*(YV[ij]+YV[i+j1]);
YUT [ij] =0.5*(YU[ij]+YU[i1+j]);
}


__kernel void copyy(__global double *AAA,__global double *BBB) {
int i = get_global_id(0)+get_global_id(1)*WX;
AAA[i]=BBB[i];
}



__kernel void out(__global uchar *outt,__global double *RYU) {
/*
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX) % WX;
int j0=(j-1+WY) % WY;
j*=WX;
j0*=WX;
double uzudo=8412.3*(YV[i+j]-YU[i+j]-YV[i0+j]+YU[i+j0]);
(int)uzudo;
j = ((WY-get_global_id(1)-1)*WX+i)*3;
outt[j  ]=uzudo;
outt[j+1]=-uzudo;
if (uzudo<0){uzudo=-uzudo;}
uzudo/=14;
outt[j+2]=uzudo;
*/

int i = get_global_id(0)*2;
int xx=(RYU[i]);
int yy=(RYU[i+1]);
int j=(xx+(WY*ipn-yy-1)*WX*ipn)*3;
i*=16;
outt[j+0]+=i%128+128;
i/=128;
outt[j+1]+=i%128+128;
i/=128;
outt[j+2]+=i%128+128;
}




__kernel void out0(__global uint *outt) {
outt[get_global_id(0)]=0;
}




__kernel void nensei0(__global double *YU,__global double *YUN,__global double *YV,__global double *YVN,__global double *GXd,__global double *GYd,double arufa,double ar1fa) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX) % WX;
int i1=(i+1) % WX;
int j0=(j-1+WY) % WY;
int j1=(j+1) % WY;
j*=WX;
j0*=WX;
j1*=WX;

GXd[i+j]=(YU[i+j]+arufa*(YUN[i0+j]+YUN[i1+j]+YUN[i+j0]+YUN[i+j1]))*ar1fa;
GYd[i+j]=(YV[i+j]+arufa*(YVN[i0+j]+YVN[i1+j]+YVN[i+j0]+YVN[i+j1]))*ar1fa;
}


__kernel void nensei1(__global double *YU,__global double *YUN,__global double *YV,__global double *YVN,__global double *GXd,__global double *GYd,double arufa,double ar1fa) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX) % WX;
int i1=(i+1) % WX;
int j0=(j-1+WY) % WY;
int j1=(j+1) % WY;
j*=WX;
j0*=WX;
j1*=WX;
YUN[i+j]=(YU[i+j]+arufa*(GXd[i0+j]+GXd[i1+j]+GXd[i+j0]+GXd[i+j1]))*ar1fa;
YVN[i+j]=(YV[i+j]+arufa*(GYd[i0+j]+GYd[i1+j]+GYd[i+j0]+GYd[i+j1]))*ar1fa;
}



__kernel void syokise(__global double *AA){
uint i = get_global_id(0);
uint j = get_global_id(1);
uint ij=(i+j*WX*ipn)*2;
AA[ij]=1.0*i;
AA[ij+1]=2.0*j;
}





__kernel void ryuusi(__global double *RYS,__global double *YUN,__global double *YVN,double DT,__global double *GXU,__global double *GYU,__global double *GXV,__global double *GYV){
uint di = get_global_id(0)*2;
double xx=RYS[di];
double yy=RYS[di+1];
int ixx=xx/ipn;
int iyy=yy/ipn;
double sxx=xx/ipn-ixx;
double syy=yy/ipn-iyy;

int im1=(ixx+1)%WX;
int jm1=((iyy+1)%WY)*WX;
iyy*=WX;

xx+=(( (1.0-sxx)*YUN[ixx+iyy]+sxx*YUN[im1+iyy] )*(1.0-syy) + ( (1.0-sxx)*YUN[ixx+jm1]+sxx*YUN[im1+jm1] )*syy)*DT*ipn;
yy+=(( (1.0-sxx)*YVN[ixx+iyy]+sxx*YVN[im1+iyy] )*(1.0-syy) + ( (1.0-sxx)*YVN[ixx+jm1]+sxx*YVN[im1+jm1] )*syy)*DT*ipn;

/*
double a1;
double b1;
double c1;
double d1;
double f1;
double e1;
double g1;
double tmp;
double tmq;

int jwx=iyy*WX;

iyy=ixx+jm1;
jm1+=im1;
im1+=jwx;
jwx+=ixx;

	a1=GXU[im1]+GXU[jwx]-2.0*(YUN[jwx]-YUN[im1]);
	b1=GYU[iyy]+GYU[jwx]-2.0*(YUN[jwx]-YUN[iyy]);
	e1=3.0*(YUN[im1]-YUN[jwx])+(GXU[im1]+2.0*GXU[jwx]);
	f1=3.0*(YUN[iyy]-YUN[jwx])+(GYU[iyy]+2.0*GYU[jwx]);
	tmp=YUN[jwx]-YUN[iyy]-YUN[im1]+YUN[jm1];
	tmq=GYU[im1]-GYU[jwx];
	d1=(-tmp-tmq);
	c1=(-tmp-(GXU[iyy]-GXU[jwx]));
	g1=(c1-tmq);
xx+=(((a1*sxx+c1*syy+e1)*sxx+g1*syy+GXU[jwx])*sxx+((b1*syy+d1*sxx+f1)*syy+GYU[jwx])*syy+YUN[jwx])*DT*ipn;


	a1=GXV[im1]+GXV[jwx]-2.0*(YVN[jwx]-YVN[im1]);
	b1=GYV[iyy]+GYV[jwx]-2.0*(YVN[jwx]-YVN[iyy]);
	e1=3.0*(YVN[im1]-YVN[jwx])+(GXV[im1]+2.0*GXV[jwx]);
	f1=3.0*(YVN[iyy]-YVN[jwx])+(GYV[iyy]+2.0*GYV[jwx]);
	tmp=YVN[jwx]-YVN[iyy]-YVN[im1]+YVN[jm1];
	tmq=GYV[im1]-GYV[jwx];
	d1=(-tmp-tmq);
	c1=(-tmp-(GXV[iyy]-GXV[jwx]));
	g1=(c1-tmq);
yy+=(((a1*sxx+c1*syy+e1)*sxx+g1*syy+GXV[jwx])*sxx+((b1*syy+d1*sxx+f1)*syy+GYV[jwx])*syy+YVN[jwx])*DT*ipn;

*/

if (xx>=(1.0*ipn*WX)){xx-=(1.0*ipn*WX);}
if (yy>=(1.0*ipn*WY)){yy-=(1.0*ipn*WY);}
if (xx<0.0){xx+=(1.0*ipn*WX);}
if (yy<0.0){yy+=(1.0*ipn*WY);}
RYS[di]=xx;
RYS[di+1]=yy;
}