#ifdef cl_khr_fp64
  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
  #error Double precision floating point not supported by OpenCL implementation.
#endif



#define alpha (double)1.8
#define WX (uint)128
#define WY (uint)128
#define ipn (uint)4



__kernel void newgrad(__global double *yn, __global double *y, __global double *GX, __global double *GY, __global uchar *kabe) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX)%WX;
int i1=(i+1) % WX;
int j0=(j-1+WY) % WY;
int j1=(j+1) % WY;
j*=WX;
j0*=WX;
j1*=WX;
int ij=i+j; 
GX [ij] +=(yn[i1+j]-yn[i0+j]-y[i1 +j]+y[i0 + j])*0.5*(kabe[ij]>128);
GY [ij] +=(yn[i + j1]-yn[i + j0]-y[i +j1]+y[i + j0])*0.5*(kabe[ij]>128);
}


__kernel void dcip0(__global double *fn,__global double *gxn,__global double *gyn,__global double *u,__global double *v,__global double *GXd,__global double *GYd,__global double *Yd,double DT,__global uchar *kabe) {
int i = get_global_id(0);
int j = get_global_id(1);
int jwx=j*WX;
double a1;
double b1;
double c1;
double d1;
double f1;
double e1;
double g1;

double xx=-u[i+jwx]*DT;
double yy=-v[i+jwx]*DT;

char isn;
char jsn;

if (xx==0.0){isn=0;}else{
if (xx>0.0){isn=-1;}else{isn=1;}}
if (yy==0.0){jsn=0;}else{
if (yy>0.0){jsn=-1;}else{jsn=1;}}

int im1=(i-isn+WX)%WX;
int jm1=((j-jsn+WY)%WY)*WX;
j=i+jm1;
jm1+=im1;
im1+=jwx;
jwx+=i;

	a1=Yd[jwx]-Yd[j]-Yd[im1]+Yd[jm1];
	b1=GYd[im1]-GYd[jwx];
	d1=(-a1-b1*jsn)*isn;
	c1=(-a1-(GXd[j]-GXd[jwx])*isn)*jsn;
	g1=(c1-b1)*isn;
	a1=GXd[im1]+GXd[jwx]-2.0*isn*(Yd[jwx]-Yd[im1]);
	b1=GYd[j  ]+GYd[jwx]-2.0*jsn*(Yd[jwx]-Yd[j  ]);
	e1=3.0*(Yd[im1]-Yd[jwx])+(GXd[im1]+2.0*GXd[jwx])*isn;
	f1=3.0*(Yd[j  ]-Yd[jwx])+(GYd[j  ]+2.0*GYd[jwx])*jsn;
	a1*=xx;
	b1*=yy;

if (kabe[jwx]){fn[jwx] =((a1+c1*yy+e1)*xx+g1*yy+GXd[jwx])*xx+((b1+d1*xx+f1)*yy+GYd[jwx])*yy+Yd[jwx];}
a1=(3.0*a1+2.0*(c1*yy+e1))*xx+(d1*yy+g1)*yy+GXd[jwx];
b1=(3.0*b1+2.0*(d1*xx+f1))*yy+(c1*xx+g1)*xx+GYd[jwx];

j = get_global_id(1);
i = get_global_id(0);
im1=(i+WX-1)%WX+j*WX;
jm1=(i+1)%WX+j*WX;
if (kabe[jwx]){gxn[jwx]=1.0*(a1-0.5*DT*(a1*(u[jm1]-u[im1])+b1*(v[jm1]-v[im1])));}

im1=((j+WY-1)%WY)*WX+i;
jm1=((j+1)%WY)*WX+i;
if (kabe[jwx]){gyn[jwx]=1.0*(b1-0.5*DT*(a1*(u[jm1]-u[im1])+b1*(v[jm1]-v[im1])));}
}






__kernel void pressure0(__global double *DIV,__global double *YPN,__global uchar *kabeP) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i+WX-1)%WX;
int i1=(i+1)%WX;
int j0=(j+WY-1)%WY;
int j1=(j+1)%WY;
int ij=i+j*WX;
double ff=DIV[ij];

if (((i+j)%2)==0){
j*=WX;
double p=YPN[ij];
if (kabeP[i0+j]!=0){ff-=YPN[i0+j];}else{ff-=p;}
if (kabeP[i1+j]!=0){ff-=YPN[i1+j];}else{ff-=p;}
if (kabeP[i+j0*WX]!=0){ff-=YPN[i+j0*WX];}else{ff-=p;}
if (kabeP[i+j1*WX]!=0){ff-=YPN[i+j1*WX];}else{ff-=p;}

YPN[ij]+=(-0.25*ff-p)*alpha*(kabeP[ij]>128);
}
}



__kernel void pressure1(__global double *DIV,__global double *YPN,__global uchar *kabeP) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i+WX-1)%WX;
int i1=(i+1)%WX;
int j0=(j+WY-1)%WY;
int j1=(j+1)%WY;
int ij=i+j*WX;
double ff=DIV[ij];

if (((i+j)%2)==1){
j*=WX;
double p=YPN[ij];
if (kabeP[i0+j]!=0){ff-=YPN[i0+j];}else{ff-=p;}
if (kabeP[i1+j]!=0){ff-=YPN[i1+j];}else{ff-=p;}
if (kabeP[i+j0*WX]!=0){ff-=YPN[i+j0*WX];}else{ff-=p;}
if (kabeP[i+j1*WX]!=0){ff-=YPN[i+j1*WX];}else{ff-=p;}

YPN[ij]+=(-0.25*ff-p)*alpha*(kabeP[ij]>128);
}
}



//,__local double PL[]
//barrier(CLK_GLOBAL_MEM_FENCE);
//barrier(CLK_LOCAL_MEM_FENCE);






__kernel void div(__global double *DIV, __global double *YU, __global double *YV) {
int i = get_global_id(0);
int j = get_global_id(1);
int i1=(i+1)%WX;
int j1=(j+1)%WY;
j*=WX;
j1*=WX;
int ij=i+j;
DIV[ij]=(YU[i1+j]-YU[ij]+YV[i+j1]-YV[ij]);
}


__kernel void narasi(__global double *YP, double yp00) {
int i = get_global_id(0)+get_global_id(1)*WX;
YP[i]-=yp00;
}



__kernel void rhs(__global double *YUN,__global double *YVN,__global double *YPN,__global uchar *kabeX,__global uchar *kabeY) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX)%WX;
int j0=(j-1+WY)%WY;
j*=WX;
j0*=WX;
int ij=i+j;
YUN[ij]-=(YPN[ij]-YPN[i0+j])*(kabeX[ij]>128);
YVN[ij]-=(YPN[ij]-YPN[i+j0])*(kabeY[ij]>128);
}


__kernel void veloc(__global double *YU,__global double *YV,__global double *YVU, __global double *YUV ,__global double *YVT, __global double *YUT ) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX) % WX;
int i1=(i+1) % WX;
int j0=(j-1+WY) % WY;
int j1=(j+1) % WY;
j*=WX;
j0*=WX;
j1*=WX;
int ij=i+j;
YVU [ij] =0.25*(YV[ij]+YV[i+j1]+YV[i0+j]+YV[i0+j1]);
YUV [ij] =0.25*(YU[ij]+YU[i1+j]+YU[i+j0]+YU[i1+j0]);
YVT [ij] =0.5*(YV[ij]+YV[i+j1]);
YUT [ij] =0.5*(YU[ij]+YU[i1+j]);
}


__kernel void copyy(__global double *AAA,__global double *BBB) {
int i = get_global_id(0)+get_global_id(1)*WX;
AAA[i]=BBB[i];
}



__kernel void out(__global uchar *outt,__global double *RYU,__global double *YU,__global double *YV) {
/*
int i = get_global_id(0);
int j = i/(WX*ipn);
i=i%(WX*ipn);
int cc= ((WY*ipn-j-1)*WX*ipn+i)*3;
i/=ipn;
j/=ipn;
int i0=(i-1+WX) % WX;
int j0=(j-1+WY) % WY;
j*=WX;
j0*=WX;
double uzudo=8412.3*(YV[i+j]-YU[i+j]-YV[i0+j]+YU[i+j0]);
(int)uzudo;
outt[cc  ]=uzudo;
outt[cc+1]=-uzudo;
if (uzudo<0){uzudo=-uzudo;}
uzudo/=14;
outt[cc+2]=uzudo;
*/

int i = get_global_id(0)*2;
int xx=(RYU[i]);
int yy=(RYU[i+1]);
int j=(xx+(WY*ipn-yy-1)*WX*ipn)*3;
i*=16;
outt[j+0]+=i%256;
i/=256;
outt[j+1]+=i%128+128;
i/=128;
outt[j+2]+=i%32+128+64+32;

}




__kernel void out0(__global uint *outt) {
outt[get_global_id(0)]=0;
}




__kernel void nensei0(__global double *YU,__global double *YUN,__global double *YV,__global double *YVN,__global double *GXd,__global double *GYd,double arufa,double ar1fa,__global uchar *kabeX,__global uchar *kabeY) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX) % WX;
int i1=(i+1) % WX;
int j0=(j-1+WY) % WY;
int j1=(j+1) % WY;
j*=WX;
j0*=WX;
j1*=WX;

if (kabeX[i+j]){GXd[i+j]=(YU[i+j]+arufa*(YUN[i0+j]+YUN[i1+j]+YUN[i+j0]+YUN[i+j1]))*ar1fa;}
if (kabeY[i+j]){GYd[i+j]=(YV[i+j]+arufa*(YVN[i0+j]+YVN[i1+j]+YVN[i+j0]+YVN[i+j1]))*ar1fa;}
}


__kernel void nensei1(__global double *YU,__global double *YUN,__global double *YV,__global double *YVN,__global double *GXd,__global double *GYd,double arufa,double ar1fa,__global uchar *kabeX,__global uchar *kabeY) {
int i = get_global_id(0);
int j = get_global_id(1);
int i0=(i-1+WX) % WX;
int i1=(i+1) % WX;
int j0=(j-1+WY) % WY;
int j1=(j+1) % WY;
j*=WX;
j0*=WX;
j1*=WX;
if (kabeX[i+j]){YUN[i+j]=(YU[i+j]+arufa*(GXd[i0+j]+GXd[i1+j]+GXd[i+j0]+GXd[i+j1]))*ar1fa;}
if (kabeY[i+j]){YVN[i+j]=(YV[i+j]+arufa*(GYd[i0+j]+GYd[i1+j]+GYd[i+j0]+GYd[i+j1]))*ar1fa;}
}



__kernel void syokise(__global double *AA){
uint i = get_global_id(0)%(WX*ipn);
uint j = get_global_id(0)/(WX*ipn);
uint ij=(i+j*WX*ipn)*2;
AA[ij]=WX*ipn-1.0*j-1.0;
AA[ij+1]=1.0*i;
}





__kernel void ryuusi(__global double *RYS,__global double *YUN,__global double *YVN,double DT,__global double *GXU,__global double *GYU,__global double *GXV,__global double *GYV){
uint di = get_global_id(0)*2;
double xx=RYS[di];
double yy=RYS[di+1];
int ixx=xx/ipn;
int iyy=yy/ipn;
double sxx=xx/ipn-ixx;
double syy=yy/ipn-iyy;

int im1=(ixx+1)%WX;
int jm1=((iyy+1)%WY)*WX;
iyy*=WX;

xx+=(( (1.0-sxx)*YUN[ixx+iyy]+sxx*YUN[im1+iyy] )*(1.0-syy) + ( (1.0-sxx)*YUN[ixx+jm1]+sxx*YUN[im1+jm1] )*syy)*DT*ipn;
yy+=(( (1.0-sxx)*YVN[ixx+iyy]+sxx*YVN[im1+iyy] )*(1.0-syy) + ( (1.0-sxx)*YVN[ixx+jm1]+sxx*YVN[im1+jm1] )*syy)*DT*ipn;

/*
double a1;
double b1;
double c1;
double d1;
double f1;
double e1;
double g1;
double tmp;
double tmq;

int jwx=iyy*WX;

iyy=ixx+jm1;
jm1+=im1;
im1+=jwx;
jwx+=ixx;

	a1=GXU[im1]+GXU[jwx]-2.0*(YUN[jwx]-YUN[im1]);
	b1=GYU[iyy]+GYU[jwx]-2.0*(YUN[jwx]-YUN[iyy]);
	e1=3.0*(YUN[im1]-YUN[jwx])+(GXU[im1]+2.0*GXU[jwx]);
	f1=3.0*(YUN[iyy]-YUN[jwx])+(GYU[iyy]+2.0*GYU[jwx]);
	tmp=YUN[jwx]-YUN[iyy]-YUN[im1]+YUN[jm1];
	tmq=GYU[im1]-GYU[jwx];
	d1=(-tmp-tmq);
	c1=(-tmp-(GXU[iyy]-GXU[jwx]));
	g1=(c1-tmq);
xx+=(((a1*sxx+c1*syy+e1)*sxx+g1*syy+GXU[jwx])*sxx+((b1*syy+d1*sxx+f1)*syy+GYU[jwx])*syy+YUN[jwx])*DT*ipn;


	a1=GXV[im1]+GXV[jwx]-2.0*(YVN[jwx]-YVN[im1]);
	b1=GYV[iyy]+GYV[jwx]-2.0*(YVN[jwx]-YVN[iyy]);
	e1=3.0*(YVN[im1]-YVN[jwx])+(GXV[im1]+2.0*GXV[jwx]);
	f1=3.0*(YVN[iyy]-YVN[jwx])+(GYV[iyy]+2.0*GYV[jwx]);
	tmp=YVN[jwx]-YVN[iyy]-YVN[im1]+YVN[jm1];
	tmq=GYV[im1]-GYV[jwx];
	d1=(-tmp-tmq);
	c1=(-tmp-(GXV[iyy]-GXV[jwx]));
	g1=(c1-tmq);
yy+=(((a1*sxx+c1*syy+e1)*sxx+g1*syy+GXV[jwx])*sxx+((b1*syy+d1*sxx+f1)*syy+GYV[jwx])*syy+YVN[jwx])*DT*ipn;

*/

if (xx>=(1.0*ipn*WX)){xx=0.0;yy=(1.0*ipn*WY-0.1);}
if (yy>=(1.0*ipn*WY)){yy-=(1.0*ipn*WY);}
if (xx<0.0){xx+=(1.0*ipn*WX);}
if (yy<0.0){yy+=(1.0*ipn*WY);}
RYS[di]=xx;
RYS[di+1]=yy;
}