--------------------------------------------------------------------------
y  \tg   zHSPCL32.dll
y o[W  z2.0
y         zpippi
y  KvP zWindowsXPȍ~
y  KvQ zHSP Ver3.0ȍ~
y  KvR zopenCLΉOtBbN{[h܂CPU܂CellvZbT[
			GeForce 9800GTȍ~				i9800GTAGeForce 200 Series  GeForce 300 Series GeForce 400 Series GeForce 500 Series GeForce 600 Series GeForce GTX Titanj
			R700  (HD 4xxx)ȍ~				iR700  (HD 4xxx) Evergreen  (HD 5xxx) Northern Islands  (HD 6xxx) Southern Islands  (HD 7xxx)j
			HD Graphics 2500/4000ȍ~			iIntel HD Graphics 2500/4000j
y  戵   zt[vOC
y    e     zHSP3pGPGPUvOC
y     HP      zhttp://blog.livedoor.jp/toropippi/
--------------------------------------------------------------------------

Tv
OpenCLHSP3ȒPɂvOCB

OpenCLƂ̓OtBbN{[hCELLvZbT[A}`RACPUŔėpvZł悤ɍꂽB
{HSPCL32łAp̌w΂Ȃ΂Ȃ{͎lZƃrbgVtg_ZAǗxȂ̂ŃvO}[̕S͏ȂB
HSP̌_łvZ̒xA{vOCŃJo[鎖łB


ǂ̂炢ƂCPU1Xbhœ삷Ɖ肵ĂAHSPCL(OpenCL) > C++ >> HSP łB
{vOCł͂ꂼ̃foCXɍ킹OpenCLvOœK邽߁ASIMDƂ@\tɎgACPŨJ^OXybNɋ߂vZ\@oƂ\B
C++SSEȂǂ̍œK΂܂ʂB
}`RAGPŨ}`XbhHSPCL32ł͊ȒPɂłB

܂HSPvO\[Xfloat^悤ɂȂB

CXg[
HSPCXg[tH_HSPCL32.dllRs[
ACXg[̓S~ɍ폜


gp@
HSP\[XHSPCL.asCN[hAHSPCL32.dllst@CƓKwɓ
pӂ̂́AvOœǂݍނ߂OpenCL̃R[hꂽeLXgt@CB

ƂĂOpenCLłȂƂȂB
OtBbN{[hI{[h̏ꍇ́A{Iɓ삵ȂBCPUintel CPUcore iȂǂł
http://software.intel.com/en-us/vcsource/tools/opencl-sdk

uOpenCL for intel HDEEEv
CXg[΁AOpenCLintel CPÛ݂œ\B
I{[hłȂꍇ͊{IɃO{̃hCoŐVłΓ삷B


CPUAGPUdouble^T|[gĂ΁AOpenCLJ[lR[h̐擪

#ifdef cl_khr_fp64
  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
  #error Double precision floating point not supported by OpenCL implementation.
#endif

Rs[y[Xg΁Adouble^g悤ɂȂB


ӓ_
̃vOCł͑xD悵߃ANZXᔽɑ΂Ėh삷@\܂B
āAANZXᔽɂG[VXeɉeyڂƂ܂B

ňAu[XN[ɂȂAO{̐Mr₦ʂAO{t[YȂǂ̌ۂN܂B
TvɊׂ₷G[Tvpӂ̂ŁAQlɂȂׂR[h~X͂Ȃ悤ɋCĉB
{Iɓ\[XȂAANZXᔽ͂P̊ŋNȂΕʂ̊łN邱Ƃ͂ȂƍlĂB

Ɋւ邢Ȃ鑹A͐ӔC𕉂܂B
\ӂĉB


ߒ`
#cmd hspcllllllllllllllnewcmd1	$000
#cmd hspcllllllllllllllnewcmd2	$001
#cmd clbye		      	$002
#cmd clBuildProgram 		$003
#cmd clCreateKernel		$004
#cmd clSetKernel		$005
#cmd clCreateBuffer		$006
#cmd clWriteBuffer		$007
#cmd clReadBuffer		$008
#cmd clCopyBuffer		$009
#cmd clDoKernel			$00A
#cmd clReleaseKernel		$00B
#cmd clReleaseProgram		$00C
#cmd clReleaseMemObject		$00D
#cmd fdim			$00E
#cmd clSetDev			$00F
#cmd clWaitTask			$010
#cmd clDoKrn1			$011
#cmd clDoKrn2			$012
#cmd clDoKrn3			$013



ߐ
[CLini]
hspcl.asCN[hƍŏɎsĉB
ϐcldevcountopenCLs\foCX܂B
ȍ~foCXid  0`(cldevcount-1)  ܂Ŏwo܂B


[CLbye]
CLinisĂꍇŌɂŃĂB
ȂĂoc{^ŏI܂end߂ŉƎv܂B
܂Adɓǂ݂ꍇvʃG[ɂȂ邱Ƃ܂B


[clGetDevInfo(int p1)]
int p1	foCXid [in]
p1Ŏw肳ꂽfoCXid̏𕶎ŕԂ܂B
uGetDevInfovƍȂłBHSP̖߂ɑ݂邽߃RpCʂĂ܂܂B


[clGetDevName(int p1)]
int p1	foCXid [in]
p1Ŏw肳ꂽfoCXid̃foCX𕶎ŕԂ܂
clGetDevInfołfoCX擾ł܂


[clSetDev int p1]
int p1	foCXid [in]
J[l߂J[lo^AmۂsfoCXw肵܂B
ȉ̖߂sfoCX̖߂Ŏw肵܂B

clBuildProgram
clCreateKernel
clCreateBuffer
clWriteBuffer
clReadBuffer
clCopyBuffer
clDoKernel
clWaitTask

clSetDevǂłȂꍇ́AJgfoCX̓foCX0łB
p1ɂ0`(cldevcount-1)w肵ĉB


[clBuildProgram str p1,int p2]
openCL̃vOrh܂B
clSetDevŃZbgꂽfoCXɈԍœKꂽ`Ń\[XRpCArh܂B
str p1 : J[l.cl\[X  [in]
int p2 : vOid[OUT]
p1ɂ̓\[X̃t@CĉB
p2̓RpCꂽvOid܂B

RpCꂽopenclvÓÃfoCXłg܂B
Q̃foCXœJ[lsƂAꂼ̃foCXidclSetDevŃZbgclBuildProgram񂸂ĂŉB


[clCreateKernel int p1,str p2,int p3]
J[lƂAopenCL̊֐sPPʂo^܂B
int p1 : vOid          [in]
str p2 : J[l֐        [in]
int p3 : J[lid[OUT]
p2p1J[l\[Xɂu__kernel vn܂֐u__kernel v̕Ŏw肵܂B
Ⴆp1\[XɁu__kernel void vector_add(__global float *A) {vƂsp2 "vector_add" w肵܂B
p3ɂ̓J[lido͂܂B
opencl\[X̊֐́ÃJ[lƂ`ŊǗAsł܂B


[clSetKernel int p1,int p2,p3,int p4]
ŁAJ[ls̈\߃ZbgĂȂ΂܂B
int p1 : J[lid			[in]
int p2 : Zbgʒu		[in]
    p3 : ɓn(萔mem_object)[in]
int p4 : [JtO		[in]
J[l̈Ƀf[^n܂B
Ⴆopencl̃\[Xu__kernel void vector_add(__global float *hairetu,float *teisu) {vƂ̂Ȃ
clSetKernel p1,0,mem_object_A(clCreateBufferō쐬mem_object id)
clSetKernel p1,1,float(5.0)
2ɓnw肵܂B
܂̖̐߂ĂŉB

PZbg΁AƂ͉xJ[ls߂𑗂邱Ƃł܂B̓J[ls̃I[o[wbh炷߂ł܂B

p3ɂ͕ϐAint^ϐAAfloat^ϐAdouble^ϐȂǂwłA܂mem_object idw\łB


p40ȊOɂƁÄ̓[JiLjƂēo^܂B[J̓O[oeʂȂɃANZX\ȏǎ\łB
̃J[lłlێł܂Bl͐ݒs0܂͕słB
̃XbhƏLƂɎg܂B

p40ȊÔƂAp4ɂ͊mۂ[JTCY(byte)intŎw肵ĉBp3͖܂
[J̎g͈ȉ̂Ƃł



J[lR[h
__kernel void vector_add(__global double *A, __global double *B, __local double block[] , int bekii, int n) {
EEEEE

ɑ΂HSPXNvgłclSetKernel
clSetKernel p1,0,mem_object_dpA
clSetKernel p1,1,mem_object_dpB
clSetKernel p1,2,0,64 (p3ɂ0wAp464byte܂double^*8̋LƂӖAopencl\[Xłblock[0]`block[7][NO[vŋLĎg)
clSetKernel p1,3,10
clSetKernel p1,4,1024



[clCreateBuffer int p1,int p2]
int p1 : mem_object id	[OUT]
int p2 : mۂbyte			[in]
clSetDevŎw肳ꂽOtBbN{[hȂǂ̃foCXɃmۂ܂B
GDDR5Ȃǂ̃̂O{ɁAw肵TCỸmۂ邱ƂɂȂ܂B
΂clReleaseMemObjectŉł


[clWriteBuffer int p1,array p2,int p3,int p4,int p5,int p6]
int p1 : Rs[mem_object id		[in]
array p2:Rs[zϐ		[in]
int p3 : Rs[TCYbyte		[in]
int p4 : Rs[̃ItZbg		[in]
int p5 : Rs[̃ItZbg		[in]
int p6 : ubLO[hoff		[in]
zXg(CPU)clSetDevŎw肵foCX(GPU)Ƀf[^]܂B
p6̃ubLO[h1w肷offɂȂA]ȂɎ̖߂Ɉڂ܂B
]܂łclWaitTaskő҂Ƃł܂B


[clReadBuffer int p1,array p2,int p3,int p4,int p5,int p6]
int p1 : Rs[mem_object id		[in]
array p2:Rs[zϐ		[OUT]
int p3 : Rs[TCYbyte		[in]
int p4 : Rs[̃ItZbg		[in]
int p5 : Rs[̃ItZbg		[in]
int p6 : ubLO[hoff		[in]
clSetDevŎw肵foCX(GPU)zXg(CPU)Ƀf[^]܂B
p6̃ubLO[h1w肷offɂȂA]ȂɎ̖߂Ɉڂ܂B



[clCopyBuffer int p1,int p2,int p3,int p4,int p5]
int p1 : Rs[mem_object id		[in]
int p2 : Rs[mem_object id		[in]
int p3 : Rs[TCYbyte		[in]
int p4 : Rs[̃ItZbg		[in]
int p5 : Rs[̃ItZbg		[in]
clSetDevŎw肵foCX̃ԂŃRs[܂B

傫TCYŊmۂmԂ̃Rs[Ńu[XN[ɂȂ錻ۂF߂Ă܂B
ȂׂgȂƂ߂܂



[clDoKernel int p1,int p2,array p3,array p4]
int p1 : J[lid			[in]
int p2 : work_dim(1`3)			[in]
array p3:global_work_size		[in]
array p4:local_work_size		[in]
clSetDevŎw肵foCXŃJ[ls܂B
(J[lɑΉopencl\[X̊֐s܂B)
clSetDevŎw肵foCX́AvÕRpC̃foCXōsȂĂKv܂B
炩clSetKernelŃJ[lZbgĒuȂ΂܂

p2work_dimƌAglobal_work_sizelocal_work_size̎ݒ肵܂B1`3wł܂B

p3ɂ́AJ[lsw肵܂Bsɂ̐̕[NO[v܂B
p2work_dimŎw肵l2ȏ̏ꍇAp3͔zϐŎw肵Ȃ΂܂(vfwork_dim)
p4̓[J[NTCYƌA1`256(foCXɂ)włAp4vfŎw肵܂B
p3̊evfp4̊evfŊ؂Ȃ΂܂B
܂p4̊evf̑S̐ςclGetDevInfoœꂽuЂƂ̃[NO[ṽ[NACe̍őlv𒴂ꍇG[ɂȂ܂

p4̐l̈Ӗ͗lXAGPUł1w肷ƔɂȂĂ܂ƂłB
Ă12481632EEEƎw肵̎s\1{2{4{8{16{32{EEƂȂĂ܂64őł~߂ɂȂ܂B
GPŨRA̍\ɂ܂B64őł~߂Ȃ̂GPURA1Pʂ64[̏Xbhł܂Ƃ܂Ă邽߂ŁAGPUɂĂ܂܂Ȃ̂ōœKȒlTĉB
CPUł́ASIMDtɎgȂ߂p4̒l148ɂقǂꍇ܂܂֌WȂAJ[l\[XɁufloat4vufloat32vgvZꂽقSIMDtɎgł傤B
p4ɑ傫lݒ肵ĂxIɕsɂȂ邱Ƃ͊{Iɂ͂܂񂪁AeX̊Ŋm߂Ȃ璲߂ĉB

xȊO̖ʂł́Ap4Ŏw肵[NACe؂̃[NO[vŋLƂ̂g܂B
ȉɃ[NO[vO[oidȂǂ̗pg̉ڂ܂B



work_dim=2
global_work_size=6,3
local_work_size=2,3
w肵Ƃ܂B

uO[o[NTCYvu[NACev  global_work_size.0*global_work_size.1  18 @@iuv͓ӖƂӖj

ł
܂sXbh6*318ƂȂ܂B


Ƀ[J[NTCYłB
u[J[NTCYvu[NO[v[NACev  local_work_size.0*local_work_size.1  6 

łB
̒ĺuclGetDevInfovŎ擾uЂƂ̃[NO[ṽ[NACe̍őlvႢlłȂƂ܂BĂ2561024łB


Ƀ[NO[vł
u[NO[vv(global_work_size.0 / local_work_size.0)  *  (global_work_size.1 / local_work_size.1)  3
łB
̃[NO[vŃ[J[NTCY̃Xbh삷ƍlĉB
̃XbhmŋLŒlLł܂(J[l)BL̂Ƃ[JƂ܂B


x̓J[l̃Tv܂B
J[l\[Xł
__kernel void vector_add(__global double *A, __global double *B, __local double block[] , int bekii, int n) {
	int i0 = get_global_id(0);
	int i1 = get_global_id(1);
EEEEEEEEEȉ
ƂȂĂƂ܂B
J[l18쓮܂ꂼidAꂪget_global_idŎ擾ł܂B
bȒPɂ邽work_dim1global_work_size18Ƃݒɂ܂B
get_global_id(0)e[NACe0`17Ԃ܂B

work_dim 2̗ł
get_global_id(0)0`5
get_global_id(1)0`2ꂼ̃[NACeɕԂ܂Biȉ̐}QƁj



get_local_id ̓[NO[v̎idԂ܂B
get_local_id(0)0`1
get_local_id(1)0`2ꂼ̃[NACeɕԂ܂Biȉ̐}QƁj



get_group_id̓[NO[ṽO[vʔԍԂ܂B
get_group_id(0)0`2
get_group_id(1)0݂̂ꂼ̃[NACeɕԂ܂Biȉ̐}QƁj


get_global_size̓O[oTCYŁAǂ̃[NACełԂl͓l
get_global_size(0)6
get_global_size(1)3
ƂȂ܂B

get_local_size̓[JTCYŁAǂ̃[NACełԂl͓l
get_local_size(0)2
get_local_size(1)3
ƂȂ܂


get_num_groups̓O[vŁAǂ̃[NACełԂl͓l
get_num_groups(0)3
get_num_groups(1)1
ƂȂ܂


ȉA18Xbh̊el̐}
get_global_id	get_group_id	get_local_id
(0,0)			(0,0)			(0,0)
(0,1)			(0,0)			(0,1)
(0,2)			(0,0)			(0,2)
(1,0)			(0,0)			(1,0)
(1,1)			(0,0)			(1,1)
(1,2)			(0,0)			(1,2)
(2,0)			(1,0)			(0,0)
(2,1)			(1,0)			(0,1)
(2,2)			(1,0)			(0,2)
(3,0)			(1,0)			(1,0)
(3,1)			(1,0)			(1,1)
(3,2)			(1,0)			(1,2)
(4,0)			(2,0)			(0,0)
(4,1)			(2,0)			(0,1)
(4,2)			(2,0)			(0,2)
(5,0)			(2,0)			(1,0)
(5,1)			(2,0)			(1,1)
(5,2)			(2,0)			(1,2)





ɂ
܂uclDoKernelv̓G[`FbN@\work_dim򏈗Ă邽߁AI[o[wbhCɂȂ́uclDoKrn1vuclDoKrn2vuclDoKrn3v
G[`FbNȂ̓sȂ܂Błł鍂ƂĂɌyȂ̂łBbJ[ls߂1000ĂяoxōĂ܂B



[clDoKrn1 int p1,int p2,int p3]
int p1 : J[lid			[in]
int p2 : global_work_size		[in]
int p3 : local_work_size		[in]
work_dim1̏ꍇclDoKernelƓłB

[clDoKrn2 int p1,int p2,int p3,int p4,int p5]
int p1 : J[lid			[in]
int p2 : global_work_size.0		[in]
int p3 : global_work_size.1		[in]
int p4 : local_work_size.0		[in]
int p5 : local_work_size.1		[in]
work_dim2̏ꍇ

[clDoKrn3 int p1,int p2,int p3,int p4,int p5,int p6,int p7]
int p1 : J[lid			[in]
int p2 : global_work_size.0		[in]
int p3 : global_work_size.1		[in]
int p4 : global_work_size.2		[in]
int p5 : local_work_size.0		[in]
int p6 : local_work_size.1		[in]
int p7 : local_work_size.2		[in]



[clWaitTask]
clSetDevŎw肳ĂfoCX̉Z⃁[hSďI܂Ŏ̖߂Ɉڂ܂


[clReleaseKernel int p1]
int p1 : J[lid			[in]
o^J[lj܂B


[clReleaseProgram int p1]
int p1 : vOid			[in]
o^RpCς݃vOj܂B


[clReleaseMemObject int p1]
int p1 : mem_object id			[in]
foCX̃܂B


[fdim p1,int p2]
p1     : ϐ			[in]
int p2 : z			[in]
p1floatzϐɂ܂


[varsize p1]
p1Ŏw肳ꂽCӂ̌^̔zϐ̊mۍς݃TCY(byte)Ԃ܂B
i̊֐Kerupani129 Project ̃uOhttp://blogs.yahoo.co.jp/kerupani/13754300.htmlq؂܂j


zzꏊ
http://blog.livedoor.jp/toropippi/
A
http://blog.livedoor.jp/toropippi/̃bZ[WMy[W
Ɛ
̃vOC̎gpɂ蔭@Ȃɂē͈؂̐ӔC𕉂܂B

쌠
쌠͈ꕔāApippiۗL܂B

HSPCL.aśuvarsizev֐́uKerupani129 Projectv񂩂q؂
HSPCL32.dlĺuHSPfloat^ϐv@\́AHSPCXg[tH_́uhspsdkṽTv̂܂ܔq؁AgpĂ܂B

XV
    Ver 2.0
	2013/8/29
	clbyes̕sC
	clDoKrn1`3܂Œǉ
	clWaitTaskŃ^XN҂ł悤
	ANZXᔽȊȎT̃vO~Xɑ΂G[bZ[Wo悤Ƀ`FbN@\
	̑G[bZ[Wł悤ɋ@\ǉ
	[XOŏI
    Ver 1.11
	2013/8/10
	J[lsɂlocal_work_sizeglobal_work_sizeɔzwł悤ɕύX
    Ver 1.10
	2013/5/14
	vOrhAJ[lA̎
    Ver 1.0
	2013/5/9
	ReLXg̍쐬