JuliaWGPU · arhik · Mar 5, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/examples/cast_kernel.jl b/examples/cast_kernel.jl
@@ -1,6 +1,6 @@
 using WGPUCompute
 
-@wgpukernel workgroupSizes=(4, 4) workgroupCount=(2, 2) function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
+function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
 	xdim = workgroupDims.x
 	ydim = workgroupDims.y
 	gIdx = workgroupId.x*xdim + localId.x
@@ -11,11 +11,11 @@ end
 
 function cast(S::DataType, x::WgpuArray{T, N}) where {T, N}
 	y = WgpuArray{S}(undef, size(x))
-	cast_kernel(x, y)
+	@wgpukernel launch=true workgroupSizes=(4, 4) workgroupCount=(2, 2) cast_kernel(x, y)
 	return y
 end
 
-x = WgpuArray{Float32}(rand(Float32, 1024, 1024) .- 0.5f0)
+x = WgpuArray{Float32}(rand(Float32, 8, 8) .- 0.5f0)
 z = cast(UInt32, x)
 
 # TODO Bool cast is not working yet

diff --git a/examples/relu_kernel.jl b/examples/relu_kernel.jl
@@ -0,0 +1,23 @@
+using WGPUCompute
+using MacroTools
+
+y = WgpuArray((rand(4, 4) .-0.5) .|> Float32)
+
+function relu_kernel(x::WgpuArray{T, N}, out::WgpuArray{T, N}) where {T, N}
+	xdim = workgroupDims.x
+	ydim = workgroupDims.y
+	gIdx = workgroupId.x*xdim + localId.x
+	gIdy = workgroupId.y*ydim + localId.y
+	gId = xDims.x*gIdy + gIdx
+	value = x[gId]
+	out[gId] = max(value, 0.0)
+end
+
+function relu(x::WgpuArray{T, N}) where {T, N}
+	y = similar(x)
+	@wgpukernel launch=true workgrouSizes=(4,4) workgroupCount=(1,1) relu_kernel(x, y)
+	return y
+end
+
+relu(y)
+
diff --git a/src/array.jl b/src/array.jl
@@ -141,7 +141,7 @@ mutable struct WgpuArray{T, N} <: AbstractGPUArray{T, N}
 		end
 
 		dev = getCurrentDevice()
-		
+
 		if bufsize > 0
 			storageData = Array{T}(undef, prod(dims))
 			(storageBuffer, _) = WGPUCore.createBufferWithData(
@@ -150,14 +150,14 @@ mutable struct WgpuArray{T, N} <: AbstractGPUArray{T, N}
 				storageData,
 				["Storage", "CopyDst", "CopySrc"],
 			)
+			bindGroup = nothing
+			computePipeline = nothing
+			obj = new(dims, storageData, maxSize, 0, storageBuffer, bindGroup, computePipeline)
+			finalizer(obj) do arr
+				obj = nothing
+			end
+			return obj
 		end
-		bindGroup = nothing
-		computePipeline = nothing
-		obj = new(dims, storageData, maxSize, 0, storageBuffer, bindGroup, computePipeline)
-		finalizer(obj) do arr
-			obj = nothing
-		end
-		return obj
 	end
 
 	function WgpuArray{T, N}(buffer::WgpuArray, dims::Dims{T}) where {T, N}
@@ -173,21 +173,26 @@ end
 Base.eltype(::Type{WgpuArray{T}}) where T = T
 Base.eltype(::Type{WgpuArray{T, N}}) where {T, N} = T
 
-# constructors (borrowed from WgpuArray for quick prototyping)
+# constructors (borrowed from CUDA.jl for quick prototyping)
 WgpuArray{T, N}(::UndefInitializer, dims::Integer...) where {T, N} = 
 	WgpuArray{T, N}(undef, Dims(dims))
+WgpuArray{T, N}(::UndefInitializer, dims::NTuple{N, Integer}) where {T, N} = 
+	WgpuArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
+WgpuArray{T, N}(::UndefInitializer, dims::Vararg{Integer, N}) where {T, N} =
+	WgpuArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
 
 # type but not dimensionality specified 
-WgpuArray{T}(::UndefInitializer, dims::Dims{N}) where {T, N} = WgpuArray{T, N}(undef, dims)
-WgpuArray{T}(::UndefInitializer, dims::Integer...) where T = 
-	WgpuArray{T}(undef, convert(Tuple{Vararg{Int}}, dims))
+WgpuArray{T}(::UndefInitializer, dims::NTuple{N, Integer}) where {T, N} = 
+	WgpuArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
+WgpuArray{T}(::UndefInitializer, dims::Vararg{Integer, N}) where {T, N} = 
+	WgpuArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
 
 # empty vector constructors
 WgpuArray{T, 1}() where {T} = WgpuArray{T, 1}(undef, 0)
 
 Base.similar(a::WgpuArray{T,N}) where {T,N} = WgpuArray{T,N}(undef, size(a))
-Base.similar(a::WgpuArray{T}, dims::Base.Dims{N}) where {T,N} = WgpuArray{T,N}(undef, dims)
-Base.similar(a::WgpuArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} =
+Base.similar(a::WgpuArray{T, <:Any}, dims::Base.Dims{N}) where {T,N} = WgpuArray{T,N}(undef, dims)
+Base.similar(a::WgpuArray{<:Any, <:Any}, ::Type{T}, dims::Base.Dims{N}) where {T,N} =
   WgpuArray{T,N}(undef, dims)
 
 function Base.copy(a::WgpuArray{T,N}) where {T,N}
@@ -274,7 +279,6 @@ Base.collect(x::WgpuArray{T,N}) where {T,N} = copyto!(Array{T,N}(undef, size(x))
 device(array::WgpuArray) = array.storageBuffer.device
 
 ## memory copying
-
 function Base.copyto!(dest::WgpuArray{T}, doffs::Integer, src::Array{T}, soffs::Integer,
                       n::Integer) where T
   (n==0 || sizeof(T) == 0) && return dest

diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -8,11 +8,11 @@ WgpuArrayStyle{M}(::Val{N}) where {N,M} = WgpuArrayStyle{N}()
 
 BroadcastStyle(::Type{<:WgpuArray{T,N}}) where {T,N} = WgpuArrayStyle{N}()
 
-Base.similar(bc::Broadcasted{WgpuArrayStyle{N}}, ::Type{T}) where {N,T} =
-    similar(WgpuArray{T}, axes(bc))
+#Base.similar(bc::Broadcasted{WgpuArrayStyle{N}}, ::Type{T}) where {N,T} =
+#    similar(WgpuArray{T}, axes(bc))
 
-Base.similar(bc::Broadcasted{WgpuArrayStyle{N}}, ::Type{T}, dims...) where {N,T} =
-    WgpuArray{T}(undef, dims...)
+Base.similar(bc::Broadcasted{WgpuArrayStyle{N}}, ::Type{T}, dims) where {N,T} =
+    Base.similar(WgpuArray{T, length(dims)}, dims)
 
 # broadcasting type ctors isnt GPU compatible
 Broadcast.broadcasted(::WgpuArrayStyle{N}, f::Type{T}, args...) where {N, T} =

diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -1,14 +1,19 @@
-export @wgpukernel, getShaderCode
+export @wgpukernel, getShaderCode, WGPUKernelObject, wgpuCall
 
 using MacroTools
 using CodeTracking
 using Lazy
+using Infiltrator
 
 # TODO remove
 using WGPUCompute
 using Infiltrator
 
-mutable struct KernelContext
+struct WGPUKernelObject
+	kernelFunc::Function
+end
+
+mutable struct KernelBuildContext
 	inargs::Dict{Symbol, Any}
 	outargs::Dict{Symbol, Any}
 	tmpargs::Array{Symbol}
@@ -19,11 +24,9 @@ mutable struct KernelContext
 	kernel # TODO function body with all statements will reside here
 end
 
-function getShaderCode(f, args::WgpuArray...)
+function getShaderCode(f, args::WgpuArray...; workgroupSizes=(), workgroupCount=())
 	fexpr = @code_string(f(args...)) |> Meta.parse
-	@capture(fexpr, @wgpukernel workgroupSizes_ workgroupCount_ function fname_(fargs__) where Targs__ fbody__ end)
-	workgroupSizes = Meta.eval(workgroupSizes)
-	workgroupCount = Meta.eval(workgroupCount)
+	@capture(fexpr, function fname_(fargs__) where Targs__ fbody__ end)
 	originArgs = fargs[:]
 	builtinArgs = [
 		:(@builtin(global_invocation_id, global_id::Vec3{UInt32})),
@@ -46,7 +49,7 @@ function getShaderCode(f, args::WgpuArray...)
 	ins = Dict{Symbol, Any}()
 	outs = Dict{Symbol, Any}()
 
-	cntxt = KernelContext(ins, outs, Symbol[], Symbol[], Expr[], Expr[], 0, nothing)
+	cntxt = KernelBuildContext(ins, outs, Symbol[], Symbol[], Expr[], Expr[], 0, nothing)
 	ins[:Targs] = Targs
 	ins[:workgroupDims] = :workgroupDims
 
@@ -118,7 +121,7 @@ function wgslFunctionStatements(cntxt, stmnts)
 	end
 end
 
-function wgslFunctionStatement(cntxt::KernelContext, stmnt; isLast = false)
+function wgslFunctionStatement(cntxt::KernelBuildContext, stmnt; isLast = false)
 	if @capture(stmnt, a_[b_] = c_)
 		stmnt = :($(wgslFunctionStatement(cntxt, a))[$(wgslFunctionStatement(cntxt, b))] = $(wgslFunctionStatement(cntxt, c)))
 		push!(cntxt.stmnts, wgslAssignment(stmnt, nothing))
@@ -204,8 +207,8 @@ function wgslFunctionStatement(cntxt::KernelContext, stmnt; isLast = false)
 	end
 end
 
-function compileShader(f, args::WgpuArray...)
-	shaderSrc = getShaderCode(f, args...)
+function compileShader(f, args::WgpuArray...; workgroupSizes=(), workgroupCount=())
+	shaderSrc = getShaderCode(f, args...; workgroupSizes=workgroupSizes, workgroupCount=workgroupCount)
 	cShader = nothing
 	try
 		cShader = createShaderObj(WGPUCompute.getWgpuDevice(), shaderSrc; savefile=true)
@@ -218,10 +221,10 @@ function compileShader(f, args::WgpuArray...)
 	return cShader
 end
 
-function preparePipeline(f, args::WgpuArray...)
+function preparePipeline(f::Function, args::WgpuArray...; workgroupSizes=(), workgroupCount=())
 	gpuDevice = WGPUCompute.getWgpuDevice()
 	cShader = get!(task_local_storage(), (f, :shader, eltype.(args), size.(args))) do
-		compileShader(f, args...)
+		compileShader(f, args...; workgroupSizes=workgroupSizes, workgroupCount=workgroupCount)
 	end
 	bindingLayouts = []
 	bindings = []
@@ -270,23 +273,53 @@ function compute(f, args::WgpuArray...; workgroupSizes=(), workgroupCount=())
 	WGPUCore.submit(gpuDevice.queue, [WGPUCore.finish(commandEncoder),])
 end
 
-function kernelFunc(funcExpr; workgroupSizes=nothing, workgroupCount=nothing)
-	workgroupSizes = Meta.eval(workgroupSizes)
-	workgroupCount = Meta.eval(workgroupCount)
+function kernelFunc(funcExpr)
 	if 	@capture(funcExpr, function fname_(fargs__) where Targs__ fbody__ end)
 		kernelfunc = quote
-			function $fname(args::WgpuArray...)
+			function $fname(args::Tuple{WgpuArray}, workgroupSizes, workgroupCount)
 				$preparePipeline($(funcExpr), args...)
-				$compute($(funcExpr), args...; workgroupSizes=$workgroupSizes, workgroupCount=$workgroupCount)
+				$compute($(funcExpr), args...; workgroupSizes=workgroupSizes, workgroupCount=workgroupCount)
 				return nothing
 			end
-		end
-		return esc(kernelfunc)
+		end |> unblock
+		return esc(quote $kernelfunc end)
 	else
 		error("Couldnt capture function")
 	end
 end
 
-macro wgpukernel(workgroupSizes, workgroupCount, expr)
-	kernelFunc(expr; workgroupSizes=workgroupSizes, workgroupCount=workgroupCount)
+function getFunctionBlock(func, args)
+	fString = CodeTracking.definition(String, which(func, args))
+	return Meta.parse(fString |> first)
+end
+
+function wgpuCall(kernelObj::WGPUKernelObject, args...)
+	kernelObj.kernelFunc(args...)
+end
+
+#function wgpuKernel(f, args...)
+#	preparePipeline(f, args...)
+#end
+
+macro wgpukernel(launch, wgSize, wgCount, ex)
+	code = quote end
+	@gensym f_var kernel_f kernel_args kernel_tt kernel
+	if @capture(ex, fname_(fargs__))
+		(vars, var_exprs) = assign_args!(code, fargs)
+		push!(code.args, quote
+				$kernel_args = ($(var_exprs...),)
+				$kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
+				kernel = function wgpuKernel(args...)
+					$preparePipeline($fname, args...; workgroupSizes=$wgSize, workgroupCount=$wgCount)
+					$compute($fname, args...; workgroupSizes=$wgSize, workgroupCount=$wgCount)
+				end
+				if $launch == true
+					wgpuCall(WGPUKernelObject(kernel), $(kernel_args)...)
+				else
+					WGPUKernelObject(kernel)
+				end
+			end
+		)
+	end
+	return esc(code)
 end
diff --git a/test/test4.jl b/test/test4.jl
@@ -3,19 +3,32 @@ using MacroTools
 
 y = WgpuArray((rand(4, 4, 1) .-0.5) .|> Float32)
 
-(@macroexpand @kernel function Relu(x::WgpuArray{T, N}) where {T, N}
+(@macroexpand @wgpukernel workgroupSizes=(4,4) workgroupCount=(2,2) function relu_kernel(
+	x::WgpuArray{T, N},
+	out::WgpuArray{T, N}
+	) where {T, N}
 	gIdx = globalId.x * globalId.y + globalId.z
 	value = x[gIdx]
 	out[gIdx] = max(value, 0.0)
 end) |> MacroTools.striplines
 
-@kernel function Relu(x::WgpuArray{T, N}) where {T, N}
+@wgpukernel workgroupSizes=(4,4) workgroupCount=(2,2) function relu_kernel(
+	x::WgpuArray{T, N},
+	out::WgpuArray{T, N}
+	) where {T, N}
 	gIdx = globalId.x * globalId.y + globalId.z
 	value = x[gIdx]
 	out[gIdx] = max(value, 0.0)
 end
 
-Relu(y)
+
+function relu(x::WgpuArray{T, N}) where {T, N}
+	y = similar(x)
+	relu_kernel(x, y)
+	return y
+end
+
+relu(y)
 
 # TODO this version should also be useful but may be not
 # @kernel Relu(x)