Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions examples/cast_kernel.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
using WGPUCompute

@wgpukernel workgroupSizes=(4, 4) workgroupCount=(2, 2) function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
xdim = workgroupDims.x
ydim = workgroupDims.y
gIdx = workgroupId.x*xdim + localId.x
Expand All @@ -11,11 +11,11 @@ end

function cast(S::DataType, x::WgpuArray{T, N}) where {T, N}
y = WgpuArray{S}(undef, size(x))
cast_kernel(x, y)
@wgpukernel launch=true workgroupSizes=(4, 4) workgroupCount=(2, 2) cast_kernel(x, y)
return y
end

x = WgpuArray{Float32}(rand(Float32, 1024, 1024) .- 0.5f0)
x = WgpuArray{Float32}(rand(Float32, 8, 8) .- 0.5f0)
z = cast(UInt32, x)

# TODO Bool cast is not working yet
Expand Down
23 changes: 23 additions & 0 deletions examples/relu_kernel.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using WGPUCompute
using MacroTools

y = WgpuArray((rand(4, 4) .-0.5) .|> Float32)

function relu_kernel(x::WgpuArray{T, N}, out::WgpuArray{T, N}) where {T, N}
xdim = workgroupDims.x
ydim = workgroupDims.y
gIdx = workgroupId.x*xdim + localId.x
gIdy = workgroupId.y*ydim + localId.y
gId = xDims.x*gIdy + gIdx
value = x[gId]
out[gId] = max(value, 0.0)
end

function relu(x::WgpuArray{T, N}) where {T, N}
y = similar(x)
@wgpukernel launch=true workgrouSizes=(4,4) workgroupCount=(1,1) relu_kernel(x, y)
return y
end

relu(y)

34 changes: 19 additions & 15 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ mutable struct WgpuArray{T, N} <: AbstractGPUArray{T, N}
end

dev = getCurrentDevice()

if bufsize > 0
storageData = Array{T}(undef, prod(dims))
(storageBuffer, _) = WGPUCore.createBufferWithData(
Expand All @@ -150,14 +150,14 @@ mutable struct WgpuArray{T, N} <: AbstractGPUArray{T, N}
storageData,
["Storage", "CopyDst", "CopySrc"],
)
bindGroup = nothing
computePipeline = nothing
obj = new(dims, storageData, maxSize, 0, storageBuffer, bindGroup, computePipeline)
finalizer(obj) do arr
obj = nothing
end
return obj
end
bindGroup = nothing
computePipeline = nothing
obj = new(dims, storageData, maxSize, 0, storageBuffer, bindGroup, computePipeline)
finalizer(obj) do arr
obj = nothing
end
return obj
end

function WgpuArray{T, N}(buffer::WgpuArray, dims::Dims{T}) where {T, N}
Expand All @@ -173,21 +173,26 @@ end
Base.eltype(::Type{WgpuArray{T}}) where T = T
Base.eltype(::Type{WgpuArray{T, N}}) where {T, N} = T

# constructors (borrowed from WgpuArray for quick prototyping)
# constructors (borrowed from CUDA.jl for quick prototyping)
WgpuArray{T, N}(::UndefInitializer, dims::Integer...) where {T, N} =
WgpuArray{T, N}(undef, Dims(dims))
WgpuArray{T, N}(::UndefInitializer, dims::NTuple{N, Integer}) where {T, N} =
WgpuArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
WgpuArray{T, N}(::UndefInitializer, dims::Vararg{Integer, N}) where {T, N} =
WgpuArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))

# type but not dimensionality specified
WgpuArray{T}(::UndefInitializer, dims::Dims{N}) where {T, N} = WgpuArray{T, N}(undef, dims)
WgpuArray{T}(::UndefInitializer, dims::Integer...) where T =
WgpuArray{T}(undef, convert(Tuple{Vararg{Int}}, dims))
WgpuArray{T}(::UndefInitializer, dims::NTuple{N, Integer}) where {T, N} =
WgpuArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
WgpuArray{T}(::UndefInitializer, dims::Vararg{Integer, N}) where {T, N} =
WgpuArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))

# empty vector constructors
WgpuArray{T, 1}() where {T} = WgpuArray{T, 1}(undef, 0)

Base.similar(a::WgpuArray{T,N}) where {T,N} = WgpuArray{T,N}(undef, size(a))
Base.similar(a::WgpuArray{T}, dims::Base.Dims{N}) where {T,N} = WgpuArray{T,N}(undef, dims)
Base.similar(a::WgpuArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} =
Base.similar(a::WgpuArray{T, <:Any}, dims::Base.Dims{N}) where {T,N} = WgpuArray{T,N}(undef, dims)
Base.similar(a::WgpuArray{<:Any, <:Any}, ::Type{T}, dims::Base.Dims{N}) where {T,N} =
WgpuArray{T,N}(undef, dims)

function Base.copy(a::WgpuArray{T,N}) where {T,N}
Expand Down Expand Up @@ -274,7 +279,6 @@ Base.collect(x::WgpuArray{T,N}) where {T,N} = copyto!(Array{T,N}(undef, size(x))
device(array::WgpuArray) = array.storageBuffer.device

## memory copying

function Base.copyto!(dest::WgpuArray{T}, doffs::Integer, src::Array{T}, soffs::Integer,
n::Integer) where T
(n==0 || sizeof(T) == 0) && return dest
Expand Down
8 changes: 4 additions & 4 deletions src/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ WgpuArrayStyle{M}(::Val{N}) where {N,M} = WgpuArrayStyle{N}()

BroadcastStyle(::Type{<:WgpuArray{T,N}}) where {T,N} = WgpuArrayStyle{N}()

Base.similar(bc::Broadcasted{WgpuArrayStyle{N}}, ::Type{T}) where {N,T} =
similar(WgpuArray{T}, axes(bc))
#Base.similar(bc::Broadcasted{WgpuArrayStyle{N}}, ::Type{T}) where {N,T} =
# similar(WgpuArray{T}, axes(bc))

Base.similar(bc::Broadcasted{WgpuArrayStyle{N}}, ::Type{T}, dims...) where {N,T} =
WgpuArray{T}(undef, dims...)
Base.similar(bc::Broadcasted{WgpuArrayStyle{N}}, ::Type{T}, dims) where {N,T} =
Base.similar(WgpuArray{T, length(dims)}, dims)

# broadcasting type ctors isnt GPU compatible
Broadcast.broadcasted(::WgpuArrayStyle{N}, f::Type{T}, args...) where {N, T} =
Expand Down
75 changes: 54 additions & 21 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
export @wgpukernel, getShaderCode
export @wgpukernel, getShaderCode, WGPUKernelObject, wgpuCall

using MacroTools
using CodeTracking
using Lazy
using Infiltrator

# TODO remove
using WGPUCompute
using Infiltrator

mutable struct KernelContext
struct WGPUKernelObject
kernelFunc::Function
end

mutable struct KernelBuildContext
inargs::Dict{Symbol, Any}
outargs::Dict{Symbol, Any}
tmpargs::Array{Symbol}
Expand All @@ -19,11 +24,9 @@ mutable struct KernelContext
kernel # TODO function body with all statements will reside here
end

function getShaderCode(f, args::WgpuArray...)
function getShaderCode(f, args::WgpuArray...; workgroupSizes=(), workgroupCount=())
fexpr = @code_string(f(args...)) |> Meta.parse
@capture(fexpr, @wgpukernel workgroupSizes_ workgroupCount_ function fname_(fargs__) where Targs__ fbody__ end)
workgroupSizes = Meta.eval(workgroupSizes)
workgroupCount = Meta.eval(workgroupCount)
@capture(fexpr, function fname_(fargs__) where Targs__ fbody__ end)
originArgs = fargs[:]
builtinArgs = [
:(@builtin(global_invocation_id, global_id::Vec3{UInt32})),
Expand All @@ -46,7 +49,7 @@ function getShaderCode(f, args::WgpuArray...)
ins = Dict{Symbol, Any}()
outs = Dict{Symbol, Any}()

cntxt = KernelContext(ins, outs, Symbol[], Symbol[], Expr[], Expr[], 0, nothing)
cntxt = KernelBuildContext(ins, outs, Symbol[], Symbol[], Expr[], Expr[], 0, nothing)
ins[:Targs] = Targs
ins[:workgroupDims] = :workgroupDims

Expand Down Expand Up @@ -118,7 +121,7 @@ function wgslFunctionStatements(cntxt, stmnts)
end
end

function wgslFunctionStatement(cntxt::KernelContext, stmnt; isLast = false)
function wgslFunctionStatement(cntxt::KernelBuildContext, stmnt; isLast = false)
if @capture(stmnt, a_[b_] = c_)
stmnt = :($(wgslFunctionStatement(cntxt, a))[$(wgslFunctionStatement(cntxt, b))] = $(wgslFunctionStatement(cntxt, c)))
push!(cntxt.stmnts, wgslAssignment(stmnt, nothing))
Expand Down Expand Up @@ -204,8 +207,8 @@ function wgslFunctionStatement(cntxt::KernelContext, stmnt; isLast = false)
end
end

function compileShader(f, args::WgpuArray...)
shaderSrc = getShaderCode(f, args...)
function compileShader(f, args::WgpuArray...; workgroupSizes=(), workgroupCount=())
shaderSrc = getShaderCode(f, args...; workgroupSizes=workgroupSizes, workgroupCount=workgroupCount)
cShader = nothing
try
cShader = createShaderObj(WGPUCompute.getWgpuDevice(), shaderSrc; savefile=true)
Expand All @@ -218,10 +221,10 @@ function compileShader(f, args::WgpuArray...)
return cShader
end

function preparePipeline(f, args::WgpuArray...)
function preparePipeline(f::Function, args::WgpuArray...; workgroupSizes=(), workgroupCount=())
gpuDevice = WGPUCompute.getWgpuDevice()
cShader = get!(task_local_storage(), (f, :shader, eltype.(args), size.(args))) do
compileShader(f, args...)
compileShader(f, args...; workgroupSizes=workgroupSizes, workgroupCount=workgroupCount)
end
bindingLayouts = []
bindings = []
Expand Down Expand Up @@ -270,23 +273,53 @@ function compute(f, args::WgpuArray...; workgroupSizes=(), workgroupCount=())
WGPUCore.submit(gpuDevice.queue, [WGPUCore.finish(commandEncoder),])
end

function kernelFunc(funcExpr; workgroupSizes=nothing, workgroupCount=nothing)
workgroupSizes = Meta.eval(workgroupSizes)
workgroupCount = Meta.eval(workgroupCount)
function kernelFunc(funcExpr)
if @capture(funcExpr, function fname_(fargs__) where Targs__ fbody__ end)
kernelfunc = quote
function $fname(args::WgpuArray...)
function $fname(args::Tuple{WgpuArray}, workgroupSizes, workgroupCount)
$preparePipeline($(funcExpr), args...)
$compute($(funcExpr), args...; workgroupSizes=$workgroupSizes, workgroupCount=$workgroupCount)
$compute($(funcExpr), args...; workgroupSizes=workgroupSizes, workgroupCount=workgroupCount)
return nothing
end
end
return esc(kernelfunc)
end |> unblock
return esc(quote $kernelfunc end)
else
error("Couldnt capture function")
end
end

macro wgpukernel(workgroupSizes, workgroupCount, expr)
kernelFunc(expr; workgroupSizes=workgroupSizes, workgroupCount=workgroupCount)
function getFunctionBlock(func, args)
fString = CodeTracking.definition(String, which(func, args))
return Meta.parse(fString |> first)
end

function wgpuCall(kernelObj::WGPUKernelObject, args...)
kernelObj.kernelFunc(args...)
end

#function wgpuKernel(f, args...)
# preparePipeline(f, args...)
#end

macro wgpukernel(launch, wgSize, wgCount, ex)
code = quote end
@gensym f_var kernel_f kernel_args kernel_tt kernel
if @capture(ex, fname_(fargs__))
(vars, var_exprs) = assign_args!(code, fargs)
push!(code.args, quote
$kernel_args = ($(var_exprs...),)
$kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
kernel = function wgpuKernel(args...)
$preparePipeline($fname, args...; workgroupSizes=$wgSize, workgroupCount=$wgCount)
$compute($fname, args...; workgroupSizes=$wgSize, workgroupCount=$wgCount)
end
if $launch == true
wgpuCall(WGPUKernelObject(kernel), $(kernel_args)...)
else
WGPUKernelObject(kernel)
end
end
)
end
return esc(code)
end
19 changes: 16 additions & 3 deletions test/test4.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,32 @@ using MacroTools

y = WgpuArray((rand(4, 4, 1) .-0.5) .|> Float32)

(@macroexpand @kernel function Relu(x::WgpuArray{T, N}) where {T, N}
(@macroexpand @wgpukernel workgroupSizes=(4,4) workgroupCount=(2,2) function relu_kernel(
x::WgpuArray{T, N},
out::WgpuArray{T, N}
) where {T, N}
gIdx = globalId.x * globalId.y + globalId.z
value = x[gIdx]
out[gIdx] = max(value, 0.0)
end) |> MacroTools.striplines

@kernel function Relu(x::WgpuArray{T, N}) where {T, N}
@wgpukernel workgroupSizes=(4,4) workgroupCount=(2,2) function relu_kernel(
x::WgpuArray{T, N},
out::WgpuArray{T, N}
) where {T, N}
gIdx = globalId.x * globalId.y + globalId.z
value = x[gIdx]
out[gIdx] = max(value, 0.0)
end

Relu(y)

function relu(x::WgpuArray{T, N}) where {T, N}
y = similar(x)
relu_kernel(x, y)
return y
end

relu(y)

# TODO this version should also be useful but may be not
# @kernel Relu(x)