Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions examples/atomic.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Revise
using WGPUCompute
using Test

# Silly example
# Need to comeup with better example

empty!(task_local_storage())

function atomiccount_kernel(hist::WgpuArray{T, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
gId = workgroupId.x*workgroupId.y + localId.x
stride = workgroupDims.x*workgroupCount.x
@wgpuatomic a::UInt32
val = x[gId]
a = hist[val]
while gId < iSize
val = x[gId]
a += T(1)
gId += stride
end
hist[val] = a
end

function atomiccount(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
y = WgpuArray{UInt32}(undef, nbins)
copyto!(y, hist)
@wgpukernel(
launch=true,
workgroupSizes=(64,),
workgroupCount=(1,),
shmem=(),
atomiccount_kernel(y, x, reduce(*, size(x)) |> UInt32)
)
return y
end

nbins = 10
x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
count = WgpuArray{UInt32}(zeros(UInt32, 10))

z = atomiccount(x, count)

# histogram(x)
17 changes: 15 additions & 2 deletions examples/cast_kernel.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using WGPUCompute
using Test

function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
xdim = workgroupDims.x
Expand All @@ -9,14 +10,26 @@ function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
out[gId] = S(ceil(x[gId]))
end

function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
gId = xDims.x*globalId.y + globalId.x
out[gId] = S(ceil(x[gId]))
end

function cast(S::DataType, x::WgpuArray{T, N}) where {T, N}
y = WgpuArray{S}(undef, size(x))
@wgpukernel launch=true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() cast_kernel(x, y)
return y
end

x = WgpuArray{Float32}(rand(Float32, 8, 8) .- 0.5f0)
z = cast(UInt32, x)
x = rand(Float32, 8, 8) .- 0.5f0

x_gpu = WgpuArray{Float32}(x)
z_gpu = cast(UInt32, x_gpu)
z_cpu = z_gpu |> collect

z = UInt32.(x .> 0.0)

@test z ≈ z_cpu

# TODO Bool cast is not working yet
# y = cast(Bool, x)
18 changes: 12 additions & 6 deletions examples/clamp_kernel.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
using Revise
using WGPUCompute
using Test

function clamp_kernel(x::WgpuArray{T, N}, out::WgpuArray{T, N}, minval::T, maxval::T) where {T, N}
gId = xDims.x*globalId.y + globalId.x
value = x[gId]
out[gId] = clamp(value, minval, maxval)
gId = xDims.x * globalId.y + globalId.x
value = x[gId]
out[gId] = clamp(value, minval, maxval)
end


function Base.clamp(x::WgpuArray{T, N}, minValue::T, maxValue::T) where {T, N}
y = similar(x)
@wgpukernel launch=true workgroupSizes=size(y) workgroupCount=(1, 1) shmem=() clamp_kernel(x, y, minValue, maxValue)
return y
y = similar(x)
@wgpukernel launch=true workgroupSizes=size(y) workgroupCount=(1, 1) shmem=() clamp_kernel(x, y, minValue, maxValue)
return y
end

x = WgpuArray{Float32, 2}(rand(16, 16))

y = Base.clamp(x, 0.2f0, 0.5f0)
y_cpu = y |> collect

@testset "Clamp minimum and maximum" begin
@test minimum(y_cpu) == 0.2f0
@test maximum(y_cpu) == 0.5f0
end
43 changes: 43 additions & 0 deletions examples/divfree_reduce_kernel.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Revise
using WGPUCompute
using Test

empty!(task_local_storage())

function divfree_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
gId = xDims.x * globalId.y + globalId.x
W = Float32(xDims.x * xDims.y)
steps = UInt32(ceil(log2(W)))
out[gId] = x[gId]
base = 2.0f0
for itr in 0:steps
exponent = Float32(steps - itr - 1)
baseexp = pow(base, exponent)
stride = UInt32(baseexp)
if localId.x < stride
out[gId] = op(out[gId], out[gId + stride])
end
end
end

function divfree_reduce(x::WgpuArray{T,N}, op::Function) where {T,N}
y = WgpuArray{T}(undef, size(x))
@wgpukernel(
launch = true,
workgroupSizes = (8, 8),
workgroupCount = (1, 1),
shmem = (),
divfree_reduce_kernel(x, y, op)
)
return (y |> collect)
end

x = WgpuArray{Float32}(rand(Float32, 8, 8))
z = divfree_reduce(x, +)

x_cpu = (x |> collect)

sum_cpu = sum(x_cpu)
sum_gpu = (z|>collect)[1]

@test sum_cpu ≈ sum_gpu
13 changes: 13 additions & 0 deletions examples/gpuarrays.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using Revise
using WGPUCompute

x = WgpuArray{UInt32}(rand(UInt32, 10, 10))
y = WgpuArray{WAtomic{UInt32}}(undef, 10, 10)

cntxt = WGPUCompute.WgpuKernelContext()
Base.unsafe_copyto!(WGPUCompute.device(y), pointer(y, 1), pointer(x, 1), reduce(*, size(x)))

copyto!(y, x)

copyto!(x, y)

42 changes: 42 additions & 0 deletions examples/histogram.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using Revise
using WGPUCompute
using Test

empty!(task_local_storage())

function histogram_kernel(hist::WgpuArray{WAtomic{T}, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
gId = workgroupId.x*workgroupId.y + localId.x
stride = workgroupDims.x*workgroupCount.x
while gId < iSize
val = x[gId]
hist[val] += T(1)
gId += stride
end
end

function histogram(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
y = WgpuArray{WAtomic{UInt32}}(undef, nbins)
copyto!(y, hist)
@wgpukernel(
launch=true,
workgroupSizes=(64,),
workgroupCount=(1,),
shmem=(),
histogram_kernel(y, x, reduce(*, size(x)) |> UInt32)
)
copyto!(hist, y)
return hist
end

nbins = 10
x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
hist = WgpuArray{UInt32}(zeros(UInt32, 10))

z = histogram(x, hist)

hist_cpu = zeros(UInt32, nbins)
for i in (x |> collect)
hist_cpu[i%nbins + 1] += 1
end

@test hist_cpu ≈ hist |> collect
26 changes: 26 additions & 0 deletions examples/localarray.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using Revise
using WGPUCompute
using Test

function localarray_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
a = Vec4{Float32}(1.0f0, 2.0f0, 3.0f0, 4.0f0);
gId = xDims.x*globalId.y + globalId.x
out[gId] = S(ceil(x[gId]))
end

function localarray(S::DataType, x::WgpuArray{T, N}) where {T, N}
y = WgpuArray{S}(undef, size(x))
@wgpukernel launch = true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() localarray_kernel(x, y)
return y
end

x = rand(Float32, 8, 8) .- 0.5f0

x_gpu = WgpuArray{Float32}(x)
z_gpu = localarray(UInt32, x_gpu)
z_cpu = z_gpu |> collect

z = UInt32.(x .> 0.0)

@test z ≈ z_cpu

2 changes: 1 addition & 1 deletion examples/matmul_kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ out = matmul(x, y)

xcpu*ycpu

@test (xcpu*ycpu) == (out |> collect)
@test (xcpu*ycpu) (out |> collect)
43 changes: 43 additions & 0 deletions examples/naive_reduce_mul.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Revise
using WGPUCompute
using Test

empty!(task_local_storage())

function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
gId = xDims.x * globalId.y + globalId.x
W = Float32(xDims.x * xDims.y)
steps = UInt32(ceil(log2(W)))
out[gId] = x[gId]
base = 2.0f0
for itr in 0:steps
if gId%2 == 0
exponent = Float32(itr)
baseexp = pow(base, exponent)
stride = UInt32(baseexp)
out[gId] = op(out[gId], out[gId + stride])
end
end
end

function naive_reduce(x::WgpuArray{T,N}, *) where {T,N}
y = WgpuArray{T}(undef, size(x))
@wgpukernel(
launch = true,
workgroupSizes = (8, 8),
workgroupCount = (1, 1),
shmem = (),
naive_reduce_kernel(x, y, *)
)
return (y |> collect)
end

x = WgpuArray{Float32}(rand(Float32, 8, 8))
z = naive_reduce(x, *)

x_cpu = (x |> collect)

mul_cpu = reduce(*, x_cpu)
mul_gpu = (z|>collect)[1]

@test mul_cpu ≈ mul_gpu
43 changes: 43 additions & 0 deletions examples/naive_reduce_plus.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Revise
using WGPUCompute
using Test

empty!(task_local_storage())

function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
gId = xDims.x * globalId.y + globalId.x
W = Float32(xDims.x * xDims.y)
steps = UInt32(ceil(log2(W)))
out[gId] = x[gId]
base = 2.0f0
for itr in 0:steps
if gId%2 == 0
exponent = Float32(itr)
baseexp = pow(base, exponent)
stride = UInt32(baseexp)
out[gId] = op(out[gId], out[gId + stride])
end
end
end

function naive_reduce(x::WgpuArray{T,N}, +) where {T,N}
y = WgpuArray{T}(undef, size(x))
@wgpukernel(
launch = true,
workgroupSizes = (8, 8),
workgroupCount = (1, 1),
shmem = (),
naive_reduce_kernel(x, y, +)
)
return (y |> collect)
end

x = WgpuArray{Float32}(rand(Float32, 8, 8))
z = naive_reduce(x, +)

x_cpu = (x |> collect)

sum_cpu = sum(x_cpu)
sum_gpu = (z|>collect)[1]

@test sum_cpu ≈ sum_gpu
3 changes: 2 additions & 1 deletion examples/reduce_kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}) where {T,N}
for itr in 0:steps
if gId%2 == 0
exponent = Float32(itr)
stride = UInt32(pow(base, exponent))
baseexp = pow(base, exponent)
stride = UInt32(baseexp)
out[gId] += out[gId + stride]
end
end
Expand Down
Loading