Skip to content

Commit 2f8c002

Browse files
authored
Merge pull request #44 from arhik/main
atomics support
2 parents e130cff + bfe0dd6 commit 2f8c002

15 files changed

+521
-36
lines changed

examples/atomic.jl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
using Revise
2+
using WGPUCompute
3+
using Test
4+
5+
# Silly example
6+
# Need to comeup with better example
7+
8+
empty!(task_local_storage())
9+
10+
function atomiccount_kernel(hist::WgpuArray{T, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
11+
gId = workgroupId.x*workgroupId.y + localId.x
12+
stride = workgroupDims.x*workgroupCount.x
13+
@wgpuatomic a::UInt32
14+
val = x[gId]
15+
a = hist[val]
16+
while gId < iSize
17+
val = x[gId]
18+
a += T(1)
19+
gId += stride
20+
end
21+
hist[val] = a
22+
end
23+
24+
function atomiccount(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
25+
y = WgpuArray{UInt32}(undef, nbins)
26+
copyto!(y, hist)
27+
@wgpukernel(
28+
launch=true,
29+
workgroupSizes=(64,),
30+
workgroupCount=(1,),
31+
shmem=(),
32+
atomiccount_kernel(y, x, reduce(*, size(x)) |> UInt32)
33+
)
34+
return y
35+
end
36+
37+
nbins = 10
38+
x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
39+
count = WgpuArray{UInt32}(zeros(UInt32, 10))
40+
41+
z = atomiccount(x, count)
42+
43+
# histogram(x)

examples/cast_kernel.jl

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using WGPUCompute
2+
using Test
23

34
function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
45
xdim = workgroupDims.x
@@ -9,14 +10,26 @@ function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
910
out[gId] = S(ceil(x[gId]))
1011
end
1112

13+
function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
14+
gId = xDims.x*globalId.y + globalId.x
15+
out[gId] = S(ceil(x[gId]))
16+
end
17+
1218
function cast(S::DataType, x::WgpuArray{T, N}) where {T, N}
1319
y = WgpuArray{S}(undef, size(x))
1420
@wgpukernel launch=true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() cast_kernel(x, y)
1521
return y
1622
end
1723

18-
x = WgpuArray{Float32}(rand(Float32, 8, 8) .- 0.5f0)
19-
z = cast(UInt32, x)
24+
x = rand(Float32, 8, 8) .- 0.5f0
25+
26+
x_gpu = WgpuArray{Float32}(x)
27+
z_gpu = cast(UInt32, x_gpu)
28+
z_cpu = z_gpu |> collect
29+
30+
z = UInt32.(x .> 0.0)
31+
32+
@test z z_cpu
2033

2134
# TODO Bool cast is not working yet
2235
# y = cast(Bool, x)

examples/clamp_kernel.jl

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,26 @@
11
using Revise
22
using WGPUCompute
3+
using Test
34

45
function clamp_kernel(x::WgpuArray{T, N}, out::WgpuArray{T, N}, minval::T, maxval::T) where {T, N}
5-
gId = xDims.x*globalId.y + globalId.x
6-
value = x[gId]
7-
out[gId] = clamp(value, minval, maxval)
6+
gId = xDims.x * globalId.y + globalId.x
7+
value = x[gId]
8+
out[gId] = clamp(value, minval, maxval)
89
end
910

1011

1112
function Base.clamp(x::WgpuArray{T, N}, minValue::T, maxValue::T) where {T, N}
12-
y = similar(x)
13-
@wgpukernel launch=true workgroupSizes=size(y) workgroupCount=(1, 1) shmem=() clamp_kernel(x, y, minValue, maxValue)
14-
return y
13+
y = similar(x)
14+
@wgpukernel launch=true workgroupSizes=size(y) workgroupCount=(1, 1) shmem=() clamp_kernel(x, y, minValue, maxValue)
15+
return y
1516
end
1617

1718
x = WgpuArray{Float32, 2}(rand(16, 16))
1819

1920
y = Base.clamp(x, 0.2f0, 0.5f0)
21+
y_cpu = y |> collect
2022

23+
@testset "Clamp minimum and maximum" begin
24+
@test minimum(y_cpu) == 0.2f0
25+
@test maximum(y_cpu) == 0.5f0
26+
end

examples/divfree_reduce_kernel.jl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
using Revise
2+
using WGPUCompute
3+
using Test
4+
5+
empty!(task_local_storage())
6+
7+
function divfree_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
8+
gId = xDims.x * globalId.y + globalId.x
9+
W = Float32(xDims.x * xDims.y)
10+
steps = UInt32(ceil(log2(W)))
11+
out[gId] = x[gId]
12+
base = 2.0f0
13+
for itr in 0:steps
14+
exponent = Float32(steps - itr - 1)
15+
baseexp = pow(base, exponent)
16+
stride = UInt32(baseexp)
17+
if localId.x < stride
18+
out[gId] = op(out[gId], out[gId + stride])
19+
end
20+
end
21+
end
22+
23+
function divfree_reduce(x::WgpuArray{T,N}, op::Function) where {T,N}
24+
y = WgpuArray{T}(undef, size(x))
25+
@wgpukernel(
26+
launch = true,
27+
workgroupSizes = (8, 8),
28+
workgroupCount = (1, 1),
29+
shmem = (),
30+
divfree_reduce_kernel(x, y, op)
31+
)
32+
return (y |> collect)
33+
end
34+
35+
x = WgpuArray{Float32}(rand(Float32, 8, 8))
36+
z = divfree_reduce(x, +)
37+
38+
x_cpu = (x |> collect)
39+
40+
sum_cpu = sum(x_cpu)
41+
sum_gpu = (z|>collect)[1]
42+
43+
@test sum_cpu sum_gpu

examples/gpuarrays.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
using Revise
2+
using WGPUCompute
3+
4+
x = WgpuArray{UInt32}(rand(UInt32, 10, 10))
5+
y = WgpuArray{WAtomic{UInt32}}(undef, 10, 10)
6+
7+
cntxt = WGPUCompute.WgpuKernelContext()
8+
Base.unsafe_copyto!(WGPUCompute.device(y), pointer(y, 1), pointer(x, 1), reduce(*, size(x)))
9+
10+
copyto!(y, x)
11+
12+
copyto!(x, y)
13+

examples/histogram.jl

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
using Revise
2+
using WGPUCompute
3+
using Test
4+
5+
empty!(task_local_storage())
6+
7+
function histogram_kernel(hist::WgpuArray{WAtomic{T}, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
8+
gId = workgroupId.x*workgroupId.y + localId.x
9+
stride = workgroupDims.x*workgroupCount.x
10+
while gId < iSize
11+
val = x[gId]
12+
hist[val] += T(1)
13+
gId += stride
14+
end
15+
end
16+
17+
function histogram(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
18+
y = WgpuArray{WAtomic{UInt32}}(undef, nbins)
19+
copyto!(y, hist)
20+
@wgpukernel(
21+
launch=true,
22+
workgroupSizes=(64,),
23+
workgroupCount=(1,),
24+
shmem=(),
25+
histogram_kernel(y, x, reduce(*, size(x)) |> UInt32)
26+
)
27+
copyto!(hist, y)
28+
return hist
29+
end
30+
31+
nbins = 10
32+
x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
33+
hist = WgpuArray{UInt32}(zeros(UInt32, 10))
34+
35+
z = histogram(x, hist)
36+
37+
hist_cpu = zeros(UInt32, nbins)
38+
for i in (x |> collect)
39+
hist_cpu[i%nbins + 1] += 1
40+
end
41+
42+
@test hist_cpu hist |> collect

examples/localarray.jl

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
using Revise
2+
using WGPUCompute
3+
using Test
4+
5+
function localarray_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
6+
a = Vec4{Float32}(1.0f0, 2.0f0, 3.0f0, 4.0f0);
7+
gId = xDims.x*globalId.y + globalId.x
8+
out[gId] = S(ceil(x[gId]))
9+
end
10+
11+
function localarray(S::DataType, x::WgpuArray{T, N}) where {T, N}
12+
y = WgpuArray{S}(undef, size(x))
13+
@wgpukernel launch = true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() localarray_kernel(x, y)
14+
return y
15+
end
16+
17+
x = rand(Float32, 8, 8) .- 0.5f0
18+
19+
x_gpu = WgpuArray{Float32}(x)
20+
z_gpu = localarray(UInt32, x_gpu)
21+
z_cpu = z_gpu |> collect
22+
23+
z = UInt32.(x .> 0.0)
24+
25+
@test z z_cpu
26+

examples/matmul_kernel.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,4 @@ out = matmul(x, y)
4646

4747
xcpu*ycpu
4848

49-
@test (xcpu*ycpu) == (out |> collect)
49+
@test (xcpu*ycpu) (out |> collect)

examples/naive_reduce_mul.jl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
using Revise
2+
using WGPUCompute
3+
using Test
4+
5+
empty!(task_local_storage())
6+
7+
function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
8+
gId = xDims.x * globalId.y + globalId.x
9+
W = Float32(xDims.x * xDims.y)
10+
steps = UInt32(ceil(log2(W)))
11+
out[gId] = x[gId]
12+
base = 2.0f0
13+
for itr in 0:steps
14+
if gId%2 == 0
15+
exponent = Float32(itr)
16+
baseexp = pow(base, exponent)
17+
stride = UInt32(baseexp)
18+
out[gId] = op(out[gId], out[gId + stride])
19+
end
20+
end
21+
end
22+
23+
function naive_reduce(x::WgpuArray{T,N}, *) where {T,N}
24+
y = WgpuArray{T}(undef, size(x))
25+
@wgpukernel(
26+
launch = true,
27+
workgroupSizes = (8, 8),
28+
workgroupCount = (1, 1),
29+
shmem = (),
30+
naive_reduce_kernel(x, y, *)
31+
)
32+
return (y |> collect)
33+
end
34+
35+
x = WgpuArray{Float32}(rand(Float32, 8, 8))
36+
z = naive_reduce(x, *)
37+
38+
x_cpu = (x |> collect)
39+
40+
mul_cpu = reduce(*, x_cpu)
41+
mul_gpu = (z|>collect)[1]
42+
43+
@test mul_cpu mul_gpu

examples/naive_reduce_plus.jl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
using Revise
2+
using WGPUCompute
3+
using Test
4+
5+
empty!(task_local_storage())
6+
7+
function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}, op::Function) where {T,N}
8+
gId = xDims.x * globalId.y + globalId.x
9+
W = Float32(xDims.x * xDims.y)
10+
steps = UInt32(ceil(log2(W)))
11+
out[gId] = x[gId]
12+
base = 2.0f0
13+
for itr in 0:steps
14+
if gId%2 == 0
15+
exponent = Float32(itr)
16+
baseexp = pow(base, exponent)
17+
stride = UInt32(baseexp)
18+
out[gId] = op(out[gId], out[gId + stride])
19+
end
20+
end
21+
end
22+
23+
function naive_reduce(x::WgpuArray{T,N}, +) where {T,N}
24+
y = WgpuArray{T}(undef, size(x))
25+
@wgpukernel(
26+
launch = true,
27+
workgroupSizes = (8, 8),
28+
workgroupCount = (1, 1),
29+
shmem = (),
30+
naive_reduce_kernel(x, y, +)
31+
)
32+
return (y |> collect)
33+
end
34+
35+
x = WgpuArray{Float32}(rand(Float32, 8, 8))
36+
z = naive_reduce(x, +)
37+
38+
x_cpu = (x |> collect)
39+
40+
sum_cpu = sum(x_cpu)
41+
sum_gpu = (z|>collect)[1]
42+
43+
@test sum_cpu sum_gpu

0 commit comments

Comments
 (0)