update examples

arhik · arhik · commit 14810fd1a55d · 2024-07-12T22:54:46.000+05:30
update examples
diff --git a/examples/atomic.jl b/examples/atomic.jl
@@ -0,0 +1,43 @@
+using Revise
+using WGPUCompute
+using Test
+
+# Silly example
+# Need to comeup with better example
+
+empty!(task_local_storage())
+
+function atomiccount_kernel(hist::WgpuArray{T, N1}, x::WgpuArray{T, N2}, iSize::UInt32) where {T, N1, N2}
+	gId = workgroupId.x*workgroupId.y + localId.x
+	stride = workgroupDims.x*workgroupCount.x
+	@wgpuatomic a::UInt32
+	val = x[gId]
+	a = hist[val]
+	while gId < iSize
+		val = x[gId]
+		a += T(1)
+		gId += stride
+	end
+	hist[val] = a
+end
+
+function atomiccount(x::WgpuArray{T, N1}, hist::WgpuArray{S, N2}) where {T, S, N1, N2}
+	y = WgpuArray{UInt32}(undef, nbins)
+	copyto!(y, hist)
+	@wgpukernel(
+		launch=true,
+		workgroupSizes=(64,),
+		workgroupCount=(1,),
+		shmem=(),
+		atomiccount_kernel(y, x, reduce(*, size(x)) |> UInt32)
+	)
+	return y
+end
+
+nbins = 10
+x = WgpuArray{UInt32}(rand(UInt32, 64) .% nbins)
+count = WgpuArray{UInt32}(zeros(UInt32, 10))
+
+z = atomiccount(x, count)
+
+# histogram(x)
diff --git a/examples/cast_kernel.jl b/examples/cast_kernel.jl
@@ -10,6 +10,11 @@ function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
 	out[gId] = S(ceil(x[gId]))
 end
 
+function cast_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
+	gId = xDims.x*globalId.y + globalId.x
+	out[gId] = S(ceil(x[gId]))
+end
+
 function cast(S::DataType, x::WgpuArray{T, N}) where {T, N}
 	y = WgpuArray{S}(undef, size(x))
 	@wgpukernel launch=true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() cast_kernel(x, y)
diff --git a/examples/localarray.jl b/examples/localarray.jl
@@ -0,0 +1,26 @@
+using Revise
+using WGPUCompute
+using Test
+
+function localarray_kernel(x::WgpuArray{T, N}, out::WgpuArray{S, N}) where {T, S, N}
+	a = Vec4{Float32}(1.0f0, 2.0f0, 3.0f0, 4.0f0);
+	gId = xDims.x*globalId.y + globalId.x
+	out[gId] = S(ceil(x[gId]))
+end
+
+function localarray(S::DataType, x::WgpuArray{T, N}) where {T, N}
+	y = WgpuArray{S}(undef, size(x))
+	@wgpukernel launch = true workgroupSizes=(4, 4) workgroupCount=(2, 2) shmem=() localarray_kernel(x, y)
+	return y
+end
+
+x = rand(Float32, 8, 8) .- 0.5f0
+
+x_gpu = WgpuArray{Float32}(x)
+z_gpu = localarray(UInt32, x_gpu)
+z_cpu = z_gpu |> collect
+
+z = UInt32.(x .> 0.0)
+
+@test z ≈ z_cpu
+
diff --git a/examples/naive_reduce_mul.jl b/examples/naive_reduce_mul.jl
@@ -37,7 +37,7 @@ z = naive_reduce(x, *)
 
 x_cpu = (x |> collect)
 
-sum_cpu = reduce(*, x_cpu)
-sum_gpu = (z|>collect)[1]
+mul_cpu = reduce(*, x_cpu)
+mul_gpu = (z|>collect)[1]
 
-@test sum_cpu ≈ sum_gpu
+@test mul_cpu ≈ mul_gpu
diff --git a/src/array.jl b/src/array.jl
@@ -163,7 +163,6 @@ function unsafe_fill!(gpuDevice, dst::WgpuArrayPtr{T}, value::Union{UInt8, Int8}
 	WGPUCore.writeBuffer(gpuDevice.queue, dst.buffer, fill(value, N))
 end
 
-
 mutable struct WgpuArray{T, N} <: AbstractGPUArray{T, N}
 	dims::Dims{N}
 	storageData::Union{Nothing, Vector{T}, Array{T}}