Merge pull request #43 from arhik/main

arhik · web-flow · commit e130cffb4043 · 2024-06-20T22:11:15.000+05:30
Update reduce_kernel.jl
diff --git a/examples/reduce_kernel.jl b/examples/reduce_kernel.jl
@@ -2,32 +2,33 @@ using Revise
 using WGPUCompute
 using Test
 
-function naive_reduce_kernel(x::WgpuArray{T, N}, out::WgpuArray{T, N}) where {T, N}
-	gId = xDims.x*globalId.y + globalId.x
-	W = Float32(xDims.x*xDims.y)
-	steps = UInt32(ceil(log2(W)))
-	out[gId] = x[gId]
-	base=2.0
-	for itr in 0:steps
-		exponent = Float32(itr)
-		stride = UInt32(pow(base, exponent))
-		if gId%(2*stride) == 0
+empty!(task_local_storage())
+
+function naive_reduce_kernel(x::WgpuArray{T,N}, out::WgpuArray{T,N}) where {T,N}
+    gId = xDims.x * globalId.y + globalId.x
+    W = Float32(xDims.x * xDims.y)
+    steps = UInt32(ceil(log2(W)))
+    out[gId] = x[gId]
+    base = 2.0f0
+    for itr in 0:steps
+	    if gId%2 == 0
+    		exponent = Float32(itr)
+			stride = UInt32(pow(base, exponent))
 			out[gId] += out[gId + stride]
-		end
-		synchronize()
+	    end
 	end
 end
 
-function naive_reduce(x::WgpuArray{T, N}) where {T, N}
-	y = WgpuArray{T}(undef, size(x))
-	@wgpukernel(
-		launch=true, 
-		workgroupSizes=(4, 4),
-		workgroupCount=(2, 2),
-		shmem=(:shmem=>(Float32, (4, 4)),),
-		naive_reduce_kernel(x, y)
-	)
-	return (y |> collect)[1]
+function naive_reduce(x::WgpuArray{T,N}) where {T,N}
+    y = WgpuArray{T}(undef, size(x))
+    @wgpukernel(
+        launch = true,
+        workgroupSizes = (8, 8),
+        workgroupCount = (1, 1),
+        shmem = (),
+        naive_reduce_kernel(x, y)
+    )
+    return (y |> collect)
 end
 
 x = WgpuArray{Float32}(rand(Float32, 8, 8))
@@ -36,7 +37,6 @@ z = naive_reduce(x)
 x_cpu = (x |> collect)
 
 sum_cpu = sum(x_cpu)
-sum_gpu = (z |> collect)[1]
+sum_gpu = (z|>collect)[1]
 
 @test sum_cpu ≈ sum_gpu
-