@@ -22,40 +22,69 @@ function preprocess(renderer::GaussianRenderer2D)
2222end
2323
2424function preprocess (renderer:: GaussianRenderer3D )
25+ # Worldspace, clip space initializations
26+ # TODO avoid dynamic memory allocations
2527 ts = CUDA. zeros (4 , renderer. nGaussians);
2628 tps = CUDA. zeros (4 , renderer. nGaussians);
2729 μ′ = CUDA. zeros (2 , renderer. nGaussians);
28- camera = defaultCamera ();
30+
31+ # Camera related params
32+ camerasPath = joinpath (pkgdir (WGPUgfx), " assets" , " bonsai" , " cameras.json" )
33+ camIdx = 1
34+ near = 0.1f0
35+ far = 10.0f0
36+ camera = getCamera (camerasPath, camIdx)
2937 T = computeTransform (camera). linear |> MArray |> gpu;
3038 (w, h) = size (renderer. imageData)[1 : 2 ];
31- P = computeProjection (camera, w, h). linear |> gpu;
39+ P = computeProjection (camera, near, far). linear |> gpu;
40+ w = camera. width
41+ h = camera. height
3242 cx = div (w, 2 )
3343 cy = div (h, 2 )
3444 n = renderer. nGaussians
35- fx = 3200.7f0
36- fy = 3200.7f0
45+ fx = camera . fx
46+ fy = camera . fy
3747 means = renderer. splatData. means |> gpu
38- cov2ds = renderer. cov2ds
39- cov3ds = renderer. cov3ds
40- bbs = renderer. bbs
48+ cov2ds = renderer. cov2ds;
49+ cov3ds = renderer. cov3ds;
50+ bbs = renderer. bbs;
4151 invCov2ds = renderer. invCov2ds;
42- quaternions = renderer. splatData. quaternions |> gpu
43- scales = renderer. splatData. scales |> gpu
44- n = renderer. nGaussians
45- bbs = renderer . bbs
46- CUDA. @sync begin @cuda threads= 32 blocks= div (n, 32 ) tValues (
52+ quaternions = renderer. splatData. quaternions |> gpu;
53+ scales = renderer. splatData. scales |> gpu;
54+ n = renderer. nGaussians;
55+
56+ CUDA. @sync begin @cuda threads= 32 blocks= div (n, 32 ) frustumCulling (
4757 ts, tps, cov3ds, means, μ′, fx, fy,
4858 quaternions, scales, T, P, w, h, cx, cy,
49- cov2ds,
59+ cov2ds, far, near
5060 )
5161 end
62+
63+ CUDA. @sync begin @cuda threads= 32 blocks= div (n, 32 ) tValues (
64+ ts, cov3ds, fx, fy,
65+ quaternions, scales, cov2ds
66+ )
67+ end
68+
69+ renderer. positions = μ′
70+ sortIdxs = CUDA. sortperm (tps[3 , :])
5271 CUDA. unsafe_free! (ts)
5372 CUDA. unsafe_free! (tps)
54- renderer. positions = μ′
73+ renderer. cov2ds = cov2ds[:, :, sortIdxs]
74+ renderer. positions = μ′[:, sortIdxs]
5575 # TODO this is temporary hack
5676 # CUDA.@sync begin @cuda threads=32 blocks=div(n, 32) computeCov2d_kernel(cov2ds, rots, scales) end
5777 CUDA. @sync begin @cuda threads= 32 blocks= div (n, 32 ) computeInvCov2d (cov2ds, invCov2ds) end
58- CUDA. @sync begin @cuda threads= 32 blocks= div (n, 32 ) computeBB (cov2ds, bbs, μ′, size (renderer. imageData)[1 : end - 1 ]) end
78+ CUDA. @sync begin @cuda threads= 32 blocks= div (n, 32 ) computeBB (cov2ds, bbs, renderer. positions, (w, h)) end
79+ end
80+
81+
82+ function packedTileIds (renderer)
83+ bbs = renderer. bbs
84+ packedIds = CUDA. zeros (UInt64, nGaussians)
85+ CUDA. @sync begin
86+ @cuda threads= 32 blocks= div (nGaussians, 32 ) binPacking (packedIds, threads... , blocks... )
87+ end
5988end
6089
6190function compactIdxs (renderer)
@@ -65,11 +94,21 @@ function compactIdxs(renderer)
6594 CUDA. @sync begin
6695 @cuda threads= 32 blocks= div (n, 32 ) hitBinning (hits, bbs, threads... , blocks... )
6796 end
68- hitScans = CUDA. zeros (UInt16, size (hits));
97+
98+ # This is not memory efficient but works for small list of gaussians in tile ...
99+ # hitScans = CUDA.zeros(UInt16, size(hits));
69100 CUDA. @sync CUDA. scan! (+ , hitScans, hits; dims= 3 );
70101 CUDA. @sync maxHits = CUDA. maximum (hitScans) |> Int
71- maxBinSize = min (typemax (UInt16) |> Int, nextpow (2 , maxHits))# TODO limiting maxBinSize hardcoded to 4096
72- renderer. hitIdxs = CUDA. zeros (UInt32, blocks... , maxBinSize);
102+
103+ # TODO hardcoding UInt16 will cause issues if number of gaussians in a Tile
104+ if maxHits < typemax (UInt16)
105+ maxBinSize = min ((typemax (UInt16) |> Int), nextpow (2 , maxHits))# TODO limiting maxBinSize hardcoded to 4096
106+ renderer. hitIdxs = CUDA. zeros (UInt32, blocks... , maxBinSize);
107+ else
108+ maxBinSize = 2 * nextpow (2 , maxHits)
109+ renderer. hitIdxs = CUDA. zeros (UInt32, blocks... , maxBinSize);
110+ end
111+
73112 CUDA. @sync begin
74113 @cuda threads= blocks blocks= (32 , div (n, 32 )) shmem= reduce (* , blocks)* sizeof (UInt32) compactHits (
75114 hits,
0 commit comments