small fixes and formatting

arhik · arhik · commit 5399411b8d76 · 2024-04-16T23:41:56.000+05:30
diff --git a/src/boundingbox.jl b/src/boundingbox.jl
@@ -1,4 +1,4 @@
-
+# Compute Bounding Boxes 
 function computeBB(cov2ds, bbs, means, sz)
     idx = (blockIdx().x - 1i32)*blockDim().x + threadIdx().x
     BB = MArray{Tuple{2, 2}, Float32}(undef)
diff --git a/src/camera.jl b/src/camera.jl
@@ -22,15 +22,15 @@ mutable struct Camera
 end
 
 function defaultCamera(;id=0)
-	eye = [0.0, 0.0, 30.0] .|> Float32
+	eye = [0.0, 0.0, 35.0] .|> Float32
 	lookat = [0, 0, 0] .|> Float32
 	up = [0, 1, 0] .|> Float32
 	scale = [1, 1, 1] .|> Float32
     fx = 3200.0f0
     fy = 3200.0f0
 	aspectRatio = 1.0 |> Float32
-	nearPlane = 0.1 |> Float32
-	farPlane = 100.0 |> Float32
+	nearPlane = -10.0 |> Float32
+	farPlane = -100.0 |> Float32
 	return Camera(
         fx,
         fy,
@@ -94,6 +94,7 @@ function computeTransform(camera::Camera)
 	v = cross(w, u)
 	m = MMatrix{4, 4, Float32}(I)
 	m[1:3, 1:3] .= (cat([u, v, w]..., dims=2) |> adjoint .|> Float32 |> collect)
+	m[4, 4] = 0.0
 	m = SMatrix(m)
 	return LinearMap(m) ∘ translateCamera(camera)
 end
@@ -128,9 +129,9 @@ function getCamera(path, idx)
 	id = camera["id"]
 	up = [0, 1, 0] .|> Float32
 	eye = -(rotation |> adjoint)*position
-	lookAt = (rotation |> adjoint)*[0.0f0, 0.0f0, -1.0f0]
-	near = 0.1f0 # TODO hardcoded
-	far = 100.0f0 # TODO hardcoded
+	lookAt = -(rotation |> adjoint)*[0.0f0, 0.0f0, 1.0f0]
+	near = -1.0f0 # TODO hardcoded
+	far = -100.0f0 # TODO hardcoded
 	scale = [1, 1, 1] .|> Float32
 	aspectRatio=1.0f0
 	data = imgName
diff --git a/src/forward.jl b/src/forward.jl
@@ -15,9 +15,20 @@ function preprocess(renderer::GaussianRenderer2D)
     scales = renderer.splatData.scales;
     n = renderer.nGaussians
     bbs = renderer.bbs
-    CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeCov2d_kernel(cov2ds, rots, scales) end
-    CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeInvCov2d(cov2ds, invCov2ds) end
-    CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeBB(cov2ds, bbs, means, size(renderer.imageData)[1:end-1]) end
+    CUDA.@sync begin 
+        @cuda threads=32 blocks=div(n, 32) computeCov2d_kernel(cov2ds, rots, scales) 
+    end
+    CUDA.@sync begin 
+        @cuda threads=32 blocks=div(n, 32) computeInvCov2d(cov2ds, invCov2ds) 
+    end
+    CUDA.@sync begin 
+        @cuda threads=32 blocks=div(n, 32) computeBB(
+            cov2ds,
+            bbs,
+            means,
+            size(renderer.imageData)[1:end-1]
+        )
+    end
     return nothing
 end
 
@@ -30,17 +41,22 @@ function preprocess(renderer::GaussianRenderer3D)
     (w, h) = size(renderer.imageData)[1:2];
     # Camera related params
 	camerasPath = joinpath(
-	    ENV["HOMEPATH"], "Downloads", "GaussianSplatting", "GaussianSplatting", "bonsai", "cameras.json"
+	    ENV["HOMEPATH"], 
+	    "Downloads", 
+	    "GaussianSplatting", 
+	    "GaussianSplatting", 
+	    "bonsai", 
+	    "cameras.json"
     ) # TODO this is hardcoded
-	camIdx = 2
-    #camera = getCamera(camerasPath, camIdx) # defaultCamera();
+	camIdx = 1
+    # camera = getCamera(camerasPath, camIdx) # defaultCamera();
     camera = defaultCamera();
     near = camera.near
     far = camera.far
     T = computeTransform(camera).linear |> gpu;
     P = computeProjection(camera, w, h).linear |> gpu;
     cx = w/2.0
-    cy = w/2.0
+    cy = h/2.0
     n = renderer.nGaussians
     fx = camera.fx |> Float32
     fy = camera.fy |> Float32
@@ -52,84 +68,110 @@ function preprocess(renderer::GaussianRenderer3D)
     quaternions = renderer.splatData.quaternions |> gpu;
     scales = renderer.splatData.scales |> gpu;
     n = renderer.nGaussians;
-
+    
     CUDA.@sync begin @cuda threads=32 blocks=div(n, 32) frustumCulling(
-            ts, tps, cov3ds, means,  μ′, fx, fy,
-            quaternions, scales, T, P, w, h, cx, cy,
-            cov2ds, far, near
-        ) 
+            ts, tps, μ′,  # outs
+            means, T, P,  # ins
+            w, h, cx, cy, # Numbers
+        )
     end
-
+    
     CUDA.@sync begin @cuda threads=32 blocks=div(n, 32) tValues(
             ts, cov3ds, fx, fy,
             quaternions, scales, cov2ds
+        )
+    end
+    
+    CUDA.unsafe_free!(ts)
+    
+    # renderer.positions = μ′
+    # TODO this is temporary hack
+    # CUDA.@sync begin   
+    #    @cuda threads=32 blocks=div(n, 32) computeCov2d_kernel(cov2ds, rots, scales) 
+    # end
+    
+    CUDA.@sync begin   
+        @cuda threads=32 blocks=div(n, 32) computeInvCov2d(
+            cov2ds, 
+            invCov2ds
         ) 
     end
-
-    #renderer.positions = μ′
+    
+    CUDA.@sync begin   
+        @cuda threads=32 blocks=div(n, 32) computeBB(
+            cov2ds, 
+            bbs, 
+            μ′, 
+            (w, h)
+        ) 
+    end
+    
+    sortIdxs = CUDA.sortperm(tps[3, :], lt=!isless)
     renderer.camera = camera
-    sortIdxs = CUDA.sortperm(tps[3, :], lt=isless) # chck
     renderer.sortIdxs = sortIdxs
-    CUDA.unsafe_free!(ts)
-    #CUDA.unsafe_free!(tps)
     renderer.cov2ds = cov2ds[:, :, sortIdxs]
     renderer.positions = μ′[:, sortIdxs]
-    # TODO this is temporary hack
-    #CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeCov2d_kernel(cov2ds, rots, scales) end
-    CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeInvCov2d(renderer.cov2ds, invCov2ds) end
-    CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeBB(renderer.cov2ds, bbs, renderer.positions, (w, h)) end
+    renderer.invCov2ds = invCov2ds[:, :, sortIdxs]
+    renderer.bbs = bbs[:, :, sortIdxs]
     return tps
 end
 
+"""
+    compactIndex(renderer::Renderer)
 
-function packedTileIds(renderer)
-    bbs = renderer.bbs
-    packedIds = CUDA.zeros(UInt64, nGaussians)
-    CUDA.@sync begin
-        @cuda threads=32 blocks=div(nGaussians, 32) binPacking(bbs, packedIds, threads..., blocks...)
-    end
-end
-
-
+This function compute compact indexes. 
+"""
 function compactIdxs(renderer)
     bbs = renderer.bbs
     hits = CUDA.zeros(UInt8, blocks..., renderer.nGaussians);
     n = renderer.nGaussians
+    
     CUDA.@sync begin 
         @cuda threads=32 blocks=div(n, 32) hitBinning(hits, bbs, threads..., blocks...)
     end
 
     # This is not memory efficient but works for small list of gaussians in tile ... 
+    
     hitScans = CUDA.zeros(UInt16, size(hits));
     CUDA.@sync CUDA.scan!(+, hitScans, hits; dims=3);
     CUDA.@sync maxHits = CUDA.maximum(hitScans) |> Int
 
     # TODO hardcoding UInt16 will cause issues if number of gaussians in a Tile
     # if maxHits < typemax(UInt32)
-    maxBinSize = min((typemax(UInt16) |> Int), nextpow(2, maxHits))# TODO limiting maxBinSize hardcoded to 4096
+    # TODO limiting maxBinSize hardcoded to 4096
+    
+    maxBinSize = min((typemax(UInt16) |> Int), nextpow(2, maxHits))
     renderer.hitIdxs  = CUDA.zeros(UInt32, blocks..., maxBinSize);
+
     # else
         # maxBinSize = 2*nextpow(2, maxHits)
         # renderer.hitIdxs = CUDA.zeros(UInt32, blocks..., maxBinSize);
     # end
 
     CUDA.@sync begin
-        @cuda threads=blocks blocks=(32, div(n, 32)) shmem=reduce(*, blocks)*sizeof(UInt32) compactHits(
-            hits, 
-            bbs, 
-            hitScans, 
-            renderer.hitIdxs
+        @cuda(
+            threads=blocks,
+            blocks=(32, div(n, 32)),
+            shmem=reduce(*, blocks)*sizeof(UInt32),
+            compactHits(
+                hits, 
+                bbs, 
+                hitScans, 
+                renderer.hitIdxs
+            )
         )
     end
+    
     CUDA.unsafe_free!(hits)
     CUDA.unsafe_free!(hitScans)
     return nothing
 end
 
 function forward(renderer, tps)
+    sortIdxs = renderer.sortIdxs
+    tps = tps[:, sortIdxs]
     cimage = renderer.imageData
     invCov2ds = renderer.invCov2ds
-    sortIdxs = renderer.sortIdxs
     transmittance = renderer.transmittance
     positions = renderer.positions
     bbs = renderer.bbs
@@ -141,18 +183,23 @@ function forward(renderer, tps)
     eye = renderer.camera.eye .|> Float32 |>gpu
     lookAt = renderer.camera.lookAt .|> Float32 |> gpu
     CUDA.@sync begin
-        @cuda threads=threads blocks=blocks shmem=(4*(reduce(*, threads))*sizeof(Float32)) splatDraw(
-            cimage, 
-            transmittance,
-            positions, 
-            tps,
-            bbs,
-            invCov2ds,
-            hitIdxs,
-            opacities,
-            shs,
-            eye,
-            lookAt
+        @cuda(
+            threads=threads, 
+            blocks=blocks, 
+            shmem=(4*(reduce(*, threads))*sizeof(Float32)), 
+            splatDraw(
+                cimage, 
+                transmittance,
+                positions, 
+                tps,
+                bbs,
+                invCov2ds,
+                hitIdxs,
+                opacities,
+                shs,
+                eye,
+                lookAt
+            )
         )
     end
     return nothing
diff --git a/src/main.jl b/src/main.jl
@@ -17,7 +17,7 @@ imSize = (512, 512, 3)
 # renderer = getRenderer(GAUSSIAN_2D, imSize, nGaussians, threads, blocks)
 renderer = getRenderer(
         GAUSSIAN_3D, 
-        joinpath(ENV["HOMEPATH"], "Downloads", "GaussianSplatting", "GaussianSplatting", "bonsai", "bonsai_30000.ply"),
+        joinpath(ENV["HOMEPATH"], "Downloads", "GaussianSplatting", "GaussianSplatting", "train", "train_30000.ply"),
         imSize, 
         threads, 
         blocks; 
@@ -26,9 +26,9 @@ renderer = getRenderer(
 GC.gc()
 CUDA.reclaim()
 
-ts = preprocess(renderer)
+tps = preprocess(renderer)
 compactIdxs(renderer)
-forward(renderer, ts[:, renderer.sortIdxs])
+forward(renderer, tps)
 renderer.imageData[findall((x) -> isequal(x, NaN), renderer.imageData)] .= 0.0f0
 img = renderer.imageData |> cpu;
 tmpimageview = reshape(renderer.imageData, size(renderer.imageData)..., 1)
@@ -41,10 +41,10 @@ yimg = colorview(RGB{N0f8},
 yimg = Images.imrotate(yimg, -pi/2)
 imshow(yimg)
 
-include("train.jl")
+# include("train.jl")
 
-windowSize = 11
-nChannels = 3
-lossFunc = getLossFunction(imSize, windowSize, nChannels)
+# windowSize = 11
+# nChannels = 3
+# lossFunc = getLossFunction(imSize, windowSize, nChannels)
 
-#train(renderer, gtimg, 1e-5, lossFunc)
+# train(renderer, gtimg, 1e-5, lossFunc)
diff --git a/src/projection.jl b/src/projection.jl

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-`
	`1`	`+# Compute Bounding Boxes`
`2`	`2`	`function computeBB(cov2ds, bbs, means, sz)`
`3`	`3`	`idx = (blockIdx().x - 1i32)*blockDim().x + threadIdx().x`
`4`	`4`	`BB = MArray{Tuple{2, 2}, Float32}(undef)`