snapshot

arhik · arhik · commit 628e2757f711 · 2024-02-14T23:47:14.000+05:30
diff --git a/src/binning.jl b/src/binning.jl
@@ -29,7 +29,33 @@ function hitBinning(hits, bbs, blockSizeX, blockSizeY, gridSizeX, gridSizeY)
     return
 end
 
+function packTileId(x::UInt64)
+    
+end
+
+function packZValue(x::UInt64)
+
+end
+
+function unpackTileId(x::UInt64)
 
-function bbTileHit(bbs, )
+end
 
+function unpackZValue(x::UInt64)
+
+end
+
+function binPacking(bbs, packedIds, blockSizeX, blockSizeY, gridSizeX, gridSizeY)
+    idx = (blockIdx().x - 1i32)*blockDim().x + threadIdx().x
+    xbbmin = (floor(bbs[1, 1, idx]))
+    xbbmax = (ceil(bbs[1, 2, idx]))
+    ybbmin = (floor(bbs[2, 1, idx]))
+    ybbmax = (ceil(bbs[2, 2, idx]))
+    # sync_threads()
+    bminxIdx = Int32(div(xbbmin, float32(blockSizeX))) + 1i32
+    bminyIdx = Int32(div(ybbmin, float32(blockSizeY))) + 1i32
+    bmaxxIdx = Int32(div(xbbmax, float32(blockSizeX))) + 1i32
+    bmaxyIdx = Int32(div(ybbmax, float32(blockSizeY))) + 1i32
+    # # BB Cover 
+    sync_threads()
 end
diff --git a/src/camera.jl b/src/camera.jl
@@ -87,7 +87,7 @@ function computeTransform(camera::Camera)
 	eye = camera.eye
 	lookat = camera.lookat
 	up = camera.up
-	w = -(lookat .- eye) |> normalize
+	w = (lookat .- eye) |> normalize
 	u =	cross(up, w) |> normalize
 	v = cross(w, u)
 	m = MMatrix{4, 4, Float32}(I)
@@ -96,6 +96,13 @@ function computeTransform(camera::Camera)
 	return LinearMap(m) ∘ translateCamera(camera)
 end
 
+function computeTransform(camera::GroundTruthCamera)
+	m = MMatrix{4, 4, Float32}(I)
+	m[1:3, 1:3] .= camera.rotation
+	m[1:3, 4] .= -camera.position
+	return LinearMap(m)
+end
+
 function computeProjection(camera::Camera, w, h)
     p = MArray{Tuple{4, 4}, Float32}(undef)
     p .= 0.0f0
@@ -107,6 +114,17 @@ function computeProjection(camera::Camera, w, h)
     return LinearMap(p)
 end
 
+function computeProjection(camera::GroundTruthCamera, near, far)
+    p = MArray{Tuple{4, 4}, Float32}(undef)
+    p .= 0.0f0
+    p[1, 1] = 2.0f0*(camera.fx)/camera.width
+    p[2, 2] = 2.0f0*(camera.fy)/camera.height
+    p[3, 3] = (far + near)/(far - near)
+    p[3, 4] = -2.0f0*(far*near)/(far - near)
+    p[4, 3] = 1
+    return LinearMap(p)
+end
+
 function loadCameras(path)
 	cameras = JSON.parsefile(path)
 	return cameras
diff --git a/src/forward.jl b/src/forward.jl
@@ -22,40 +22,69 @@ function preprocess(renderer::GaussianRenderer2D)
 end
 
 function preprocess(renderer::GaussianRenderer3D)
+    # Worldspace, clip space initializations
+    # TODO avoid dynamic memory allocations
     ts = CUDA.zeros(4, renderer.nGaussians);
     tps = CUDA.zeros(4, renderer.nGaussians);
     μ′ = CUDA.zeros(2, renderer.nGaussians);
-    camera = defaultCamera();
+    
+    # Camera related params
+	camerasPath = joinpath(pkgdir(WGPUgfx), "assets", "bonsai", "cameras.json")
+	camIdx = 1
+    near = 0.1f0
+    far = 10.0f0
+    camera = getCamera(camerasPath, camIdx)
     T = computeTransform(camera).linear |> MArray |> gpu;
     (w, h) = size(renderer.imageData)[1:2];
-    P = computeProjection(camera, w, h).linear |> gpu;
+    P = computeProjection(camera, near, far).linear |> gpu;
+    w = camera.width
+    h = camera.height
     cx = div(w, 2)
     cy = div(h, 2)
     n = renderer.nGaussians
-    fx = 3200.7f0
-    fy = 3200.7f0
+    fx = camera.fx
+    fy = camera.fy
     means = renderer.splatData.means |> gpu
-    cov2ds = renderer.cov2ds
-    cov3ds = renderer.cov3ds
-    bbs = renderer.bbs
+    cov2ds = renderer.cov2ds;
+    cov3ds = renderer.cov3ds;
+    bbs = renderer.bbs;
     invCov2ds = renderer.invCov2ds;
-    quaternions = renderer.splatData.quaternions |> gpu
-    scales = renderer.splatData.scales |> gpu
-    n = renderer.nGaussians
-    bbs = renderer.bbs
-    CUDA.@sync begin @cuda threads=32 blocks=div(n, 32) tValues(
+    quaternions = renderer.splatData.quaternions |> gpu;
+    scales = renderer.splatData.scales |> gpu;
+    n = renderer.nGaussians;
+
+    CUDA.@sync begin @cuda threads=32 blocks=div(n, 32) frustumCulling(
             ts, tps, cov3ds, means,  μ′, fx, fy,
             quaternions, scales, T, P, w, h, cx, cy,
-            cov2ds,
+            cov2ds, far, near
         ) 
     end
+
+    CUDA.@sync begin @cuda threads=32 blocks=div(n, 32) tValues(
+            ts, cov3ds, fx, fy,
+            quaternions, scales, cov2ds
+        ) 
+    end
+
+    renderer.positions = μ′
+    sortIdxs = CUDA.sortperm(tps[3, :])
     CUDA.unsafe_free!(ts)
     CUDA.unsafe_free!(tps)
-    renderer.positions = μ′
+    renderer.cov2ds = cov2ds[:, :, sortIdxs]
+    renderer.positions = μ′[:, sortIdxs]
     # TODO this is temporary hack
     #CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeCov2d_kernel(cov2ds, rots, scales) end
     CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeInvCov2d(cov2ds, invCov2ds) end
-    CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeBB(cov2ds, bbs, μ′, size(renderer.imageData)[1:end-1]) end
+    CUDA.@sync begin   @cuda threads=32 blocks=div(n, 32) computeBB(cov2ds, bbs, renderer.positions, (w, h)) end
+end
+
+
+function packedTileIds(renderer)
+    bbs = renderer.bbs
+    packedIds = CUDA.zeros(UInt64, nGaussians)
+    CUDA.@sync begin
+        @cuda threads=32 blocks=div(nGaussians, 32) binPacking(packedIds, threads..., blocks...)
+    end
 end
 
 function compactIdxs(renderer)
@@ -65,11 +94,21 @@ function compactIdxs(renderer)
     CUDA.@sync begin 
         @cuda threads=32 blocks=div(n, 32) hitBinning(hits, bbs, threads..., blocks...)
     end
-    hitScans = CUDA.zeros(UInt16, size(hits));
+
+    # This is not memory efficient but works for small list of gaussians in tile ... 
+    # hitScans = CUDA.zeros(UInt16, size(hits));
     CUDA.@sync CUDA.scan!(+, hitScans, hits; dims=3);
     CUDA.@sync maxHits = CUDA.maximum(hitScans) |> Int
-    maxBinSize = min(typemax(UInt16) |> Int, nextpow(2, maxHits))# TODO limiting maxBinSize hardcoded to 4096
-    renderer.hitIdxs  = CUDA.zeros(UInt32, blocks..., maxBinSize);
+
+    # TODO hardcoding UInt16 will cause issues if number of gaussians in a Tile
+    if maxHits < typemax(UInt16)
+        maxBinSize = min((typemax(UInt16) |> Int), nextpow(2, maxHits))# TODO limiting maxBinSize hardcoded to 4096
+        renderer.hitIdxs  = CUDA.zeros(UInt32, blocks..., maxBinSize);
+    else
+        maxBinSize = 2*nextpow(2, maxHits)
+        renderer.hitIdxs = CUDA.zeros(UInt32, blocks..., maxBinSize);
+    end
+
     CUDA.@sync begin
         @cuda threads=blocks blocks=(32, div(n, 32)) shmem=reduce(*, blocks)*sizeof(UInt32) compactHits(
             hits, 
diff --git a/src/main.jl b/src/main.jl
@@ -3,6 +3,7 @@ include("cov2d.jl")
 include("boundingbox.jl")
 include("binning.jl")
 include("compact.jl")
+
 include("camera.jl")
 include("renderer.jl")
 include("projection.jl")
@@ -38,9 +39,6 @@ yimg = colorview(RGB{N0f8},
 )
 yimg = Images.imrotate(yimg, -pi/2)
 
-
-
-
 include("train.jl")
 
 windowSize = 11
diff --git a/src/projection.jl b/src/projection.jl
@@ -9,9 +9,9 @@ function computeCov3dProjection_kernel(cov2ds, cov3ds, rotation, affineTransform
             S[i, j] = 0.0f0
         end
     end
-    S[1, 1] = scales[1, idx]
-    S[2, 2] = scales[2, idx]
-    S[3, 3] = scales[3, idx]
+    S[1, 1] = exp(scales[1, idx])
+    S[2, 2] = exp(scales[2, idx])
+    S[3, 3] = exp(scales[3, idx])
     W = R*S
     J = W*adjoint(W)
     for i in 1:3
@@ -40,76 +40,10 @@ end
     return R
 end
 
-function tValues(ts, tps, meansList, μ′, T, P, w, h, cx, cy)
-    idx = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x
-    
-    meanVec = MVector{4, Float32}(undef)
-    meanVec[1] = meansList[1, idx]
-    meanVec[2] = meansList[2, idx]
-    meanVec[3] = meansList[3, idx]
-    meanVec[4] = 1
-
-    Tcw = MArray{Tuple{4, 4}, Float32}(undef)
-    for ii in 1:4
-        for jj in 1:4
-            Tcw[ii, jj] = T[ii, jj]
-        end
-    end
-
-    tstmp = Tcw*meanVec
-    ts[1, idx] = tstmp[1]
-    ts[2, idx] = tstmp[2]
-    ts[3, idx] = tstmp[3]
-    ts[4, idx] = tstmp[4]
-
-    Ptmp = MArray{Tuple{4, 4}, Float32}(undef)
-    for ii in 1:4
-        for jj in 1:4
-            Ptmp[ii, jj] = P[ii, jj]
-        end
-    end
-
-    tpstmp = Ptmp*tstmp
-    tps[1, idx] = tpstmp[1]
-    tps[2, idx] = tpstmp[2]
-    tps[3, idx] = tpstmp[3]
-    tps[4, idx] = tpstmp[4] 
-    
-    tx = tpstmp[1]
-    ty = tpstmp[2]
-    tz = tpstmp[3]
-    tw = tpstmp[4]
-
-    μ′[1, idx] = (w*tx/tw) + cx
-    μ′[2, idx] = (w*ty/tw) + cy
-
-    quat = quaternions[1, idx]
-    @inline R = quatToRot(quat)
-    S = MArray{Tuple{3, 3}, Float32}(undef)
-    for i in 1:3
-        for j in 1:3
-            S[i, j] = 0.0f0
-        end
-    end
-    S[1, 1] = scales[1, idx]
-    S[2, 2] = scales[2, idx]
-    S[3, 3] = scales[3, idx]
-    W = R*S
-    J = W*adjoint(W)
-    for i in 1:3
-        for j in 1:3
-            cov3ds[i, j, idx] = J[i, j]
-        end
-    end
-    
-    return nothing
-end
-
-
-function tValues(
+function frustumCulling(
     ts, tps, cov3ds, meansList, μ′, fx, fy,
     quaternions, scales, T, P, w, h, cx, cy,
-    cov2ds
+    cov2ds, far, near
 )
     idx = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x
     
@@ -155,9 +89,31 @@ function tValues(
     tz′ = tpstmp[3]
     tw′ = tpstmp[4]
 
-    μ′[1, idx] = ((w*tx′/tw′) + 1)/2 + cx
-    μ′[2, idx] = ((h*ty′/tw′) + 1)/2 + cy
+    x = ((w*tx′/tw′) + 1)/2 + cx
+    y = ((h*ty′/tw′) + 1)/2 + cy
 
+    if (-w < x < w) && (-h < y < h)# && (near < tz′ < far)
+        μ′[1, idx] = ((w*tx′/tw′) + 1)/2 + cx
+        μ′[2, idx] = ((h*ty′/tw′) + 1)/2 + cy
+    else
+        # TODO zero values are used for culling checks for now
+       μ′[1, idx] = 0.0f0
+       μ′[1, idx] = 0.0f0
+    end
+    return nothing
+end
+
+
+function tValues(
+    ts, cov3ds, fx, fy, quaternions, scales, cov2ds
+)
+    idx = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x
+
+    tx = ts[1, idx]
+    ty = ts[2, idx]
+    tz = ts[3, idx]
+    tw = ts[4, idx]
+    
     quat = MVector{4, Float32}(undef)
     quat[1] = quaternions[1, idx]
     quat[2] = quaternions[2, idx]
@@ -195,7 +151,7 @@ function tValues(
 
     for ii in 1:2
         for jj in 1:2
-            cov2ds[ii, jj, idx] = cov2d[ii, jj]
+            cov2ds[ii, jj, idx] = cov2d[ii, jj] + 0.1
         end
     end
 
diff --git a/src/splat.jl b/src/splat.jl
@@ -220,13 +220,17 @@ function splatDraw(cimage, transGlobal, means, bbs, invCov2ds, hitIdxs, opacitie
             deltaY = float(j) - means[2, bIdx]
             delta[2] = deltaY
             disttmp  = invCov2d*delta
-            dist = disttmp[1]*delta[1] + disttmp[2]*delta[2]
+            dist = 0.50f0*(disttmp[1]*delta[1] + disttmp[2]*delta[2])
             alpha = cusigmoid(opacity*exp(-dist))
             transmittance = splatData[txIdx, tyIdx, transIdx]
-            CUDA.@atomic splatData[txIdx, tyIdx, 1] += (colors[1, bIdx]*alpha*transmittance)
-            CUDA.@atomic splatData[txIdx, tyIdx, 2] += (colors[2, bIdx]*alpha*transmittance)
-            CUDA.@atomic splatData[txIdx, tyIdx, 3] += (colors[3, bIdx]*alpha*transmittance)
-            CUDA.@atomic splatData[txIdx, tyIdx, transIdx] *= (1.0f0 - alpha)
+            color1 = SH_C0*colors[1, bIdx]
+            color2 = SH_C0*colors[2, bIdx]
+            color3 = SH_C0*colors[3, bIdx]
+
+            splatData[txIdx, tyIdx, 1] += (color1*alpha*transmittance)
+            splatData[txIdx, tyIdx, 2] += (color2*alpha*transmittance)
+            splatData[txIdx, tyIdx, 3] += (color3*alpha*transmittance)
+            splatData[txIdx, tyIdx, transIdx] *= (1.0f0 - alpha)
         end
     end
     sync_threads()
@@ -312,7 +316,7 @@ function splatGrads(
             dist = disttmp[1]*delta[1] + disttmp[2]*delta[2]
             ΔMean = invCov2d*delta
             ΔΣ = ΔMean*adjoint(ΔMean)
-            Δo = exp(-dist)
+            Δo = exp(-0.5f0*dist)
             Δσ = -opacity*Δo
             transmittance = transData[txIdx, tyIdx]
             alpha = opacity*exp(-dist)