torch
diff --git a/‎lib/THCUNN/VolumetricUpSamplingNearest.cu‎
Lines changed: 95 additions & 0 deletions b/‎lib/THCUNN/VolumetricUpSamplingNearest.cu‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎lib/THCUNN/VolumetricUpSamplingTrilinear.cu‎
Lines changed: 155 additions & 0 deletions b/‎lib/THCUNN/VolumetricUpSamplingTrilinear.cu‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎lib/THCUNN/generic/THCUNN.h‎
Lines changed: 34 additions & 0 deletions b/‎lib/THCUNN/generic/THCUNN.h‎
Lines changed: 34 additions & 0 deletions
@@ -0,0 +1,95 @@
+#include "THCUNN.h"
+#include "common.h"
+
+#include <thrust/transform.h>
+#include <thrust/reduce.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/functional.h>
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+/*
+ * Description:
+ */
+
+__device__ int translate_idx(int ii, int d1, int d2, int d3, int d4, int scale_factor)
+{
+  int x, y, z, w, v;
+  v = ii % d4;
+  ii = ii/d4;
+  w = ii % d3;
+  ii = ii/d3;
+  z = ii % d2;
+  ii = ii/d2;
+  y = ii % d1;
+  ii = ii/d1;
+  x = ii;
+  v = v/scale_factor;
+  w = w/scale_factor;
+  z = z/scale_factor;
+  d2 /= scale_factor;
+  d3 /= scale_factor;
+  d4 /= scale_factor;
+  return ((((x*d1+y)*d2)+z)*d3+w)*d4+v;
+
+}
+__device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int d4, int scale_factor, int off_x, int off_y, int off_z)
+{
+  int x, y, z, w, v;
+  v = ii % d4;
+  ii = ii/d4;
+  w = ii % d3;
+  ii = ii/d3;
+  z = ii % d2;
+  ii = ii/d2;
+  y = ii % d1;
+  ii = ii/d1;
+  x = ii;
+  v = v*scale_factor+off_x;
+  w = w*scale_factor+off_y;
+  z = z*scale_factor+off_z;
+  d2 *= scale_factor;
+  d3 *= scale_factor;
+  d4 *= scale_factor;
+  return ((((x*d1+y)*d2)+z)*d3+w)*d4+v;
+
+}
+
+template <typename Dtype>
+__global__ void vupscale(Dtype *input, Dtype *output, long no_elements,
+                         int scale_factor, int d1, int d2, int d3, int d4)
+{
+  // output offset:
+  long ii = threadIdx.x + blockDim.x * blockIdx.x;
+  ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
+  if (ii >= no_elements) return;
+  int ipidx = translate_idx(ii, d1, d2, d3, d4, scale_factor);
+  output[ii]=input[ipidx];
+}
+
+/*
+ * Description:
+ */
+template <typename Dtype, typename Acctype>
+__global__ void vdownscale(Dtype *gradInput_data, Dtype *gradOutput_data, long no_elements,
+                              int scale_factor, int d1, int d2, int d3, int d4)
+{
+  // output offset:
+  long ii = threadIdx.x + blockDim.x * blockIdx.x;
+  ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
+  if (ii >= no_elements) return;
+  Acctype sum = Acctype(0);
+  for (int i=0; i < scale_factor; i++){
+    for(int j=0; j < scale_factor; j++){
+      for(int k=0; k < scale_factor; k++){
+        int ipidx = translate_idx_inv(ii, d1, d2, d3, d4, scale_factor, i, j, k);
+        sum += gradOutput_data[ipidx];
+      }
+    }
+  }
+  gradInput_data[ii] += ScalarConvert<Acctype, Dtype>::to(sum);
+}
+
+#include "generic/VolumetricUpSamplingNearest.cu"
+#include "THCGenerateFloatTypes.h"
@@ -0,0 +1,155 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel(const int n,
+    const Acctype rdepth, const Acctype rheight, const Acctype rwidth,
+    const THCDeviceTensor<Dtype, 5> data1, THCDeviceTensor<Dtype, 5> data2) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int depth1 = data1.getSize(2);
+  const int height1 = data1.getSize(3);
+  const int width1 = data1.getSize(4);
+  const int depth2 = data2.getSize(2);
+  const int height2 = data2.getSize(3);
+  const int width2 = data2.getSize(4);
+
+  if (index < n) {
+    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2*width2);            // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data1[n][c][t1][h1][w1];
+          data2[n][c][t2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const Acctype t1r = rdepth * t2;
+    const int t1 = t1r;
+    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
+    const Acctype t1lambda = t1r - t1;
+    const Acctype t0lambda = Acctype(1) - t1lambda;
+    //
+    const Acctype h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = rwidth * w2;
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+        const Acctype val = t0lambda * (h0lambda * (w0lambda * data1[n][c][t1][h1][w1] 
+                                                  + w1lambda * data1[n][c][t1][h1][w1+w1p])
+                                      + h1lambda * (w0lambda * data1[n][c][t1][h1+h1p][w1]
+                                                  + w1lambda * data1[n][c][t1][h1+h1p][w1+w1p]))
+                          + t1lambda * (h0lambda * (w0lambda * data1[n][c][t1+t1p][h1][w1] 
+                                                  + w1lambda * data1[n][c][t1+t1p][h1][w1+w1p])
+                                      + h1lambda * (w0lambda * data1[n][c][t1+t1p][h1+h1p][w1]
+                                                  + w1lambda * data1[n][c][t1+t1p][h1+h1p][w1+w1p]));
+        data2[n][c][t2][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel_backward(const int n,
+    const Acctype rdepth, const Acctype rheight, const Acctype rwidth,
+    THCDeviceTensor<Dtype, 5> data1, const THCDeviceTensor<Dtype, 5> data2){
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int batchsize = data1.getSize(0);
+  const int channels = data1.getSize(1);
+  const int depth1 = data1.getSize(2);
+  const int height1 = data1.getSize(3);
+  const int width1 = data1.getSize(4);
+  const int depth2 = data2.getSize(2);
+  const int height2 = data2.getSize(3);
+  const int width2 = data2.getSize(4);
+  if (index < n) {
+    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2*width2);            // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize ; n++){
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data2[n][c][t1][h1][w1];
+          data1[n][c][t2][h2][w2] += val;
+        }
+      }
+      return;
+    }
+    //
+    const Acctype t1r = rdepth * t2;
+    const int t1 = t1r;
+    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
+    const Acctype t1lambda = t1r - t1;
+    const Acctype t0lambda = Acctype(1) - t1lambda;
+    //
+    const Acctype h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = rwidth * w2;
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize ; n++){
+      for (int c = 0; c < channels; ++c) {
+        const Dtype d2val = data2[n][c][t2][h2][w2];
+        atomicAdd(data1[n][c][t1][h1][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h0lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][t1][h1][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h0lambda * w1lambda * d2val));
+        atomicAdd(data1[n][c][t1][h1+h1p][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h1lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][t1][h1+h1p][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h1lambda * w1lambda * d2val));
+        atomicAdd(data1[n][c][t1+t1p][h1][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h0lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][t1+t1p][h1][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h0lambda * w1lambda * d2val));
+        atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h1lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h1lambda * w1lambda * d2val));
+      }
+    }
+  }
+  /////////////////////////////////////////////////////////
+}
+
+
+#include "generic/VolumetricUpSamplingTrilinear.cu"
+#include "THCGenerateFloatTypes.h"
@@ -1370,4 +1370,38 @@ TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
                   int ptop, int pbottom,
                   int pfront, int pback);
 
+TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int scale_factor);
+
+TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int scale_factor);
+
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputDepth,
+                  int outputHeight,
+                  int outputWidth);
+
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int nbatch,
+                  int nchannels,
+                  int inputDepth,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputDepth,
+                  int outputHeight,
+                  int outputWidth);
+
 #endif