pranavsubramani
diff --git a/‎torch/lib/THCUNN/ClassNLLCriterion.cu‎
Lines changed: 30 additions & 18 deletions b/‎torch/lib/THCUNN/ClassNLLCriterion.cu‎
Lines changed: 30 additions & 18 deletions
diff --git a/‎torch/lib/THCUNN/IndexLinear.cu‎
Lines changed: 11 additions & 30 deletions b/‎torch/lib/THCUNN/IndexLinear.cu‎
Lines changed: 11 additions & 30 deletions
diff --git a/‎torch/lib/THCUNN/SpatialDepthWiseConvolution.cu‎
Lines changed: 9 additions & 0 deletions b/‎torch/lib/THCUNN/SpatialDepthWiseConvolution.cu‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎torch/lib/THCUNN/generic/ClassNLLCriterion.cu‎
Lines changed: 14 additions & 6 deletions b/‎torch/lib/THCUNN/generic/ClassNLLCriterion.cu‎
Lines changed: 14 additions & 6 deletions
@@ -15,19 +15,22 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(Dtype *output,
                                                            THCIndex_t  *target,
                                                            Dtype *weights,
                                                            int size_average,
-                                                           int n_classes) {
+                                                           int n_classes,
+                                                           long ignore_index) {
   assert(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
 
   // TODO: T4951791 Reuse code between updateOutput_kernel1 and
   // updateOutput_kernel.
 
   int t = (int)*target - TH_INDEX_BASE;
-  assert(t >= 0 && t < n_classes);
-  Dtype cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
-  *output = -cur_weight * input[t];
-  *total_weight = cur_weight;
-  if (size_average && *total_weight > 0) {
-    *output /= *total_weight;
+  if (t != ignore_index) {
+    assert(t >= 0 && t < n_classes);
+    Dtype cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
+    *output = -cur_weight * input[t];
+    *total_weight = cur_weight;
+    if (size_average && *total_weight > 0) {
+      *output /= *total_weight;
+    }
   }
 }
 
@@ -40,7 +43,8 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *output,
                                                            int size_average,
                                                            int nframe,
                                                            int ndim,
-                                                           int n_classes) {
+                                                           int n_classes,
+                                                           long ignore_index) {
   __shared__ Acctype shInputs[NTHREADS], acc_weight[NTHREADS];
   int i, t;
   Dtype cur_weight;
@@ -49,10 +53,12 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *output,
   acc_weight[threadIdx.x] = ScalarConvert<int, Acctype>::to(0);
   for (i = threadIdx.x; i < nframe; i += NTHREADS) {
       t = target[i] - TH_INDEX_BASE;
-      assert(t >= 0 && t < n_classes);
-      cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
-      shInputs[threadIdx.x] -= input[i * ndim + t] * cur_weight;
-      acc_weight[threadIdx.x] += cur_weight;
+      if (t != ignore_index) {
+        assert(t >= 0 && t < n_classes);
+        cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
+        shInputs[threadIdx.x] -= input[i * ndim + t] * cur_weight;
+        acc_weight[threadIdx.x] += cur_weight;
+      }
   }
   __syncthreads();
 
@@ -84,15 +90,18 @@ __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel1(
   THCIndex_t* target,
   Dtype* total_weight,
   int size_average,
-  int n_classes)
+  int n_classes,
+  long ignore_index)
 {
   if (*total_weight <= 0) {
     return;
   }
   Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1);
   int t = (int)*target - TH_INDEX_BASE;
-  assert(t >= 0 && t < n_classes);
-  gradInput[t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm;
+  if (t != ignore_index) {
+    assert(t >= 0 && t < n_classes);
+    gradInput[t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm;
+  }
 }
 
 template <typename Dtype>
@@ -104,7 +113,8 @@ __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel(
   int size_average,
   int nframe,
   int ndim,
-  int n_classes)
+  int n_classes,
+  long ignore_index)
 {
   if (*total_weight <= 0) {
     return;
@@ -114,8 +124,10 @@ __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel(
 
   for (i = threadIdx.x; i < nframe; i += NTHREADS) {
     t = (int)target[i] - TH_INDEX_BASE;
-    assert(t >= 0 && t < n_classes);
-    gradInput[i * ndim + t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm;
+    if (t != ignore_index) {
+      assert(t >= 0 && t < n_classes);
+      gradInput[i * ndim + t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm;
+    }
   }
 }
 
 
@@ -15,32 +15,11 @@ const long NNZ_PER_BLOCK_MAX = 1024;
 #define clamp(a, low, high) max(min((a), (high)), (low))
 #endif
 
-#ifndef ATOMIC_REAL_MINMAX
-#define ATOMIC_REAL_MINMAX(func)                                        \
-    __device__  void atomic_##func(double *address, double val) {       \
-        unsigned long long int* address_as_ull = (unsigned long long int*)address; \
-        unsigned long long int old = *address_as_ull;                   \
-        unsigned long long int assumed;                                 \
-        do {                                                            \
-            assumed = old;                                              \
-            old = atomicCAS(address_as_ull, assumed,                    \
-                            __double_as_longlong(func(val, __longlong_as_double(assumed)))); \
-        } while (assumed != old);                                       \
-    }                                                                   \
-    __device__  void atomic_##func(float *address, float val) {         \
-        int* address_as_int = (int*)address;                            \
-        int old = *address_as_int;                                      \
-        int assumed;                                                    \
-        do {                                                            \
-            assumed = old;                                              \
-            old = atomicCAS(address_as_int, assumed,                    \
-                            __float_as_int(func(val, __int_as_float(assumed)))); \
-        } while (assumed != old);                                       \
-    }                                                                   \
-
-ATOMIC_REAL_MINMAX(max)
-ATOMIC_REAL_MINMAX(min)
-#endif
+__device__ double atomicExch(double *address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long res = atomicExch(address_as_ull, __double_as_longlong(val));
+    return __longlong_as_double(res);
+}
 
 template<typename Ty, bool train>
 __global__ static
@@ -113,14 +92,16 @@ void updateOutput(
             Ty *nWeightCurr = nWeight + nWeightOffset;
             if (train) {
                 Ty absVal = fabs(val);
-                Ty maxVal = nWeight[key * weightStride + 0];
+                Ty maxVal = nWeightCurr[0];
                 if (absVal > maxVal) {
                     // Updating maxVal and invMaxVal. Go hogwild!
-                    atomic_max(nWeightCurr + 0, absVal);
-                    atomic_min(nWeightCurr + 1, 1.0/absVal);
+                    Ty invAbsVal = 1.0 / absVal;
+                    atomicExch(nWeightCurr + 0, absVal);
+                    atomicExch(nWeightCurr + 1, invAbsVal);
                 }
-                val = val * nWeightCurr[1] + nWeightCurr[3];
+                val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3];
                 normalizedValues[id + tid] = val;
+                nWeightCurr[2] = 1;
             } else {
                 val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3];
             }
 
@@ -0,0 +1,9 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "im2col.h"
+
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
+#include "generic/SpatialDepthWiseConvolution.cu"
+#include "THCGenerateFloatTypes.h"
@@ -9,9 +9,11 @@ void THNN_(ClassNLLCriterion_updateOutput)(
            THCTensor *output,
            bool sizeAverage,
            THCTensor *weights,
-           THCTensor *total_weight) {
+           THCTensor *total_weight,
+           long ignore_index) {
   THCUNN_check_dim_size(state, output, 1, 0, 1);
   THCUNN_check_dim_size(state, total_weight, 1, 0, 1);
+  ignore_index -= TH_INDEX_BASE;
 
   if (THCIndexTensor_(nDimension)(state, target) > 1) {
     THError("multi-target not supported");
@@ -63,7 +65,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
         target_data,
         weights_data,
         sizeAverage,
-        n_classes
+        n_classes,
+        ignore_index
     );
 
   } else if (THCTensor_(nDimension)(state, input) == 2) {
@@ -77,7 +80,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
         sizeAverage,
         THCTensor_(size)(state, input, 0),
         THCTensor_(size)(state, input, 1),
-        n_classes
+        n_classes,
+        ignore_index
     );
   }
   THCudaCheck(cudaGetLastError());
@@ -96,10 +100,12 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
            THCTensor *gradInput,
            bool sizeAverage,
            THCTensor *weights,
-           THCTensor *total_weight) {
+           THCTensor *total_weight,
+           long ignore_index) {
   if (THCIndexTensor_(nDimension)(state, target) > 1) {
     THError("multi-target not supported");
   }
+  ignore_index -= TH_INDEX_BASE;
 
   int n_dims = THCTensor_(nDimension)(state, input);
   int n_classes = THCTensor_(size)(state, input, n_dims - 1);
@@ -145,7 +151,8 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
         target_data,
         total_weight_data,
         sizeAverage,
-        n_classes
+        n_classes,
+        ignore_index
     );
   } else {
     cunn_ClassNLLCriterion_updateGradInput_kernel<real>
@@ -157,7 +164,8 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
         sizeAverage,
         THCTensor_(size)(state, input, 0),
         THCTensor_(size)(state, input, 1),
-        n_classes
+        n_classes,
+        ignore_index
     );
   }
   THCudaCheck(cudaGetLastError());