Make keepdim work with autograd.

gchanan · soumith · commit ae2b2cbbec09 · 2017-05-09T14:15:59.000-07:00
diff --git a/test/common_nn.py b/test/common_nn.py
@@ -88,7 +88,7 @@
     dict(
         module_name='Softmax',
         input_size=(10, 20),
-        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1).expand(10, 20))
+        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, True).expand(10, 20))
     ),
     dict(
         module_name='Softmax2d',
@@ -98,7 +98,7 @@
     dict(
         module_name='LogSoftmax',
         input_size=(10, 20),
-        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1).expand(10, 20)).log_()
+        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, True).expand(10, 20)).log_()
     ),
     dict(
         module_name='LogSoftmax',
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -1041,7 +1041,7 @@ def test_stochastic(self):
         x = Variable(torch.rand(2, 10), requires_grad=True)
         stddevs = Variable(torch.rand(2, 10) * 5, requires_grad=True)
         y = (x * 2).clamp(0, 1)
-        y = y / y.sum(1).expand_as(y)
+        y = y / y.sum(1, True).expand_as(y)
         samples_multi = y.multinomial(5)
         samples_multi_flat = y[0].multinomial(5)
         samples_bernoulli = y.bernoulli()
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -159,17 +159,17 @@ def _testSelection(self, torchfn, mathfn):
         # with indices
         m1 = torch.randn(100, 100)
         res1val, res1ind = torchfn(m1, 1)
-        res2val = m1[:, 0:1].clone()
+        res2val = m1[:, 0:1].clone().squeeze()
         res2ind = res1ind.clone().fill_(0)
         for i, j in iter_indices(m1):
-            if mathfn(res2val[i, 0], m1[i, j]) != res2val[i, 0]:
-                res2val[i, 0] = m1[i, j]
-                res2ind[i, 0] = j
+            if mathfn(res2val[i], m1[i, j]) != res2val[i]:
+                res2val[i] = m1[i, j]
+                res2ind[i] = j
 
         maxerr = 0
         for i in range(res1val.size(0)):
-            maxerr = max(maxerr, abs(res1val[i][0] - res2val[i][0]))
-            self.assertEqual(res1ind[i][0], res2ind[i][0])
+            maxerr = max(maxerr, abs(res1val[i] - res2val[i]))
+            self.assertEqual(res1ind[i], res2ind[i])
         self.assertLessEqual(abs(maxerr), 1e-5)
 
         # NaNs
@@ -514,22 +514,22 @@ def test_addbmm(self):
         res2 = torch.Tensor().resize_as_(res[0]).zero_()
 
         res2.addbmm_(b1, b2)
-        self.assertEqual(res2, res.sum(0)[0])
+        self.assertEqual(res2, res.sum(0))
 
         res2.addbmm_(1, b1, b2)
-        self.assertEqual(res2, res.sum(0)[0] * 2)
+        self.assertEqual(res2, res.sum(0) * 2)
 
         res2.addbmm_(1., .5, b1, b2)
-        self.assertEqual(res2, res.sum(0)[0] * 2.5)
+        self.assertEqual(res2, res.sum(0) * 2.5)
 
         res3 = torch.addbmm(1, res2, 0, b1, b2)
         self.assertEqual(res3, res2)
 
         res4 = torch.addbmm(1, res2, .5, b1, b2)
-        self.assertEqual(res4, res.sum(0)[0] * 3)
+        self.assertEqual(res4, res.sum(0) * 3)
 
         res5 = torch.addbmm(0, res2, 1, b1, b2)
-        self.assertEqual(res5, res.sum(0)[0])
+        self.assertEqual(res5, res.sum(0))
 
         res6 = torch.addbmm(.1, res2, .5, b1, b2)
         self.assertEqual(res6, res2 * .1 + res.sum(0) * .5)
@@ -744,7 +744,7 @@ def renorm(matrix, value, dim, max_norm):
             m1 = matrix.transpose(dim, 0).contiguous()
             # collapse non-dim dimensions.
             m2 = m1.clone().resize_(m1.size(0), int(math.floor(m1.nelement() / m1.size(0))))
-            norms = m2.norm(value, 1)
+            norms = m2.norm(value, 1, True)
             # clip
             new_norms = norms.clone()
             new_norms[torch.gt(norms, max_norm)] = max_norm
@@ -1070,23 +1070,23 @@ def test_kthvalue(self):
         res1val, res1ind = torch.kthvalue(x, k)
         res2val, res2ind = torch.sort(x)
 
-        self.assertEqual(res1val[:, :, 0], res2val[:, :, k - 1], 0)
-        self.assertEqual(res1ind[:, :, 0], res2ind[:, :, k - 1], 0)
+        self.assertEqual(res1val[:, :], res2val[:, :, k - 1], 0)
+        self.assertEqual(res1ind[:, :], res2ind[:, :, k - 1], 0)
         # test use of result tensors
         k = random.randint(1, SIZE)
         res1val = torch.Tensor()
         res1ind = torch.LongTensor()
         torch.kthvalue(x, k, out=(res1val, res1ind))
         res2val, res2ind = torch.sort(x)
-        self.assertEqual(res1val[:, :, 0], res2val[:, :, k - 1], 0)
-        self.assertEqual(res1ind[:, :, 0], res2ind[:, :, k - 1], 0)
+        self.assertEqual(res1val[:, :], res2val[:, :, k - 1], 0)
+        self.assertEqual(res1ind[:, :], res2ind[:, :, k - 1], 0)
 
         # test non-default dim
         k = random.randint(1, SIZE)
         res1val, res1ind = torch.kthvalue(x, k, 0)
         res2val, res2ind = torch.sort(x, 0)
-        self.assertEqual(res1val[0], res2val[k - 1], 0)
-        self.assertEqual(res1ind[0], res2ind[k - 1], 0)
+        self.assertEqual(res1val, res2val[k - 1], 0)
+        self.assertEqual(res1ind, res2ind[k - 1], 0)
 
         # non-contiguous
         y = x.narrow(1, 0, 1)
@@ -1110,12 +1110,12 @@ def test_median(self):
             x = torch.rand(size, size)
             x0 = x.clone()
 
-            res1val, res1ind = torch.median(x)
+            res1val, res1ind = torch.median(x, False)
             res2val, res2ind = torch.sort(x)
             ind = int(math.floor((size + 1) / 2) - 1)
 
-            self.assertEqual(res2val.select(1, ind), res1val.select(1, 0), 0)
-            self.assertEqual(res2val.select(1, ind), res1val.select(1, 0), 0)
+            self.assertEqual(res2val.select(1, ind), res1val, 0)
+            self.assertEqual(res2val.select(1, ind), res1val, 0)
 
             # Test use of result tensor
             res2val = torch.Tensor()
@@ -1127,8 +1127,8 @@ def test_median(self):
             # Test non-default dim
             res1val, res1ind = torch.median(x, 0)
             res2val, res2ind = torch.sort(x, 0)
-            self.assertEqual(res1val[0], res2val[ind], 0)
-            self.assertEqual(res1ind[0], res2ind[ind], 0)
+            self.assertEqual(res1val, res2val[ind], 0)
+            self.assertEqual(res1ind, res2ind[ind], 0)
 
             # input unchanged
             self.assertEqual(x, x0, 0)
@@ -1140,9 +1140,9 @@ def test_mode(self):
         x0 = x.clone()
 
         # Pre-calculated results.
-        res1val = torch.Tensor(SIZE, 1).fill_(1)
+        res1val = torch.Tensor(SIZE).fill_(1)
         # The indices are the position of the last appearance of the mode element.
-        res1ind = torch.LongTensor(SIZE, 1).fill_(1)
+        res1ind = torch.LongTensor(SIZE).fill_(1)
         res1ind[0] = SIZE - 1
         res1ind[1] = SIZE - 1
 
@@ -1160,8 +1160,8 @@ def test_mode(self):
 
         # Test non-default dim
         res2val, res2ind = torch.mode(x, 0)
-        self.assertEqual(res1val.view(1, SIZE), res2val, 0)
-        self.assertEqual(res1ind.view(1, SIZE), res2ind, 0)
+        self.assertEqual(res1val, res2val, 0)
+        self.assertEqual(res1ind, res2ind, 0)
 
         # input unchanged
         self.assertEqual(x, x0, 0)
@@ -2217,7 +2217,7 @@ def _test_gather(self, cast, test_bounds=True):
             self.assertRaises(RuntimeError, lambda: torch.gather(src, dim, idx))
 
         src = cast(torch.randn(3, 4, 5))
-        expected, idx = src.max(2)
+        expected, idx = src.max(2, True)
         expected = cast(expected)
         idx = cast(idx)
         actual = torch.gather(src, 2, idx)
diff --git a/torch/autograd/_functions/reduce.py b/torch/autograd/_functions/reduce.py
@@ -7,36 +7,41 @@
 class Sum(Function):
 
     @staticmethod
-    def forward(ctx, input, dim=None):
+    def forward(ctx, input, dim=None, keepdim=False):
         ctx.dim = dim
+        ctx.keepdim = keepdim
         ctx.input_size = input.size()
         if dim is None:
             return input.new((input.sum(),))
         else:
-            return input.sum(dim)
+            return input.sum(dim, keepdim)
 
     @staticmethod
     def backward(ctx, grad_output):
         if ctx.dim is None:
-            return grad_output.expand(ctx.input_size), None
+            return grad_output.expand(ctx.input_size), None, None
         else:
+            if ctx.keepdim is False:
+                grad_output = grad_output.unsqueeze(ctx.dim)
+
             repeats = [1 for _ in ctx.input_size]
             repeats[ctx.dim] = ctx.input_size[ctx.dim]
-            return grad_output.repeat(*repeats), None
+            return grad_output.repeat(*repeats), None, None
 
 
 class Prod(Function):
 
     @staticmethod
-    def forward(ctx, input, dim=None):
+    def forward(ctx, input, dim=None, keepdim=False):
         ctx.dim = dim
+        ctx.keepdim = keepdim
         ctx.input_size = input.size()
         if dim is None:
             ctx.result = input.prod()
             ctx.save_for_backward(input)
             return input.new((ctx.result,))
         else:
-            output = input.prod(dim)
+            output = input.prod(dim, keepdim)
             ctx.save_for_backward(input, output)
             return output
 
@@ -59,8 +64,11 @@ def backward(ctx, grad_output):
         else:
             input, output = ctx.saved_variables
             dim = ctx.dim if ctx.dim >= 0 else ctx.dim + input.dim()
+            if ctx.keepdim is False:
+                grad_output = grad_output.unsqueeze(dim)
+
             zero_mask = input == 0
-            slice_zero_count = zero_mask.sum(dim)
+            slice_zero_count = zero_mask.sum(dim, True)
             total_zeros = slice_zero_count.sum()
             grad_input = grad_output.mul(output).expand_as(input).div(input)
             if total_zeros == 0:
@@ -93,24 +101,28 @@ def backward(ctx, grad_output):
 class Mean(Function):
 
     @staticmethod
-    def forward(ctx, input, dim=None):
+    def forward(ctx, input, dim=None, keepdim=False):
         ctx.dim = dim
+        ctx.keepdim = keepdim
         ctx.input_size = input.size()
         if dim is None:
             return input.new((input.mean(),))
         else:
-            return input.mean(dim)
+            return input.mean(dim, keepdim)
 
     @staticmethod
     def backward(ctx, grad_output):
         if ctx.dim is None:
             grad_input_val = grad_output / reduce(lambda x, y: x * y, ctx.input_size, 1)
-            return grad_input_val.expand(ctx.input_size), None
+            return grad_input_val.expand(ctx.input_size), None, None
         else:
+            if ctx.keepdim is False:
+                grad_output = grad_output.unsqueeze(ctx.dim)
+
             repeats = [1 for _ in ctx.input_size]
             dim_size = ctx.input_size[ctx.dim]
             repeats[ctx.dim] = dim_size
-            return grad_output.repeat(*repeats).div_(dim_size), None
+            return grad_output.repeat(*repeats).div_(dim_size), None, None
 
 
 class _SelectionFunction(Function):
@@ -120,9 +132,10 @@ class _SelectionFunction(Function):
     # kthvalue not only requires us to pass a dim, but also preceed it with k.
     additional_args = tuple()
 
-    def __init__(self, dim=None):
+    def __init__(self, dim=None, keepdim=False):
         super(_SelectionFunction, self).__init__()
         self.dim = dim
+        self.keepdim = keepdim
 
     def forward(self, input):
         fn = getattr(input, type(self).__name__.lower())
@@ -136,7 +149,7 @@ def forward(self, input):
                 dim = input.dim() - 1
             else:
                 dim = self.dim
-            args = (dim,)
+            args = (dim, self.keepdim)
             if self.additional_args:
                 args = self.additional_args + args
             output, indices = fn(*args)
@@ -153,7 +166,13 @@ def backward(self, grad_output, grad_indices=None):
                 dim = input.dim() - 1
             else:
                 dim = self.dim
+
             indices, = self.saved_tensors
+            if self.keepdim is False:
+                grad_output = grad_output.unsqueeze(dim)
+                grad_indices = grad_indices.unsqueeze(dim)
+                indices = indices.unsqueeze(dim)
+
             grad_input.scatter_(dim, indices, grad_output)
         return grad_input
 
@@ -177,25 +196,26 @@ class Median(_SelectionFunction):
 class Kthvalue(_SelectionFunction):
     has_all_reduce = False
 
-    def __init__(self, k, dim=None):
-        super(Kthvalue, self).__init__(dim)
+    def __init__(self, k, dim=None, keepdim=False):
+        super(Kthvalue, self).__init__(dim, keepdim)
         self.additional_args = (k,)
 
 
 class Norm(Function):
 
-    def __init__(self, norm_type=2, dim=None):
+    def __init__(self, norm_type=2, dim=None, keepdim=False):
         super(Norm, self).__init__()
         self.norm_type = norm_type
         self.dim = dim
+        self.keepdim = keepdim
 
     def forward(self, input):
         if self.dim is None:
             self.norm = input.norm(self.norm_type)
             self.save_for_backward(input)
             return input.new((self.norm,))
         else:
-            output = input.norm(self.norm_type, self.dim)
+            output = input.norm(self.norm_type, self.dim, self.keepdim)
             self.save_for_backward(input, output)
             return output
 
@@ -210,6 +230,11 @@ def backward(self, grad_output):
                 return input.mul(pow).mul(scale)
         else:
             input, output = self.saved_tensors
+
+            if self.keepdim is False:
+                grad_output = grad_output.unsqueeze(self.dim)
+                output = output.unsqueeze(self.dim)
+
             big_grad_output = grad_output.expand_as(input)
             if self.norm_type == 2:
                 big_output = output.expand_as(input)
diff --git a/torch/autograd/_functions/stochastic.py b/torch/autograd/_functions/stochastic.py
@@ -24,7 +24,7 @@ def backward(self, reward):
             probs = probs.unsqueeze(0)
             samples = samples.unsqueeze(0)
         # normalize probs (multinomial accepts weights)
-        probs /= probs.sum(1).expand_as(probs)
+        probs /= probs.sum(1, True).expand_as(probs)
         grad_probs = probs.new().resize_as_(probs).zero_()
         output_probs = probs.gather(1, samples)
         output_probs.add_(1e-6).reciprocal_()
diff --git a/torch/autograd/_functions/tensor.py b/torch/autograd/_functions/tensor.py
@@ -113,7 +113,7 @@ def backward(ctx, grad_output):
         for i in range(ctx.num_unsqueezed):
             grad_input = grad_input.sum(0).squeeze(0)
         for dim in ctx.expanded_dims:
-            grad_input = grad_input.sum(dim)
+            grad_input = grad_input.sum(dim, True)
         return grad_input, None
 
 
diff --git a/torch/autograd/variable.py b/torch/autograd/variable.py
diff --git a/torch/nn/_functions/loss.py b/torch/nn/_functions/loss.py
diff --git a/torch/nn/functional.py b/torch/nn/functional.py