make keepdim backcompat warnings emit in autograd as well (pytorch#2157)

soumith · web-flow · commit 09abaa21899d · 2017-07-20T01:48:05.000-04:00
diff --git a/setup.py b/setup.py
@@ -236,7 +236,7 @@ def run(self):
     CXXNAME = os.getenv('CXX', 'g++')
     path = subprocess.check_output([CXXNAME, '-print-file-name=libstdc++.a'])
     path = path[:-1]
-    if type(path) != str: # python 3
+    if type(path) != str:  # python 3
         path = path.decode(sys.stdout.encoding)
     extra_link_args += [path]
 
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -1366,6 +1366,52 @@ def backward(self, grad_output):
         c.backward(torch.ones(c.size()))
         self.assertEqual(x.grad.data, torch.ones(x.size()))
 
+    def test_keepdim_warning(self):
+        torch.utils.backcompat.keepdim_warning.enabled = True
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+
+        def run_backward(y):
+            y_ = y
+            if type(y) is tuple:
+                y_ = y[0]
+            # check that backward runs smooth
+            y_.backward(y_.data.new(y_.size()).normal_())
+
+        def keepdim_check(f):
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                y = f(x, 1)
+                self.assertTrue(len(w) == 1)
+                self.assertTrue(issubclass(w[-1].category, UserWarning))
+                self.assertTrue("keepdim" in str(w[-1].message))
+                run_backward(y)
+                self.assertEqual(x.size(), x.grad.size())
+
+                # check against explicit keepdim
+                y2 = f(x, 1, keepdim=False)
+                self.assertEqual(y, y2)
+                run_backward(y2)
+
+                y3 = f(x, 1, keepdim=True)
+                if type(y3) == tuple:
+                    y3 = (y3[0].squeeze(1), y3[1].squeeze(1))
+                else:
+                    y3 = y3.squeeze(1)
+                self.assertEqual(y, y3)
+                run_backward(y3)
+
+        keepdim_check(torch.sum)
+        keepdim_check(torch.prod)
+        keepdim_check(torch.mean)
+        keepdim_check(torch.max)
+        keepdim_check(torch.min)
+        keepdim_check(torch.mode)
+        keepdim_check(torch.median)
+        keepdim_check(torch.kthvalue)
+        keepdim_check(torch.var)
+        keepdim_check(torch.std)
+        torch.utils.backcompat.keepdim_warning.enabled = False
+
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):
diff --git a/torch/autograd/_functions/reduce.py b/torch/autograd/_functions/reduce.py
@@ -8,14 +8,17 @@
 class Sum(Function):
 
     @staticmethod
-    def forward(ctx, input, dim=None, keepdim=False):
+    def forward(ctx, input, dim=None, keepdim=None):
         ctx.dim = dim
-        ctx.keepdim = keepdim
+        ctx.keepdim = False if keepdim is None else keepdim
         ctx.input_size = input.size()
         if dim is None:
             return input.new((input.sum(),))
         else:
-            return input.sum(dim, keepdim)
+            if keepdim is not None:
+                return input.sum(dim, keepdim=keepdim)
+            else:
+                return input.sum(dim)
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -33,16 +36,19 @@ def backward(ctx, grad_output):
 class Prod(Function):
 
     @staticmethod
-    def forward(ctx, input, dim=None, keepdim=False):
+    def forward(ctx, input, dim=None, keepdim=None):
         ctx.dim = dim
-        ctx.keepdim = keepdim
+        ctx.keepdim = False if keepdim is None else keepdim
         ctx.input_size = input.size()
         if dim is None:
             ctx.result = input.prod()
             ctx.save_for_backward(input)
             return input.new((ctx.result,))
         else:
-            output = input.prod(dim, keepdim)
+            if keepdim is not None:
+                output = input.prod(dim, keepdim=keepdim)
+            else:
+                output = input.prod(dim)
             ctx.save_for_backward(input, output)
             return output
 
@@ -105,14 +111,17 @@ def reverse_dim(var, dim):
 class Mean(Function):
 
     @staticmethod
-    def forward(ctx, input, dim=None, keepdim=False):
+    def forward(ctx, input, dim=None, keepdim=None):
         ctx.dim = dim
-        ctx.keepdim = keepdim
+        ctx.keepdim = False if keepdim is None else keepdim
         ctx.input_size = input.size()
         if dim is None:
             return input.new((input.mean(),))
         else:
-            return input.mean(dim, keepdim)
+            if keepdim is not None:
+                return input.mean(dim, keepdim=keepdim)
+            else:
+                return input.mean(dim)
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -136,10 +145,10 @@ class _SelectionFunction(Function):
     # kthvalue not only requires us to pass a dim, but also preceed it with k.
 
     @classmethod
-    def forward(cls, ctx, input, dim=None, keepdim=False, additional_args=tuple()):
+    def forward(cls, ctx, input, dim=None, keepdim=None, additional_args=tuple()):
         fn = getattr(input, cls.__name__.lower())
         ctx.dim = dim
-        ctx.keepdim = keepdim
+        ctx.keepdim = False if keepdim is None else keepdim
         ctx.additional_args = additional_args
         ctx.input_size = input.size()
         if ctx.dim is None and cls.has_all_reduce:
@@ -151,10 +160,13 @@ def forward(cls, ctx, input, dim=None, keepdim=False, additional_args=tuple()):
                 dim = input.dim() - 1
             else:
                 dim = ctx.dim
-            args = (dim, keepdim)
+            args = (dim,)
             if additional_args:
                 args = additional_args + args
-            output, indices = fn(*args)
+            if keepdim is not None:
+                output, indices = fn(*args, keepdim=keepdim)
+            else:
+                output, indices = fn(*args)
             ctx.save_for_backward(indices)
             ctx.mark_non_differentiable(indices)
             return output, indices
@@ -200,24 +212,27 @@ class Kthvalue(_SelectionFunction):
     has_all_reduce = False
 
     @classmethod
-    def forward(cls, ctx, input, k, dim=None, keepdim=False):
+    def forward(cls, ctx, input, k, dim=None, keepdim=None):
         return super(Kthvalue, cls).forward(ctx, input, dim, keepdim, (k,))
 
 
 class Norm(Function):
 
     @staticmethod
-    def forward(ctx, input, p=2, dim=None, keepdim=False):
+    def forward(ctx, input, p=2, dim=None, keepdim=None):
         ctx.p = p
         ctx.dim = dim
-        ctx.keepdim = keepdim
+        ctx.keepdim = False if keepdim is None else keepdim
 
         if dim is None:
             ctx.norm = input.norm(p)
             ctx.save_for_backward(input)
             return input.new((ctx.norm,))
         else:
-            output = input.norm(p, dim, keepdim)
+            if keepdim is not None:
+                output = input.norm(p, dim, keepdim=keepdim)
+            else:
+                output = input.norm(p, dim)
             ctx.save_for_backward(input, output)
             return output
 
diff --git a/torch/autograd/variable.py b/torch/autograd/variable.py
@@ -449,32 +449,32 @@ def lerp(self, tensor, weight):
     def rsqrt(self):
         return Rsqrt.apply(self)
 
-    def sum(self, dim=None, keepdim=False):
+    def sum(self, dim=None, keepdim=None):
         return Sum.apply(self, dim, keepdim)
 
-    def prod(self, dim=None, keepdim=False):
+    def prod(self, dim=None, keepdim=None):
         return Prod.apply(self, dim, keepdim)
 
-    def mean(self, dim=None, keepdim=False):
+    def mean(self, dim=None, keepdim=None):
         return Mean.apply(self, dim, keepdim)
 
-    def max(self, dim=None, keepdim=False):
+    def max(self, dim=None, keepdim=None):
         if isinstance(dim, Variable):
             return Cmax.apply(self, dim)
         return Max.apply(self, dim, keepdim)
 
-    def min(self, dim=None, keepdim=False):
+    def min(self, dim=None, keepdim=None):
         if isinstance(dim, Variable):
             return Cmin.apply(self, dim)
         return Min.apply(self, dim, keepdim)
 
-    def mode(self, dim=None, keepdim=False):
+    def mode(self, dim=None, keepdim=None):
         return Mode.apply(self, dim, keepdim)
 
-    def median(self, dim=None, keepdim=False):
+    def median(self, dim=None, keepdim=None):
         return Median.apply(self, dim, keepdim)
 
-    def kthvalue(self, k, dim=None, keepdim=False):
+    def kthvalue(self, k, dim=None, keepdim=None):
         return Kthvalue.apply(self, k, dim, keepdim)
 
     def sort(self, dim=None, descending=False):
@@ -508,20 +508,21 @@ def cumprod(self, dim):
     def unfold(self, dim, size, step):
         return Unfold.apply(self, dim, size, step)
 
-    def var(self, dim=None, keepdim=False, unbiased=True):
+    def var(self, dim=None, keepdim=None, unbiased=True):
+        keepdim_ = False if keepdim is None else keepdim
         mean = self.mean(dim, keepdim)
         if dim is None:
             mean = mean.view(*(1 for s in self.size()))
         # we could just set keepdim to True, but this preserves some fidelity
-        elif keepdim is False and self.dim() != 1:
+        elif keepdim_ is False and self.dim() != 1:
             mean = mean.unsqueeze(dim)
         mean_expanded = mean.expand_as(self)
         zero_centered = self.sub(mean_expanded)
-        var = zero_centered.mul(zero_centered).sum(dim, keepdim)
+        var = zero_centered.mul(zero_centered).sum(dim, keepdim=keepdim_)
         numel = self.numel() if dim is None else self.size(dim)
         return var.div(numel - int(unbiased))
 
-    def std(self, dim=None, keepdim=False, unbiased=True):
+    def std(self, dim=None, keepdim=None, unbiased=True):
         return self.var(dim, keepdim, unbiased).sqrt()
 
     def renorm(self, p, dim, maxnorm):
@@ -626,7 +627,7 @@ def addcmul_(self, *args):
     def addcdiv_(self, *args):
         return self._addcop(Addcdiv, args, True)
 
-    def norm(self, p=2, dim=None, keepdim=False):
+    def norm(self, p=2, dim=None, keepdim=None):
         return Norm.apply(self, p, dim, keepdim)
 
     def dist(self, tensor, p=2):