Improve broadcast/reduce performance by coalescing tensors

colesbury · colesbury · commit 65b66264d49d · 2017-03-06T12:47:53.000-08:00
diff --git a/torch/cuda/comm.py b/torch/cuda/comm.py
@@ -28,6 +28,35 @@ def broadcast(tensor, devices):
     return tuple(tensor.cuda(gpu, async=True) for gpu in devices)
 
 
+def broadcast_coalesced(tensors, devices, buffer_size=10485760):
+    """Broadcasts a sequence tensors to the specified GPUs.
+
+    Small tensors are first coalesced into a buffer to reduce the number
+    of synchronizations.
+
+    Arguments:
+        tensors (sequence): tensors to broadcast.
+        devices (Iterable): an iterable of devices to which to broadcast.
+        buffer_size (int): maximum size of the buffer used for coalescing
+
+    Returns:
+        A tuple containing copies of the ``tensor``, placed on devices
+        corresponding to indices from ``devices``.
+    """
+    for tensor in tensors:
+        if tensor.get_device() != devices[0]:
+            raise RuntimeError('all tensors must be on devices[0]')
+    outputs = [[] for _ in devices]
+    # use the original tensors for the first device
+    outputs[0].extend(tensors)
+    for chunk in _take_tensors(tensors, buffer_size):
+        results = broadcast(_flatten_tensors(chunk), devices)
+        # use the broadcasted tensors for the remaining devices
+        for dst, res in zip(outputs[1:], results[1:]):
+            dst.extend(_unflatten_tensors(res, chunk))
+    return tuple(outputs)
+
+
 def reduce_add(inputs, destination=None):
     """Sums tensors from multiple GPUs.
 
@@ -68,6 +97,31 @@ def reduce_add(inputs, destination=None):
     return result
 
 
+def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
+    """Sums tensors from multiple GPUs.
+
+    Small tensors are first coalesced into a buffer to reduce the number
+    of synchronizations.
+
+    Arguments:
+        inputs (Iterable[Tensor]): an iterable of tensors to add.
+        destination (int, optional): a device on which the output will be
+            placed (default: current device).
+        buffer_size (int): maximum size of the buffer used for coalescing
+
+    Returns:
+        A tuple of tensors containing an elementwise sum of each group of
+        inputs, placed on the ``destination`` device.
+    """
+    output = []
+    itrs = [_take_tensors(tensors, buffer_size) for tensors in inputs]
+    for chunks in zip(*itrs):
+        flattened = [_flatten_tensors(chunk) for chunk in chunks]
+        result = reduce_add(flattened, destination)
+        output.extend(_unflatten_tensors(result, chunks[0]))
+    return tuple(output)
+
+
 def scatter(tensor, devices, chunk_sizes=None, dim=0):
     """Scatters tensor across multiple GPUs.
 
@@ -142,3 +196,42 @@ def gather(tensors, dim=0, destination=None):
         result.narrow(dim, chunk_start, tensor.size(dim)).copy_(tensor, True)
         chunk_start += tensor.size(dim)
     return result
+
+
+def _flatten_tensors(tensors):
+    """Flatten tensors into a single contiguous 1D buffer"""
+    if len(tensors) == 1:
+        return tensors[0].contiguous().view(-1)
+    size = sum(tensor.numel() for tensor in tensors)
+    offset = 0
+    flat = tensors[0].new(size)
+    for tensor in tensors:
+        flat.narrow(0, offset, tensor.numel()).copy_(tensor)
+        offset += tensor.numel()
+    return flat
+
+
+def _unflatten_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors"""
+    outputs = []
+    offset = 0
+    for tensor in tensors:
+        outputs.append(flat.narrow(0, offset, tensor.numel()).view_as(tensor))
+        offset += tensor.numel()
+    return tuple(outputs)
+
+
+def _take_tensors(tensors, size_limit):
+    """Groups tensors into lists of up to size_limit bytes"""
+    buf = []
+    size = 0
+    for tensor in tensors:
+        param_size = tensor.numel() * tensor.element_size()
+        if size + param_size > size_limit and size > 0:
+            yield buf
+            size = 0
+            buf = []
+        buf.append(tensor)
+        size += param_size
+    if len(buf) > 0:
+        yield buf
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
@@ -1,4 +1,3 @@
-import torch.cuda
 import torch.cuda.comm as comm
 from torch.autograd import Function
 
@@ -9,13 +8,19 @@ def __init__(self, target_gpus):
         super(Broadcast, self).__init__()
         self.target_gpus = target_gpus
 
-    def forward(self, input):
-        assert input.is_cuda, "Broadcast function not implemented for CPU tensors"
-        self.input_device = input.get_device()
-        return comm.broadcast(input, self.target_gpus)
-
-    def backward(self, *grad_output):
-        return comm.reduce_add(grad_output, self.input_device)
+    def forward(self, *inputs):
+        if not all(input.is_cuda for input in inputs):
+            raise TypeError('Broadcast function not implemented for CPU tensors')
+        if len(inputs) == 0:
+            return tuple()
+        self.input_device = inputs[0].get_device()
+        outputs = comm.broadcast_coalesced(inputs, self.target_gpus)
+        return tuple([t for tensors in outputs for t in tensors])
+
+    def backward(self, *grad_outputs):
+        grad_outputs = [grad_outputs[i:i + self.num_inputs]
+                        for i in range(0, len(grad_outputs), self.num_inputs)]
+        return comm.reduce_add_coalesced(grad_outputs, self.input_device)
 
 
 class Gather(Function):
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
@@ -1,42 +1,73 @@
-from copy import copy
-from collections import OrderedDict
-
-from ..modules import Module
 import torch.cuda.comm as comm
 
 
-def _replicate_module(module, gpu, param_remap):
-    if module is None:
-        return module
-    replica = copy(module)
-    replica._parameters = OrderedDict()
-    for key, param in module._parameters.items():
-        replica._parameters[key] = param_remap.get(param)
-    replica._buffers = {}
-    for key, buffer in module._buffers.items():
-        replica._buffers[key] = param_remap.get(buffer)
-    if replica._modules:
-        replica._modules = OrderedDict()
-        for name, child in module._modules.items():
-            replica._modules[name] = _replicate_module(child, gpu, param_remap)
-    return replica
-
-
-def replicate(module, device_ids):
+def replicate(network, devices):
     from ._functions import Broadcast
-    seen_params = set()
-    param_remap = [{} for dev_id in device_ids]
-    for param in module.parameters():
-        if param in seen_params:
-            continue
-        seen_params.add(param)
-        param_copies = Broadcast(device_ids)(param)
-        for param_copy, remap in zip(param_copies, param_remap):
-            remap[param] = param_copy
-    for m in module.modules():
-        for buffer in m._buffers.values():
-            copies = comm.broadcast(buffer, device_ids)
-            for buf_copy, remap in zip(copies, param_remap):
-                remap[buffer] = buf_copy
-    return [_replicate_module(module, device_id, remap)
-            for device_id, remap in zip(device_ids, param_remap)]
+
+    devices = tuple(devices)
+    num_replicas = len(devices)
+
+    params = list(network.parameters())
+    param_indices = {param: idx for idx, param in enumerate(params)}
+    param_copies = Broadcast(devices)(*params)
+    if len(params) > 0:
+        param_copies = [param_copies[i:i + len(params)]
+                        for i in range(0, len(param_copies), len(params))]
+
+    buffers = _buffers(network)
+    buffer_indices = {buf: idx for idx, buf in enumerate(buffers)}
+    buffer_copies = comm.broadcast_coalesced(buffers, devices)
+
+    modules = list(network.modules())
+    module_copies = [[] for device in devices]
+    module_indices = {}
+
+    for i, module in enumerate(modules):
+        module_indices[module] = i
+        for j in range(num_replicas):
+            replica = module.__new__(type(module))
+            replica.__dict__ = module.__dict__.copy()
+            replica._parameters = replica._parameters.copy()
+            replica._buffers = replica._buffers.copy()
+            replica._modules = replica._modules.copy()
+            module_copies[j].append(replica)
+
+    for i, module in enumerate(modules):
+        for key, child in module._modules.items():
+            module_idx = module_indices[child]
+            for j in range(num_replicas):
+                replica = module_copies[j][i]
+                replica._modules[key] = module_copies[j][module_idx]
+        for key, param in module._parameters.items():
+            if param is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._parameters[key] = None
+            else:
+                param_idx = param_indices[param]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._parameters[key] = param_copies[j][param_idx]
+        for key, buf in module._buffers.items():
+            if buf is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._buffers[key] = None
+            else:
+                buffer_idx = buffer_indices[buf]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._buffers[key] = buffer_copies[j][buffer_idx]
+
+    return [module_copies[j][0] for j in range(num_replicas)]
+
+
+def _buffers(network):
+    buffers = []
+    seen = set()
+    for module in network.modules():
+        for buf in module._buffers.values():
+            if buf not in seen and buf is not None:
+                seen.add(buf)
+                buffers.append(buf)
+    return buffers