Post-training quantization.

qinjian623 · qinjian623 · commit eaffc5aa7242 · 2019-09-16T07:44:53.000+08:00
diff --git a/post_quant/README.md b/post_quant/README.md
@@ -0,0 +1,45 @@
+# Usage
+
+```python
+import torch
+import torchvision
+import torchvision.datasets as datasets
+import torchvision.transforms.transforms as transforms
+from post_quant.fake_quantization import fake_quant, load_fake_quant_model
+
+model = torchvision.models.resnet50(True)
+model.eval()
+
+db = datasets.ImageFolder(
+    "ILSVRC2012_img_val",
+    transforms.Compose([
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225]
+        ),
+    ]))
+dataset = torch.utils.data.DataLoader(
+    db,
+    batch_size=128,
+    num_workers=8,
+    shuffle=False,
+    pin_memory=True)
+
+# Quantize model
+q_model = fake_quant(model, dataset)
+
+# Save model with scale & zero point:
+torch.save(model.state_dict(), 'model.quant') 
+
+# Reload model:
+m = load_fake_quant_model(torchvision.models.resnet50(), 'model.quant')
+```
+
+
+# TODO
+ - [ ] Symmetric quantization
+ - [ ] Channel-wise weight quantization
+ - [ ] More sophisticated activation range calibration
diff --git a/post_quant/__init__.py b/post_quant/__init__.py
diff --git a/post_quant/accuracy_test.py b/post_quant/accuracy_test.py
@@ -0,0 +1,89 @@
+# Borrowed from examples
+import torch
+import time
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+
+    return res
+
+
+def validate(val_loader, model,
+             shut_up=False,
+             criterion=None, half=False):
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (input, target) in enumerate(val_loader):
+            if torch.cuda.is_available():
+                input = input.cuda(non_blocking=True)
+                target = target.cuda(non_blocking=True)
+            if half:
+                input = input.half()
+            # compute output
+            output = model(input)
+            loss = criterion(output, target) if criterion else None
+
+            # measure accuracy and record layer
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item() if loss else 0, input.size(0))
+            top1.update(acc1[0], input.size(0))
+            top5.update(acc5[0], input.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % 10 == 0 and not shut_up:
+                print('Test: [{0}/{1}]\t'
+                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
+                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
+                      'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t'
+                      'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
+                       i, len(val_loader), batch_time=batch_time, loss=losses,
+                       top1=top1, top5=top5))
+
+        if not shut_up:
+            print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+                  .format(top1=top1, top5=top5))
+
+    return top1.avg
+
diff --git a/post_quant/activation.py b/post_quant/activation.py
@@ -0,0 +1,84 @@
+import torch
+
+from post_quant.accuracy_test import validate
+from post_quant.common import _weight_quantize_range, dequantize, quantize
+
+
+class ActivationMonitor(object):
+    def __init__(self, bits=8, smooth=True):
+        self.bits = bits
+        self.smooth = smooth
+
+    def __call__(self, m, _, output_):
+        o_max = output_.max().item()
+        o_min = output_.min().item()
+        if m.output_max is None:
+            m.output_max = torch.tensor(o_max)
+            m.output_min = torch.tensor(o_min)
+        else:
+            if not self.smooth:
+                if m.output_max < o_max:
+                    m.output_max = o_max
+                if m.output_min > o_min:
+                    m.output_min = o_min
+            else:
+                m.output_max = m.output_max * 0.9 + o_max * 0.1
+                m.output_min = m.output_min * 0.9 + o_min * 0.1
+        min = m.output_min.item()
+        max = m.output_max.item()
+        s, z = _weight_quantize_range(min, max, bits=self.bits)
+        m.output_scale = torch.tensor(s)
+        m.output_zero_point = torch.tensor(z)
+
+
+def register_activation_monitor(
+        net,
+        func):
+    handles = []
+    for n, module in net.named_modules():
+        if need_monitor(module):
+            h = hook_monitor(module, func)
+            handles.append(h)
+    return handles
+
+
+def fake_quant_activation_module(net):
+    for n, m in net.named_modules():
+        if need_monitor(m):
+            replace_forward_op(m)
+
+
+def need_monitor(module):
+    if isinstance(module, torch.nn.Conv2d) or \
+            isinstance(module, torch.nn.BatchNorm2d) or \
+            isinstance(module, torch.nn.Linear):
+        return True
+    return False
+
+
+def hook_monitor(m, func):
+    m.register_buffer('output_scale', None)
+    m.register_buffer('output_zero_point', None)
+    m.output_max = None
+    m.output_min = None
+    return m.register_forward_hook(func)
+
+
+# Replace the forward function to record the output
+def replace_forward_op(module):
+    old_forward = module.forward
+    s = module.output_scale.item()
+    z = module.output_zero_point.item()
+
+    def quant_forward(*input):
+        output_ = old_forward(*input)
+        return dequantize(quantize(output_, s, z), s, z)
+
+    module.forward = quant_forward
+
+
+def calibrate_activation_range(m, db, bits):
+    hooks = register_activation_monitor(m, ActivationMonitor(bits=bits))
+    validate(db, m)
+    for h in hooks:
+        h.remove()
diff --git a/post_quant/common.py b/post_quant/common.py
@@ -0,0 +1,41 @@
+import torch
+
+
+def _weight_quantize_range(min_w, max_w, bits):
+    level = 2 ** bits - 1
+    scale = (max_w - min_w) / level
+    zero_point = round((0.0 - min_w) / scale)
+    if max_w < 0:
+        zero_point = level
+    if min_w > 0:
+        zero_point = 0
+    return scale, zero_point
+
+
+def dequantize(weight, S, Z):
+    return S * (weight - Z)
+
+
+def quantize(weight, S, Z, bits=8):
+    return torch.clamp((weight / S).round() + Z, 0, 2 ** bits - 1)
+
+
+def quantize_tensor(tensor, bits):
+    s, z = _weight_quantize_parameter(tensor, bits)
+    return dequantize(quantize(tensor, s, z, bits), s, z), s, z
+
+
+def _weight_quantize_parameter(weight, bits=8):
+    return _weight_quantize_range(weight.min().item(), weight.max().item(), bits)
+
+
+def register_quant_params(m):
+    with torch.no_grad():
+        for n, module in m.named_modules():
+            if isinstance(module, torch.nn.Conv2d) or isinstance(module, torch.nn.Linear):
+                module.register_buffer('weight_scale', torch.tensor(0.0))
+                module.register_buffer('weight_zero_point', torch.tensor(0))
+                module.register_buffer('bias_scale', torch.tensor(0.0))
+                module.register_buffer('bias_zero_point', torch.tensor(0))
+                module.register_buffer('output_scale', torch.tensor(0.0))
+                module.register_buffer('output_zero_point', torch.tensor(0))
diff --git a/post_quant/fake_quantization.py b/post_quant/fake_quantization.py
@@ -0,0 +1,24 @@
+import torch
+from .common import register_quant_params
+from .fusion import fuse_module
+from .weights import quantize_module
+from .activation import fake_quant_activation_module, calibrate_activation_range
+
+
+def load_fake_quant_model(m, f):
+    state_dict = torch.load(f)
+    m.eval()
+    fuse_module(m)
+    register_quant_params(m)
+    m.load_state_dict(state_dict)
+    fake_quant_activation_module(m)
+    return m
+
+
+def fake_quant(m, db, bits=8):
+    m.eval()
+    fuse_module(m)
+    calibrate_activation_range(m, db, bits)
+    quantize_module(m, bits)
+    fake_quant_activation_module(m)
+    return m
diff --git a/post_quant/fusion.py b/post_quant/fusion.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn
+from utils.modules import DummyModule
+
+
+def fuse(conv, bn):
+    w = conv.weight
+    mean = bn.running_mean
+    var_sqrt = torch.sqrt(bn.running_var + bn.eps)
+
+    beta = bn.weight
+    gamma = bn.bias
+
+    if conv.bias is not None:
+        b = conv.bias
+    else:
+        b = mean.new_zeros(mean.shape)
+
+    w = w * (beta / var_sqrt).reshape([conv.out_channels, 1, 1, 1])
+    b = (b - mean)/var_sqrt * beta + gamma
+
+    fused_conv = nn.Conv2d(
+        conv.in_channels,
+        conv.out_channels,
+        conv.kernel_size,
+        conv.stride,
+        conv.padding,
+        conv.dilation,
+        conv.groups,
+        bias=True,
+        padding_mode=conv.padding_mode
+    )
+    fused_conv.weight = nn.Parameter(w)
+    fused_conv.bias = nn.Parameter(b)
+    return fused_conv
+
+
+def fuse_module(m):
+    children = list(m.named_children())
+    conv = None
+    conv_name = None
+
+    for name, child in children:
+        if isinstance(child, nn.BatchNorm2d) and conv:
+            bc = fuse(conv, child)
+            m._modules[conv_name] = bc
+            m._modules[name] = DummyModule()
+            conv = None
+        elif isinstance(child, nn.Conv2d):
+            conv = child
+            conv_name = name
+        else:
+            fuse_module(child)
+
+
+def validate(net, input_, cuda=True):
+    net.eval()
+    if cuda:
+        input_ = input_.cuda()
+        net.cuda()
+    # import time
+    # s = time.time()
+    a = net(input_)
+    if cuda:
+        torch.cuda.synchronize()
+    # print(time.time() - s)
+    fuse_module(net)
+    # print(mbnet)
+    # s = time.time()
+    b = net(input_)
+    if cuda:
+        torch.cuda.synchronize()
+    # print(time.time() - s)
+    return (a - b).abs().max().item()
+
+
+if __name__ == '__main__':
+    import torchvision
+    mbnet = torchvision.models.mobilenet_v2(True)
+    mbnet.eval()
+    print(validate(mbnet, torch.randn(32, 3, 224, 224), True))
diff --git a/post_quant/main.py b/post_quant/main.py
diff --git a/post_quant/weights.py b/post_quant/weights.py