Skip to content

Commit e87d2c3

Browse files
committed
Merge branch 'chong-dev'
YOLACT++ Release
2 parents f38e14b + ef56a8d commit e87d2c3

26 files changed

+2706
-118
lines changed

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@
1010

1111
A simple, fully convolutional model for real-time instance segmentation. This is the code for [our paper](https://arxiv.org/abs/1904.02689).
1212

13+
#### YOLACT++ implementation and models released!
14+
YOLACT++ resnet50 model runs at 33.5 fps on a Titan Xp and achieves 34.1 mAP on COCO's `test-dev`.
15+
16+
Related paper will be posted on arXiv soon.
17+
18+
In order to use YOLACT++, make sure you compile the DCNv2 code. (See [Installation](https://github.com/dbolya/yolact#installation))
19+
1320
#### ICCV update (v1.1) released! Check out the ICCV trailer here:
1421
[![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/0pMfmo8qfpQ/0.jpg)](https://www.youtube.com/watch?v=0pMfmo8qfpQ)
1522

@@ -37,6 +44,11 @@ Some examples from our base model (33.5 fps on a Titan Xp and 29.8 mAP on COCO's
3744
git clone https://github.com/dbolya/yolact.git
3845
cd yolact
3946
```
47+
- Compile deformable convolutional layers (from [DCNv2](https://github.com/CharlesShang/DCNv2/tree/pytorch_1.0))
48+
```Shell
49+
cd external/DCNv2
50+
./make.sh
51+
```
4052
- If you'd like to train YOLACT, download the COCO dataset and the 2014/2017 annotations. Note that this script will take a while and dump 21gb of files into `./data/coco`.
4153
```Shell
4254
sh data/scripts/COCO.sh
@@ -57,6 +69,13 @@ As of April 5th, 2019 here are our latest models along with their FPS on a Titan
5769
| 550 | Resnet101-FPN | 33.0 | 29.8 | [yolact_base_54_800000.pth](https://drive.google.com/file/d/1UYy3dMapbH1BnmtZU4WH1zbYgOzzHHf_/view?usp=sharing) | [Mirror](https://ucdavis365-my.sharepoint.com/:u:/g/personal/yongjaelee_ucdavis_edu/EYRWxBEoKU9DiblrWx2M89MBGFkVVB_drlRd_v5sdT3Hgg)
5870
| 700 | Resnet101-FPN | 23.6 | 31.2 | [yolact_im700_54_800000.pth](https://drive.google.com/file/d/1lE4Lz5p25teiXV-6HdTiOJSnS7u7GBzg/view?usp=sharing) | [Mirror](https://ucdavis365-my.sharepoint.com/:u:/g/personal/yongjaelee_ucdavis_edu/Eagg5RSc5hFEhp7sPtvLNyoBjhlf2feog7t8OQzHKKphjw)
5971

72+
YOLACT++ models (released on Dec. 6th, 2019):
73+
74+
| Image Size | Backbone | FPS | mAP | Weights | |
75+
|:----------:|:-------------:|:----:|:----:|----------------------------------------------------------------------------------------------------------------------|--------|
76+
| 550 | Resnet50-FPN | 33.5 | 34.1 | [yolact_plus_resnet50_54_800000.pth](https://drive.google.com/file/d/1ZPu1YR2UzGHQD0o1rEqy-j5bmEm3lbyP/view?usp=sharing) | [Mirror](https://ucdavis365-my.sharepoint.com/:u:/g/personal/yongjaelee_ucdavis_edu/EcJAtMiEFlhAnVsDf00yWRIBUC4m8iE9NEEiV05XwtEoGw) |
77+
| 550 | Resnet101-FPN | 27.3 | 34.6 | [yolact_plus_base_54_800000.pth](https://drive.google.com/file/d/15id0Qq5eqRbkD-N3ZjDZXdCvRyIaHpFB/view?usp=sharing) | [Mirror](https://ucdavis365-my.sharepoint.com/:u:/g/personal/yongjaelee_ucdavis_edu/EVQ62sF0SrJPrl_68onyHF8BpG7c05A8PavV4a849sZgEA)
78+
6079
To evalute the model, put the corresponding weights file in the `./weights` directory and run one of the following commands.
6180
## Quantitative Results on COCO
6281
```Shell

backbone.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,25 @@
44

55
from collections import OrderedDict
66

7+
from dcn_v2 import DCN
8+
79
class Bottleneck(nn.Module):
810
""" Adapted from torchvision.models.resnet """
911
expansion = 4
1012

11-
def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d, dilation=1):
13+
def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d, dilation=1, use_dcn=False):
1214
super(Bottleneck, self).__init__()
1315
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, dilation=dilation)
1416
self.bn1 = norm_layer(planes)
15-
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
16-
padding=dilation, bias=False, dilation=dilation)
17+
if use_dcn:
18+
self.conv2 = DCN(planes, planes, kernel_size=3, stride=stride,
19+
padding=dilation, dilation=dilation, deformable_groups=1)
20+
self.conv2.bias.data.zero_()
21+
self.conv2.conv_offset_mask.weight.data.zero_()
22+
self.conv2.conv_offset_mask.bias.data.zero_()
23+
else:
24+
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
25+
padding=dilation, bias=False, dilation=dilation)
1726
self.bn2 = norm_layer(planes)
1827
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False, dilation=dilation)
1928
self.bn3 = norm_layer(planes * 4)
@@ -47,7 +56,7 @@ def forward(self, x):
4756
class ResNetBackbone(nn.Module):
4857
""" Adapted from torchvision.models.resnet """
4958

50-
def __init__(self, layers, atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d):
59+
def __init__(self, layers, dcn_layers=[0, 0, 0, 0], dcn_interval=1, atrous_layers=[], block=Bottleneck, norm_layer=nn.BatchNorm2d):
5160
super().__init__()
5261

5362
# These will be populated by _make_layer
@@ -66,10 +75,10 @@ def __init__(self, layers, atrous_layers=[], block=Bottleneck, norm_layer=nn.Bat
6675
self.relu = nn.ReLU(inplace=True)
6776
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
6877

69-
self._make_layer(block, 64, layers[0])
70-
self._make_layer(block, 128, layers[1], stride=2)
71-
self._make_layer(block, 256, layers[2], stride=2)
72-
self._make_layer(block, 512, layers[3], stride=2)
78+
self._make_layer(block, 64, layers[0], dcn_layers=dcn_layers[0], dcn_interval=dcn_interval)
79+
self._make_layer(block, 128, layers[1], stride=2, dcn_layers=dcn_layers[1], dcn_interval=dcn_interval)
80+
self._make_layer(block, 256, layers[2], stride=2, dcn_layers=dcn_layers[2], dcn_interval=dcn_interval)
81+
self._make_layer(block, 512, layers[3], stride=2, dcn_layers=dcn_layers[3], dcn_interval=dcn_interval)
7382

7483
# This contains every module that should be initialized by loading in pretrained weights.
7584
# Any extra layers added onto this that won't be initialized by init_backbone will not be
@@ -78,7 +87,7 @@ def __init__(self, layers, atrous_layers=[], block=Bottleneck, norm_layer=nn.Bat
7887
self.backbone_modules = [m for m in self.modules() if isinstance(m, nn.Conv2d)]
7988

8089

81-
def _make_layer(self, block, planes, blocks, stride=1):
90+
def _make_layer(self, block, planes, blocks, stride=1, dcn_layers=0, dcn_interval=1):
8291
""" Here one layer means a string of n Bottleneck blocks. """
8392
downsample = None
8493

@@ -97,11 +106,12 @@ def _make_layer(self, block, planes, blocks, stride=1):
97106
)
98107

99108
layers = []
100-
layers.append(block(self.inplanes, planes, stride, downsample, self.norm_layer, self.dilation))
109+
use_dcn = (dcn_layers >= blocks)
110+
layers.append(block(self.inplanes, planes, stride, downsample, self.norm_layer, self.dilation, use_dcn=use_dcn))
101111
self.inplanes = planes * block.expansion
102112
for i in range(1, blocks):
103-
layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer))
104-
113+
use_dcn = ((i+dcn_layers) >= blocks) and (i % dcn_interval == 0)
114+
layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer, use_dcn=use_dcn))
105115
layer = nn.Sequential(*layers)
106116

107117
self.channels.append(planes * block.expansion)

data/config.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,11 @@ def print(self):
247247
'pred_aspect_ratios': [ [[0.66685089, 1.7073535, 0.87508774, 1.16524493, 0.49059086]] ] * 6,
248248
})
249249

250+
resnet101_dcn_inter3_backbone = resnet101_backbone.copy({
251+
'name': 'ResNet101_DCN_Interval3',
252+
'args': ([3, 4, 23, 3], [0, 4, 23, 3], 3),
253+
})
254+
250255
resnet50_backbone = resnet101_backbone.copy({
251256
'name': 'ResNet50',
252257
'path': 'resnet50-19c8e357.pth',
@@ -255,6 +260,11 @@ def print(self):
255260
'transform': resnet_transform,
256261
})
257262

263+
resnet50_dcnv2_backbone = resnet50_backbone.copy({
264+
'name': 'ResNet50_DCNv2',
265+
'args': ([3, 4, 6, 3], [0, 4, 6, 3]),
266+
})
267+
258268
darknet53_backbone = backbone_base.copy({
259269
'name': 'DarkNet53',
260270
'path': 'darknet53.pth',
@@ -618,6 +628,19 @@ def print(self):
618628

619629
'backbone': None,
620630
'name': 'base_config',
631+
632+
# Fast Mask Re-scoring Network
633+
# Inspried by Mask Scoring R-CNN (https://arxiv.org/abs/1903.00241)
634+
# Do not crop out the mask with bbox but slide a convnet on the image-size mask,
635+
# then use global pooling to get the final mask score
636+
'use_maskiou': False,
637+
'maskiou_net': [],
638+
'remove_small_gt_mask': -1,
639+
640+
'maskiou_alpha': 1.0,
641+
'rescore_mask': False,
642+
'rescore_bbox': False,
643+
'maskious_to_train': -1,
621644
})
622645

623646

@@ -740,6 +763,44 @@ def print(self):
740763
})
741764
})
742765

766+
# ----------------------- YOLACT++ CONFIGS ----------------------- #
767+
768+
yolact_plus_base_config = yolact_base_config.copy({
769+
'name': 'yolact_plus_base',
770+
771+
'backbone': resnet101_dcn_inter3_backbone.copy({
772+
'selected_layers': list(range(1, 4)),
773+
774+
'pred_aspect_ratios': [ [[1, 1/2, 2]] ]*5,
775+
'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]],
776+
'use_pixel_scales': True,
777+
'preapply_sqrt': False,
778+
'use_square_anchors': False,
779+
}),
780+
781+
'use_maskiou': True,
782+
'maskiou_net': [(8, 3, {'stride': 2}), (16, 3, {'stride': 2}), (32, 3, {'stride': 2}), (64, 3, {'stride': 2}), (128, 3, {'stride': 2}), (80, 1, {})],
783+
'maskiou_alpha': 25,
784+
'rescore_bbox': False,
785+
'rescore_mask': True,
786+
787+
'remove_small_gt_mask': 5*5,
788+
})
789+
790+
yolact_plus_resnet50_config = yolact_plus_base_config.copy({
791+
'name': 'yolact_plus_resnet50',
792+
793+
'backbone': resnet50_dcnv2_backbone.copy({
794+
'selected_layers': list(range(1, 4)),
795+
796+
'pred_aspect_ratios': [ [[1, 1/2, 2]] ]*5,
797+
'pred_scales': [[i * 2 ** (j / 3.0) for j in range(3)] for i in [24, 48, 96, 192, 384]],
798+
'use_pixel_scales': True,
799+
'preapply_sqrt': False,
800+
'use_square_anchors': False,
801+
}),
802+
})
803+
743804

744805
# Default config
745806
cfg = yolact_base_config.copy()

eval.py

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def parse_args(argv=None):
132132
coco_cats_inv = {}
133133
color_cache = defaultdict(lambda: {})
134134

135-
def prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, fps_str=''):
135+
def prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, fps_str='', maskiou_net=None):
136136
"""
137137
Note: If undo_transform=False then im_h and im_w are allowed to be None.
138138
"""
@@ -146,14 +146,34 @@ def prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, ma
146146
with timer.env('Postprocess'):
147147
t = postprocess(dets_out, w, h, visualize_lincomb = args.display_lincomb,
148148
crop_masks = args.crop,
149-
score_threshold = args.score_threshold)
149+
score_threshold = args.score_threshold,
150+
maskiou_net = maskiou_net)
150151
torch.cuda.synchronize()
151152

153+
# FIXME reduce copy
152154
with timer.env('Copy'):
153155
if cfg.eval_mask_branch:
154156
# Masks are drawn on the GPU, so don't copy
155-
masks = t[3][:args.top_k]
156-
classes, scores, boxes = [x[:args.top_k].cpu().numpy() for x in t[:3]]
157+
masks = t[3]
158+
classes, scores, boxes = [x for x in t[:3]]
159+
if isinstance(scores, list):
160+
box_scores = scores[0].cpu().numpy()
161+
mask_scores = scores[1].cpu().numpy()
162+
# Re-rank predictions by mask scores
163+
_scores = mask_scores * box_scores
164+
idx = np.argsort(-_scores)
165+
scores = box_scores[idx]
166+
classes = classes.cpu().numpy()[idx]
167+
boxes = boxes.cpu().numpy()[idx]
168+
masks = masks[idx]
169+
else:
170+
scores = scores.cpu().numpy()
171+
classes = classes.cpu().numpy()
172+
boxes = boxes.cpu().numpy()
173+
scores = scores[:args.top_k]
174+
classes = classes[:args.top_k]
175+
boxes = boxes[:args.top_k]
176+
masks = masks[:args.top_k]
157177

158178
num_dets_to_consider = min(args.top_k, classes.shape[0])
159179
for j in range(num_dets_to_consider):
@@ -257,12 +277,20 @@ def get_color(j, on_gpu=None):
257277

258278
return img_numpy
259279

260-
def prep_benchmark(dets_out, h, w):
280+
def prep_benchmark(dets_out, h, w, maskiou_net=None):
261281
with timer.env('Postprocess'):
262-
t = postprocess(dets_out, w, h, crop_masks=args.crop, score_threshold=args.score_threshold)
282+
t = postprocess(dets_out, w, h, crop_masks=args.crop, score_threshold=args.score_threshold, maskiou_net=maskiou_net)
263283

264284
with timer.env('Copy'):
265-
classes, scores, boxes, masks = [x[:args.top_k].cpu().numpy() for x in t]
285+
classes, scores, boxes, masks = [x[:args.top_k] for x in t]
286+
if isinstance(scores, list):
287+
box_scores = scores[0].cpu().numpy()
288+
mask_scores = scores[1].cpu().numpy()
289+
else:
290+
scores = scores.cpu().numpy()
291+
classes = classes.cpu().numpy()
292+
boxes = boxes.cpu().numpy()
293+
masks = masks.cpu().numpy()
266294

267295
with timer.env('Sync'):
268296
# Just in case
@@ -371,7 +399,7 @@ def _bbox_iou(bbox1, bbox2, iscrowd=False):
371399
ret = jaccard(bbox1, bbox2, iscrowd)
372400
return ret.cpu()
373401

374-
def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, detections:Detections=None):
402+
def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, detections:Detections=None, maskiou_net=None):
375403
""" Returns a list of APs for this image, with each element being for a class """
376404
if not args.output_coco_json:
377405
with timer.env('Prepare gt'):
@@ -388,13 +416,19 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de
388416
crowd_classes, gt_classes = split(gt_classes)
389417

390418
with timer.env('Postprocess'):
391-
classes, scores, boxes, masks = postprocess(dets, w, h, crop_masks=args.crop, score_threshold=args.score_threshold)
419+
classes, scores, boxes, masks = postprocess(dets, w, h, crop_masks=args.crop, score_threshold=args.score_threshold, maskiou_net=maskiou_net)
392420

393421
if classes.size(0) == 0:
394422
return
395423

396424
classes = list(classes.cpu().numpy().astype(int))
397-
scores = list(scores.cpu().numpy().astype(float))
425+
if isinstance(scores, list):
426+
box_scores = list(scores[0].cpu().numpy().astype(float))
427+
mask_scores = list(scores[1].cpu().numpy().astype(float))
428+
else:
429+
scores = list(scores.cpu().numpy().astype(float))
430+
box_scores = scores
431+
mask_scores = scores
398432
masks = masks.view(-1, h*w).cuda()
399433
boxes = boxes.cuda()
400434

@@ -406,8 +440,8 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de
406440
for i in range(masks.shape[0]):
407441
# Make sure that the bounding box actually makes sense and a mask was produced
408442
if (boxes[i, 3] - boxes[i, 1]) * (boxes[i, 2] - boxes[i, 0]) > 0:
409-
detections.add_bbox(image_id, classes[i], boxes[i,:], scores[i])
410-
detections.add_mask(image_id, classes[i], masks[i,:,:], scores[i])
443+
detections.add_bbox(image_id, classes[i], boxes[i,:], box_scores[i])
444+
detections.add_mask(image_id, classes[i], masks[i,:,:], mask_scores[i])
411445
return
412446

413447
with timer.env('Eval Setup'):
@@ -425,8 +459,8 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de
425459
crowd_bbox_iou_cache = None
426460

427461
iou_types = [
428-
('box', lambda i,j: bbox_iou_cache[i, j].item(), lambda i,j: crowd_bbox_iou_cache[i,j].item()),
429-
('mask', lambda i,j: mask_iou_cache[i, j].item(), lambda i,j: crowd_mask_iou_cache[i,j].item())
462+
('box', lambda i,j: bbox_iou_cache[i, j].item(), lambda i,j: crowd_bbox_iou_cache[i,j].item(), lambda i: box_scores[i]),
463+
('mask', lambda i,j: mask_iou_cache[i, j].item(), lambda i,j: crowd_mask_iou_cache[i,j].item(), lambda i: mask_scores[i])
430464
]
431465

432466
timer.start('Main loop')
@@ -437,7 +471,7 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de
437471
for iouIdx in range(len(iou_thresholds)):
438472
iou_threshold = iou_thresholds[iouIdx]
439473

440-
for iou_type, iou_func, crowd_func in iou_types:
474+
for iou_type, iou_func, crowd_func, score_func in iou_types:
441475
gt_used = [False] * len(gt_classes)
442476

443477
ap_obj = ap_data[iou_type][iouIdx][_class]
@@ -461,7 +495,7 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de
461495

462496
if max_match_idx >= 0:
463497
gt_used[max_match_idx] = True
464-
ap_obj.push(scores[i], True)
498+
ap_obj.push(score_func(i), True)
465499
else:
466500
# If the detection matches a crowd, we can just ignore it
467501
matched_crowd = False
@@ -481,7 +515,7 @@ def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, de
481515
# same result as COCOEval. There aren't even that many crowd annotations to
482516
# begin with, but accuracy is of the utmost importance.
483517
if not matched_crowd:
484-
ap_obj.push(scores[i], False)
518+
ap_obj.push(score_func(i), False)
485519
timer.stop('Main loop')
486520

487521

@@ -846,6 +880,7 @@ def evaluate(net:Yolact, dataset, train_mode=False):
846880
net.detect.use_cross_class_nms = args.cross_class_nms
847881
cfg.mask_proto_debug = args.mask_proto_debug
848882

883+
# TODO Currently we do not support Fast Mask Re-scroing in evalimage, evalimages, and evalvideo
849884
if args.image is not None:
850885
if ':' in args.image:
851886
inp, out = args.image.split(':')
@@ -921,13 +956,14 @@ def evaluate(net:Yolact, dataset, train_mode=False):
921956
with timer.env('Network Extra'):
922957
preds = net(batch)
923958

959+
maskiou_net = net.get_maskiou_net()
924960
# Perform the meat of the operation here depending on our mode.
925961
if args.display:
926-
img_numpy = prep_display(preds, img, h, w)
962+
img_numpy = prep_display(preds, img, h, w, maskiou_net=maskiou_net)
927963
elif args.benchmark:
928-
prep_benchmark(preds, h, w)
964+
prep_benchmark(preds, h, w, maskiou_net=maskiou_net)
929965
else:
930-
prep_metrics(ap_data, preds, img, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], detections)
966+
prep_metrics(ap_data, preds, img, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], detections, maskiou_net=maskiou_net)
931967

932968
# First couple of images take longer because we're constructing the graph.
933969
# Since that's technically initialization, don't include those in the FPS calculations.

0 commit comments

Comments
 (0)