Skip to content

Commit 5097d55

Browse files
authored
[Bug fix] Fix efficient test for multi-node (open-mmlab#707)
* [Bug fix] Fix efficient test for multi-node * fixed CI * add efficient test dir * remove unused args
1 parent e610ed1 commit 5097d55

File tree

1 file changed

+14
-87
lines changed

1 file changed

+14
-87
lines changed

mmseg/apis/test.py

Lines changed: 14 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,31 @@
11
import os.path as osp
2-
import pickle
3-
import shutil
42
import tempfile
53

64
import mmcv
75
import numpy as np
86
import torch
9-
import torch.distributed as dist
7+
from mmcv.engine import collect_results_cpu, collect_results_gpu
108
from mmcv.image import tensor2imgs
119
from mmcv.runner import get_dist_info
1210

1311

14-
def np2tmp(array, temp_file_name=None):
12+
def np2tmp(array, temp_file_name=None, tmpdir=None):
1513
"""Save ndarray to local numpy file.
1614
1715
Args:
1816
array (ndarray): Ndarray to save.
1917
temp_file_name (str): Numpy file name. If 'temp_file_name=None', this
2018
function will generate a file name with tempfile.NamedTemporaryFile
2119
to save ndarray. Default: None.
20+
tmpdir (str): Temporary directory to save Ndarray files. Default: None.
2221
2322
Returns:
2423
str: The numpy file name.
2524
"""
2625

2726
if temp_file_name is None:
2827
temp_file_name = tempfile.NamedTemporaryFile(
29-
suffix='.npy', delete=False).name
28+
suffix='.npy', delete=False, dir=tmpdir).name
3029
np.save(temp_file_name, array)
3130
return temp_file_name
3231

@@ -58,6 +57,8 @@ def single_gpu_test(model,
5857
results = []
5958
dataset = data_loader.dataset
6059
prog_bar = mmcv.ProgressBar(len(dataset))
60+
if efficient_test:
61+
mmcv.mkdir_or_exist('.efficient_test')
6162
for i, data in enumerate(data_loader):
6263
with torch.no_grad():
6364
result = model(return_loss=False, **data)
@@ -90,11 +91,11 @@ def single_gpu_test(model,
9091

9192
if isinstance(result, list):
9293
if efficient_test:
93-
result = [np2tmp(_) for _ in result]
94+
result = [np2tmp(_, tmpdir='.efficient_test') for _ in result]
9495
results.extend(result)
9596
else:
9697
if efficient_test:
97-
result = np2tmp(result)
98+
result = np2tmp(result, tmpdir='.efficient_test')
9899
results.append(result)
99100

100101
batch_size = len(result)
@@ -120,7 +121,8 @@ def multi_gpu_test(model,
120121
model (nn.Module): Model to be tested.
121122
data_loader (utils.data.Dataloader): Pytorch data loader.
122123
tmpdir (str): Path of directory to save the temporary results from
123-
different gpus under cpu mode.
124+
different gpus under cpu mode. The same path is used for efficient
125+
test.
124126
gpu_collect (bool): Option to use either gpu or cpu to collect results.
125127
efficient_test (bool): Whether save the results as local numpy files to
126128
save CPU memory during evaluation. Default: False.
@@ -135,17 +137,19 @@ def multi_gpu_test(model,
135137
rank, world_size = get_dist_info()
136138
if rank == 0:
137139
prog_bar = mmcv.ProgressBar(len(dataset))
140+
if efficient_test:
141+
mmcv.mkdir_or_exist('.efficient_test')
138142
for i, data in enumerate(data_loader):
139143
with torch.no_grad():
140144
result = model(return_loss=False, rescale=True, **data)
141145

142146
if isinstance(result, list):
143147
if efficient_test:
144-
result = [np2tmp(_) for _ in result]
148+
result = [np2tmp(_, tmpdir='.efficient_test') for _ in result]
145149
results.extend(result)
146150
else:
147151
if efficient_test:
148-
result = np2tmp(result)
152+
result = np2tmp(result, tmpdir='.efficient_test')
149153
results.append(result)
150154

151155
if rank == 0:
@@ -159,80 +163,3 @@ def multi_gpu_test(model,
159163
else:
160164
results = collect_results_cpu(results, len(dataset), tmpdir)
161165
return results
162-
163-
164-
def collect_results_cpu(result_part, size, tmpdir=None):
165-
"""Collect results with CPU."""
166-
rank, world_size = get_dist_info()
167-
# create a tmp dir if it is not specified
168-
if tmpdir is None:
169-
MAX_LEN = 512
170-
# 32 is whitespace
171-
dir_tensor = torch.full((MAX_LEN, ),
172-
32,
173-
dtype=torch.uint8,
174-
device='cuda')
175-
if rank == 0:
176-
tmpdir = tempfile.mkdtemp()
177-
tmpdir = torch.tensor(
178-
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
179-
dir_tensor[:len(tmpdir)] = tmpdir
180-
dist.broadcast(dir_tensor, 0)
181-
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
182-
else:
183-
mmcv.mkdir_or_exist(tmpdir)
184-
# dump the part result to the dir
185-
mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank)))
186-
dist.barrier()
187-
# collect all parts
188-
if rank != 0:
189-
return None
190-
else:
191-
# load results of all parts from tmp dir
192-
part_list = []
193-
for i in range(world_size):
194-
part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i))
195-
part_list.append(mmcv.load(part_file))
196-
# sort the results
197-
ordered_results = []
198-
for res in zip(*part_list):
199-
ordered_results.extend(list(res))
200-
# the dataloader may pad some samples
201-
ordered_results = ordered_results[:size]
202-
# remove tmp dir
203-
shutil.rmtree(tmpdir)
204-
return ordered_results
205-
206-
207-
def collect_results_gpu(result_part, size):
208-
"""Collect results with GPU."""
209-
rank, world_size = get_dist_info()
210-
# dump result part to tensor with pickle
211-
part_tensor = torch.tensor(
212-
bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
213-
# gather all result part tensor shape
214-
shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
215-
shape_list = [shape_tensor.clone() for _ in range(world_size)]
216-
dist.all_gather(shape_list, shape_tensor)
217-
# padding result part tensor to max length
218-
shape_max = torch.tensor(shape_list).max()
219-
part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
220-
part_send[:shape_tensor[0]] = part_tensor
221-
part_recv_list = [
222-
part_tensor.new_zeros(shape_max) for _ in range(world_size)
223-
]
224-
# gather all result part
225-
dist.all_gather(part_recv_list, part_send)
226-
227-
if rank == 0:
228-
part_list = []
229-
for recv, shape in zip(part_recv_list, shape_list):
230-
part_list.append(
231-
pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
232-
# sort the results
233-
ordered_results = []
234-
for res in zip(*part_list):
235-
ordered_results.extend(list(res))
236-
# the dataloader may pad some samples
237-
ordered_results = ordered_results[:size]
238-
return ordered_results

0 commit comments

Comments
 (0)