Skip to content

Commit 83b367d

Browse files
authored
add dygraph mnist CE (PaddlePaddle#2453)
* add ce for dygraph mnist * add ce for dygraph mnist * del mnist_dygraph.py * change mnist_dygraph to train * fix print style
1 parent dbc27b8 commit 83b367d

File tree

4 files changed

+88
-3
lines changed

4 files changed

+88
-3
lines changed

dygraph/mnist/.run_ce.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
# This file is only used for continuous evaluation.
4+
# dygraph single card
5+
export FLAGS_cudnn_deterministic=True
6+
export CUDA_VISIBLE_DEVICES=0
7+
python train.py --ce --epoch 1 | python _ce.py
8+

dygraph/mnist/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
## 训练
1616
教程中使用`paddle.dataset.mnist`数据集作为训练数据,可以通过如下的方式启动训练:
1717
```
18-
env CUDA_VISIBLE_DEVICES=0 python mnist_dygraph.py
18+
env CUDA_VISIBLE_DEVICES=0 python train.py
1919
```
2020
Paddle动态图支持多进程多卡进行模型训练,启动训练的方式:
2121
```
22-
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog mnist_dygraph.py --use_data_parallel 1
22+
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train.py --use_data_parallel 1
2323
```
2424
此时,程序会将每个进程的输出log导入到`./mylog`路径下:
2525
```

dygraph/mnist/_ce.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
####this file is only used for continuous evaluation test!
2+
from __future__ import absolute_import
3+
from __future__ import division
4+
from __future__ import print_function
5+
import os
6+
import sys
7+
sys.path.append(os.environ['ceroot'])
8+
from kpi import CostKpi, DurationKpi, AccKpi
9+
10+
#### NOTE kpi.py should shared in models in some way!!!!
11+
12+
test_acc = AccKpi('test_acc', 0.001, 0, actived=True, desc="test acc")
13+
test_cost = CostKpi('test_cost', 0.001, 0, actived=True, desc='test cost')
14+
#train_speed_kpi = DurationKpi(
15+
# 'train_speed',
16+
# 0.05,
17+
# 0,
18+
# actived=True,
19+
# unit_repr='seconds/image',
20+
# desc='train speed in one GPU card')
21+
tracking_kpis = [test_acc, test_cost]
22+
23+
def parse_log(log):
24+
'''
25+
This method should be implemented by model developers.
26+
27+
The suggestion:
28+
29+
each line in the log should be key, value, for example:
30+
31+
"
32+
train_cost\t1.0
33+
test_cost\t1.0
34+
train_cost\t1.0
35+
train_cost\t1.0
36+
train_acc\t1.2
37+
"
38+
'''
39+
for line in log.split('\n'):
40+
fs = line.strip().split('\t')
41+
print(fs)
42+
if len(fs) == 3 and fs[0] == 'kpis':
43+
print("-----%s" % fs)
44+
kpi_name = fs[1]
45+
kpi_value = float(fs[2])
46+
yield kpi_name, kpi_value
47+
48+
49+
def log_to_ce(log):
50+
kpi_tracker = {}
51+
for kpi in tracking_kpis:
52+
kpi_tracker[kpi.name] = kpi
53+
54+
for (kpi_name, kpi_value) in parse_log(log):
55+
print(kpi_name, kpi_value)
56+
kpi_tracker[kpi_name].add_record(kpi_value)
57+
kpi_tracker[kpi_name].persist()
58+
59+
60+
if __name__ == '__main__':
61+
log = sys.stdin.read()
62+
print("*****")
63+
print(log)
64+
print("****")
65+
log_to_ce(log)

dygraph/mnist/mnist_dygraph.py renamed to dygraph/mnist/train.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def parse_args():
3232
type=ast.literal_eval,
3333
default=False,
3434
help="The flag indicating whether to shuffle instances in each pass.")
35+
parser.add_argument("-e", "--epoch", default=5, type=int, help="set epoch")
36+
parser.add_argument("--ce", action="store_true", help="run ce")
3537
args = parser.parse_args()
3638
return args
3739

@@ -170,13 +172,20 @@ def load_image(file):
170172

171173

172174
def train_mnist(args):
173-
epoch_num = 5
175+
epoch_num = args.epoch
174176
BATCH_SIZE = 64
175177

176178
trainer_count = fluid.dygraph.parallel.Env().nranks
177179
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
178180
if args.use_data_parallel else fluid.CUDAPlace(0)
179181
with fluid.dygraph.guard(place):
182+
if args.ce:
183+
print("ce mode")
184+
seed = 33
185+
np.random.seed(seed)
186+
fluid.default_startup_program().random_seed = seed
187+
fluid.default_main_program().random_seed = seed
188+
180189
if args.use_data_parallel:
181190
strategy = fluid.dygraph.parallel.prepare_context()
182191
mnist = MNIST("mnist")
@@ -226,6 +235,9 @@ def train_mnist(args):
226235
mnist.eval()
227236
test_cost, test_acc = test_mnist(test_reader, mnist, BATCH_SIZE)
228237
mnist.train()
238+
if args.ce:
239+
print("kpis\ttest_acc\t%s" % test_acc)
240+
print("kpis\ttest_cost\t%s" % test_cost)
229241
print("Loss at epoch {} , Test avg_loss is: {}, acc is: {}".format(
230242
epoch, test_cost, test_acc))
231243

0 commit comments

Comments
 (0)