add dygraph mnist CE (PaddlePaddle#2453)

DDDivano · web-flow · commit 83b367d711e0 · 2019-06-20T11:37:27.000+08:00
* add ce for dygraph mnist

* add ce for dygraph mnist

* del mnist_dygraph.py

* change mnist_dygraph to train

* fix print style
diff --git a/dygraph/mnist/.run_ce.sh b/dygraph/mnist/.run_ce.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# This file is only used for continuous evaluation.
+# dygraph single card
+export FLAGS_cudnn_deterministic=True
+export CUDA_VISIBLE_DEVICES=0
+python train.py --ce --epoch 1 | python _ce.py
+
diff --git a/dygraph/mnist/README.md b/dygraph/mnist/README.md
@@ -15,11 +15,11 @@
 ## 训练
 教程中使用`paddle.dataset.mnist`数据集作为训练数据，可以通过如下的方式启动训练：
 ```
-env CUDA_VISIBLE_DEVICES=0 python mnist_dygraph.py
+env CUDA_VISIBLE_DEVICES=0 python train.py
 ```
 Paddle动态图支持多进程多卡进行模型训练，启动训练的方式：
 ```
-python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog mnist_dygraph.py   --use_data_parallel 1
+python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train.py   --use_data_parallel 1
 ```
 此时，程序会将每个进程的输出log导入到`./mylog`路径下：
 ```
diff --git a/dygraph/mnist/_ce.py b/dygraph/mnist/_ce.py
@@ -0,0 +1,65 @@
+####this file is only used for continuous evaluation test!
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+test_acc = AccKpi('test_acc', 0.001, 0, actived=True, desc="test acc")
+test_cost = CostKpi('test_cost', 0.001, 0, actived=True, desc='test cost')
+#train_speed_kpi = DurationKpi(
+#    'train_speed',
+#    0.05,
+#    0,
+#    actived=True,
+#    unit_repr='seconds/image',
+#    desc='train speed in one GPU card')
+tracking_kpis = [test_acc, test_cost]
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
diff --git a/dygraph/mnist/train.py b/dygraph/mnist/train.py
@@ -32,6 +32,8 @@ def parse_args():
         type=ast.literal_eval,
         default=False,
         help="The flag indicating whether to shuffle instances in each pass.")
+    parser.add_argument("-e", "--epoch", default=5, type=int, help="set epoch")
+    parser.add_argument("--ce", action="store_true", help="run ce")
     args = parser.parse_args()
     return args
 
@@ -170,13 +172,20 @@ def load_image(file):
 
 
 def train_mnist(args):
-    epoch_num = 5
+    epoch_num = args.epoch
     BATCH_SIZE = 64
 
     trainer_count = fluid.dygraph.parallel.Env().nranks
     place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
         if args.use_data_parallel else fluid.CUDAPlace(0)
     with fluid.dygraph.guard(place):
+        if args.ce:
+            print("ce mode")
+            seed = 33
+            np.random.seed(seed)
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
         if args.use_data_parallel:
             strategy = fluid.dygraph.parallel.prepare_context()
         mnist = MNIST("mnist")
@@ -226,6 +235,9 @@ def train_mnist(args):
             mnist.eval()
             test_cost, test_acc = test_mnist(test_reader, mnist, BATCH_SIZE)
             mnist.train()
+            if args.ce:
+                print("kpis\ttest_acc\t%s" % test_acc)
+                print("kpis\ttest_cost\t%s" % test_cost)
             print("Loss at epoch {} , Test avg_loss is: {}, acc is: {}".format(
                 epoch, test_cost, test_acc))