15
15
import os
16
16
import subprocess
17
17
import sys
18
+ import tempfile
18
19
import unittest
19
20
20
21
import numpy
23
24
from paddle import base
24
25
from paddle .distributed import fleet
25
26
from paddle .distributed .fleet .base import role_maker
27
+ from paddle .distributed .utils .launch_utils import find_free_ports
26
28
27
29
paddle .enable_static ()
28
30
29
31
30
32
class TestCommunicatorHalfAsyncEnd2End (unittest .TestCase ):
31
33
def net (self ):
32
34
x = paddle .static .data (name = 'x' , shape = [- 1 , 13 ], dtype = 'float32' )
33
- y_predict = paddle .static .nn .fc (x , size = 1 , activation = None )
34
- y = paddle .static .data (name = 'y' , shape = [- 1 , 1 ], dtype = 'float32' )
35
+ x1 = paddle .static .data (
36
+ name = 'x1' , shape = [- 1 , 1 ], dtype = 'int64' , lod_level = 1
37
+ )
35
38
39
+ emb = paddle .static .nn .embedding (
40
+ input = x1 ,
41
+ size = [10000 , 10 ],
42
+ param_attr = base .ParamAttr (
43
+ name = "embedding" ,
44
+ initializer = paddle .nn .initializer .Constant (value = 0.01 ),
45
+ ),
46
+ is_sparse = True ,
47
+ )
48
+
49
+ pool = paddle .static .nn .sequence_lod .sequence_pool (
50
+ input = emb .squeeze (- 2 ), pool_type = "sum"
51
+ )
52
+ z = paddle .concat ([x , pool ], axis = 1 )
53
+
54
+ y_predict = paddle .static .nn .fc (x = z , size = 1 )
55
+ y = paddle .static .data (name = 'y' , shape = [- 1 , 1 ], dtype = 'float32' )
36
56
cost = paddle .nn .functional .square_error_cost (input = y_predict , label = y )
37
57
avg_cost = paddle .mean (cost )
38
- return avg_cost , x , y
58
+ return avg_cost , x , x1 , y
39
59
40
60
def fake_reader (self ):
41
61
def reader ():
42
62
for i in range (10000 ):
43
63
x = numpy .random .random ((1 , 13 )).astype ('float32' )
64
+ z = numpy .random .randint (0 , 9999 , (1 , 1 )).astype ('int64' )
44
65
y = numpy .random .randint (0 , 2 , (1 , 1 )).astype ('int64' )
45
- yield x , y
66
+ yield x , z , y
46
67
47
68
return reader
48
69
49
70
def run_pserver (self , role , strategy ):
50
71
fleet .init (role )
51
- avg_cost , x , y = self .net ()
72
+ avg_cost , x , z , y = self .net ()
52
73
optimizer = paddle .optimizer .SGD (0.01 )
53
74
optimizer = fleet .distributed_optimizer (optimizer , strategy )
54
75
optimizer .minimize (avg_cost )
@@ -61,102 +82,79 @@ def run_trainer(self, role, strategy):
61
82
exe = base .Executor (place )
62
83
63
84
fleet .init (role )
64
- avg_cost , x , y = self .net ()
85
+ avg_cost , x , z , y = self .net ()
65
86
optimizer = paddle .optimizer .SGD (0.01 )
66
87
optimizer = fleet .distributed_optimizer (optimizer , strategy )
67
88
optimizer .minimize (avg_cost )
68
89
69
- exe .run (paddle . static .default_startup_program ())
90
+ exe .run (base .default_startup_program ())
70
91
fleet .init_worker ()
71
92
72
93
train_reader = paddle .batch (self .fake_reader (), batch_size = 24 )
73
- feeder = base .DataFeeder (place = place , feed_list = [x , y ])
94
+ feeder = base .DataFeeder (place = place , feed_list = [x , z , y ])
74
95
75
96
for batch_id , data in enumerate (train_reader ()):
76
97
exe .run (
77
- paddle . static .default_main_program (),
98
+ base .default_main_program (),
78
99
feed = feeder .feed (data ),
79
100
fetch_list = [],
80
101
)
81
102
82
103
fleet .stop_worker ()
83
104
84
105
def run_ut (self ):
85
- strategy = paddle .distributed .fleet .DistributedStrategy ()
86
- strategy .a_sync = True
87
-
88
106
training_role = os .getenv ("TRAINING_ROLE" , "TRAINER" )
89
107
90
- role = role_maker .UserDefinedRoleMaker (
91
- current_id = 0 ,
92
- role = role_maker .Role .WORKER
93
- if training_role == "TRAINER"
94
- else role_maker .Role .SERVER ,
95
- worker_num = 1 ,
96
- server_endpoints = ["127.0.0.1:6002" ],
97
- )
108
+ os .environ ["PADDLE_PSERVER_NUMS" ] = "1"
109
+ os .environ ["PADDLE_TRAINERS_NUM" ] = "1"
110
+ os .environ ["PADDLE_TRAINER_ID" ] = "0"
111
+ os .environ ["PADDLE_TRAINERS_NUM" ] = "1"
112
+ os .environ ["POD_IP" ] = "127.0.0.1"
113
+
114
+ role = role_maker .PaddleCloudRoleMaker ()
115
+
116
+ strategy = paddle .distributed .fleet .DistributedStrategy ()
117
+ strategy .a_sync = True
98
118
99
119
if training_role == "TRAINER" :
100
120
self .run_trainer (role , strategy )
101
121
else :
102
122
self .run_pserver (role , strategy )
103
123
104
124
def test_communicator (self ):
105
- run_server_cmd = """
125
+ temp_dir = tempfile .TemporaryDirectory ()
126
+ pipe_name = os .path .join (temp_dir .name , 'mypipe' )
127
+ try :
128
+ os .mkfifo (pipe_name )
129
+ except OSError as oe :
130
+ print (f"Failed to create pipe: { oe } " )
106
131
107
- import sys
108
- import os
132
+ port = find_free_ports (1 ).pop ()
109
133
110
- import time
111
- import threading
112
- import subprocess
113
- import unittest
114
- import numpy
115
-
116
- from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
117
-
118
- import paddle
119
- import paddle.base as base
120
- import paddle.distributed.fleet as fleet
121
- import paddle.distributed.fleet.base.role_maker as role_maker
122
-
123
- paddle.enable_static()
124
-
125
- class RunServer(TestCommunicatorHalfAsyncEnd2End):
126
- def runTest(self):
127
- pass
128
-
129
- os.environ["http_proxy"] = ""
130
- os.environ["https_proxy"] = ""
131
- os.environ["TRAINING_ROLE"] = "PSERVER"
132
- half_run_server = RunServer()
133
- half_run_server.run_ut()
134
- """
135
-
136
- server_file = "run_server_for_communicator_haflaysnc.py"
137
- with open (server_file , "w" ) as wb :
138
- wb .write (run_server_cmd )
139
134
os .environ ["TRAINING_ROLE" ] = "PSERVER"
140
- _python = sys .executable
135
+ os .environ ["PADDLE_PORT" ] = str (port )
136
+ os .environ ["PADDLE_PSERVERS_IP_PORT_LIST" ] = f"127.0.0.1:{ port } "
137
+ os .environ ["PIPE_FILE" ] = pipe_name
141
138
139
+ _python = sys .executable
140
+ server_file = "run_server_for_communicator_half_async.py"
142
141
ps_cmd = f"{ _python } { server_file } "
142
+
143
143
ps_proc = subprocess .Popen (
144
144
ps_cmd .strip ().split (" " ),
145
145
stdout = subprocess .PIPE ,
146
146
stderr = subprocess .PIPE ,
147
147
)
148
148
149
- os .environ ["http_proxy" ] = ""
150
- os .environ ["https_proxy" ] = ""
149
+ with open (pipe_name , 'r' ) as pipe :
150
+ start_command = pipe .read ()
151
+
151
152
os .environ ["TRAINING_ROLE" ] = "TRAINER"
152
- os .environ ["FLAGS_communicator_send_queue_size" ] = "1"
153
- os .environ ["FLAGS_communicator_max_merge_var_num" ] = "1"
154
153
155
154
self .run_ut ()
156
155
ps_proc .kill ()
157
-
158
- if os .path .exists (server_file ):
159
- os .remove (server_file )
156
+ ps_proc .wait ()
157
+ outs , errs = ps_proc .communicate ()
160
158
161
159
162
160
if __name__ == '__main__' :
0 commit comments