1
+ #[ cfg( target_os = "linux" ) ]
2
+ use std:: { io, thread, time:: Duration } ;
1
3
use {
2
4
crate :: {
3
5
admin_rpc_service,
4
- commands:: { monitor, wait_for_restart_window, FromClapArgMatches , Result } ,
6
+ commands:: { monitor, wait_for_restart_window, Error , FromClapArgMatches , Result } ,
5
7
} ,
6
8
clap:: { value_t_or_exit, App , Arg , ArgMatches , SubCommand } ,
7
9
solana_clap_utils:: input_validators:: { is_parsable, is_valid_percentage} ,
@@ -13,10 +15,18 @@ const COMMAND: &str = "exit";
13
15
const DEFAULT_MIN_IDLE_TIME : & str = "10" ;
14
16
const DEFAULT_MAX_DELINQUENT_STAKE : & str = "5" ;
15
17
18
+ #[ derive( Clone , Debug , PartialEq ) ]
19
+ pub enum PostExitAction {
20
+ // Run the agave-validator monitor command indefinitely
21
+ Monitor ,
22
+ // Block until the exiting validator process has terminated
23
+ Wait ,
24
+ }
25
+
16
26
#[ derive( Debug , PartialEq ) ]
17
27
pub struct ExitArgs {
18
28
pub force : bool ,
19
- pub monitor : bool ,
29
+ pub post_exit_action : Option < PostExitAction > ,
20
30
pub min_idle_time : usize ,
21
31
pub max_delinquent_stake : u8 ,
22
32
pub skip_new_snapshot_check : bool ,
@@ -25,9 +35,17 @@ pub struct ExitArgs {
25
35
26
36
impl FromClapArgMatches for ExitArgs {
27
37
fn from_clap_arg_match ( matches : & ArgMatches ) -> Result < Self > {
38
+ let post_exit_action = if matches. is_present ( "monitor" ) {
39
+ Some ( PostExitAction :: Monitor )
40
+ } else if matches. is_present ( "wait_for_exit" ) {
41
+ Some ( PostExitAction :: Wait )
42
+ } else {
43
+ None
44
+ } ;
45
+
28
46
Ok ( ExitArgs {
29
47
force : matches. is_present ( "force" ) ,
30
- monitor : matches . is_present ( "monitor" ) ,
48
+ post_exit_action ,
31
49
min_idle_time : value_t_or_exit ! ( matches, "min_idle_time" , usize ) ,
32
50
max_delinquent_stake : value_t_or_exit ! ( matches, "max_delinquent_stake" , u8 ) ,
33
51
skip_new_snapshot_check : matches. is_present ( "skip_new_snapshot_check" ) ,
@@ -55,6 +73,12 @@ pub fn command<'a>() -> App<'a, 'a> {
55
73
. takes_value ( false )
56
74
. help ( "Monitor the validator after sending the exit request" ) ,
57
75
)
76
+ . arg (
77
+ Arg :: with_name ( "wait_for_exit" )
78
+ . long ( "wait-for-exit" )
79
+ . conflicts_with ( "monitor" )
80
+ . help ( "Wait for the validator to terminate after sending the exit request" ) ,
81
+ )
58
82
. arg (
59
83
Arg :: with_name ( "min_idle_time" )
60
84
. long ( "min-idle-time" )
@@ -101,17 +125,99 @@ pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
101
125
) ?;
102
126
}
103
127
104
- let admin_client = admin_rpc_service:: connect ( ledger_path) ;
105
- admin_rpc_service:: runtime ( ) . block_on ( async move { admin_client. await ?. exit ( ) . await } ) ?;
128
+ // Grab the pid from the process before initiating exit as the running
129
+ // validator will be unable to respond after exit has returned.
130
+ //
131
+ // Additionally, only check the pid() RPC call result if it will be used.
132
+ // In an upgrade scenario, it is possible that a binary that calls pid()
133
+ // will be initating exit against a process that doesn't support pid().
134
+ // Since PostExitAction::Wait case is opt-in (via --wait-for-exit), the
135
+ // result is checked ONLY in that case to provide a friendlier upgrade
136
+ // path for users who are NOT using --wait-for-exit
137
+ const WAIT_FOR_EXIT_UNSUPPORTED_ERROR : & str =
138
+ "remote process exit cannot be waited on. `--wait-for-exit` is not supported by the remote process" ;
139
+ let post_exit_action = exit_args. post_exit_action . clone ( ) ;
140
+ let validator_pid = admin_rpc_service:: runtime ( ) . block_on ( async move {
141
+ let admin_client = admin_rpc_service:: connect ( ledger_path) . await ?;
142
+ let validator_pid = match post_exit_action {
143
+ Some ( PostExitAction :: Wait ) => admin_client
144
+ . pid ( )
145
+ . await
146
+ . map_err ( |_err| Error :: Dynamic ( WAIT_FOR_EXIT_UNSUPPORTED_ERROR . into ( ) ) ) ?,
147
+ _ => 0 ,
148
+ } ;
149
+ admin_client. exit ( ) . await ?;
150
+
151
+ Ok :: < u32 , Error > ( validator_pid)
152
+ } ) ?;
153
+
106
154
println ! ( "Exit request sent" ) ;
107
155
108
- if exit_args. monitor {
109
- monitor:: execute ( matches, ledger_path) ?;
156
+ match exit_args. post_exit_action {
157
+ None => Ok ( ( ) ) ,
158
+ Some ( PostExitAction :: Monitor ) => monitor:: execute ( matches, ledger_path) ,
159
+ Some ( PostExitAction :: Wait ) => poll_until_pid_terminates ( validator_pid) ,
160
+ } ?;
161
+
162
+ Ok ( ( ) )
163
+ }
164
+
165
+ #[ cfg( target_os = "linux" ) ]
166
+ fn poll_until_pid_terminates ( pid : u32 ) -> Result < ( ) > {
167
+ let pid = i32:: try_from ( pid) ?;
168
+
169
+ println ! ( "Waiting for agave-validator process {pid} to terminate" ) ;
170
+ loop {
171
+ // From man kill(2)
172
+ //
173
+ // If sig is 0, then no signal is sent, but existence and permission
174
+ // checks are still performed; this can be used to check for the
175
+ // existence of a process ID or process group ID that the caller is
176
+ // permitted to signal.
177
+ let result = unsafe {
178
+ libc:: kill ( pid, /*sig:*/ 0 )
179
+ } ;
180
+ if result >= 0 {
181
+ // Give the process some time to exit before checking again
182
+ thread:: sleep ( Duration :: from_millis ( 500 ) ) ;
183
+ } else {
184
+ let errno = io:: Error :: last_os_error ( )
185
+ . raw_os_error ( )
186
+ . ok_or ( Error :: Dynamic ( "unable to read raw os error" . into ( ) ) ) ?;
187
+ match errno {
188
+ libc:: ESRCH => {
189
+ println ! ( "Done, agave-validator process {pid} has terminated" ) ;
190
+ break ;
191
+ }
192
+ libc:: EINVAL => {
193
+ // An invalid signal was specified, we only pass sig=0 so
194
+ // this should not be possible
195
+ Err ( Error :: Dynamic (
196
+ format ! ( "unexpected invalid signal error for kill({pid}, 0)" ) . into ( ) ,
197
+ ) ) ?;
198
+ }
199
+ libc:: EPERM => {
200
+ Err ( io:: Error :: from ( io:: ErrorKind :: PermissionDenied ) ) ?;
201
+ }
202
+ unknown => {
203
+ Err ( Error :: Dynamic (
204
+ format ! ( "unexpected errno for kill({pid}, 0): {unknown}" ) . into ( ) ,
205
+ ) ) ?;
206
+ }
207
+ }
208
+ }
110
209
}
111
210
112
211
Ok ( ( ) )
113
212
}
114
213
214
+ #[ cfg( not( target_os = "linux" ) ) ]
215
+ fn poll_until_pid_terminates ( _pid : u32 ) -> Result < ( ) > {
216
+ Err ( Error :: Dynamic (
217
+ "Unable to wait for agave-validator process termination on this platform" . into ( ) ,
218
+ ) )
219
+ }
220
+
115
221
#[ cfg( test) ]
116
222
mod tests {
117
223
use { super :: * , crate :: commands:: tests:: verify_args_struct_by_command} ;
@@ -126,7 +232,7 @@ mod tests {
126
232
. parse ( )
127
233
. expect ( "invalid DEFAULT_MAX_DELINQUENT_STAKE" ) ,
128
234
force : false ,
129
- monitor : false ,
235
+ post_exit_action : None ,
130
236
skip_new_snapshot_check : false ,
131
237
skip_health_check : false ,
132
238
}
@@ -151,12 +257,21 @@ mod tests {
151
257
}
152
258
153
259
#[ test]
154
- fn verify_args_struct_by_command_exit_with_monitor ( ) {
260
+ fn verify_args_struct_by_command_exit_with_post_exit_action ( ) {
155
261
verify_args_struct_by_command (
156
262
command ( ) ,
157
263
vec ! [ COMMAND , "--monitor" ] ,
158
264
ExitArgs {
159
- monitor : true ,
265
+ post_exit_action : Some ( PostExitAction :: Monitor ) ,
266
+ ..ExitArgs :: default ( )
267
+ } ,
268
+ ) ;
269
+
270
+ verify_args_struct_by_command (
271
+ command ( ) ,
272
+ vec ! [ COMMAND , "--wait-for-exit" ] ,
273
+ ExitArgs {
274
+ post_exit_action : Some ( PostExitAction :: Wait ) ,
160
275
..ExitArgs :: default ( )
161
276
} ,
162
277
) ;
0 commit comments