Skip to content

Commit 0a75c9c

Browse files
v2.3: validator: Add --wait-for-exit flag to exit subcommand (backport of #6780) (#6908)
validator: Add --wait-for-exit flag to exit subcommand (#6780) agave-validator exit currents returns immediately after the AdminRpc call returns. However, the running validator has not exited at this point and may continue to tear itself down for multiple seconds The exit subcommand now has an optional flag, --wait-for-exit, that queries the PID from the validator and loops until that PID has fully terminated. Use of this flag means that a caller can be sure the running validator is dead when agave-validator exit returns (cherry picked from commit 6bcd5ba) Co-authored-by: steviez <[email protected]>
1 parent d844c32 commit 0a75c9c

File tree

4 files changed

+140
-11
lines changed

4 files changed

+140
-11
lines changed

validator/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ jsonrpc-core = { workspace = true }
2727
jsonrpc-core-client = { workspace = true, features = ["ipc"] }
2828
jsonrpc-derive = { workspace = true }
2929
jsonrpc-ipc-server = { workspace = true }
30+
libc = { workspace = true }
3031
libloading = { workspace = true }
3132
log = { workspace = true }
3233
num_cpus = { workspace = true }

validator/src/admin_rpc_service.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,15 @@ impl solana_cli_output::QuietDisplay for AdminRpcRepairWhitelist {}
151151
pub trait AdminRpc {
152152
type Metadata;
153153

154+
/// Initiates validator exit; exit is asynchronous so the validator
155+
/// will almost certainly still be running when this method returns
154156
#[rpc(meta, name = "exit")]
155157
fn exit(&self, meta: Self::Metadata) -> Result<()>;
156158

159+
/// Return the process id (pid)
160+
#[rpc(meta, name = "pid")]
161+
fn pid(&self, meta: Self::Metadata) -> Result<u32>;
162+
157163
#[rpc(meta, name = "reloadPlugin")]
158164
fn reload_plugin(
159165
&self,
@@ -266,7 +272,7 @@ impl AdminRpc for AdminRpcImpl {
266272
// receive a confusing error as the validator shuts down before a response is sent back.
267273
thread::sleep(Duration::from_millis(100));
268274

269-
warn!("validator exit requested");
275+
info!("validator exit requested");
270276
meta.validator_exit.write().unwrap().exit();
271277

272278
if !meta.validator_exit_backpressure.is_empty() {
@@ -311,6 +317,10 @@ impl AdminRpc for AdminRpcImpl {
311317
Ok(())
312318
}
313319

320+
fn pid(&self, _meta: Self::Metadata) -> Result<u32> {
321+
Ok(std::process::id())
322+
}
323+
314324
fn reload_plugin(
315325
&self,
316326
meta: Self::Metadata,

validator/src/commands/exit/mod.rs

Lines changed: 125 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
#[cfg(target_os = "linux")]
2+
use std::{io, thread, time::Duration};
13
use {
24
crate::{
35
admin_rpc_service,
4-
commands::{monitor, wait_for_restart_window, FromClapArgMatches, Result},
6+
commands::{monitor, wait_for_restart_window, Error, FromClapArgMatches, Result},
57
},
68
clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
79
solana_clap_utils::input_validators::{is_parsable, is_valid_percentage},
@@ -13,10 +15,18 @@ const COMMAND: &str = "exit";
1315
const DEFAULT_MIN_IDLE_TIME: &str = "10";
1416
const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
1517

18+
#[derive(Clone, Debug, PartialEq)]
19+
pub enum PostExitAction {
20+
// Run the agave-validator monitor command indefinitely
21+
Monitor,
22+
// Block until the exiting validator process has terminated
23+
Wait,
24+
}
25+
1626
#[derive(Debug, PartialEq)]
1727
pub struct ExitArgs {
1828
pub force: bool,
19-
pub monitor: bool,
29+
pub post_exit_action: Option<PostExitAction>,
2030
pub min_idle_time: usize,
2131
pub max_delinquent_stake: u8,
2232
pub skip_new_snapshot_check: bool,
@@ -25,9 +35,17 @@ pub struct ExitArgs {
2535

2636
impl FromClapArgMatches for ExitArgs {
2737
fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
38+
let post_exit_action = if matches.is_present("monitor") {
39+
Some(PostExitAction::Monitor)
40+
} else if matches.is_present("wait_for_exit") {
41+
Some(PostExitAction::Wait)
42+
} else {
43+
None
44+
};
45+
2846
Ok(ExitArgs {
2947
force: matches.is_present("force"),
30-
monitor: matches.is_present("monitor"),
48+
post_exit_action,
3149
min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
3250
max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
3351
skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
@@ -55,6 +73,12 @@ pub fn command<'a>() -> App<'a, 'a> {
5573
.takes_value(false)
5674
.help("Monitor the validator after sending the exit request"),
5775
)
76+
.arg(
77+
Arg::with_name("wait_for_exit")
78+
.long("wait-for-exit")
79+
.conflicts_with("monitor")
80+
.help("Wait for the validator to terminate after sending the exit request"),
81+
)
5882
.arg(
5983
Arg::with_name("min_idle_time")
6084
.long("min-idle-time")
@@ -101,17 +125,99 @@ pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
101125
)?;
102126
}
103127

104-
let admin_client = admin_rpc_service::connect(ledger_path);
105-
admin_rpc_service::runtime().block_on(async move { admin_client.await?.exit().await })?;
128+
// Grab the pid from the process before initiating exit as the running
129+
// validator will be unable to respond after exit has returned.
130+
//
131+
// Additionally, only check the pid() RPC call result if it will be used.
132+
// In an upgrade scenario, it is possible that a binary that calls pid()
133+
// will be initating exit against a process that doesn't support pid().
134+
// Since PostExitAction::Wait case is opt-in (via --wait-for-exit), the
135+
// result is checked ONLY in that case to provide a friendlier upgrade
136+
// path for users who are NOT using --wait-for-exit
137+
const WAIT_FOR_EXIT_UNSUPPORTED_ERROR: &str =
138+
"remote process exit cannot be waited on. `--wait-for-exit` is not supported by the remote process";
139+
let post_exit_action = exit_args.post_exit_action.clone();
140+
let validator_pid = admin_rpc_service::runtime().block_on(async move {
141+
let admin_client = admin_rpc_service::connect(ledger_path).await?;
142+
let validator_pid = match post_exit_action {
143+
Some(PostExitAction::Wait) => admin_client
144+
.pid()
145+
.await
146+
.map_err(|_err| Error::Dynamic(WAIT_FOR_EXIT_UNSUPPORTED_ERROR.into()))?,
147+
_ => 0,
148+
};
149+
admin_client.exit().await?;
150+
151+
Ok::<u32, Error>(validator_pid)
152+
})?;
153+
106154
println!("Exit request sent");
107155

108-
if exit_args.monitor {
109-
monitor::execute(matches, ledger_path)?;
156+
match exit_args.post_exit_action {
157+
None => Ok(()),
158+
Some(PostExitAction::Monitor) => monitor::execute(matches, ledger_path),
159+
Some(PostExitAction::Wait) => poll_until_pid_terminates(validator_pid),
160+
}?;
161+
162+
Ok(())
163+
}
164+
165+
#[cfg(target_os = "linux")]
166+
fn poll_until_pid_terminates(pid: u32) -> Result<()> {
167+
let pid = i32::try_from(pid)?;
168+
169+
println!("Waiting for agave-validator process {pid} to terminate");
170+
loop {
171+
// From man kill(2)
172+
//
173+
// If sig is 0, then no signal is sent, but existence and permission
174+
// checks are still performed; this can be used to check for the
175+
// existence of a process ID or process group ID that the caller is
176+
// permitted to signal.
177+
let result = unsafe {
178+
libc::kill(pid, /*sig:*/ 0)
179+
};
180+
if result >= 0 {
181+
// Give the process some time to exit before checking again
182+
thread::sleep(Duration::from_millis(500));
183+
} else {
184+
let errno = io::Error::last_os_error()
185+
.raw_os_error()
186+
.ok_or(Error::Dynamic("unable to read raw os error".into()))?;
187+
match errno {
188+
libc::ESRCH => {
189+
println!("Done, agave-validator process {pid} has terminated");
190+
break;
191+
}
192+
libc::EINVAL => {
193+
// An invalid signal was specified, we only pass sig=0 so
194+
// this should not be possible
195+
Err(Error::Dynamic(
196+
format!("unexpected invalid signal error for kill({pid}, 0)").into(),
197+
))?;
198+
}
199+
libc::EPERM => {
200+
Err(io::Error::from(io::ErrorKind::PermissionDenied))?;
201+
}
202+
unknown => {
203+
Err(Error::Dynamic(
204+
format!("unexpected errno for kill({pid}, 0): {unknown}").into(),
205+
))?;
206+
}
207+
}
208+
}
110209
}
111210

112211
Ok(())
113212
}
114213

214+
#[cfg(not(target_os = "linux"))]
215+
fn poll_until_pid_terminates(_pid: u32) -> Result<()> {
216+
Err(Error::Dynamic(
217+
"Unable to wait for agave-validator process termination on this platform".into(),
218+
))
219+
}
220+
115221
#[cfg(test)]
116222
mod tests {
117223
use {super::*, crate::commands::tests::verify_args_struct_by_command};
@@ -126,7 +232,7 @@ mod tests {
126232
.parse()
127233
.expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
128234
force: false,
129-
monitor: false,
235+
post_exit_action: None,
130236
skip_new_snapshot_check: false,
131237
skip_health_check: false,
132238
}
@@ -151,12 +257,21 @@ mod tests {
151257
}
152258

153259
#[test]
154-
fn verify_args_struct_by_command_exit_with_monitor() {
260+
fn verify_args_struct_by_command_exit_with_post_exit_action() {
155261
verify_args_struct_by_command(
156262
command(),
157263
vec![COMMAND, "--monitor"],
158264
ExitArgs {
159-
monitor: true,
265+
post_exit_action: Some(PostExitAction::Monitor),
266+
..ExitArgs::default()
267+
},
268+
);
269+
270+
verify_args_struct_by_command(
271+
command(),
272+
vec![COMMAND, "--wait-for-exit"],
273+
ExitArgs {
274+
post_exit_action: Some(PostExitAction::Wait),
160275
..ExitArgs::default()
161276
},
162277
);

validator/src/commands/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ pub enum Error {
2727

2828
#[error(transparent)]
2929
Io(#[from] std::io::Error),
30+
31+
#[error(transparent)]
32+
TryFromInt(#[from] std::num::TryFromIntError),
3033
}
3134
pub type Result<T> = std::result::Result<T, Error>;
3235

0 commit comments

Comments
 (0)