Skip to content

v2.3: validator: Add --wait-for-exit flag to exit subcommand (backport of #6780) #6908

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions validator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jsonrpc-core = { workspace = true }
jsonrpc-core-client = { workspace = true, features = ["ipc"] }
jsonrpc-derive = { workspace = true }
jsonrpc-ipc-server = { workspace = true }
libc = { workspace = true }
libloading = { workspace = true }
log = { workspace = true }
num_cpus = { workspace = true }
Expand Down
12 changes: 11 additions & 1 deletion validator/src/admin_rpc_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,15 @@ impl solana_cli_output::QuietDisplay for AdminRpcRepairWhitelist {}
pub trait AdminRpc {
type Metadata;

/// Initiates validator exit; exit is asynchronous so the validator
/// will almost certainly still be running when this method returns
#[rpc(meta, name = "exit")]
fn exit(&self, meta: Self::Metadata) -> Result<()>;

/// Return the process id (pid)
#[rpc(meta, name = "pid")]
fn pid(&self, meta: Self::Metadata) -> Result<u32>;

#[rpc(meta, name = "reloadPlugin")]
fn reload_plugin(
&self,
Expand Down Expand Up @@ -266,7 +272,7 @@ impl AdminRpc for AdminRpcImpl {
// receive a confusing error as the validator shuts down before a response is sent back.
thread::sleep(Duration::from_millis(100));

warn!("validator exit requested");
info!("validator exit requested");
meta.validator_exit.write().unwrap().exit();

if !meta.validator_exit_backpressure.is_empty() {
Expand Down Expand Up @@ -311,6 +317,10 @@ impl AdminRpc for AdminRpcImpl {
Ok(())
}

fn pid(&self, _meta: Self::Metadata) -> Result<u32> {
Ok(std::process::id())
}

fn reload_plugin(
&self,
meta: Self::Metadata,
Expand Down
135 changes: 125 additions & 10 deletions validator/src/commands/exit/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#[cfg(target_os = "linux")]
use std::{io, thread, time::Duration};
use {
crate::{
admin_rpc_service,
commands::{monitor, wait_for_restart_window, FromClapArgMatches, Result},
commands::{monitor, wait_for_restart_window, Error, FromClapArgMatches, Result},
},
clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
solana_clap_utils::input_validators::{is_parsable, is_valid_percentage},
Expand All @@ -13,10 +15,18 @@ const COMMAND: &str = "exit";
const DEFAULT_MIN_IDLE_TIME: &str = "10";
const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";

#[derive(Clone, Debug, PartialEq)]
pub enum PostExitAction {
// Run the agave-validator monitor command indefinitely
Monitor,
// Block until the exiting validator process has terminated
Wait,
}

#[derive(Debug, PartialEq)]
pub struct ExitArgs {
pub force: bool,
pub monitor: bool,
pub post_exit_action: Option<PostExitAction>,
pub min_idle_time: usize,
pub max_delinquent_stake: u8,
pub skip_new_snapshot_check: bool,
Expand All @@ -25,9 +35,17 @@ pub struct ExitArgs {

impl FromClapArgMatches for ExitArgs {
fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
let post_exit_action = if matches.is_present("monitor") {
Some(PostExitAction::Monitor)
} else if matches.is_present("wait_for_exit") {
Some(PostExitAction::Wait)
} else {
None
};

Ok(ExitArgs {
force: matches.is_present("force"),
monitor: matches.is_present("monitor"),
post_exit_action,
min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
Expand Down Expand Up @@ -55,6 +73,12 @@ pub fn command<'a>() -> App<'a, 'a> {
.takes_value(false)
.help("Monitor the validator after sending the exit request"),
)
.arg(
Arg::with_name("wait_for_exit")
.long("wait-for-exit")
.conflicts_with("monitor")
.help("Wait for the validator to terminate after sending the exit request"),
)
.arg(
Arg::with_name("min_idle_time")
.long("min-idle-time")
Expand Down Expand Up @@ -101,17 +125,99 @@ pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
)?;
}

let admin_client = admin_rpc_service::connect(ledger_path);
admin_rpc_service::runtime().block_on(async move { admin_client.await?.exit().await })?;
// Grab the pid from the process before initiating exit as the running
// validator will be unable to respond after exit has returned.
//
// Additionally, only check the pid() RPC call result if it will be used.
// In an upgrade scenario, it is possible that a binary that calls pid()
// will be initating exit against a process that doesn't support pid().
// Since PostExitAction::Wait case is opt-in (via --wait-for-exit), the
// result is checked ONLY in that case to provide a friendlier upgrade
// path for users who are NOT using --wait-for-exit
const WAIT_FOR_EXIT_UNSUPPORTED_ERROR: &str =
"remote process exit cannot be waited on. `--wait-for-exit` is not supported by the remote process";
let post_exit_action = exit_args.post_exit_action.clone();
let validator_pid = admin_rpc_service::runtime().block_on(async move {
let admin_client = admin_rpc_service::connect(ledger_path).await?;
let validator_pid = match post_exit_action {
Some(PostExitAction::Wait) => admin_client
.pid()
.await
.map_err(|_err| Error::Dynamic(WAIT_FOR_EXIT_UNSUPPORTED_ERROR.into()))?,
_ => 0,
};
admin_client.exit().await?;

Ok::<u32, Error>(validator_pid)
})?;

println!("Exit request sent");

if exit_args.monitor {
monitor::execute(matches, ledger_path)?;
match exit_args.post_exit_action {
None => Ok(()),
Some(PostExitAction::Monitor) => monitor::execute(matches, ledger_path),
Some(PostExitAction::Wait) => poll_until_pid_terminates(validator_pid),
}?;

Ok(())
}

#[cfg(target_os = "linux")]
fn poll_until_pid_terminates(pid: u32) -> Result<()> {
let pid = i32::try_from(pid)?;

println!("Waiting for agave-validator process {pid} to terminate");
loop {
// From man kill(2)
//
// If sig is 0, then no signal is sent, but existence and permission
// checks are still performed; this can be used to check for the
// existence of a process ID or process group ID that the caller is
// permitted to signal.
let result = unsafe {
libc::kill(pid, /*sig:*/ 0)
};
if result >= 0 {
// Give the process some time to exit before checking again
thread::sleep(Duration::from_millis(500));
} else {
let errno = io::Error::last_os_error()
.raw_os_error()
.ok_or(Error::Dynamic("unable to read raw os error".into()))?;
match errno {
libc::ESRCH => {
println!("Done, agave-validator process {pid} has terminated");
break;
}
libc::EINVAL => {
// An invalid signal was specified, we only pass sig=0 so
// this should not be possible
Err(Error::Dynamic(
format!("unexpected invalid signal error for kill({pid}, 0)").into(),
))?;
}
libc::EPERM => {
Err(io::Error::from(io::ErrorKind::PermissionDenied))?;
}
unknown => {
Err(Error::Dynamic(
format!("unexpected errno for kill({pid}, 0): {unknown}").into(),
))?;
}
}
}
}

Ok(())
}

#[cfg(not(target_os = "linux"))]
fn poll_until_pid_terminates(_pid: u32) -> Result<()> {
Err(Error::Dynamic(
"Unable to wait for agave-validator process termination on this platform".into(),
))
}

#[cfg(test)]
mod tests {
use {super::*, crate::commands::tests::verify_args_struct_by_command};
Expand All @@ -126,7 +232,7 @@ mod tests {
.parse()
.expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
force: false,
monitor: false,
post_exit_action: None,
skip_new_snapshot_check: false,
skip_health_check: false,
}
Expand All @@ -151,12 +257,21 @@ mod tests {
}

#[test]
fn verify_args_struct_by_command_exit_with_monitor() {
fn verify_args_struct_by_command_exit_with_post_exit_action() {
verify_args_struct_by_command(
command(),
vec![COMMAND, "--monitor"],
ExitArgs {
monitor: true,
post_exit_action: Some(PostExitAction::Monitor),
..ExitArgs::default()
},
);

verify_args_struct_by_command(
command(),
vec![COMMAND, "--wait-for-exit"],
ExitArgs {
post_exit_action: Some(PostExitAction::Wait),
..ExitArgs::default()
},
);
Expand Down
3 changes: 3 additions & 0 deletions validator/src/commands/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ pub enum Error {

#[error(transparent)]
Io(#[from] std::io::Error),

#[error(transparent)]
TryFromInt(#[from] std::num::TryFromIntError),
}
pub type Result<T> = std::result::Result<T, Error>;

Expand Down
Loading