anza-xyz · steviez · Jul 11, 2025 · Jul 10, 2025
@@ -27,6 +27,7 @@ jsonrpc-core = { workspace = true }
 jsonrpc-core-client = { workspace = true, features = ["ipc"] }
 jsonrpc-derive = { workspace = true }
 jsonrpc-ipc-server = { workspace = true }
+libc = { workspace = true }
 libloading = { workspace = true }
 log = { workspace = true }
 num_cpus = { workspace = true }

@@ -151,9 +151,15 @@ impl solana_cli_output::QuietDisplay for AdminRpcRepairWhitelist {}
 pub trait AdminRpc {
     type Metadata;
 
+    /// Initiates validator exit; exit is asynchronous so the validator
+    /// will almost certainly still be running when this method returns
     #[rpc(meta, name = "exit")]
     fn exit(&self, meta: Self::Metadata) -> Result<()>;
 
+    /// Return the process id (pid)
+    #[rpc(meta, name = "pid")]
+    fn pid(&self, meta: Self::Metadata) -> Result<u32>;
+
     #[rpc(meta, name = "reloadPlugin")]
     fn reload_plugin(
         &self,
@@ -266,7 +272,7 @@ impl AdminRpc for AdminRpcImpl {
                 // receive a confusing error as the validator shuts down before a response is sent back.
                 thread::sleep(Duration::from_millis(100));
 
-                warn!("validator exit requested");
+                info!("validator exit requested");
                 meta.validator_exit.write().unwrap().exit();
 
                 if !meta.validator_exit_backpressure.is_empty() {
@@ -311,6 +317,10 @@ impl AdminRpc for AdminRpcImpl {
         Ok(())
     }
 
+    fn pid(&self, _meta: Self::Metadata) -> Result<u32> {
+        Ok(std::process::id())
+    }
+
     fn reload_plugin(
         &self,
         meta: Self::Metadata,

@@ -1,7 +1,9 @@
+#[cfg(target_os = "linux")]
+use std::{io, thread, time::Duration};
 use {
     crate::{
         admin_rpc_service,
-        commands::{monitor, wait_for_restart_window, FromClapArgMatches, Result},
+        commands::{monitor, wait_for_restart_window, Error, FromClapArgMatches, Result},
     },
     clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
     solana_clap_utils::input_validators::{is_parsable, is_valid_percentage},
@@ -13,10 +15,18 @@ const COMMAND: &str = "exit";
 const DEFAULT_MIN_IDLE_TIME: &str = "10";
 const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
 
+#[derive(Clone, Debug, PartialEq)]
+pub enum PostExitAction {
+    // Run the agave-validator monitor command indefinitely
+    Monitor,
+    // Block until the exiting validator process has terminated
+    Wait,
+}
+
 #[derive(Debug, PartialEq)]
 pub struct ExitArgs {
     pub force: bool,
-    pub monitor: bool,
+    pub post_exit_action: Option<PostExitAction>,
     pub min_idle_time: usize,
     pub max_delinquent_stake: u8,
     pub skip_new_snapshot_check: bool,
@@ -25,9 +35,17 @@ pub struct ExitArgs {
 
 impl FromClapArgMatches for ExitArgs {
     fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
+        let post_exit_action = if matches.is_present("monitor") {
+            Some(PostExitAction::Monitor)
+        } else if matches.is_present("wait_for_exit") {
+            Some(PostExitAction::Wait)
+        } else {
+            None
+        };
+
         Ok(ExitArgs {
             force: matches.is_present("force"),
-            monitor: matches.is_present("monitor"),
+            post_exit_action,
             min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
             max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
             skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
@@ -55,6 +73,12 @@ pub fn command<'a>() -> App<'a, 'a> {
                 .takes_value(false)
                 .help("Monitor the validator after sending the exit request"),
         )
+        .arg(
+            Arg::with_name("wait_for_exit")
+                .long("wait-for-exit")
+                .conflicts_with("monitor")
+                .help("Wait for the validator to terminate after sending the exit request"),
+        )
         .arg(
             Arg::with_name("min_idle_time")
                 .long("min-idle-time")
@@ -101,17 +125,99 @@ pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
         )?;
     }
 
-    let admin_client = admin_rpc_service::connect(ledger_path);
-    admin_rpc_service::runtime().block_on(async move { admin_client.await?.exit().await })?;
+    // Grab the pid from the process before initiating exit as the running
+    // validator will be unable to respond after exit has returned.
+    //
+    // Additionally, only check the pid() RPC call result if it will be used.
+    // In an upgrade scenario, it is possible that a binary that calls pid()
+    // will be initating exit against a process that doesn't support pid().
+    // Since PostExitAction::Wait case is opt-in (via --wait-for-exit), the
+    // result is checked ONLY in that case to provide a friendlier upgrade
+    // path for users who are NOT using --wait-for-exit
+    const WAIT_FOR_EXIT_UNSUPPORTED_ERROR: &str =
+        "remote process exit cannot be waited on. `--wait-for-exit` is not supported by the remote process";
+    let post_exit_action = exit_args.post_exit_action.clone();
+    let validator_pid = admin_rpc_service::runtime().block_on(async move {
+        let admin_client = admin_rpc_service::connect(ledger_path).await?;
+        let validator_pid = match post_exit_action {
+            Some(PostExitAction::Wait) => admin_client
+                .pid()
+                .await
+                .map_err(|_err| Error::Dynamic(WAIT_FOR_EXIT_UNSUPPORTED_ERROR.into()))?,
+            _ => 0,
+        };
+        admin_client.exit().await?;
+
+        Ok::<u32, Error>(validator_pid)
+    })?;
+
     println!("Exit request sent");
 
-    if exit_args.monitor {
-        monitor::execute(matches, ledger_path)?;
+    match exit_args.post_exit_action {
+        None => Ok(()),
+        Some(PostExitAction::Monitor) => monitor::execute(matches, ledger_path),
+        Some(PostExitAction::Wait) => poll_until_pid_terminates(validator_pid),
+    }?;
+
+    Ok(())
+}
+
+#[cfg(target_os = "linux")]
+fn poll_until_pid_terminates(pid: u32) -> Result<()> {
+    let pid = i32::try_from(pid)?;
+
+    println!("Waiting for agave-validator process {pid} to terminate");
+    loop {
+        // From man kill(2)
+        //
+        // If sig is 0, then no signal is sent, but existence and permission
+        // checks are still performed; this can be used to check for the
+        // existence of a process ID or process group ID that the caller is
+        // permitted to signal.
+        let result = unsafe {
+            libc::kill(pid, /*sig:*/ 0)
+        };
+        if result >= 0 {
+            // Give the process some time to exit before checking again
+            thread::sleep(Duration::from_millis(500));
+        } else {
+            let errno = io::Error::last_os_error()
+                .raw_os_error()
+                .ok_or(Error::Dynamic("unable to read raw os error".into()))?;
+            match errno {
+                libc::ESRCH => {
+                    println!("Done, agave-validator process {pid} has terminated");
+                    break;
+                }
+                libc::EINVAL => {
+                    // An invalid signal was specified, we only pass sig=0 so
+                    // this should not be possible
+                    Err(Error::Dynamic(
+                        format!("unexpected invalid signal error for kill({pid}, 0)").into(),
+                    ))?;
+                }
+                libc::EPERM => {
+                    Err(io::Error::from(io::ErrorKind::PermissionDenied))?;
+                }
+                unknown => {
+                    Err(Error::Dynamic(
+                        format!("unexpected errno for kill({pid}, 0): {unknown}").into(),
+                    ))?;
+                }
+            }
+        }
     }
 
     Ok(())
 }
 
+#[cfg(not(target_os = "linux"))]
+fn poll_until_pid_terminates(_pid: u32) -> Result<()> {
+    Err(Error::Dynamic(
+        "Unable to wait for agave-validator process termination on this platform".into(),
+    ))
+}
+
 #[cfg(test)]
 mod tests {
     use {super::*, crate::commands::tests::verify_args_struct_by_command};
@@ -126,7 +232,7 @@ mod tests {
                     .parse()
                     .expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
                 force: false,
-                monitor: false,
+                post_exit_action: None,
                 skip_new_snapshot_check: false,
                 skip_health_check: false,
             }
@@ -151,12 +257,21 @@ mod tests {
     }
 
     #[test]
-    fn verify_args_struct_by_command_exit_with_monitor() {
+    fn verify_args_struct_by_command_exit_with_post_exit_action() {
         verify_args_struct_by_command(
             command(),
             vec![COMMAND, "--monitor"],
             ExitArgs {
-                monitor: true,
+                post_exit_action: Some(PostExitAction::Monitor),
+                ..ExitArgs::default()
+            },
+        );
+
+        verify_args_struct_by_command(
+            command(),
+            vec![COMMAND, "--wait-for-exit"],
+            ExitArgs {
+                post_exit_action: Some(PostExitAction::Wait),
                 ..ExitArgs::default()
             },
         );

@@ -27,6 +27,9 @@ pub enum Error {
 
     #[error(transparent)]
     Io(#[from] std::io::Error),
+
+    #[error(transparent)]
+    TryFromInt(#[from] std::num::TryFromIntError),
 }
 pub type Result<T> = std::result::Result<T, Error>;