Skip to content

Commit 9d8bf1f

Browse files
vtjnashKristofferC
authored andcommitted
add pending state back to jl_thread_suspend_and_get_state-machine (#55622)
Fixes an issue with #55500, where signals may abruptly abort the process as they observe it is still processing the resume SIGUSR2 message and are not able to wait for that processing to end before setting the new message to exit.
1 parent 399512f commit 9d8bf1f

File tree

1 file changed

+55
-10
lines changed

1 file changed

+55
-10
lines changed

src/signals-unix.c

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ static int signal_caught_cond = -1;
448448

449449
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
450450
{
451+
int err;
451452
pthread_mutex_lock(&in_signal_lock);
452453
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
453454
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
@@ -456,22 +457,51 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
456457
pthread_mutex_unlock(&in_signal_lock);
457458
return 0;
458459
}
459-
sig_atomic_t request = 0;
460-
if (!jl_atomic_cmpswap(&ptls2->signal_request, &request, 1)) {
460+
if (jl_atomic_load(&ptls2->signal_request) != 0) {
461461
// something is wrong, or there is already a usr2 in flight elsewhere
462+
// try to wait for it to finish or wait for timeout
463+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
464+
do {
465+
err = poll(&event, 1, timeout * 1000);
466+
} while (err == -1 && errno == EINTR);
467+
if (err == -1 || (event.revents & POLLIN) == 0) {
468+
// not ready after timeout: cancel this request
469+
pthread_mutex_unlock(&in_signal_lock);
470+
return 0;
471+
}
472+
}
473+
// check for any stale signal_caught_cond events
474+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
475+
do {
476+
err = poll(&event, 1, 0);
477+
} while (err == -1 && errno == EINTR);
478+
if (err == -1) {
462479
pthread_mutex_unlock(&in_signal_lock);
463480
return 0;
464481
}
482+
if ((event.revents & POLLIN) != 0) {
483+
// consume it before continuing
484+
eventfd_t got;
485+
do {
486+
err = read(signal_caught_cond, &got, sizeof(eventfd_t));
487+
} while (err == -1 && errno == EINTR);
488+
if (err != sizeof(eventfd_t)) abort();
489+
assert(got == 1); (void) got;
490+
}
491+
sig_atomic_t request = jl_atomic_exchange(&ptls2->signal_request, 1);
492+
assert(request == 0 || request == -1);
465493
request = 1;
466-
int err = pthread_kill(ptls2->system_id, SIGUSR2);
467-
// wait for thread to acknowledge or timeout
468-
struct pollfd event = {signal_caught_cond, POLLIN, 0};
494+
err = pthread_kill(ptls2->system_id, SIGUSR2);
469495
if (err == 0) {
496+
// wait for thread to acknowledge or timeout
497+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
470498
do {
471499
err = poll(&event, 1, timeout * 1000);
472500
} while (err == -1 && errno == EINTR);
501+
if (err != 1 || (event.revents & POLLIN) == 0)
502+
err = -1;
473503
}
474-
if ((event.revents & POLLIN) == 0) {
504+
if (err == -1) {
475505
// not ready after timeout: try to cancel this request
476506
if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
477507
pthread_mutex_unlock(&in_signal_lock);
@@ -487,7 +517,7 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
487517
// Now the other thread is waiting on exit_signal_cond (verify that here by
488518
// checking it is 0, and add an acquire barrier for good measure)
489519
request = jl_atomic_load_acquire(&ptls2->signal_request);
490-
assert(request == 0); (void) request;
520+
assert(request == 0 || request == -1); (void) request;
491521
jl_atomic_store_release(&ptls2->signal_request, 4); // prepare to resume normally, but later code may change this
492522
*ctx = *signal_context;
493523
return 1;
@@ -546,6 +576,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
546576
}
547577

548578
// request:
579+
// -1: processing
549580
// 0: nothing [not from here]
550581
// 1: get state & wait for request
551582
// 2: throw sigint if `!defer_signal && io_wait` or if force throw threshold
@@ -561,22 +592,36 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
561592
if (ptls == NULL)
562593
return;
563594
int errno_save = errno;
564-
// acknowledge that we saw the signal_request
565-
sig_atomic_t request = jl_atomic_exchange(&ptls->signal_request, 0);
595+
sig_atomic_t request = jl_atomic_load(&ptls->signal_request);
596+
if (request == 0)
597+
return;
598+
if (!jl_atomic_cmpswap(&ptls->signal_request, &request, -1))
599+
return;
566600
if (request == 1) {
567601
signal_context = jl_to_bt_context(ctx);
602+
// acknowledge that we saw the signal_request and set signal_context
568603
int err;
569604
eventfd_t got = 1;
570605
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
571606
if (err != sizeof(eventfd_t)) abort();
607+
sig_atomic_t processing = -1;
608+
jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
609+
// wait for exit signal
572610
do {
573611
err = read(exit_signal_cond, &got, sizeof(eventfd_t));
574612
} while (err == -1 && errno == EINTR);
575613
if (err != sizeof(eventfd_t)) abort();
576614
assert(got == 1);
577-
request = jl_atomic_exchange(&ptls->signal_request, 0);
615+
request = jl_atomic_exchange(&ptls->signal_request, -1);
616+
signal_context = NULL;
578617
assert(request == 2 || request == 3 || request == 4);
579618
}
619+
int err;
620+
eventfd_t got = 1;
621+
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
622+
if (err != sizeof(eventfd_t)) abort();
623+
sig_atomic_t processing = -1;
624+
jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
580625
if (request == 2) {
581626
int force = jl_check_force_sigint();
582627
if (force || (!ptls->defer_signal && ptls->io_wait)) {

0 commit comments

Comments
 (0)