Skip to content

Commit d4bd540

Browse files
authored
make jl_thread_suspend_and_get_state safe (#55500)
Fixes async safety, thread safety, FreeBSD safety.
1 parent a218e82 commit d4bd540

File tree

1 file changed

+55
-54
lines changed

1 file changed

+55
-54
lines changed

src/signals-unix.c

Lines changed: 55 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,8 @@ int exc_reg_is_write_fault(uintptr_t esr) {
314314
#if defined(HAVE_MACH)
315315
#include "signals-mach.c"
316316
#else
317+
#include <poll.h>
318+
#include <sys/eventfd.h>
317319

318320
int jl_lock_stackwalk(void)
319321
{
@@ -439,17 +441,13 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
439441
}
440442
}
441443

442-
#if !defined(JL_DISABLE_LIBUNWIND)
443-
static bt_context_t *signal_context;
444-
pthread_mutex_t in_signal_lock;
445-
static pthread_cond_t exit_signal_cond;
446-
static pthread_cond_t signal_caught_cond;
444+
pthread_mutex_t in_signal_lock; // shared with jl_delete_thread
445+
static bt_context_t *signal_context; // protected by in_signal_lock
446+
static int exit_signal_cond = -1;
447+
static int signal_caught_cond = -1;
447448

448449
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
449450
{
450-
struct timespec ts;
451-
clock_gettime(CLOCK_REALTIME, &ts);
452-
ts.tv_sec += timeout;
453451
pthread_mutex_lock(&in_signal_lock);
454452
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
455453
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
@@ -458,48 +456,51 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
458456
pthread_mutex_unlock(&in_signal_lock);
459457
return 0;
460458
}
461-
jl_atomic_store_release(&ptls2->signal_request, 1);
462-
pthread_kill(ptls2->system_id, SIGUSR2);
463-
// wait for thread to acknowledge
464-
int err = pthread_cond_timedwait(&signal_caught_cond, &in_signal_lock, &ts);
465-
if (err == ETIMEDOUT) {
466-
sig_atomic_t request = 1;
459+
sig_atomic_t request = 0;
460+
if (!jl_atomic_cmpswap(&ptls2->signal_request, &request, 1)) {
461+
// something is wrong, or there is already a usr2 in flight elsewhere
462+
pthread_mutex_unlock(&in_signal_lock);
463+
return 0;
464+
}
465+
request = 1;
466+
int err = pthread_kill(ptls2->system_id, SIGUSR2);
467+
// wait for thread to acknowledge or timeout
468+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
469+
if (err == 0) {
470+
do {
471+
err = poll(&event, 1, timeout * 1000);
472+
} while (err == -1 && errno == EINTR);
473+
}
474+
if ((event.revents & POLLIN) == 0) {
475+
// not ready after timeout: try to cancel this request
467476
if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
468477
pthread_mutex_unlock(&in_signal_lock);
469478
return 0;
470479
}
471-
// Request is either now 0 (meaning the other thread is waiting for
472-
// exit_signal_cond already),
473-
// Or it is now -1 (meaning the other thread
474-
// is waiting for in_signal_lock, and we need to release that lock
475-
// here for a bit, until the other thread has a chance to get to the
476-
// exit_signal_cond)
477-
if (request == -1) {
478-
err = pthread_cond_wait(&signal_caught_cond, &in_signal_lock);
479-
assert(!err);
480-
}
481480
}
481+
eventfd_t got;
482+
do {
483+
err = read(signal_caught_cond, &got, sizeof(eventfd_t));
484+
} while (err == -1 && errno == EINTR);
485+
if (err != sizeof(eventfd_t)) abort();
486+
assert(got == 1); (void) got;
482487
// Now the other thread is waiting on exit_signal_cond (verify that here by
483488
// checking it is 0, and add an acquire barrier for good measure)
484-
int request = jl_atomic_load_acquire(&ptls2->signal_request);
489+
request = jl_atomic_load_acquire(&ptls2->signal_request);
485490
assert(request == 0); (void) request;
486-
jl_atomic_store_release(&ptls2->signal_request, 1); // prepare to resume normally
491+
jl_atomic_store_release(&ptls2->signal_request, 4); // prepare to resume normally, but later code may change this
487492
*ctx = *signal_context;
488493
return 1;
489494
}
490495

491496
void jl_thread_resume(int tid)
492497
{
493-
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
494-
pthread_cond_broadcast(&exit_signal_cond);
495-
pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge (so that signal_request doesn't get mixed up)
496-
// The other thread is waiting to leave exit_signal_cond (verify that here by
497-
// checking it is 0, and add an acquire barrier for good measure)
498-
int request = jl_atomic_load_acquire(&ptls2->signal_request);
499-
assert(request == 0); (void) request;
498+
int err;
499+
eventfd_t got = 1;
500+
err = write(exit_signal_cond, &got, sizeof(eventfd_t));
501+
if (err != sizeof(eventfd_t)) abort();
500502
pthread_mutex_unlock(&in_signal_lock);
501503
}
502-
#endif
503504

504505
// Throw jl_interrupt_exception if the master thread is in a signal async region
505506
// or if SIGINT happens too often.
@@ -508,9 +509,11 @@ static void jl_try_deliver_sigint(void)
508509
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
509510
jl_safepoint_enable_sigint();
510511
jl_wake_libuv();
512+
pthread_mutex_lock(&in_signal_lock);
511513
jl_atomic_store_release(&ptls2->signal_request, 2);
512514
// This also makes sure `sleep` is aborted.
513515
pthread_kill(ptls2->system_id, SIGUSR2);
516+
pthread_mutex_unlock(&in_signal_lock);
514517
}
515518

516519
// Write only by signal handling thread, read only by main thread
@@ -543,12 +546,12 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
543546
}
544547

545548
// request:
546-
// -1: beginning processing [invalid outside here]
547549
// 0: nothing [not from here]
548-
// 1: get state
550+
// 1: get state & wait for request
549551
// 2: throw sigint if `!defer_signal && io_wait` or if force throw threshold
550552
// is reached
551553
// 3: raise `thread0_exit_signo` and try to exit
554+
// 4: no-op
552555
void usr2_handler(int sig, siginfo_t *info, void *ctx)
553556
{
554557
jl_task_t *ct = jl_get_current_task();
@@ -559,25 +562,21 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
559562
return;
560563
int errno_save = errno;
561564
// acknowledge that we saw the signal_request
562-
sig_atomic_t request = jl_atomic_exchange(&ptls->signal_request, -1);
563-
#if !defined(JL_DISABLE_LIBUNWIND)
565+
sig_atomic_t request = jl_atomic_exchange(&ptls->signal_request, 0);
564566
if (request == 1) {
565-
pthread_mutex_lock(&in_signal_lock);
566567
signal_context = jl_to_bt_context(ctx);
567-
// acknowledge that we set the signal_caught_cond broadcast
568+
int err;
569+
eventfd_t got = 1;
570+
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
571+
if (err != sizeof(eventfd_t)) abort();
572+
do {
573+
err = read(exit_signal_cond, &got, sizeof(eventfd_t));
574+
} while (err == -1 && errno == EINTR);
575+
if (err != sizeof(eventfd_t)) abort();
576+
assert(got == 1);
568577
request = jl_atomic_exchange(&ptls->signal_request, 0);
569-
assert(request == -1); (void) request;
570-
pthread_cond_broadcast(&signal_caught_cond);
571-
pthread_cond_wait(&exit_signal_cond, &in_signal_lock);
572-
request = jl_atomic_exchange(&ptls->signal_request, 0);
573-
assert(request == 1 || request == 3);
574-
// acknowledge that we got the resume signal
575-
pthread_cond_broadcast(&signal_caught_cond);
576-
pthread_mutex_unlock(&in_signal_lock);
578+
assert(request == 2 || request == 3 || request == 4);
577579
}
578-
else
579-
#endif
580-
jl_atomic_exchange(&ptls->signal_request, 0); // returns -1
581580
if (request == 2) {
582581
int force = jl_check_force_sigint();
583582
if (force || (!ptls->defer_signal && ptls->io_wait)) {
@@ -1038,10 +1037,12 @@ void restore_signals(void)
10381037
jl_sigsetset(&sset);
10391038
pthread_sigmask(SIG_SETMASK, &sset, 0);
10401039

1041-
#if !defined(HAVE_MACH) && !defined(JL_DISABLE_LIBUNWIND)
1040+
#if !defined(HAVE_MACH)
1041+
exit_signal_cond = eventfd(0, EFD_CLOEXEC);
1042+
signal_caught_cond = eventfd(0, EFD_CLOEXEC);
10421043
if (pthread_mutex_init(&in_signal_lock, NULL) != 0 ||
1043-
pthread_cond_init(&exit_signal_cond, NULL) != 0 ||
1044-
pthread_cond_init(&signal_caught_cond, NULL) != 0) {
1044+
exit_signal_cond == -1 ||
1045+
signal_caught_cond == -1) {
10451046
jl_error("SIGUSR pthread init failed");
10461047
}
10471048
#endif

0 commit comments

Comments
 (0)