@@ -448,6 +448,7 @@ static int signal_caught_cond = -1;
448448
449449int jl_thread_suspend_and_get_state (int tid , int timeout , bt_context_t * ctx )
450450{
451+ int err ;
451452 pthread_mutex_lock (& in_signal_lock );
452453 jl_ptls_t ptls2 = jl_atomic_load_relaxed (& jl_all_tls_states )[tid ];
453454 jl_task_t * ct2 = ptls2 ? jl_atomic_load_relaxed (& ptls2 -> current_task ) : NULL ;
@@ -456,22 +457,51 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
456457 pthread_mutex_unlock (& in_signal_lock );
457458 return 0 ;
458459 }
459- sig_atomic_t request = 0 ;
460- if (!jl_atomic_cmpswap (& ptls2 -> signal_request , & request , 1 )) {
460+ if (jl_atomic_load (& ptls2 -> signal_request ) != 0 ) {
461461 // something is wrong, or there is already a usr2 in flight elsewhere
462+ // try to wait for it to finish or wait for timeout
463+ struct pollfd event = {signal_caught_cond , POLLIN , 0 };
464+ do {
465+ err = poll (& event , 1 , timeout * 1000 );
466+ } while (err == -1 && errno == EINTR );
467+ if (err == -1 || (event .revents & POLLIN ) == 0 ) {
468+ // not ready after timeout: cancel this request
469+ pthread_mutex_unlock (& in_signal_lock );
470+ return 0 ;
471+ }
472+ }
473+ // check for any stale signal_caught_cond events
474+ struct pollfd event = {signal_caught_cond , POLLIN , 0 };
475+ do {
476+ err = poll (& event , 1 , 0 );
477+ } while (err == -1 && errno == EINTR );
478+ if (err == -1 ) {
462479 pthread_mutex_unlock (& in_signal_lock );
463480 return 0 ;
464481 }
482+ if ((event .revents & POLLIN ) != 0 ) {
483+ // consume it before continuing
484+ eventfd_t got ;
485+ do {
486+ err = read (signal_caught_cond , & got , sizeof (eventfd_t ));
487+ } while (err == -1 && errno == EINTR );
488+ if (err != sizeof (eventfd_t )) abort ();
489+ assert (got == 1 ); (void ) got ;
490+ }
491+ sig_atomic_t request = jl_atomic_exchange (& ptls2 -> signal_request , 1 );
492+ assert (request == 0 || request == -1 );
465493 request = 1 ;
466- int err = pthread_kill (ptls2 -> system_id , SIGUSR2 );
467- // wait for thread to acknowledge or timeout
468- struct pollfd event = {signal_caught_cond , POLLIN , 0 };
494+ err = pthread_kill (ptls2 -> system_id , SIGUSR2 );
469495 if (err == 0 ) {
496+ // wait for thread to acknowledge or timeout
497+ struct pollfd event = {signal_caught_cond , POLLIN , 0 };
470498 do {
471499 err = poll (& event , 1 , timeout * 1000 );
472500 } while (err == -1 && errno == EINTR );
501+ if (err != 1 || (event .revents & POLLIN ) == 0 )
502+ err = -1 ;
473503 }
474- if (( event . revents & POLLIN ) == 0 ) {
504+ if (err == -1 ) {
475505 // not ready after timeout: try to cancel this request
476506 if (jl_atomic_cmpswap (& ptls2 -> signal_request , & request , 0 )) {
477507 pthread_mutex_unlock (& in_signal_lock );
@@ -487,7 +517,7 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
487517 // Now the other thread is waiting on exit_signal_cond (verify that here by
488518 // checking it is 0, and add an acquire barrier for good measure)
489519 request = jl_atomic_load_acquire (& ptls2 -> signal_request );
490- assert (request == 0 ); (void ) request ;
520+ assert (request == 0 || request == -1 ); (void ) request ;
491521 jl_atomic_store_release (& ptls2 -> signal_request , 4 ); // prepare to resume normally, but later code may change this
492522 * ctx = * signal_context ;
493523 return 1 ;
@@ -546,6 +576,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
546576}
547577
548578// request:
579+ // -1: processing
549580// 0: nothing [not from here]
550581// 1: get state & wait for request
551582// 2: throw sigint if `!defer_signal && io_wait` or if force throw threshold
@@ -561,21 +592,35 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
561592 if (ptls == NULL )
562593 return ;
563594 int errno_save = errno ;
595+ sig_atomic_t request = jl_atomic_load (& ptls -> signal_request );
596+ if (request == 0 )
597+ return ;
598+ if (!jl_atomic_cmpswap (& ptls -> signal_request , & request , -1 ))
599+ return ;
564600 // acknowledge that we saw the signal_request
565- sig_atomic_t request = jl_atomic_exchange (& ptls -> signal_request , 0 );
601+ int err ;
602+ eventfd_t got = 1 ;
603+ err = write (signal_caught_cond , & got , sizeof (eventfd_t ));
604+ if (err != sizeof (eventfd_t )) abort ();
605+ int processing = -1 ;
606+ jl_atomic_cmpswap (& ptls -> signal_request , & processing , 0 );
566607 if (request == 1 ) {
567608 signal_context = jl_to_bt_context (ctx );
568- int err ;
569- eventfd_t got = 1 ;
570- err = write (signal_caught_cond , & got , sizeof (eventfd_t ));
571- if (err != sizeof (eventfd_t )) abort ();
572609 do {
573610 err = read (exit_signal_cond , & got , sizeof (eventfd_t ));
574611 } while (err == -1 && errno == EINTR );
575612 if (err != sizeof (eventfd_t )) abort ();
576613 assert (got == 1 );
577- request = jl_atomic_exchange (& ptls -> signal_request , 0 );
614+ request = jl_atomic_exchange (& ptls -> signal_request , -1 );
615+ if (request != 0 ) {
616+ int err ;
617+ eventfd_t got = 1 ;
618+ err = write (signal_caught_cond , & got , sizeof (eventfd_t ));
619+ if (err != sizeof (eventfd_t )) abort ();
620+ }
578621 assert (request == 2 || request == 3 || request == 4 );
622+ int processing = -1 ;
623+ jl_atomic_cmpswap (& ptls -> signal_request , & processing , 0 );
579624 }
580625 if (request == 2 ) {
581626 int force = jl_check_force_sigint ();
0 commit comments