Skip to content

Commit cda095f

Browse files
committed
Bug#28019228 CPCD FAILS TO START PROCESSES DUE TO INVALID TEMPORARY PID FILES
Problem: When CPCD starts processes sometimes there is a pid file left over. When CPCD starts a new process it waits for the new process to write a pid file. If there is an old pid file CPCD may misread the old one as a new pid file. If the pid in file and what is expected differ, CPCD left the newly started process running and ignored the pid file. This made things worse, leaving processes running typically interferred with other processes starting later since they might request same system resources like TCP ports. Solution: If the found pid file has no running process with that pid, remove the pid file. If there are some process running with pid from pid file, we at least do not start new processes that will be forgotten.
1 parent 6b71fae commit cda095f

File tree

2 files changed

+81
-9
lines changed

2 files changed

+81
-9
lines changed

storage/ndb/src/cw/cpcd/Process.cpp

Lines changed: 80 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -16,6 +16,7 @@
1616
*/
1717

1818
#include <ndb_global.h>
19+
#include "NdbSleep.h"
1920

2021
#ifdef _WIN32
2122
#include <io.h>
@@ -144,6 +145,7 @@ void CPCD::Process::monitor()
144145
{
145146
m_pid = bad_pid;
146147
m_status = STOPPED;
148+
removePid();
147149
}
148150
else if (time(NULL) > m_stopping_time + m_stop_timeout)
149151
{
@@ -589,6 +591,32 @@ int CPCD::Process::start() {
589591
* take care of that.
590592
*/
591593
logger.info("Starting %d: %s", m_id, m_name.c_str());
594+
595+
/* Check if there is a left over pid file.
596+
* If so and process runs with written pid, let it run and fail starting new process.
597+
* If no process runs with written pid, remove pid file.
598+
*/
599+
if (readPid() >= 0) {
600+
if (isRunning()) {
601+
logger.error("Fail starting %d. Old pid file found. Leave running "
602+
"process (pid %d) running.\n",
603+
m_id,
604+
m_pid);
605+
m_status = STOPPED;
606+
m_pid = bad_pid;
607+
return -1;
608+
}
609+
else {
610+
logger.info("While starting %d. Found old pid file with no running "
611+
"process (pid %d). Removing pid file!\n",
612+
m_id,
613+
m_pid);
614+
m_status = STOPPED;
615+
m_pid = bad_pid;
616+
removePid();
617+
}
618+
}
619+
592620
m_status = STARTING;
593621

594622
int pid = -1;
@@ -670,15 +698,59 @@ int CPCD::Process::start() {
670698
return -1;
671699
}
672700

673-
while (readPid() < 0) {
674-
sched_yield();
675-
}
701+
const int max_retries = 3;
702+
for (int retries = max_retries; retries > 0; retries--) {
703+
while (readPid() < 0) {
704+
sched_yield();
705+
}
676706

677-
errno = 0;
678-
pid_t pgid = IF_WIN(-1, getpgid(pid));
707+
errno = 0;
708+
pid_t pgid = IF_WIN(-1, getpgid(pid));
709+
710+
if (pgid == -1 || pgid == m_pid) {
711+
if (retries < max_retries)
712+
{
713+
logger.info("Retry reading pid file succeeded: cpcd pid %d: forked "
714+
"pgid %d pid %d: file m_pid %d",
715+
getpid(),
716+
pgid,
717+
pid,
718+
m_pid);
719+
}
720+
break;
721+
}
722+
723+
/* retry */
724+
725+
// For processtype PERMANENT pid and pgid must be -1 so never enter here.
726+
require(m_processType == TEMPORARY);
727+
logger.error("pgid and m_pid don't match: cpcd pid %d: forked pgid %d "
728+
"pid %d: file m_pid %d",
729+
getpid(),
730+
pgid,
731+
pid,
732+
m_pid);
733+
734+
if (retries == 1) {
735+
/* Last try reading pid file failed.
736+
* For TEMPORARY where pid of started process is known, kill it.
737+
*/
738+
#ifndef _WIN32
739+
logger.error("After pid file mismatch, forced kill of forked process "
740+
"group (pgid %d).",
741+
pgid);
742+
kill(-pgid, 9);
743+
#endif
744+
logger.error("After pid file mismatch, stop started process %d "
745+
"(pid %d).",
746+
m_id,
747+
m_pid);
748+
stop();
749+
return -1;
750+
}
679751

680-
if (pgid != -1 && pgid != m_pid) {
681-
logger.error("pgid and m_pid don't match: %d %d (%d)", pgid, m_pid, pid);
752+
m_pid = bad_pid;
753+
NdbSleep_SecSleep(1);
682754
}
683755

684756
if (isRunning())

storage/ndb/test/run-test/autotest-run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
##############
2727

2828
save_args=$*
29-
VERSION="autotest-run.sh version 1.21"
29+
VERSION="autotest-run.sh version 1.22"
3030

3131
DATE=`date '+%Y-%m-%d'`
3232
if [ `uname -s` != "SunOS" ]

0 commit comments

Comments
 (0)