Skip to content

Commit dca2a6d

Browse files
frazerclementsreedhars
authored andcommitted
Bug #28698831 7.5.11/12: NODE RESTART GOES TO HANG STATE ..
Backport of bug fix pushed to 7.6 as part of pLCP WL. (BUG#26166901 WL#10302 (8069): NODE RESTART GOES TO HANG STATE ...) Created new testcase showing at least one aspect of the problem which is fixed. Testcase added to daily-devel--07 (cherry picked from commit 92b919993fc5d94236e98fb0f01ab4e63fc8717d)
1 parent 49ae3ac commit dca2a6d

File tree

6 files changed

+121
-9
lines changed

6 files changed

+121
-9
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# along with this program; if not, write to the Free Software
1414
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
1515

16-
Next QMGR 948
16+
Next QMGR 949
1717
Next NDBCNTR 1006
1818
Next NDBFS 2002
1919
Next DBACC 3002
@@ -259,6 +259,7 @@ QMGR
259259
3) ndb_mgm -e "2 error 946"
260260
4) ndb_mgm -e "all start"
261261
947: Treat restart error due to a node disconnection as CRASH_INSERTION.
262+
948: Cause node to crash when first hearing of a node failure
262263

263264
ERROR CODES FOR TESTING NODE FAILURE, FAILURE IN COPY FRAGMENT PROCESS:
264265
-----------------------------------------------------------------------

storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1471,7 +1471,7 @@ class Dbdih: public SimulatedBlock {
14711471

14721472
void packFragIntoPagesLab(Signal *, RWFragment* wf);
14731473
void startNextChkpt(Signal *);
1474-
void failedNodeLcpHandling(Signal*, NodeRecordPtr failedNodePtr);
1474+
void failedNodeLcpHandling(Signal*, NodeRecordPtr failedNodePtr, bool &);
14751475
void failedNodeSynchHandling(Signal *, NodeRecordPtr failedNodePtr);
14761476
void checkCopyTab(Signal*, NodeRecordPtr failedNodePtr);
14771477

storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9183,6 +9183,7 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
91839183

91849184
const bool masterTakeOver = (oldMasterId != newMasterId);
91859185

9186+
bool check_more_start_lcp = false;
91869187
for(i = 0; i < noOfFailedNodes; i++) {
91879188
NodeRecordPtr failedNodePtr;
91889189
failedNodePtr.i = failedNodes[i];
@@ -9216,7 +9217,7 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
92169217
// Functions that need to be called for all nodes.
92179218
/*--------------------------------------------------*/
92189219
checkStopMe(signal, failedNodePtr);
9219-
failedNodeLcpHandling(signal, failedNodePtr);
9220+
failedNodeLcpHandling(signal, failedNodePtr, check_more_start_lcp);
92209221
startRemoveFailedNode(signal, failedNodePtr);
92219222

92229223
/**
@@ -9244,6 +9245,23 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
92449245
}//if
92459246

92469247
setGCPStopTimeouts();
9248+
9249+
/**
9250+
* Need to check if a node failed that was part of LCP. In this
9251+
* case we need to ensure that we don't get LCP hang by checking
9252+
* for sending of LCP_FRAG_ORD with last fragment flag set.
9253+
*
9254+
* This code cannot be called in master takeover case, in this
9255+
* case we restart the LCP in DIH entirely, so no need to worry
9256+
* here.
9257+
*/
9258+
if (check_more_start_lcp &&
9259+
c_lcpMasterTakeOverState.state == LMTOS_IDLE)
9260+
{
9261+
jam();
9262+
ndbrequire(isMaster());
9263+
startNextChkpt(signal);
9264+
}
92479265
}//Dbdih::execNODE_FAILREP()
92489266

92499267
void Dbdih::checkCopyTab(Signal* signal, NodeRecordPtr failedNodePtr)
@@ -9509,7 +9527,9 @@ Dbdih::findTakeOver(Ptr<TakeOverRecord> & ptr, Uint32 failedNodeId)
95099527
return false;
95109528
}//Dbdih::findTakeOver()
95119529

9512-
void Dbdih::failedNodeLcpHandling(Signal* signal, NodeRecordPtr failedNodePtr)
9530+
void Dbdih::failedNodeLcpHandling(Signal* signal,
9531+
NodeRecordPtr failedNodePtr,
9532+
bool& check_more_start_lcp)
95139533
{
95149534
jam();
95159535
const Uint32 nodeId = failedNodePtr.i;
@@ -9554,6 +9574,18 @@ void Dbdih::failedNodeLcpHandling(Signal* signal, NodeRecordPtr failedNodePtr)
95549574
ndbrequire(false);
95559575
break;
95569576
}//switch
9577+
9578+
jam();
9579+
9580+
/**
9581+
* It could be that the ongoing LCP is only waiting for our node, so
9582+
* it is important to here call checkStartMoreLcp. We need to go
9583+
* through all nodes first though to ensure that we don't call
9584+
* this and start checkpoints towards nodes already failed.
9585+
*/
9586+
failedNodePtr.p->noOfQueuedChkpt = 0;
9587+
failedNodePtr.p->noOfStartedChkpt = 0;
9588+
check_more_start_lcp = true;
95579589
}//if
95589590

95599591
c_lcpState.m_participatingDIH.clear(failedNodePtr.i);
@@ -12820,6 +12852,11 @@ Dbdih::execDROP_TAB_REQ(Signal* signal)
1282012852
}
1282112853
}
1282212854
nodePtr.p->noOfQueuedChkpt = count;
12855+
if (nodePtr.p->noOfStartedChkpt == 0)
12856+
{
12857+
jam();
12858+
checkStartMoreLcp(signal, nodePtr.i);
12859+
}
1282312860
}
1282412861
}
1282512862
}
@@ -18296,6 +18333,7 @@ void Dbdih::startLcpRoundLoopLab(Signal* signal,
1829618333
for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
1829718334
ptrAss(nodePtr, nodeRecord);
1829818335
if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
18336+
jamLine(nodePtr.i);
1829918337
ndbrequire(nodePtr.p->noOfStartedChkpt == 0);
1830018338
ndbrequire(nodePtr.p->noOfQueuedChkpt == 0);
1830118339
}//if
@@ -18926,6 +18964,11 @@ void Dbdih::execLCP_FRAG_REP(Signal* signal)
1892618964
{
1892718965
jam();
1892818966
nodePtr.p->noOfQueuedChkpt--;
18967+
if (nodePtr.p->noOfStartedChkpt == 0)
18968+
{
18969+
jam();
18970+
checkStartMoreLcp(signal, nodePtr.i);
18971+
}
1892918972
return;
1893018973
}
1893118974
}

storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -217,6 +217,8 @@ void Qmgr::execFAIL_REP(Signal* signal)
217217
failSource = refToNode(signal->getSendersBlockRef());
218218
}
219219

220+
CRASH_INSERTION(948);
221+
220222
jamEntry();
221223
failReportLab(signal, failNodeId, failCause, failSource);
222224
return;

storage/ndb/test/ndbapi/testNodeRestart.cpp

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -7343,8 +7343,6 @@ runGcpStop(NDBT_Context* ctx, NDBT_Step* step)
73437343
}
73447344

73457345

7346-
static const Uint32 numTables = 20;
7347-
73487346
int CMT_createTableHook(Ndb* ndb,
73497347
NdbDictionary::Table& table,
73507348
int when,
@@ -7370,6 +7368,7 @@ int CMT_createTableHook(Ndb* ndb,
73707368
int createManyTables(NDBT_Context* ctx, NDBT_Step* step)
73717369
{
73727370
Ndb* pNdb = GETNDB(step);
7371+
const Uint32 numTables=ctx->getProperty("NumTables", Uint32(20));
73737372

73747373
for (Uint32 tn = 0; tn < numTables; tn++)
73757374
{
@@ -7396,6 +7395,8 @@ int dropManyTables(NDBT_Context* ctx, NDBT_Step* step)
73967395

73977396
char buf[100];
73987397

7398+
const Uint32 numTables=ctx->getProperty("NumTables", Uint32(20));
7399+
73997400
for (Uint32 tn = 0; tn < numTables; tn++)
74007401
{
74017402
BaseString::snprintf(buf, sizeof(buf),
@@ -7786,6 +7787,58 @@ int runRestartandCheckLCPRestored(NDBT_Context* ctx, NDBT_Step* step)
77867787
return NDBT_OK;
77877788
}
77887789

7790+
7791+
int runNodeFailLcpStall(NDBT_Context* ctx, NDBT_Step* step)
7792+
{
7793+
NdbRestarter restarter;
7794+
int master = restarter.getMasterNodeId();
7795+
int other = restarter.getRandomNodeSameNodeGroup(master, rand());
7796+
7797+
ndbout_c("Master %u Other %u",
7798+
master, other);
7799+
7800+
ndbout_c("Stalling scans in node %u", other);
7801+
restarter.insertErrorInNode(other, 10039);
7802+
7803+
int dump[] = { 7099 };
7804+
ndbout_c("Triggering LCP");
7805+
restarter.dumpStateOneNode(master, dump, 1);
7806+
7807+
ndbout_c("Giving time for things to stall");
7808+
NdbSleep_MilliSleep(10000);
7809+
7810+
ndbout_c("Getting Master to kill other when Master LCP complete %u", master);
7811+
restarter.insertErrorInNode(master, 7178);
7812+
7813+
ndbout_c("Releasing scans in node %u", other);
7814+
restarter.insertErrorInNode(other, 0);
7815+
7816+
ndbout_c("Expect other node failure");
7817+
Uint32 retries=100;
7818+
while (restarter.getNodeStatus(other) == NDB_MGM_NODE_STATUS_STARTED)
7819+
{
7820+
if ((--retries) == 0)
7821+
{
7822+
ndbout_c("Timeout waiting for other node to restart");
7823+
return NDBT_FAILED;
7824+
}
7825+
NdbSleep_MilliSleep(500);
7826+
}
7827+
7828+
ndbout_c("Other node failed, now wait for it to restart");
7829+
restarter.insertErrorInNode(master, 0);
7830+
7831+
if (restarter.waitNodesStarted(&other, 1) != 0)
7832+
{
7833+
ndbout_c("Timed out waiting for restart");
7834+
return NDBT_FAILED;
7835+
}
7836+
7837+
ndbout_c("Restart succeeded");
7838+
7839+
return NDBT_OK;
7840+
}
7841+
77897842
NDBT_TESTSUITE(testNodeRestart);
77907843
TESTCASE("NoLoad",
77917844
"Test that one node at a time can be stopped and then restarted "\
@@ -7951,8 +8004,9 @@ TESTCASE("RestartMasterNodeError",
79518004
TESTCASE("GetTabInfoOverload",
79528005
"Test behaviour of GET_TABINFOREQ overload + LCP + restart")
79538006
{
8007+
TC_PROPERTY("NumTables", 20);
79548008
INITIALIZER(createManyTables);
7955-
STEPS(runGetTabInfo, (int) numTables);
8009+
STEPS(runGetTabInfo, 20);
79568010
STEP(runLCPandRestart);
79578011
FINALIZER(dropManyTables);
79588012
};
@@ -8478,6 +8532,14 @@ TESTCASE("RestoreOlderLCP",
84788532
FINALIZER(runScanReadVerify);
84798533
FINALIZER(runClearTable);
84808534
}
8535+
TESTCASE("NodeFailLcpStall",
8536+
"Check that node failure does not result in LCP stall")
8537+
{
8538+
TC_PROPERTY("NumTables", Uint32(100));
8539+
INITIALIZER(createManyTables);
8540+
STEP(runNodeFailLcpStall);
8541+
FINALIZER(dropManyTables);
8542+
}
84818543

84828544
NDBT_TESTSUITE_END(testNodeRestart);
84838545

storage/ndb/test/run-test/daily-devel--07-tests.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,7 @@ cmd: test_event
9393
args: -n checkParallelTriggerDropReqHandling T1
9494
max-time: 120
9595

96+
cmd: testNodeRestart
97+
args: -n NodeFailLcpStall T1
98+
max-time: 360
99+

0 commit comments

Comments
 (0)