Skip to content

Commit 76d2002

Browse files
frazerclementdahlerlend
authored andcommitted
Bug#36028828 testNodeRestart -n MixedReadUpdateScan failures
Testcase failure exposed regression from fix to Bug#22602898 NDB : CURIOUS STATE OF TC COMMIT_SENT / COMPLETE_SENT TIMEOUT HANDLING Node failure handling in TC performs 1 pass through the local active transactions to find those affected by a node failure. In this pass, all transactions affected by the node failure are queued for processing, e.g. rollback, commit, complete, via e.g. the serial abort/commit or complete protocols. The exceptions are transactions in transient internal states such as CS_PREPARE_TO_COMMIT, CS_COMMITTING, CS_COMPLETING, which are then followed by stable 'wait' states such as CS_COMMIT_SENT, CS_COMPLETE_SENT. Transactions in these states were handled by doing nothing in the node failure handling pass, and relying on the timeout handling in the subsequent stable states to queue transactions for processing. The fix to Bug#22602898 removed this stable state handling to avoid it accidentally triggering, but also stopped it from triggering when needed in this case where node failure handling found a transaction in a transient state. This is solved by modifying the CS_COMMIT_SENT and CS_COMPLETE_SENT stable state handling to also perform node failure processing if a timeout has occurred for a transaction with a failure number different to the current latest failure number. This ensures that all transactions involving the failed node are handled eventually. A new testcase testNodeRestart -n TransientStatesNF T1 is added to the AT testsuite to give coverage. Change-Id: I0c0d4b6f75a97a3a7ff892cc4eafd2351491a8ff
1 parent 9fa9ff6 commit 76d2002

File tree

4 files changed

+117
-20
lines changed

4 files changed

+117
-20
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Next DBTUP 4039
2828
Next DBLQH 5112
2929
Next DBDICT 6223
3030
Next DBDIH 7251
31-
Next DBTC 8123
31+
Next DBTC 8124
3232
Next CMVMI 9000
3333
Next TRPMAN 9501
3434
Next BACKUP 10057

storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6081,8 +6081,10 @@ void Dbtc::commit020Lab(Signal *signal,
60816081
Tcount += sendCommitLqh(signal, localTcConnectptr.p, apiConnectptr.p);
60826082

60836083
if (tcConList.next(localTcConnectptr)) {
6084-
if (Tcount < 16 && !(ERROR_INSERTED(8057) || ERROR_INSERTED(8073) ||
6085-
ERROR_INSERTED(8089))) {
6084+
if (Tcount < 16 &&
6085+
!(ERROR_INSERTED(8057) || ERROR_INSERTED(8073) ||
6086+
ERROR_INSERTED(8089) ||
6087+
(ERROR_INSERTED(8123) && ((apiConnectptr.i & 0x1) == 0)))) {
60866088
jam();
60876089
continue;
60886090
} else {
@@ -6101,7 +6103,7 @@ void Dbtc::commit020Lab(Signal *signal,
61016103
signal->theData[0] = TcContinueB::ZSEND_COMMIT_LOOP;
61026104
signal->theData[1] = apiConnectptr.i;
61036105
signal->theData[2] = localTcConnectptr.i;
6104-
if (ERROR_INSERTED(8089)) {
6106+
if (ERROR_INSERTED(8089) || ERROR_INSERTED(8123)) {
61056107
sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 100, 3);
61066108
return;
61076109
}
@@ -6513,7 +6515,8 @@ void Dbtc::complete010Lab(Signal *signal,
65136515
/* ************ */
65146516
Tcount += sendCompleteLqh(signal, localTcConnectptr.p, apiConnectptr.p);
65156517
if (tcConList.next(localTcConnectptr)) {
6516-
if (Tcount < 16 && !ERROR_INSERTED(8112)) {
6518+
if (Tcount < 16 && !ERROR_INSERTED(8112) &&
6519+
!(ERROR_INSERTED(8123) && ((apiConnectptr.i & 0x1) != 0))) {
65176520
jamDebug();
65186521
continue;
65196522
} else {
@@ -6525,7 +6528,7 @@ void Dbtc::complete010Lab(Signal *signal,
65256528
signal->theData[0] = TcContinueB::ZSEND_COMPLETE_LOOP;
65266529
signal->theData[1] = apiConnectptr.i;
65276530
signal->theData[2] = localTcConnectptr.i;
6528-
if (ERROR_INSERTED(8112)) {
6531+
if (ERROR_INSERTED(8112) || ERROR_INSERTED(8123)) {
65296532
sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 100, 3);
65306533
return;
65316534
}
@@ -8995,7 +8998,8 @@ void Dbtc::timeOutFoundLab(Signal *signal, Uint32 TapiConPtr, Uint32 errCode) {
89958998
/* or more LQH instances. We cannot speed this up. */
89968999
/* Only in confirmed node failure situations do we take action. */
89979000
/*------------------------------------------------------------------*/
8998-
if (errCode == ZNODEFAIL_BEFORE_COMMIT) {
9001+
if (errCode == ZNODEFAIL_BEFORE_COMMIT ||
9002+
apiConnectptr.p->failureNr != cfailure_nr) {
89999003
jam();
90009004
/**
90019005
* Node failure handling, switch to serial commit handling
@@ -9022,7 +9026,8 @@ void Dbtc::timeOutFoundLab(Signal *signal, Uint32 TapiConPtr, Uint32 errCode) {
90229026
/* or more LQH instances. We cannot speed this up. */
90239027
/* Only in confirmed node failure situations do we take action. */
90249028
/*--------------------------------------------------------------------*/
9025-
if (errCode == ZNODEFAIL_BEFORE_COMMIT) {
9029+
if (errCode == ZNODEFAIL_BEFORE_COMMIT ||
9030+
apiConnectptr.p->failureNr != cfailure_nr) {
90269031
jam();
90279032
/**
90289033
* Node failure handling, switch to serial complete handling

storage/ndb/test/ndbapi/testNodeRestart.cpp

Lines changed: 100 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,15 @@ static int changeStartPartitionedTimeout(NDBT_Context *ctx, NDBT_Step *step) {
6565
g_err << "Setting StartPartitionedTimeout to " << startPartitionedTimeout
6666
<< endl;
6767
ConfigValues::Iterator iter(conf.m_configuration->m_config_values);
68-
for(int idx=0; iter.openSection(CFG_SECTION_NODE, idx); idx++) {
68+
for (int idx = 0; iter.openSection(CFG_SECTION_NODE, idx); idx++) {
6969
Uint32 oldValue = 0;
7070
if (iter.get(CFG_DB_START_PARTITION_TIMEOUT, oldValue)) {
7171
if (defaultValue == Uint32(~0)) {
7272
defaultValue = oldValue;
7373
} else if (oldValue != defaultValue) {
7474
g_err << "StartPartitionedTimeout is not consistent across data node"
75-
"sections" << endl;
75+
"sections"
76+
<< endl;
7677
break;
7778
}
7879
}
@@ -2832,7 +2833,8 @@ int runCreateBigTable(NDBT_Context *ctx, NDBT_Step *step) {
28322833
do {
28332834
hugoTrans.loadTableStartFrom(GETNDB(step), cnt, 10000);
28342835
cnt += 10000;
2835-
} while (cnt < rows && (NdbTick_CurrentMillisecond() - now) < 180000); // 180s
2836+
} while (cnt < rows &&
2837+
(NdbTick_CurrentMillisecond() - now) < 180000); // 180s
28362838
ndbout_c("Loaded %u rows in %llums", cnt, NdbTick_CurrentMillisecond() - now);
28372839

28382840
return NDBT_OK;
@@ -7870,10 +7872,10 @@ int run_PLCP_many_parts(NDBT_Context *ctx, NDBT_Step *step) {
78707872
}
78717873
ConfigValues::Iterator iter(conf.m_configuration->m_config_values);
78727874
Uint32 enabledPartialLCP = 1;
7873-
for(int idx=0; iter.openSection(CFG_SECTION_NODE, idx); idx++) {
7874-
Uint32 nodeId=0;
7875-
if(iter.get(CFG_NODE_ID, &nodeId)) {
7876-
if(nodeId == (Uint32) node_1) {
7875+
for (int idx = 0; iter.openSection(CFG_SECTION_NODE, idx); idx++) {
7876+
Uint32 nodeId = 0;
7877+
if (iter.get(CFG_NODE_ID, &nodeId)) {
7878+
if (nodeId == (Uint32)node_1) {
78777879
iter.get(CFG_DB_ENABLE_PARTIAL_LCP, &enabledPartialLCP);
78787880
iter.closeSection();
78797881
break;
@@ -7882,8 +7884,7 @@ int run_PLCP_many_parts(NDBT_Context *ctx, NDBT_Step *step) {
78827884
iter.closeSection();
78837885
}
78847886

7885-
if (enabledPartialLCP == 0)
7886-
{
7887+
if (enabledPartialLCP == 0) {
78877888
g_err << "[SKIPPED] Test skipped. Needs EnablePartialLcp=1" << endl;
78887889
iter.closeSection();
78897890
return NDBT_SKIPPED;
@@ -9456,7 +9457,8 @@ int runTestStallTimeoutAndNF(NDBT_Context *ctx, NDBT_Step *step) {
94569457

94579458
CHECK(hugoOps.startTransaction(pNdb, rowNum) == 0,
94589459
"Start transaction failed");
9459-
CHECK(hugoOps.pkUpdateRecord(pNdb, rowNum, 1) == 0, "Define Update failed");
9460+
CHECK(hugoOps.pkUpdateRecord(pNdb, rowNum, 1) == 0,
9461+
"Define Update failed");
94609462
CHECK(hugoOps.execute_NoCommit(pNdb) == 0, "Execute NoCommit failed");
94619463

94629464
NdbTransaction *trans = hugoOps.getTransaction();
@@ -9534,8 +9536,7 @@ int runTestStallTimeoutAndNF(NDBT_Context *ctx, NDBT_Step *step) {
95349536

95359537
if (trans->getNdbError().code != 0) {
95369538
ndbout_c("Got unexpected failure code : %u : %s",
9537-
trans->getNdbError().code,
9538-
trans->getNdbError().message);
9539+
trans->getNdbError().code, trans->getNdbError().message);
95399540
return NDBT_FAILED;
95409541
}
95419542

@@ -9557,6 +9558,86 @@ int runTestStallTimeoutAndNF(NDBT_Context *ctx, NDBT_Step *step) {
95579558
return NDBT_OK;
95589559
}
95599560

9561+
int runLargeLockingReads(NDBT_Context *ctx, NDBT_Step *step) {
9562+
int result = NDBT_OK;
9563+
int readsize = MIN(100, ctx->getNumRecords());
9564+
int i = 0;
9565+
HugoTransactions hugoTrans(*ctx->getTab());
9566+
while (ctx->isTestStopped() == false) {
9567+
g_info << i << ": ";
9568+
if (hugoTrans.pkReadRecords(GETNDB(step), readsize, readsize,
9569+
NdbOperation::LM_Read) != 0) {
9570+
return NDBT_FAILED;
9571+
}
9572+
i++;
9573+
}
9574+
return result;
9575+
}
9576+
9577+
int runRestartsWithSlowCommitComplete(NDBT_Context *ctx, NDBT_Step *step) {
9578+
int result = NDBT_OK;
9579+
NdbRestarter restarter;
9580+
const int numRestarts = 4;
9581+
9582+
if (restarter.getNumDbNodes() < 2) {
9583+
g_err << "Too few nodes" << endl;
9584+
ctx->stopTest();
9585+
return NDBT_SKIPPED;
9586+
}
9587+
9588+
for (int i = 0; i < numRestarts && !ctx->isTestStopped(); i++) {
9589+
int errorCode = 8123; // Slow commit and complete sending at TC
9590+
ndbout << "Injecting error " << errorCode << " for slow commits + completes"
9591+
<< endl;
9592+
restarter.insertErrorInAllNodes(errorCode);
9593+
9594+
/* Give some time for things to get stuck in slowness */
9595+
NdbSleep_MilliSleep(1000);
9596+
9597+
const int id = restarter.getNode(NdbRestarter::NS_RANDOM);
9598+
ndbout << "Restart node " << id << endl;
9599+
9600+
if (restarter.restartOneDbNode(id, false, true, true) != 0) {
9601+
g_err << "Failed to restart Db node" << endl;
9602+
result = NDBT_FAILED;
9603+
break;
9604+
}
9605+
9606+
if (restarter.waitNodesNoStart(&id, 1)) {
9607+
g_err << "Failed to waitNodesNoStart" << endl;
9608+
result = NDBT_FAILED;
9609+
break;
9610+
}
9611+
9612+
restarter.insertErrorInAllNodes(0);
9613+
9614+
if (restarter.startNodes(&id, 1)) {
9615+
g_err << "Failed to start node" << endl;
9616+
result = NDBT_FAILED;
9617+
break;
9618+
}
9619+
9620+
if (restarter.waitClusterStarted() != 0) {
9621+
g_err << "Cluster failed to start" << endl;
9622+
result = NDBT_FAILED;
9623+
break;
9624+
}
9625+
9626+
/* Ensure connected */
9627+
if (GETNDB(step)->get_ndb_cluster_connection().wait_until_ready(30, 30) !=
9628+
0) {
9629+
g_err << "Timeout waiting for NdbApi reconnect" << endl;
9630+
result = NDBT_FAILED;
9631+
break;
9632+
}
9633+
}
9634+
9635+
restarter.insertErrorInAllNodes(0);
9636+
ctx->stopTest();
9637+
9638+
return result;
9639+
}
9640+
95609641
NDBT_TESTSUITE(testNodeRestart);
95619642
TESTCASE("NoLoad",
95629643
"Test that one node at a time can be stopped and then restarted "
@@ -10329,6 +10410,13 @@ TESTCASE("TransStallTimeoutNF", "") {
1032910410
STEP(runTestStallTimeoutAndNF);
1033010411
FINALIZER(runClearTable);
1033110412
}
10413+
TESTCASE("TransientStatesNF",
10414+
"Test node failure handling with transactions in transient states") {
10415+
INITIALIZER(runLoadTable);
10416+
STEPS(runLargeLockingReads, 5);
10417+
STEP(runRestartsWithSlowCommitComplete);
10418+
FINALIZER(runClearTable);
10419+
}
1033210420
NDBT_TESTSUITE_END(testNodeRestart)
1033310421

1033410422
int main(int argc, const char **argv) {

storage/ndb/test/run-test/daily-devel--07-tests.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,3 +324,7 @@ cmd: test_event
324324
args: -n SubscribeEventsNRAF T1
325325
max-time: 240
326326

327+
cmd: testNodeRestart
328+
args: -n TransientStatesNF T1
329+
max-time: 360
330+

0 commit comments

Comments
 (0)