Skip to content

Commit 428b48d

Browse files
committed
Bug#34013385 TUP require failure
The combination of - Batching with multiple in-flight operations per key - IgnoreError execution mode - Transient errors occurring on non-primary replicas can lead to TUP level inconsistencies which result in replica misalignment and other require assertion failures. This combination and its side-effects are made impossible by detecting when operations are failing on non-primary replicas, and forcing AbortOnError handling (e.g. rollback) for the containing transaction. An assertion checking for this condition in ACC is upgraded to a require. This behavioural change is verified with a new testcase : testBasic -n AbortIgnoreError This test is added to daily-devel tests. In addition, the test framework UtilTransactions class has a number of replica and index consistency check mechanisms added, which are also made available via the enhanced verify_index tool, allowing replica and index consistency to be verified from any programmatic or scripted testcase. Approved by : Maitrayi Sabaratnam <[email protected]>
1 parent 4db7913 commit 428b48d

File tree

13 files changed

+1347
-109
lines changed

13 files changed

+1347
-109
lines changed

storage/ndb/include/kernel/signaldata/LqhKey.hpp

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2022, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -853,7 +853,8 @@ class LqhKeyRef {
853853
friend bool printLQHKEYREF(FILE * output, const Uint32 * theData, Uint32 len, Uint16 receiverBlockNo);
854854

855855
public:
856-
STATIC_CONST( SignalLength = 5 );
856+
STATIC_CONST( SignalLengthWithoutFlags = 5 );
857+
STATIC_CONST( SignalLength = 6 );
857858

858859
private:
859860

@@ -865,8 +866,30 @@ class LqhKeyRef {
865866
Uint32 errorCode;
866867
Uint32 transId1;
867868
Uint32 transId2;
869+
Uint32 flags;
870+
871+
static Uint32 getReplicaErrorFlag(const Uint32& flags);
872+
static void setReplicaErrorFlag(Uint32& flags, Uint32 val);
873+
874+
enum Flags {
875+
LKR_REPLICA_ERROR_SHIFT = 0
876+
};
868877
};
869878

879+
inline
880+
Uint32
881+
LqhKeyRef::getReplicaErrorFlag(const Uint32& flags)
882+
{
883+
return ((flags >> LKR_REPLICA_ERROR_SHIFT) & 0x1);
884+
}
885+
886+
inline
887+
void
888+
LqhKeyRef::setReplicaErrorFlag(Uint32& flags, Uint32 val)
889+
{
890+
ASSERT_BOOL(val, "LqhKeyRef::setReplicaErrorFlag");
891+
flags |= (val << LKR_REPLICA_ERROR_SHIFT);
892+
}
870893

871894
#undef JAM_FILE_ID
872895

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2003, 2021, Oracle and/or its affiliates.
1+
# Copyright (c) 2003, 2022, Oracle and/or its affiliates.
22
#
33
# This program is free software; you can redistribute it and/or modify
44
# it under the terms of the GNU General Public License, version 2.0,
@@ -25,7 +25,7 @@ Next NDBCNTR 1023
2525
Next NDBFS 2003
2626
Next DBACC 3005
2727
Next DBTUP 4039
28-
Next DBLQH 5098
28+
Next DBLQH 5109
2929
Next DBDICT 6223
3030
Next DBDIH 7249
3131
Next DBTC 8115
@@ -352,6 +352,9 @@ Error insertions used to test ACCKEYCONF/ABORT interleaving
352352
ACCKEYCONF signal clears ERROR Insertion
353353
5097: ndbrequire(false)
354354

355+
5107: Cause SimulatedBlock::allocChunks() to return chunks with gaps
356+
5108: Cause request to fail with error 1218 SendBuffer overload (temp error)
357+
355358

356359

357360
ERROR CODES FOR TESTING TIME-OUT HANDLING IN DBTC

storage/ndb/src/kernel/blocks/dbacc/DbaccMain.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4987,7 +4987,7 @@ void Dbacc::commitOperation(Signal* signal)
49874987
Uint32 opbits = operationRecPtr.p->m_op_bits;
49884988
Uint32 op = opbits & Operationrec::OP_MASK;
49894989
ndbrequire((opbits & Operationrec::OP_STATE_MASK) == Operationrec::OP_STATE_EXECUTED);
4990-
ndbassert((opbits & Operationrec::OP_PENDING_ABORT) == 0);
4990+
ndbrequire((opbits & Operationrec::OP_PENDING_ABORT) == 0);
49914991
if ((opbits & Operationrec::OP_COMMIT_DELETE_CHECK) == 0 &&
49924992
(op != ZREAD && op != ZSCAN_OP))
49934993
{

storage/ndb/src/kernel/blocks/dblqh/Dblqh.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2003, 2022, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -2193,7 +2193,8 @@ class Dblqh
21932193
NEW_FROM_TC = 2,
21942194
REQ_FROM_TC = 3,
21952195
ABORT_FROM_TC = 4,
2196-
ABORT_FROM_LQH = 5
2196+
ABORT_FROM_LQH = 5,
2197+
ABORT_FROM_LQH_REPLICA = 6
21972198
};
21982199
enum TransactionState {
21992200
IDLE = 0,

storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3320,6 +3320,7 @@ void Dblqh::earlyKeyReqAbort(Signal* signal,
33203320
ref->errorCode = errCode;
33213321
ref->transId1 = transid1;
33223322
ref->transId2 = transid2;
3323+
ref->flags = 0;
33233324
sendSignal(signal->senderBlockRef(), GSN_LQHKEYREF, signal,
33243325
LqhKeyRef::SignalLength, JBB);
33253326
}//if
@@ -3497,7 +3498,10 @@ void Dblqh::execLQHKEYREF(Signal* signal)
34973498
warningReport(signal, 15);
34983499
return;
34993500
}//if
3500-
abortErrorLab(signal);
3501+
/* Mark abort due to replica issue */
3502+
regTcPtr->abortState = TcConnectionrec::ABORT_FROM_LQH_REPLICA;
3503+
regTcPtr->errorCode = terrorCode;
3504+
abortCommonLab(signal);
35013505
return;
35023506
break;
35033507
case TcConnectionrec::LOG_CONNECTED:
@@ -4965,6 +4969,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal)
49654969
}
49664970

49674971
if (ERROR_INSERTED_CLEAR(5047) ||
4972+
ERROR_INSERTED_CLEAR(5108) ||
49684973
ERROR_INSERTED(5079) ||
49694974
(ERROR_INSERTED(5102) &&
49704975
LqhKeyReq::getNoTriggersFlag(Treqinfo)) ||
@@ -10262,7 +10267,9 @@ void Dblqh::continueAfterLogAbortWriteLab(Signal* signal)
1026210267
cleanUp(signal);
1026310268
return;
1026410269
}//if
10265-
if (regTcPtr->abortState == TcConnectionrec::ABORT_FROM_LQH) {
10270+
if ((regTcPtr->abortState == TcConnectionrec::ABORT_FROM_LQH) ||
10271+
(regTcPtr->abortState == TcConnectionrec::ABORT_FROM_LQH_REPLICA))
10272+
{
1026610273
LqhKeyRef * const lqhKeyRef = (LqhKeyRef *)signal->getDataPtrSend();
1026710274

1026810275
jam();
@@ -10271,6 +10278,12 @@ void Dblqh::continueAfterLogAbortWriteLab(Signal* signal)
1027110278
lqhKeyRef->errorCode = regTcPtr->errorCode;
1027210279
lqhKeyRef->transId1 = regTcPtr->transid[0];
1027310280
lqhKeyRef->transId2 = regTcPtr->transid[1];
10281+
lqhKeyRef->flags = 0;
10282+
if (regTcPtr->abortState == TcConnectionrec::ABORT_FROM_LQH_REPLICA)
10283+
{
10284+
jam();
10285+
LqhKeyRef::setReplicaErrorFlag(lqhKeyRef->flags, 1);
10286+
}
1027410287
sendSignal(regTcPtr->clientBlockref, GSN_LQHKEYREF, signal,
1027510288
LqhKeyRef::SignalLength, JBB);
1027610289
} else if (regTcPtr->abortState == TcConnectionrec::ABORT_FROM_TC) {

storage/ndb/src/kernel/blocks/dbspj/DbspjMain.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2012, 2021, Oracle and/or its affiliates.
2+
Copyright (c) 2012, 2022, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -1108,6 +1108,7 @@ Dbspj::handle_early_lqhkey_ref(Signal* signal,
11081108
ref->errorCode = err;
11091109
ref->transId1 = transid[0];
11101110
ref->transId2 = transid[1];
1111+
ref->flags = 0;
11111112
sendSignal(returnref, GSN_LQHKEYREF, signal,
11121113
LqhKeyRef::SignalLength, JBB);
11131114
}

storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7489,15 +7489,26 @@ void Dbtc::execLQHKEYREF(Signal* signal)
74897489
warningReport(signal, 25);
74907490
return;
74917491
}//if
7492-
7492+
Uint32 flags = 0;
7493+
if (signal->getLength() >= LqhKeyRef::SignalLength)
7494+
{
7495+
jam();
7496+
flags = lqhKeyRef->flags;
7497+
}
7498+
/**
7499+
* If the error came from a backup replica rather than the primary
7500+
* then we will abort the transaction in all cases.
7501+
*/
7502+
const bool needAbort = (LqhKeyRef::getReplicaErrorFlag(flags) != 0);
74937503
const Uint32 triggeringOp = regTcPtr->triggeringOperation;
74947504
ConnectionState TapiConnectstate = regApiPtr->apiConnectstate;
74957505

74967506
time_track_complete_key_operation_error(regTcPtr,
74977507
refToNode(regApiPtr->ndbapiBlockref),
74987508
regTcPtr->tcNodedata[0]);
74997509

7500-
if (unlikely(TapiConnectstate == CS_ABORTING))
7510+
if (unlikely(TapiConnectstate == CS_ABORTING ||
7511+
needAbort))
75017512
{
75027513
jam();
75037514
goto do_abort;
@@ -7667,7 +7678,7 @@ void Dbtc::execLQHKEYREF(Signal* signal)
76677678
}
76687679

76697680
const Uint32 abort = regTcPtr->m_execAbortOption;
7670-
if (abort == TcKeyReq::AbortOnError || triggeringOp != RNIL) {
7681+
if (abort == TcKeyReq::AbortOnError || triggeringOp != RNIL || needAbort) {
76717682
/**
76727683
* No error is allowed on this operation
76737684
*/

0 commit comments

Comments
 (0)