Skip to content

Commit f15556a

Browse files
committed
SERVER-37560 Add core functionality for speculative majority reads
This patch adds functionality for "speculative" majority reads. These are reads that can satisfy "majority" read concern guarantees without support from the storage engine for reading from a historical snapshot. Queries of this nature will, by default, wait on the most recent lastApplied optime to majority commit after they complete, but before returning to the client. They can also optionally set a custom optime T to wait on, if they know that they did not read any data that reflects the effects of operations newer than optime T.
1 parent 7b85fb0 commit f15556a

23 files changed

+545
-2
lines changed

src/mongo/db/SConscript

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,7 @@ env.Library(
10001000
"curop",
10011001
"repl/read_concern_args",
10021002
"repl/repl_coordinator_interface",
1003+
"repl/speculative_majority_read_info",
10031004
"stats/timer_stats",
10041005
"storage/storage_options",
10051006
],
@@ -1019,6 +1020,7 @@ env.Library(
10191020
"catalog_raii",
10201021
"curop",
10211022
"repl/repl_coordinator_interface",
1023+
"repl/speculative_majority_read_info",
10221024
"s/sharding_api_d",
10231025
],
10241026
)

src/mongo/db/read_concern.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,6 @@ namespace mongo {
3434

3535
MONGO_DEFINE_SHIM(waitForReadConcern);
3636
MONGO_DEFINE_SHIM(waitForLinearizableReadConcern);
37+
MONGO_DEFINE_SHIM(waitForSpeculativeMajorityReadConcern);
3738

3839
} // namespace mongo

src/mongo/db/read_concern.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ template <typename T>
4141
class StatusWith;
4242
namespace repl {
4343
class ReadConcernArgs;
44+
class SpeculativeMajorityReadInfo;
4445
}
4546

4647

@@ -61,4 +62,13 @@ extern MONGO_DECLARE_SHIM((OperationContext * opCtx,
6162
*/
6263
extern MONGO_DECLARE_SHIM((OperationContext * opCtx)->Status) waitForLinearizableReadConcern;
6364

65+
/**
66+
* Waits to satisfy a "speculative" majority read.
67+
*
68+
* This method must only be called if the operation is a speculative majority read.
69+
*/
70+
extern MONGO_DECLARE_SHIM((OperationContext * opCtx,
71+
repl::SpeculativeMajorityReadInfo speculativeReadInfo)
72+
->Status) waitForSpeculativeMajorityReadConcern;
73+
6474
} // namespace mongo

src/mongo/db/read_concern_mongod.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "mongo/db/op_observer.h"
3939
#include "mongo/db/operation_context.h"
4040
#include "mongo/db/repl/repl_client_info.h"
41+
#include "mongo/db/repl/speculative_majority_read_info.h"
4142
#include "mongo/db/s/sharding_state.h"
4243
#include "mongo/db/server_options.h"
4344
#include "mongo/db/server_parameters.h"
@@ -295,6 +296,17 @@ MONGO_REGISTER_SHIM(waitForReadConcern)
295296
// It is not used for atClusterTime because waitUntilOpTimeForRead handles waiting for
296297
// the majority snapshot in that case.
297298

299+
// Handle speculative majority reads.
300+
if (readConcernArgs.getMajorityReadMechanism() ==
301+
repl::ReadConcernArgs::MajorityReadMechanism::kSpeculative) {
302+
// We read from a local snapshot, so there is no need to set an explicit read source.
303+
// Mark down that we need to block after the command is done to satisfy majority read
304+
// concern, though.
305+
auto& speculativeReadInfo = repl::SpeculativeMajorityReadInfo::get(opCtx);
306+
speculativeReadInfo.setIsSpeculativeRead();
307+
return Status::OK();
308+
}
309+
298310
const int debugLevel = serverGlobalParams.clusterRole == ClusterRole::ConfigServer ? 1 : 2;
299311

300312
LOG(debugLevel) << "Waiting for 'committed' snapshot to be available for reading: "
@@ -365,4 +377,44 @@ MONGO_REGISTER_SHIM(waitForLinearizableReadConcern)(OperationContext* opCtx)->St
365377
return awaitReplResult.status;
366378
}
367379

380+
MONGO_REGISTER_SHIM(waitForSpeculativeMajorityReadConcern)
381+
(OperationContext* opCtx, repl::SpeculativeMajorityReadInfo speculativeReadInfo)->Status {
382+
invariant(speculativeReadInfo.isSpeculativeRead());
383+
384+
// Select the optime to wait on. A command may have selected a specific optime to wait on. If
385+
// not, then we just wait on the most recent optime written on this node i.e. lastApplied.
386+
auto replCoord = repl::ReplicationCoordinator::get(opCtx);
387+
repl::OpTime waitOpTime;
388+
auto lastApplied = replCoord->getMyLastAppliedOpTime();
389+
auto speculativeReadOpTime = speculativeReadInfo.getSpeculativeReadOpTime();
390+
if (speculativeReadOpTime) {
391+
// The optime provided must not be greater than the current lastApplied.
392+
invariant(*speculativeReadOpTime <= lastApplied);
393+
waitOpTime = *speculativeReadOpTime;
394+
} else {
395+
waitOpTime = lastApplied;
396+
}
397+
398+
// Block to make sure returned data is majority committed.
399+
LOG(1) << "Servicing speculative majority read, waiting for optime " << waitOpTime
400+
<< " to become committed, current commit point: " << replCoord->getLastCommittedOpTime();
401+
402+
if (!opCtx->hasDeadline()) {
403+
// TODO (SERVER-38727): Replace this with a user specified timeout value, to address the
404+
// fact that getMore commands do not respect maxTimeMS properly. Currently, this hard-coded
405+
// value represents the maximum time we are ever willing to wait for an optime to majority
406+
// commit when doing a speculative majority read. We make this value rather conservative.
407+
auto timeout = Seconds(15);
408+
opCtx->setDeadlineAfterNowBy(timeout, ErrorCodes::MaxTimeMSExpired);
409+
}
410+
Timer t;
411+
auto waitStatus = replCoord->awaitOpTimeCommitted(opCtx, waitOpTime);
412+
if (waitStatus.isOK()) {
413+
LOG(1) << "Optime " << waitOpTime << " became majority committed, waited " << t.millis()
414+
<< "ms for speculative majority read to be satisfied.";
415+
}
416+
return waitStatus;
417+
}
418+
419+
368420
} // namespace mongo

src/mongo/db/repl/SConscript

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,6 +1006,15 @@ env.Library('read_concern_args',
10061006
'optime',
10071007
])
10081008

1009+
env.Library('speculative_majority_read_info',
1010+
[
1011+
'speculative_majority_read_info.cpp'
1012+
],
1013+
LIBDEPS=[
1014+
'$BUILD_DIR/mongo/base',
1015+
'optime',
1016+
])
1017+
10091018
env.Library('replica_set_messages',
10101019
[
10111020
'is_master_response.cpp',
@@ -1373,6 +1382,16 @@ env.CppUnitTest(
13731382
],
13741383
)
13751384

1385+
env.CppUnitTest(
1386+
target='speculative_majority_read_info_test',
1387+
source=[
1388+
'speculative_majority_read_info_test.cpp',
1389+
],
1390+
LIBDEPS=[
1391+
'speculative_majority_read_info',
1392+
],
1393+
)
1394+
13761395
env.Library(target='optime',
13771396
source=[
13781397
'bson_extract_optime.cpp',

src/mongo/db/repl/read_concern_args.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,11 @@ ReadConcernArgs::MajorityReadMechanism ReadConcernArgs::getMajorityReadMechanism
272272
return _majorityReadMechanism;
273273
}
274274

275+
bool ReadConcernArgs::isSpeculativeMajority() const {
276+
return _level && *_level == ReadConcernLevel::kMajorityReadConcern &&
277+
_majorityReadMechanism == MajorityReadMechanism::kSpeculative;
278+
}
279+
275280
Status ReadConcernArgs::upconvertReadConcernLevelToSnapshot() {
276281
if (_level && *_level != ReadConcernLevel::kSnapshotReadConcern &&
277282
*_level != ReadConcernLevel::kMajorityReadConcern &&

src/mongo/db/repl/read_concern_args.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,11 @@ class ReadConcernArgs {
124124
*/
125125
MajorityReadMechanism getMajorityReadMechanism() const;
126126

127+
/**
128+
* Returns whether the read concern is speculative 'majority'.
129+
*/
130+
bool isSpeculativeMajority() const;
131+
127132
/**
128133
* Appends level and afterOpTime.
129134
*/

src/mongo/db/repl/replication_coordinator.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,19 @@ class ReplicationCoordinator : public SyncSourceSelector {
369369
const ReadConcernArgs& settings,
370370
boost::optional<Date_t> deadline) = 0;
371371

372+
/**
373+
* Wait until the given optime is known to be majority committed.
374+
*
375+
* The given optime is expected to be an optime in this node's local oplog. This method cannot
376+
* determine correctly whether an arbitrary optime is majority committed within a replica set.
377+
* It is expected that the execution of this method is contained within the span of one user
378+
* operation, and thus, should not span rollbacks.
379+
*
380+
* Returns whether the wait was successful. Will respect the deadline on the given
381+
* OperationContext, if one has been set.
382+
*/
383+
virtual Status awaitOpTimeCommitted(OperationContext* opCtx, OpTime opTime) = 0;
384+
372385
/**
373386
* Retrieves and returns the current election id, which is a unique id that is local to
374387
* this node and changes every time we become primary.

src/mongo/db/repl/replication_coordinator_impl.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1253,7 +1253,12 @@ Status ReplicationCoordinatorImpl::_validateReadConcern(OperationContext* opCtx,
12531253
"readConcern level 'snapshot' is required when specifying atClusterTime"};
12541254
}
12551255

1256+
// We cannot support read concern 'majority' by means of reading from a historical snapshot if
1257+
// the storage layer doesn't support it. In this case, we can support it by using "speculative"
1258+
// majority reads instead.
12561259
if (readConcern.getLevel() == ReadConcernLevel::kMajorityReadConcern &&
1260+
readConcern.getMajorityReadMechanism() ==
1261+
ReadConcernArgs::MajorityReadMechanism::kMajoritySnapshot &&
12571262
!_externalState->isReadCommittedSupportedByStorageEngine(opCtx)) {
12581263
return {ErrorCodes::ReadConcernMajorityNotEnabled,
12591264
str::stream() << "Storage engine does not support read concern: "
@@ -1422,8 +1427,15 @@ Status ReplicationCoordinatorImpl::_waitUntilClusterTimeForRead(OperationContext
14221427

14231428
// We don't set isMajorityCommittedRead for kSnapshotReadConcern because snapshots are always
14241429
// speculative; we wait for majority when the transaction commits.
1430+
//
1431+
// Speculative majority reads do not need to wait for the commit point to advance to satisfy
1432+
// afterClusterTime reads. Waiting for the lastApplied to advance past the given target optime
1433+
// ensures the recency guarantee for the afterClusterTime read. At the end of the command, we
1434+
// will wait for the lastApplied optime to become majority committed, which then satisfies the
1435+
// durability guarantee.
14251436
const bool isMajorityCommittedRead =
1426-
readConcern.getLevel() == ReadConcernLevel::kMajorityReadConcern;
1437+
readConcern.getLevel() == ReadConcernLevel::kMajorityReadConcern &&
1438+
!readConcern.isSpeculativeMajority();
14271439

14281440
return _waitUntilOpTime(opCtx, isMajorityCommittedRead, targetOpTime, deadline);
14291441
}
@@ -1439,6 +1451,17 @@ Status ReplicationCoordinatorImpl::_waitUntilOpTimeForReadDeprecated(
14391451
return _waitUntilOpTime(opCtx, isMajorityCommittedRead, targetOpTime);
14401452
}
14411453

1454+
Status ReplicationCoordinatorImpl::awaitOpTimeCommitted(OperationContext* opCtx, OpTime opTime) {
1455+
// The optime given to this method is required to be an optime in this node's local oplog.
1456+
// Furthermore, the execution of this method must not span rollbacks, so the oplog at the start
1457+
// of the waiting period will be a prefix of the oplog at the end of the waiting period. This
1458+
// makes it valid to compare optimes from this node's oplog based on their timestamps alone, and
1459+
// so allows this method to determine if an optime is committed by comparing its timestamp to
1460+
// the timestamp of the last committed optime.
1461+
const bool isMajorityCommittedRead = true;
1462+
return _waitUntilOpTime(opCtx, isMajorityCommittedRead, opTime);
1463+
}
1464+
14421465
OpTime ReplicationCoordinatorImpl::_getMyLastAppliedOpTime_inlock() const {
14431466
return _topCoord->getMyLastAppliedOpTime();
14441467
}

src/mongo/db/repl/replication_coordinator_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ class ReplicationCoordinatorImpl : public ReplicationCoordinator {
165165

166166
virtual Status waitUntilOpTimeForRead(OperationContext* opCtx,
167167
const ReadConcernArgs& readConcern) override;
168-
168+
Status awaitOpTimeCommitted(OperationContext* opCtx, OpTime opTime) override;
169169
virtual OID getElectionId() override;
170170

171171
virtual int getMyId() const override;

0 commit comments

Comments
 (0)