lruihan
diff --git a/‎src/mongo/db/SConscript‎
Lines changed: 2 additions & 0 deletions b/‎src/mongo/db/SConscript‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/mongo/db/read_concern.cpp‎
Lines changed: 1 addition & 0 deletions b/‎src/mongo/db/read_concern.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/mongo/db/read_concern.h‎
Lines changed: 10 additions & 0 deletions b/‎src/mongo/db/read_concern.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/mongo/db/read_concern_mongod.cpp‎
Lines changed: 52 additions & 0 deletions b/‎src/mongo/db/read_concern_mongod.cpp‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎src/mongo/db/repl/SConscript‎
Lines changed: 19 additions & 0 deletions b/‎src/mongo/db/repl/SConscript‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/mongo/db/repl/read_concern_args.cpp‎
Lines changed: 5 additions & 0 deletions b/‎src/mongo/db/repl/read_concern_args.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/mongo/db/repl/read_concern_args.h‎
Lines changed: 5 additions & 0 deletions b/‎src/mongo/db/repl/read_concern_args.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/mongo/db/repl/replication_coordinator.h‎
Lines changed: 13 additions & 0 deletions b/‎src/mongo/db/repl/replication_coordinator.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/mongo/db/repl/replication_coordinator_impl.cpp‎
Lines changed: 24 additions & 1 deletion b/‎src/mongo/db/repl/replication_coordinator_impl.cpp‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎src/mongo/db/repl/replication_coordinator_impl.h‎
Lines changed: 1 addition & 1 deletion b/‎src/mongo/db/repl/replication_coordinator_impl.h‎
Lines changed: 1 addition & 1 deletion
@@ -1000,6 +1000,7 @@ env.Library(
         "curop",
         "repl/read_concern_args",
         "repl/repl_coordinator_interface",
+        "repl/speculative_majority_read_info",
         "stats/timer_stats",
         "storage/storage_options",
     ],
@@ -1019,6 +1020,7 @@ env.Library(
         "catalog_raii",
         "curop",
         "repl/repl_coordinator_interface",
+        "repl/speculative_majority_read_info",
         "s/sharding_api_d",
     ],
 )
 
@@ -34,5 +34,6 @@ namespace mongo {
 
 MONGO_DEFINE_SHIM(waitForReadConcern);
 MONGO_DEFINE_SHIM(waitForLinearizableReadConcern);
+MONGO_DEFINE_SHIM(waitForSpeculativeMajorityReadConcern);
 
 }  // namespace mongo
@@ -41,6 +41,7 @@ template <typename T>
 class StatusWith;
 namespace repl {
 class ReadConcernArgs;
+class SpeculativeMajorityReadInfo;
 }
 
 
@@ -61,4 +62,13 @@ extern MONGO_DECLARE_SHIM((OperationContext * opCtx,
  */
 extern MONGO_DECLARE_SHIM((OperationContext * opCtx)->Status) waitForLinearizableReadConcern;
 
+/**
+ * Waits to satisfy a "speculative" majority read.
+ *
+ * This method must only be called if the operation is a speculative majority read.
+ */
+extern MONGO_DECLARE_SHIM((OperationContext * opCtx,
+                           repl::SpeculativeMajorityReadInfo speculativeReadInfo)
+                              ->Status) waitForSpeculativeMajorityReadConcern;
+
 }  // namespace mongo
@@ -38,6 +38,7 @@
 #include "mongo/db/op_observer.h"
 #include "mongo/db/operation_context.h"
 #include "mongo/db/repl/repl_client_info.h"
+#include "mongo/db/repl/speculative_majority_read_info.h"
 #include "mongo/db/s/sharding_state.h"
 #include "mongo/db/server_options.h"
 #include "mongo/db/server_parameters.h"
@@ -295,6 +296,17 @@ MONGO_REGISTER_SHIM(waitForReadConcern)
         // It is not used for atClusterTime because waitUntilOpTimeForRead handles waiting for
         // the majority snapshot in that case.
 
+        // Handle speculative majority reads.
+        if (readConcernArgs.getMajorityReadMechanism() ==
+            repl::ReadConcernArgs::MajorityReadMechanism::kSpeculative) {
+            // We read from a local snapshot, so there is no need to set an explicit read source.
+            // Mark down that we need to block after the command is done to satisfy majority read
+            // concern, though.
+            auto& speculativeReadInfo = repl::SpeculativeMajorityReadInfo::get(opCtx);
+            speculativeReadInfo.setIsSpeculativeRead();
+            return Status::OK();
+        }
+
         const int debugLevel = serverGlobalParams.clusterRole == ClusterRole::ConfigServer ? 1 : 2;
 
         LOG(debugLevel) << "Waiting for 'committed' snapshot to be available for reading: "
@@ -365,4 +377,44 @@ MONGO_REGISTER_SHIM(waitForLinearizableReadConcern)(OperationContext* opCtx)->St
     return awaitReplResult.status;
 }
 
+MONGO_REGISTER_SHIM(waitForSpeculativeMajorityReadConcern)
+(OperationContext* opCtx, repl::SpeculativeMajorityReadInfo speculativeReadInfo)->Status {
+    invariant(speculativeReadInfo.isSpeculativeRead());
+
+    // Select the optime to wait on. A command may have selected a specific optime to wait on. If
+    // not, then we just wait on the most recent optime written on this node i.e. lastApplied.
+    auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+    repl::OpTime waitOpTime;
+    auto lastApplied = replCoord->getMyLastAppliedOpTime();
+    auto speculativeReadOpTime = speculativeReadInfo.getSpeculativeReadOpTime();
+    if (speculativeReadOpTime) {
+        // The optime provided must not be greater than the current lastApplied.
+        invariant(*speculativeReadOpTime <= lastApplied);
+        waitOpTime = *speculativeReadOpTime;
+    } else {
+        waitOpTime = lastApplied;
+    }
+
+    // Block to make sure returned data is majority committed.
+    LOG(1) << "Servicing speculative majority read, waiting for optime " << waitOpTime
+           << " to become committed, current commit point: " << replCoord->getLastCommittedOpTime();
+
+    if (!opCtx->hasDeadline()) {
+        // TODO (SERVER-38727): Replace this with a user specified timeout value, to address the
+        // fact that getMore commands do not respect maxTimeMS properly. Currently, this hard-coded
+        // value represents the maximum time we are ever willing to wait for an optime to majority
+        // commit when doing a speculative majority read. We make this value rather conservative.
+        auto timeout = Seconds(15);
+        opCtx->setDeadlineAfterNowBy(timeout, ErrorCodes::MaxTimeMSExpired);
+    }
+    Timer t;
+    auto waitStatus = replCoord->awaitOpTimeCommitted(opCtx, waitOpTime);
+    if (waitStatus.isOK()) {
+        LOG(1) << "Optime " << waitOpTime << " became majority committed, waited " << t.millis()
+               << "ms for speculative majority read to be satisfied.";
+    }
+    return waitStatus;
+}
+
+
 }  // namespace mongo
@@ -1006,6 +1006,15 @@ env.Library('read_concern_args',
                 'optime',
             ])
 
+env.Library('speculative_majority_read_info',
+            [
+                'speculative_majority_read_info.cpp'
+            ],
+            LIBDEPS=[
+                '$BUILD_DIR/mongo/base',
+                'optime',
+            ])
+
 env.Library('replica_set_messages',
             [
                 'is_master_response.cpp',
@@ -1373,6 +1382,16 @@ env.CppUnitTest(
    ],
 )
 
+env.CppUnitTest(
+    target='speculative_majority_read_info_test',
+    source=[
+        'speculative_majority_read_info_test.cpp',
+    ],
+    LIBDEPS=[
+        'speculative_majority_read_info',
+   ],
+)
+
 env.Library(target='optime',
             source=[
                 'bson_extract_optime.cpp',
 
@@ -272,6 +272,11 @@ ReadConcernArgs::MajorityReadMechanism ReadConcernArgs::getMajorityReadMechanism
     return _majorityReadMechanism;
 }
 
+bool ReadConcernArgs::isSpeculativeMajority() const {
+    return _level && *_level == ReadConcernLevel::kMajorityReadConcern &&
+        _majorityReadMechanism == MajorityReadMechanism::kSpeculative;
+}
+
 Status ReadConcernArgs::upconvertReadConcernLevelToSnapshot() {
     if (_level && *_level != ReadConcernLevel::kSnapshotReadConcern &&
         *_level != ReadConcernLevel::kMajorityReadConcern &&
 
@@ -124,6 +124,11 @@ class ReadConcernArgs {
      */
     MajorityReadMechanism getMajorityReadMechanism() const;
 
+    /**
+     * Returns whether the read concern is speculative 'majority'.
+     */
+    bool isSpeculativeMajority() const;
+
     /**
      * Appends level and afterOpTime.
      */
 
@@ -369,6 +369,19 @@ class ReplicationCoordinator : public SyncSourceSelector {
                                                const ReadConcernArgs& settings,
                                                boost::optional<Date_t> deadline) = 0;
 
+    /**
+     * Wait until the given optime is known to be majority committed.
+     *
+     * The given optime is expected to be an optime in this node's local oplog. This method cannot
+     * determine correctly whether an arbitrary optime is majority committed within a replica set.
+     * It is expected that the execution of this method is contained within the span of one user
+     * operation, and thus, should not span rollbacks.
+     *
+     * Returns whether the wait was successful. Will respect the deadline on the given
+     * OperationContext, if one has been set.
+     */
+    virtual Status awaitOpTimeCommitted(OperationContext* opCtx, OpTime opTime) = 0;
+
     /**
      * Retrieves and returns the current election id, which is a unique id that is local to
      * this node and changes every time we become primary.
 
@@ -1253,7 +1253,12 @@ Status ReplicationCoordinatorImpl::_validateReadConcern(OperationContext* opCtx,
                 "readConcern level 'snapshot' is required when specifying atClusterTime"};
     }
 
+    // We cannot support read concern 'majority' by means of reading from a historical snapshot if
+    // the storage layer doesn't support it. In this case, we can support it by using "speculative"
+    // majority reads instead.
     if (readConcern.getLevel() == ReadConcernLevel::kMajorityReadConcern &&
+        readConcern.getMajorityReadMechanism() ==
+            ReadConcernArgs::MajorityReadMechanism::kMajoritySnapshot &&
         !_externalState->isReadCommittedSupportedByStorageEngine(opCtx)) {
         return {ErrorCodes::ReadConcernMajorityNotEnabled,
                 str::stream() << "Storage engine does not support read concern: "
@@ -1422,8 +1427,15 @@ Status ReplicationCoordinatorImpl::_waitUntilClusterTimeForRead(OperationContext
 
     // We don't set isMajorityCommittedRead for kSnapshotReadConcern because snapshots are always
     // speculative; we wait for majority when the transaction commits.
+    //
+    // Speculative majority reads do not need to wait for the commit point to advance to satisfy
+    // afterClusterTime reads. Waiting for the lastApplied to advance past the given target optime
+    // ensures the recency guarantee for the afterClusterTime read. At the end of the command, we
+    // will wait for the lastApplied optime to become majority committed, which then satisfies the
+    // durability guarantee.
     const bool isMajorityCommittedRead =
-        readConcern.getLevel() == ReadConcernLevel::kMajorityReadConcern;
+        readConcern.getLevel() == ReadConcernLevel::kMajorityReadConcern &&
+        !readConcern.isSpeculativeMajority();
 
     return _waitUntilOpTime(opCtx, isMajorityCommittedRead, targetOpTime, deadline);
 }
@@ -1439,6 +1451,17 @@ Status ReplicationCoordinatorImpl::_waitUntilOpTimeForReadDeprecated(
     return _waitUntilOpTime(opCtx, isMajorityCommittedRead, targetOpTime);
 }
 
+Status ReplicationCoordinatorImpl::awaitOpTimeCommitted(OperationContext* opCtx, OpTime opTime) {
+    // The optime given to this method is required to be an optime in this node's local oplog.
+    // Furthermore, the execution of this method must not span rollbacks, so the oplog at the start
+    // of the waiting period will be a prefix of the oplog at the end of the waiting period. This
+    // makes it valid to compare optimes from this node's oplog based on their timestamps alone, and
+    // so allows this method to determine if an optime is committed by comparing its timestamp to
+    // the timestamp of the last committed optime.
+    const bool isMajorityCommittedRead = true;
+    return _waitUntilOpTime(opCtx, isMajorityCommittedRead, opTime);
+}
+
 OpTime ReplicationCoordinatorImpl::_getMyLastAppliedOpTime_inlock() const {
     return _topCoord->getMyLastAppliedOpTime();
 }
 
@@ -165,7 +165,7 @@ class ReplicationCoordinatorImpl : public ReplicationCoordinator {
 
     virtual Status waitUntilOpTimeForRead(OperationContext* opCtx,
                                           const ReadConcernArgs& readConcern) override;
-
+    Status awaitOpTimeCommitted(OperationContext* opCtx, OpTime opTime) override;
     virtual OID getElectionId() override;
 
     virtual int getMyId() const override;