Skip to content

Commit edc3dfc

Browse files
committed
SERVER-31268 Make updates_in_heterogeneous_repl_set.js more robust.
The updates_in_heterogeneous_repl_set.js test wants to call stepUp() on each of its replica set members to ensure that each one gets an opportunity to handle inserts and updates. However, we found that the stepUp()-triggered election was occasionally failing, because even though a call to awaitReplication() ensured that all members were caught up on their oplogs, some members were not aware that their peers were caught up. The new awaitNodesAgreeOnOpTime() function uses an assert.soon() to wait until all the members are caught up on their peers' status, so that the election always succeeds.
1 parent 232f798 commit edc3dfc

File tree

2 files changed

+78
-1
lines changed

2 files changed

+78
-1
lines changed

jstests/multiVersion/updates_in_heterogeneous_repl_set.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ const testName = "updates_in_heterogeneous_repl_set";
2929
// Give each member a chance to be primary while updating documents.
3030
let collIndex = 0;
3131
replTest.nodes.forEach(function(node) {
32-
replTest.awaitReplication();
3332
replTest.stepUp(node);
3433

3534
let coll = node.getDB("test")["coll" + (collIndex++)];

src/mongo/shell/replsettest.js

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,82 @@ var ReplSetTest = function(opts) {
548548
timeout);
549549
};
550550

551+
/**
552+
* Blocks until each node agrees that all other nodes have applied the most recent oplog entry.
553+
*/
554+
this.awaitNodesAgreeOnAppliedOpTime = function(timeout, nodes) {
555+
timeout = timeout || self.kDefaultTimeoutMS;
556+
nodes = nodes || self.nodes;
557+
558+
assert.soon(function() {
559+
let appliedOpTimeConsensus = undefined;
560+
for (let i = 0; i < nodes.length; i++) {
561+
let replSetGetStatus;
562+
try {
563+
replSetGetStatus = nodes[i].adminCommand({replSetGetStatus: 1});
564+
} catch (e) {
565+
print("AwaitNodesAgreeOnAppliedOpTime: Retrying because node " + nodes[i].name +
566+
" failed to execute replSetGetStatus: " + tojson(e));
567+
return false;
568+
}
569+
assert.commandWorked(replSetGetStatus);
570+
571+
if (appliedOpTimeConsensus === undefined) {
572+
if (replSetGetStatus.optimes) {
573+
appliedOpTimeConsensus = replSetGetStatus.optimes.appliedOpTime;
574+
} else {
575+
// Older versions of mongod do not include an 'optimes' field in the
576+
// replSetGetStatus response. We instead pull an optime from the first
577+
// replica set member that includes one in its status. All we need here is
578+
// any initial value that we can compare to all the other optimes.
579+
let optimeMembers = replSetGetStatus.members.filter(m => m.optime);
580+
assert(optimeMembers.length > 0,
581+
"AwaitNodesAgreeOnAppliedOpTime: replSetGetStatus did not " +
582+
"include optimes for any members: " + tojson(replSetGetStatus));
583+
appliedOpTimeConsensus = optimeMembers[0].optime;
584+
}
585+
586+
assert(appliedOpTimeConsensus,
587+
"AwaitNodesAgreeOnAppliedOpTime: missing appliedOpTime in " +
588+
"replSetGetStatus: " + tojson(replSetGetStatus));
589+
}
590+
591+
if (replSetGetStatus.optimes &&
592+
!friendlyEqual(replSetGetStatus.optimes.appliedOpTime,
593+
appliedOpTimeConsensus)) {
594+
print("AwaitNodesAgreeOnAppliedOpTime: Retrying because node " + nodes[i].name +
595+
" has appliedOpTime " + tojson(replSetGetStatus.optimes.appliedOpTime) +
596+
" that does not match the previously observed appliedOpTime " +
597+
tojson(appliedOpTimeConsensus));
598+
return false;
599+
}
600+
601+
for (let j = 0; j < replSetGetStatus.members.length; j++) {
602+
if (replSetGetStatus.members[j].state == ReplSetTest.State.ARBITER) {
603+
// ARBITER nodes do not apply oplog entries and do not have an 'optime'
604+
// field.
605+
continue;
606+
}
607+
608+
if (!friendlyEqual(replSetGetStatus.members[j].optime,
609+
appliedOpTimeConsensus)) {
610+
print("AwaitNodesAgreeOnAppliedOpTime: Retrying because node " +
611+
nodes[i].name + " sees optime " +
612+
tojson(replSetGetStatus.members[j].optime) + " on node " +
613+
replSetGetStatus.members[j].name + " but expects to see optime " +
614+
tojson(appliedOpTimeConsensus));
615+
return false;
616+
}
617+
}
618+
}
619+
620+
print(
621+
"AwaitNodesAgreeOnAppliedOpTime: All nodes agree that all ops are applied up to " +
622+
tojson(appliedOpTimeConsensus));
623+
return true;
624+
}, "Awaiting nodes to agree that all ops are applied across replica set", timeout);
625+
};
626+
551627
/**
552628
* Blocks until all nodes agree on who the primary is.
553629
* If 'expectedPrimaryNodeId' is provided, ensure that every node is seeing this node as the
@@ -896,6 +972,7 @@ var ReplSetTest = function(opts) {
896972
*/
897973
this.stepUp = function(node) {
898974
this.awaitReplication();
975+
this.awaitNodesAgreeOnAppliedOpTime();
899976
this.awaitNodesAgreeOnPrimary();
900977
if (this.getPrimary() === node) {
901978
return;
@@ -917,6 +994,7 @@ var ReplSetTest = function(opts) {
917994
"': " + tojson(ex));
918995
}
919996
this.awaitReplication();
997+
this.awaitNodesAgreeOnAppliedOpTime();
920998
this.awaitNodesAgreeOnPrimary();
921999
}
9221000

0 commit comments

Comments
 (0)