@@ -2037,6 +2037,204 @@ TEST_F(ReplCoordReconfigTest, ForceReconfigShouldThrowIfArbiterNodesHaveNewlyAdd
20372037 getReplCoord ()->processReplSetReconfig (opCtx.get (), args, &result));
20382038}
20392039
2040+ TEST_F (ReplCoordTest, StepUpReconfigConcurrentWithHeartbeatReconfig) {
2041+ auto severityGuard = unittest::MinimumLoggedSeverityGuard{logv2::LogComponent::kReplication ,
2042+ logv2::LogSeverity::Debug (2 )};
2043+ assertStartSuccess (BSON (" _id"
2044+ << " mySet"
2045+ << " version" << 2 << " term" << 0 << " members"
2046+ << BSON_ARRAY (BSON (" _id" << 1 << " host"
2047+ << " node1:12345" )
2048+ << BSON (" _id" << 2 << " host"
2049+ << " node2:12345" ))),
2050+ HostAndPort (" node1" , 12345 ));
2051+ ASSERT_OK (getReplCoord ()->setFollowerMode (MemberState::RS_SECONDARY));
2052+ ASSERT_EQUALS (getReplCoord ()->getTerm (), 0 );
2053+ replCoordSetMyLastAppliedOpTime (OpTime (Timestamp (100 , 1 ), 0 ), Date_t () + Seconds (100 ));
2054+ replCoordSetMyLastDurableOpTime (OpTime (Timestamp (100 , 1 ), 0 ), Date_t () + Seconds (100 ));
2055+
2056+ // Win election but don't exit drain mode.
2057+ auto electionTimeoutWhen = getReplCoord ()->getElectionTimeout_forTest ();
2058+ const auto opCtx = makeOperationContext ();
2059+ simulateSuccessfulV1ElectionWithoutExitingDrainMode (electionTimeoutWhen, opCtx.get ());
2060+
2061+ // Receive a heartbeat that should NOT schedule a new heartbeat to fetch a newer config.
2062+ ReplSetHeartbeatArgsV1 hbArgs;
2063+ auto rsConfig = getReplCoord ()->getConfig ();
2064+ hbArgs.setConfigVersion (3 ); // simulate a newer config version.
2065+ hbArgs.setConfigTerm (rsConfig.getConfigTerm ());
2066+ hbArgs.setSetName (rsConfig.getReplSetName ());
2067+ hbArgs.setSenderHost (HostAndPort (" node2" , 12345 ));
2068+ hbArgs.setSenderId (2 );
2069+ hbArgs.setTerm (0 );
2070+ ASSERT (hbArgs.isInitialized ());
2071+
2072+ ReplSetHeartbeatResponse response;
2073+ ASSERT_OK (getReplCoord ()->processHeartbeatV1 (hbArgs, &response));
2074+
2075+ // No requests should have been scheduled.
2076+ getNet ()->enterNetwork ();
2077+ ASSERT_FALSE (getNet ()->hasReadyRequests ());
2078+ getNet ()->exitNetwork ();
2079+
2080+ // Receive a heartbeat that schedules a new heartbeat to fetch a newer config. We simulate a
2081+ // newer config version and an uninitialized term, so that a heartbeat will be scheduled to
2082+ // fetch a new config. When we mock the heartbeat response below, we will respond with a
2083+ // non-force config, which is to test the case where the sending node installed a non force
2084+ // config after we scheduled a heartbeat to it to fetch a force config. For safety, the
2085+ // important aspect is that we don't accept/install configs during drain mode, even if we try to
2086+ // fetch them.
2087+ hbArgs.setConfigVersion (3 );
2088+ hbArgs.setConfigTerm (OpTime::kUninitializedTerm );
2089+ hbArgs.setSetName (rsConfig.getReplSetName ());
2090+ hbArgs.setSenderHost (HostAndPort (" node2" , 12345 ));
2091+ hbArgs.setSenderId (2 );
2092+ hbArgs.setTerm (0 );
2093+ ASSERT (hbArgs.isInitialized ());
2094+
2095+ ASSERT_OK (getReplCoord ()->processHeartbeatV1 (hbArgs, &response));
2096+
2097+ // Schedule a response with a newer config.
2098+ auto newerConfigVersion = 3 ;
2099+ auto newerConfig = BSON (" _id"
2100+ << " mySet"
2101+ << " version" << newerConfigVersion << " term" << 0 << " members"
2102+ << BSON_ARRAY (BSON (" _id" << 1 << " host"
2103+ << " node1:12345" )
2104+ << BSON (" _id" << 2 << " host"
2105+ << " node2:12345" )));
2106+ auto net = getNet ();
2107+ net->enterNetwork ();
2108+ auto noi = net->getNextReadyRequest ();
2109+ auto & request = noi->getRequest ();
2110+
2111+ ReplSetHeartbeatArgsV1 args;
2112+ ASSERT_OK (args.initialize (request.cmdObj ));
2113+
2114+ startCapturingLogMessages ();
2115+ OpTime lastApplied (Timestamp (100 , 1 ), 0 );
2116+ ReplSetHeartbeatResponse hbResp;
2117+ ASSERT_OK (rsConfig.initialize (newerConfig));
2118+ hbResp.setConfig (rsConfig);
2119+ hbResp.setSetName (rsConfig.getReplSetName ());
2120+ hbResp.setState (MemberState::RS_SECONDARY);
2121+ hbResp.setConfigVersion (rsConfig.getConfigVersion ());
2122+ hbResp.setConfigTerm (rsConfig.getConfigTerm ());
2123+ hbResp.setAppliedOpTimeAndWallTime ({lastApplied, Date_t () + Seconds (lastApplied.getSecs ())});
2124+ hbResp.setDurableOpTimeAndWallTime ({lastApplied, Date_t () + Seconds (lastApplied.getSecs ())});
2125+ net->scheduleResponse (noi, net->now (), makeResponseStatus (hbResp.toBSON ()));
2126+ net->runReadyNetworkOperations ();
2127+ net->exitNetwork ();
2128+ stopCapturingLogMessages ();
2129+
2130+ // Make sure the heartbeat reconfig has not been scheduled.
2131+ ASSERT_EQUALS (1 , countTextFormatLogLinesContaining (" Not scheduling a heartbeat reconfig" ));
2132+
2133+ // Let drain mode complete.
2134+ signalDrainComplete (opCtx.get ());
2135+
2136+ // We should have moved to a new term in the election, and our config should have the same term.
2137+ ASSERT_EQUALS (getReplCoord ()->getTerm (), 1 );
2138+ ASSERT_EQUALS (getReplCoord ()->getConfig ().getConfigTerm (), 1 );
2139+ }
2140+
2141+ TEST_F (ReplCoordTest, StepUpReconfigConcurrentWithForceHeartbeatReconfig) {
2142+ auto severityGuard = unittest::MinimumLoggedSeverityGuard{logv2::LogComponent::kReplication ,
2143+ logv2::LogSeverity::Debug (2 )};
2144+ assertStartSuccess (BSON (" _id"
2145+ << " mySet"
2146+ << " version" << 2 << " term" << 0 << " members"
2147+ << BSON_ARRAY (BSON (" _id" << 1 << " host"
2148+ << " node1:12345" )
2149+ << BSON (" _id" << 2 << " host"
2150+ << " node2:12345" ))),
2151+ HostAndPort (" node1" , 12345 ));
2152+ ASSERT_OK (getReplCoord ()->setFollowerMode (MemberState::RS_SECONDARY));
2153+ ASSERT_EQUALS (getReplCoord ()->getTerm (), 0 );
2154+ replCoordSetMyLastAppliedOpTime (OpTime (Timestamp (100 , 1 ), 0 ), Date_t () + Seconds (100 ));
2155+ replCoordSetMyLastDurableOpTime (OpTime (Timestamp (100 , 1 ), 0 ), Date_t () + Seconds (100 ));
2156+
2157+ // Win election but don't exit drain mode.
2158+ auto electionTimeoutWhen = getReplCoord ()->getElectionTimeout_forTest ();
2159+ const auto opCtx = makeOperationContext ();
2160+ simulateSuccessfulV1ElectionWithoutExitingDrainMode (electionTimeoutWhen, opCtx.get ());
2161+
2162+ // Receive a heartbeat that schedules a new heartbeat to fetch a newer config.
2163+ ReplSetHeartbeatArgsV1 hbArgs;
2164+ auto rsConfig = getReplCoord ()->getConfig ();
2165+ hbArgs.setConfigVersion (3 ); // simulate a newer config version.
2166+ hbArgs.setConfigTerm (OpTime::kUninitializedTerm ); // force config.
2167+ hbArgs.setSetName (rsConfig.getReplSetName ());
2168+ hbArgs.setSenderHost (HostAndPort (" node2" , 12345 ));
2169+ hbArgs.setSenderId (2 );
2170+ hbArgs.setTerm (0 );
2171+ ASSERT (hbArgs.isInitialized ());
2172+
2173+ ReplSetHeartbeatResponse response;
2174+ ASSERT_OK (getReplCoord ()->processHeartbeatV1 (hbArgs, &response));
2175+
2176+ // Schedule a response with a newer config.
2177+ auto newerConfigVersion = 3 ;
2178+ auto newerConfig =
2179+ BSON (" _id"
2180+ << " mySet"
2181+ << " version" << newerConfigVersion << " term" << OpTime::kUninitializedTerm << " members"
2182+ << BSON_ARRAY (BSON (" _id" << 1 << " host"
2183+ << " node1:12345" )
2184+ << BSON (" _id" << 2 << " host"
2185+ << " node2:12345" )));
2186+ auto net = getNet ();
2187+ net->enterNetwork ();
2188+ auto noi = net->getNextReadyRequest ();
2189+ auto & request = noi->getRequest ();
2190+
2191+ ReplSetHeartbeatArgsV1 args;
2192+ ASSERT_OK (args.initialize (request.cmdObj ));
2193+
2194+ OpTime lastApplied (Timestamp (100 , 1 ), 0 );
2195+ ReplSetHeartbeatResponse hbResp;
2196+ ASSERT_OK (rsConfig.initialize (newerConfig));
2197+ hbResp.setConfig (rsConfig);
2198+ hbResp.setSetName (rsConfig.getReplSetName ());
2199+ hbResp.setState (MemberState::RS_SECONDARY);
2200+ hbResp.setConfigVersion (rsConfig.getConfigVersion ());
2201+ hbResp.setConfigTerm (rsConfig.getConfigTerm ());
2202+ hbResp.setAppliedOpTimeAndWallTime ({lastApplied, Date_t () + Seconds (lastApplied.getSecs ())});
2203+ hbResp.setDurableOpTimeAndWallTime ({lastApplied, Date_t () + Seconds (lastApplied.getSecs ())});
2204+ net->scheduleResponse (noi, net->now (), makeResponseStatus (hbResp.toBSON ()));
2205+ net->exitNetwork ();
2206+
2207+ {
2208+ // Prevent the heartbeat reconfig from completing.
2209+ FailPointEnableBlock fpb (" blockHeartbeatReconfigFinish" );
2210+
2211+ // Let the heartbeat reconfig begin.
2212+ net->enterNetwork ();
2213+ net->runReadyNetworkOperations ();
2214+ net->exitNetwork ();
2215+
2216+ // For force reconfigs, we do allow them to proceed even if we are in drain mode, so make
2217+ // sure it is in progress, stuck at the failpoint before completion.
2218+ fpb->waitForTimesEntered (1 );
2219+
2220+ // At this point the heartbeat reconfig should be in progress but blocked from completion by
2221+ // the failpoint. We now let drain mode complete. The step up reconfig should be interrupted
2222+ // by the in progress heartbeat reconfig.
2223+ signalDrainComplete (opCtx.get ());
2224+ }
2225+
2226+ // The failpoint should be released now, allowing the heartbeat reconfig to complete. We run the
2227+ // clock forward so the re-scheduled heartbeat reconfig will complete.
2228+ net->enterNetwork ();
2229+ net->runUntil (net->now () + Milliseconds (100 ));
2230+ net->exitNetwork ();
2231+
2232+ // We should have moved to a new term in the election, but our config should have the term from
2233+ // the force config.
2234+ ASSERT_EQUALS (getReplCoord ()->getTerm (), 1 );
2235+ ASSERT_EQUALS (getReplCoord ()->getConfig ().getConfigTerm (), OpTime::kUninitializedTerm );
2236+ }
2237+
20402238} // anonymous namespace
20412239} // namespace repl
20422240} // namespace mongo
0 commit comments