Skip to content

Commit 6adc71f

Browse files
committed
SERVER-26990 Unify tracking of secondary state between replication and topology coordinators
This removes the slaveInfo structure and consolidates secondary state into the topology coordinator memberHeartbeatData.
1 parent f62ed9c commit 6adc71f

16 files changed

+1655
-2164
lines changed

src/mongo/db/repl/SConscript

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,7 @@ env.CppUnitTest(
504504
env.Library('topology_coordinator',
505505
[
506506
'heartbeat_response_action.cpp',
507+
'member_heartbeat_data.cpp',
507508
'topology_coordinator.cpp',
508509
],
509510
LIBDEPS=[
@@ -513,7 +514,6 @@ env.Library('topology_coordinator',
513514

514515
env.Library('topology_coordinator_impl',
515516
[
516-
'member_heartbeat_data.cpp',
517517
'topology_coordinator_impl.cpp',
518518
],
519519
LIBDEPS=[

src/mongo/db/repl/heartbeat_response_action.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,9 @@ void HeartbeatResponseAction::setNextHeartbeatStartDate(Date_t when) {
7575
_nextHeartbeatStartDate = when;
7676
}
7777

78+
void HeartbeatResponseAction::setAdvancedOpTime(bool advanced) {
79+
_advancedOpTime = advanced;
80+
}
81+
7882
} // namespace repl
7983
} // namespace mongo

src/mongo/db/repl/heartbeat_response_action.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,11 @@ class HeartbeatResponseAction {
100100
*/
101101
void setNextHeartbeatStartDate(Date_t when);
102102

103+
/**
104+
* Sets whether or not the heartbeat response advanced the member's opTime.
105+
*/
106+
void setAdvancedOpTime(bool advanced);
107+
103108
/**
104109
* Gets the action type of this action.
105110
*/
@@ -123,10 +128,19 @@ class HeartbeatResponseAction {
123128
return _primaryIndex;
124129
}
125130

131+
/*
132+
* Returns true if the heartbeat response resulting in our conception of the
133+
* member's optime moving forward, so we need to recalculate lastCommittedOpTime.
134+
*/
135+
bool getAdvancedOpTime() const {
136+
return _advancedOpTime;
137+
}
138+
126139
private:
127140
Action _action;
128141
int _primaryIndex;
129142
Date_t _nextHeartbeatStartDate;
143+
bool _advancedOpTime = false;
130144
};
131145

132146
} // namespace repl

src/mongo/db/repl/member_heartbeat_data.cpp

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,14 @@
3939
namespace mongo {
4040
namespace repl {
4141

42-
MemberHeartbeatData::MemberHeartbeatData() : _health(-1), _authIssue(false) {
42+
MemberHeartbeatData::MemberHeartbeatData()
43+
: _health(-1), _authIssue(false), _configIndex(-1), _isSelf(false) {
4344
_lastResponse.setState(MemberState::RS_UNKNOWN);
4445
_lastResponse.setElectionTime(Timestamp());
4546
_lastResponse.setAppliedOpTime(OpTime());
4647
}
4748

48-
void MemberHeartbeatData::setUpValues(Date_t now,
49+
bool MemberHeartbeatData::setUpValues(Date_t now,
4950
const HostAndPort& host,
5051
ReplSetHeartbeatResponse&& hbResponse) {
5152
_health = 1;
@@ -54,6 +55,8 @@ void MemberHeartbeatData::setUpValues(Date_t now,
5455
}
5556
_authIssue = false;
5657
_lastHeartbeat = now;
58+
_lastUpdate = now;
59+
_lastUpdateStale = false;
5760
_updatedSinceRestart = true;
5861

5962
if (!hbResponse.hasState()) {
@@ -71,7 +74,11 @@ void MemberHeartbeatData::setUpValues(Date_t now,
7174
<< hbResponse.getState().toString() << rsLog;
7275
}
7376

77+
bool opTimeAdvanced = advanceLastAppliedOpTime(hbResponse.getAppliedOpTime(), now);
78+
auto durableOpTime = hbResponse.hasDurableOpTime() ? hbResponse.getDurableOpTime() : OpTime();
79+
opTimeAdvanced = advanceLastDurableOpTime(durableOpTime, now) || opTimeAdvanced;
7480
_lastResponse = std::move(hbResponse);
81+
return opTimeAdvanced;
7582
}
7683

7784
void MemberHeartbeatData::setDownValues(Date_t now, const std::string& heartbeatMessage) {
@@ -87,6 +94,9 @@ void MemberHeartbeatData::setDownValues(Date_t now, const std::string& heartbeat
8794
_lastResponse.setAppliedOpTime(OpTime());
8895
_lastResponse.setHbMsg(heartbeatMessage);
8996
_lastResponse.setSyncingTo(HostAndPort());
97+
98+
// The _lastAppliedOpTime/_lastDurableOpTime fields don't get cleared merely by missing a
99+
// heartbeat.
90100
}
91101

92102
void MemberHeartbeatData::setAuthIssue(Date_t now) {
@@ -104,5 +114,47 @@ void MemberHeartbeatData::setAuthIssue(Date_t now) {
104114
_lastResponse.setSyncingTo(HostAndPort());
105115
}
106116

117+
void MemberHeartbeatData::setLastAppliedOpTime(OpTime opTime, Date_t now) {
118+
_lastUpdate = now;
119+
_lastUpdateStale = false;
120+
_lastAppliedOpTime = opTime;
121+
}
122+
123+
void MemberHeartbeatData::setLastDurableOpTime(OpTime opTime, Date_t now) {
124+
_lastUpdate = now;
125+
_lastUpdateStale = false;
126+
if (_lastAppliedOpTime < opTime) {
127+
// TODO(russotto): We think this should never happen, rollback or no rollback. Make this an
128+
// invariant and see what happens.
129+
log() << "Durable progress (" << opTime << ") is ahead of the applied progress ("
130+
<< _lastAppliedOpTime << ". This is likely due to a "
131+
"rollback."
132+
<< " memberid: " << _memberId << " rid: " << _rid << " host "
133+
<< _hostAndPort.toString() << " previous durable progress: " << _lastDurableOpTime;
134+
} else {
135+
_lastDurableOpTime = opTime;
136+
}
137+
}
138+
139+
bool MemberHeartbeatData::advanceLastAppliedOpTime(OpTime opTime, Date_t now) {
140+
_lastUpdate = now;
141+
_lastUpdateStale = false;
142+
if (_lastAppliedOpTime < opTime) {
143+
setLastAppliedOpTime(opTime, now);
144+
return true;
145+
}
146+
return false;
147+
}
148+
149+
bool MemberHeartbeatData::advanceLastDurableOpTime(OpTime opTime, Date_t now) {
150+
_lastUpdate = now;
151+
_lastUpdateStale = false;
152+
if (_lastDurableOpTime < opTime) {
153+
setLastDurableOpTime(opTime, now);
154+
return true;
155+
}
156+
return false;
157+
}
158+
107159
} // namespace repl
108160
} // namespace mongo

src/mongo/db/repl/member_heartbeat_data.h

Lines changed: 134 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ class MemberHeartbeatData {
6868
const HostAndPort& getSyncSource() const {
6969
return _lastResponse.getSyncingTo();
7070
}
71-
OpTime getAppliedOpTime() const {
71+
OpTime getHeartbeatAppliedOpTime() const {
7272
return _lastResponse.getAppliedOpTime();
7373
}
74-
OpTime getDurableOpTime() const {
74+
OpTime getHeartbeatDurableOpTime() const {
7575
return _lastResponse.hasDurableOpTime() ? _lastResponse.getDurableOpTime() : OpTime();
7676
}
7777
int getConfigVersion() const {
@@ -105,10 +105,49 @@ class MemberHeartbeatData {
105105
return _health != 0;
106106
}
107107

108+
OpTime getLastAppliedOpTime() const {
109+
return _lastAppliedOpTime;
110+
}
111+
112+
OpTime getLastDurableOpTime() const {
113+
return _lastDurableOpTime;
114+
}
115+
116+
// When was the last time this data was updated via any means?
117+
Date_t getLastUpdate() const {
118+
return _lastUpdate;
119+
}
120+
// Was the last update stale as of the last check?
121+
bool lastUpdateStale() const {
122+
return _lastUpdateStale;
123+
}
124+
125+
// Index of this member in the replica set config member list.
126+
int getConfigIndex() const {
127+
return _configIndex;
128+
}
129+
130+
int getMemberId() const {
131+
return _memberId;
132+
}
133+
134+
OID getRid() const {
135+
return _rid;
136+
}
137+
138+
bool isSelf() const {
139+
return _isSelf;
140+
}
141+
142+
HostAndPort getHostAndPort() const {
143+
return _hostAndPort;
144+
}
145+
108146
/**
109147
* Sets values in this object from the results of a successful heartbeat command.
148+
* Returns whether or not the optimes advanced as a result of this heartbeat response.
110149
*/
111-
void setUpValues(Date_t now, const HostAndPort& host, ReplSetHeartbeatResponse&& hbResponse);
150+
bool setUpValues(Date_t now, const HostAndPort& host, ReplSetHeartbeatResponse&& hbResponse);
112151

113152
/**
114153
* Sets values in this object from the results of a erroring/failed heartbeat command.
@@ -134,6 +173,66 @@ class MemberHeartbeatData {
134173
return _updatedSinceRestart;
135174
}
136175

176+
/**
177+
* Sets the last applied op time (not the heartbeat applied op time) and updates the
178+
* lastUpdate time.
179+
*/
180+
void setLastAppliedOpTime(OpTime opTime, Date_t now);
181+
182+
/**
183+
* Sets the last durable op time (not the heartbeat durable op time)
184+
*/
185+
void setLastDurableOpTime(OpTime opTime, Date_t now);
186+
187+
/**
188+
* Sets the last applied op time (not the heartbeat applied op time) iff the new optime is
189+
* later than the current optime, and updates the lastUpdate time. Returns true if the
190+
* optime was advanced.
191+
*/
192+
bool advanceLastAppliedOpTime(OpTime opTime, Date_t now);
193+
194+
/**
195+
* Sets the last durable op time (not the heartbeat applied op time) iff the new optime is
196+
* later than the current optime, and updates the lastUpdate time. Returns true if the
197+
* optime was advanced.
198+
*/
199+
bool advanceLastDurableOpTime(OpTime opTime, Date_t now);
200+
201+
/*
202+
* Indicates that this data is stale, based on _lastUpdateTime.
203+
*/
204+
void markLastUpdateStale() {
205+
_lastUpdateStale = true;
206+
}
207+
208+
/*
209+
* Updates the _lastUpdateTime and clears staleness without changing anything else.
210+
*/
211+
void updateLiveness(Date_t now) {
212+
_lastUpdate = now;
213+
_lastUpdateStale = false;
214+
}
215+
216+
void setConfigIndex(int configIndex) {
217+
_configIndex = configIndex;
218+
}
219+
220+
void setIsSelf(bool isSelf) {
221+
_isSelf = isSelf;
222+
}
223+
224+
void setHostAndPort(HostAndPort hostAndPort) {
225+
_hostAndPort = hostAndPort;
226+
}
227+
228+
void setMemberId(int memberId) {
229+
_memberId = memberId;
230+
}
231+
232+
void setRid(OID rid) {
233+
_rid = rid;
234+
}
235+
137236
private:
138237
// -1 = not checked yet, 0 = member is down/unreachable, 1 = member is up
139238
int _health;
@@ -153,6 +252,38 @@ class MemberHeartbeatData {
153252

154253
// Have we received heartbeats since the last restart?
155254
bool _updatedSinceRestart = false;
255+
256+
// Last time we got any information about this member, whether heartbeat
257+
// or replSetUpdatePosition.
258+
Date_t _lastUpdate;
259+
260+
// Set when lastUpdate time exceeds the election timeout. Implies that the member is down
261+
// on the primary, but not the secondaries.
262+
bool _lastUpdateStale = false;
263+
264+
// Last known OpTime that the replica has applied and journaled to.
265+
OpTime _lastDurableOpTime;
266+
267+
// Last known OpTime that the replica has applied, whether journaled or unjournaled.
268+
OpTime _lastAppliedOpTime;
269+
270+
// TODO(russotto): Since memberHeartbeatData is kept in config order, _configIndex
271+
// and _isSelf may not be necessary.
272+
// Index of this member in the replica set configuration.
273+
int _configIndex;
274+
275+
// Is this the data for this member?
276+
bool _isSelf;
277+
278+
// This member's RID, used only in master/slave replication.
279+
OID _rid;
280+
281+
// This member's member ID. memberId and hostAndPort duplicate information in the
282+
// configuration for replica sets, but are required to be here for master/slave replication.
283+
int _memberId = -1;
284+
285+
// Client address of this member.
286+
HostAndPort _hostAndPort;
156287
};
157288

158289
} // namespace repl

src/mongo/db/repl/repl_set_html_summary.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ const std::string ReplSetHtmlSummary::toHtmlString() const {
187187
// TODO(dannenberg): change timestamp to optime in V1
188188
memberTable << td(memberHB.getLastHeartbeat() == Date_t()
189189
? "?"
190-
: memberHB.getAppliedOpTime().toString());
190+
: memberHB.getHeartbeatAppliedOpTime().toString());
191191
}
192192
memberTable << _tr();
193193
}
@@ -201,7 +201,7 @@ const std::string ReplSetHtmlSummary::toHtmlString() const {
201201
const MemberConfig& selfConfig = _config.getMemberAt(_selfIndex);
202202

203203
if (_primaryIndex >= 0 && _primaryIndex != _selfIndex && !selfConfig.isArbiter()) {
204-
int lag = _hbData[_primaryIndex].getAppliedOpTime().getTimestamp().getSecs() -
204+
int lag = _hbData[_primaryIndex].getHeartbeatAppliedOpTime().getTimestamp().getSecs() -
205205
_selfOptime.getTimestamp().getSecs();
206206
s << tr("Lag: ", str::stream() << lag << " secs");
207207
}

0 commit comments

Comments
 (0)