Skip to content
This repository was archived by the owner on Oct 31, 2023. It is now read-only.

Commit c6afa9e

Browse files
BillyDonahueEvergreen Agent
authored andcommitted
SERVER-54227 ReshardingMetrics fix "remaining time" estimate
1 parent dc96088 commit c6afa9e

File tree

2 files changed

+105
-19
lines changed

2 files changed

+105
-19
lines changed

src/mongo/db/s/resharding/resharding_metrics.cpp

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,20 @@ const auto getMetrics = ServiceContext::declareDecoration<MetricsPtr>();
6767
const auto reshardingMetricsRegisterer = ServiceContext::ConstructorActionRegisterer{
6868
"ReshardingMetrics",
6969
[](ServiceContext* ctx) { getMetrics(ctx) = std::make_unique<ReshardingMetrics>(ctx); }};
70+
71+
/**
72+
* Given a constant rate of time per unit of work:
73+
* totalTime / totalWork == elapsedTime / elapsedWork
74+
* Solve for remaining time.
75+
* remainingTime := totalTime - elapsedTime
76+
* == (totalWork * (elapsedTime / elapsedWork)) - elapsedTime
77+
* == elapsedTime * (totalWork / elapsedWork - 1)
78+
*/
79+
Milliseconds remainingTime(Milliseconds elapsedTime, double elapsedWork, double totalWork) {
80+
elapsedWork = std::min(elapsedWork, totalWork);
81+
double remainingMsec = 1.0 * elapsedTime.count() * (totalWork / elapsedWork - 1);
82+
return Milliseconds(Milliseconds::rep(remainingMsec));
83+
}
7084
} // namespace
7185

7286
ReshardingMetrics* ReshardingMetrics::get(ServiceContext* ctx) noexcept {
@@ -242,28 +256,28 @@ void ReshardingMetrics::OperationMetrics::append(BSONObjBuilder* bob, Role role)
242256
return durationCount<Seconds>(interval.duration());
243257
};
244258

245-
auto estimateRemainingOperationTime = [&]() -> int64_t {
246-
if (bytesCopied == 0 && oplogEntriesApplied == 0)
247-
return -1;
248-
else if (oplogEntriesApplied == 0) {
249-
invariant(bytesCopied > 0);
259+
auto remainingMsec = [&]() -> boost::optional<Milliseconds> {
260+
if (oplogEntriesApplied > 0) {
261+
// All fetched oplogEntries must be applied. Some of them already have been.
262+
return remainingTime(
263+
applyingOplogEntries.duration(), oplogEntriesApplied, oplogEntriesFetched);
264+
}
265+
if (bytesCopied > 0) {
250266
// Until the time to apply batches of oplog entries is measured, we assume that applying
251267
// all of them will take as long as copying did.
252-
const auto elapsedCopyTime = getElapsedTime(copyingDocuments);
253-
const auto approxTimeToCopy =
254-
elapsedCopyTime * std::max((int64_t)0, bytesToCopy / bytesCopied - 1);
255-
return elapsedCopyTime + 2 * approxTimeToCopy;
256-
} else {
257-
invariant(oplogEntriesApplied > 0);
258-
const auto approxTimeToApply = getElapsedTime(applyingOplogEntries) *
259-
std::max((int64_t)0, oplogEntriesFetched / oplogEntriesApplied - 1);
260-
return approxTimeToApply;
268+
return remainingTime(copyingDocuments.duration(), bytesCopied, 2 * bytesToCopy);
261269
}
262-
};
270+
return {};
271+
}();
272+
263273

264274
const std::string kIntervalSuffix = role == Role::kAll ? "Millis" : "";
265275
bob->append(kOpTimeElapsed + kIntervalSuffix, getElapsedTime(runningOperation));
266-
bob->append(kOpTimeRemaining + kIntervalSuffix, estimateRemainingOperationTime());
276+
277+
bob->append(kOpTimeRemaining + kIntervalSuffix,
278+
!remainingMsec ? int64_t{-1} /** -1 is a specified integer null value */
279+
: role == Role::kAll ? durationCount<Milliseconds>(*remainingMsec)
280+
: durationCount<Seconds>(*remainingMsec));
267281

268282
if (role == Role::kAll || role == Role::kRecipient) {
269283
bob->append(kDocumentsToCopy, documentsToCopy);

src/mongo/db/s/resharding/resharding_metrics_test.cpp

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,22 +27,30 @@
2727
* it in the license file.
2828
*/
2929

30+
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kTest
31+
3032
#include <fmt/format.h>
3133

3234
#include "mongo/bson/bsonobjbuilder.h"
3335
#include "mongo/bson/json.h"
3436
#include "mongo/db/s/resharding/resharding_metrics.h"
3537
#include "mongo/db/service_context_test_fixture.h"
38+
#include "mongo/logv2/log.h"
3639
#include "mongo/unittest/death_test.h"
3740
#include "mongo/unittest/unittest.h"
3841
#include "mongo/util/clock_source_mock.h"
3942
#include "mongo/util/uuid.h"
4043

4144
namespace mongo {
45+
namespace {
46+
47+
using namespace fmt::literals;
48+
49+
constexpr auto kOpTimeRemaining = "remainingOperationTimeEstimatedMillis"_sd;
4250

4351
class ReshardingMetricsTest : public ServiceContextTest {
4452
public:
45-
void setUp() {
53+
void setUp() override {
4654
auto clockSource = std::make_unique<ClockSourceMock>();
4755
_clockSource = clockSource.get();
4856
getGlobalServiceContext()->setFastClockSource(std::move(clockSource));
@@ -55,8 +63,8 @@ class ReshardingMetricsTest : public ServiceContextTest {
5563
// Timer step in milliseconds
5664
static constexpr auto kTimerStep = 100;
5765

58-
void advanceTime(Milliseconds interval = Milliseconds(kTimerStep)) {
59-
_clockSource->advance(interval);
66+
void advanceTime(Milliseconds step = Milliseconds{kTimerStep}) {
67+
_clockSource->advance(step);
6068
}
6169

6270
auto getReport() {
@@ -371,4 +379,68 @@ TEST_F(ReshardingMetricsTest, CurrentOpReportForCoordinator) {
371379
ASSERT_BSONOBJ_EQ(expected, report);
372380
}
373381

382+
TEST_F(ReshardingMetricsTest, EstimatedRemainingOperationTimeCloning) {
383+
// Copy N docs @ timePerDoc. Check the progression of the estimated time remaining.
384+
auto m = getMetrics();
385+
m->onStart();
386+
m->setRecipientState(RecipientStateEnum::kCloning);
387+
auto timePerDocument = Milliseconds{123};
388+
int64_t bytesPerDocument = 1024;
389+
int64_t documentsToCopy = 409;
390+
int64_t bytesToCopy = bytesPerDocument * documentsToCopy;
391+
m->setDocumentsToCopy(documentsToCopy, bytesToCopy);
392+
auto remainingTime = 2 * timePerDocument * documentsToCopy;
393+
double maxAbsRelErr = 0;
394+
for (int64_t copied = 0; copied < documentsToCopy; ++copied) {
395+
double output = getReport()[kOpTimeRemaining].Number();
396+
if (copied == 0) {
397+
ASSERT_EQ(output, -1);
398+
} else {
399+
ASSERT_GTE(output, 0);
400+
auto expected = durationCount<Milliseconds>(remainingTime);
401+
// Check that error is pretty small (it should get better as the operation progresses)
402+
double absRelErr = std::abs((output - expected) / expected);
403+
ASSERT_LT(absRelErr, 0.05)
404+
<< "output={}, expected={}, copied={}"_format(output, expected, copied);
405+
maxAbsRelErr = std::max(maxAbsRelErr, absRelErr);
406+
}
407+
m->onDocumentsCopied(1, bytesPerDocument);
408+
advanceTime(timePerDocument);
409+
remainingTime -= timePerDocument;
410+
}
411+
LOGV2_DEBUG(
412+
5422700, 3, "Max absolute relative error observed", "maxAbsRelErr"_attr = maxAbsRelErr);
413+
}
414+
415+
TEST_F(ReshardingMetricsTest, EstimatedRemainingOperationTimeApplying) {
416+
// Perform N ops @ timePerOp. Check the progression of the estimated time remaining.
417+
auto m = getMetrics();
418+
m->onStart();
419+
m->setRecipientState(RecipientStateEnum::kApplying);
420+
auto timePerOp = Milliseconds{123};
421+
int64_t fetched = 10000;
422+
m->onOplogEntriesFetched(fetched);
423+
auto remainingTime = timePerOp * fetched;
424+
double maxAbsRelErr = 0;
425+
for (int64_t applied = 0; applied < fetched; ++applied) {
426+
double output = getReport()[kOpTimeRemaining].Number();
427+
if (applied == 0) {
428+
ASSERT_EQ(output, -1);
429+
} else {
430+
auto expected = durationCount<Milliseconds>(remainingTime);
431+
// Check that error is pretty small (it should get better as the operation progresses)
432+
double absRelErr = std::abs((output - expected) / expected);
433+
ASSERT_LT(absRelErr, 0.05)
434+
<< "output={}, expected={}, applied={}"_format(output, expected, applied);
435+
maxAbsRelErr = std::max(maxAbsRelErr, absRelErr);
436+
}
437+
advanceTime(timePerOp);
438+
m->onOplogEntriesApplied(1);
439+
remainingTime -= timePerOp;
440+
}
441+
LOGV2_DEBUG(
442+
5422701, 3, "Max absolute relative error observed", "maxAbsRelErr"_attr = maxAbsRelErr);
443+
}
444+
445+
} // namespace
374446
} // namespace mongo

0 commit comments

Comments
 (0)