Skip to content

Commit 91e4999

Browse files
author
Sally McNichols
committed
SERVER-24152 add granularity option to $bucketAuto
1 parent 6ab3768 commit 91e4999

12 files changed

+1759
-21
lines changed

src/mongo/db/pipeline/SConscript

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,21 @@ env.Library(
175175
]
176176
)
177177

178+
env.Library(
179+
target='granularity_rounder',
180+
source=[
181+
'granularity_rounder.cpp',
182+
'granularity_rounder_powers_of_two.cpp',
183+
'granularity_rounder_preferred_numbers.cpp',
184+
],
185+
LIBDEPS=[
186+
'document_value',
187+
'document_value_test_util',
188+
'expression',
189+
'field_path',
190+
]
191+
)
192+
178193
docSourceEnv = env.Clone()
179194
docSourceEnv.InjectThirdPartyIncludePaths(libraries=['snappy'])
180195
docSourceEnv.Library(
@@ -205,6 +220,7 @@ docSourceEnv.Library(
205220
],
206221
LIBDEPS=[
207222
'accumulator',
223+
'granularity_rounder',
208224
'dependencies',
209225
'document_value',
210226
'expression',
@@ -380,6 +396,17 @@ env.CppUnitTest(
380396
],
381397
)
382398

399+
env.CppUnitTest(
400+
target='granularity_rounder_test',
401+
source=[
402+
'granularity_rounder_powers_of_two_test.cpp',
403+
'granularity_rounder_preferred_numbers_test.cpp',
404+
],
405+
LIBDEPS=[
406+
'granularity_rounder',
407+
],
408+
)
409+
383410
env.Library(
384411
target='serveronly',
385412
source=[

src/mongo/db/pipeline/document_source.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include "mongo/db/pipeline/document.h"
5050
#include "mongo/db/pipeline/expression.h"
5151
#include "mongo/db/pipeline/expression_context.h"
52+
#include "mongo/db/pipeline/granularity_rounder.h"
5253
#include "mongo/db/pipeline/lookup_set_cache.h"
5354
#include "mongo/db/pipeline/parsed_aggregation_projection.h"
5455
#include "mongo/db/pipeline/pipeline.h"
@@ -2028,7 +2029,7 @@ class DocumentSourceBucketAuto final : public DocumentSource {
20282029
/**
20292030
* Adds 'newBucket' to _buckets and updates any boundaries if necessary.
20302031
*/
2031-
void addBucket(const Bucket& newBucket);
2032+
void addBucket(Bucket& newBucket);
20322033

20332034
/**
20342035
* Makes a document using the information from bucket. This is what is returned when getNext()
@@ -2038,6 +2039,8 @@ class DocumentSourceBucketAuto final : public DocumentSource {
20382039

20392040
void parseGroupByExpression(const BSONElement& groupByField, const VariablesParseState& vps);
20402041

2042+
void setGranularity(std::string granularity);
2043+
20412044
std::unique_ptr<Sorter<Value, Document>> _sorter;
20422045
std::unique_ptr<Sorter<Value, Document>::Iterator> _sortedInput;
20432046

@@ -2056,6 +2059,7 @@ class DocumentSourceBucketAuto final : public DocumentSource {
20562059
std::vector<Bucket>::iterator _bucketsIterator;
20572060
std::unique_ptr<Variables> _variables;
20582061
boost::intrusive_ptr<Expression> _groupByExpression;
2062+
boost::intrusive_ptr<GranularityRounder> _granularityRounder;
20592063
long long _nDocuments = 0;
20602064
};
20612065
} // namespace mongo

src/mongo/db/pipeline/document_source_bucket_auto.cpp

Lines changed: 97 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,15 @@
2626
* it in the license file.
2727
*/
2828

29+
#include "mongo/platform/basic.h"
30+
2931
#include "mongo/db/pipeline/document_source.h"
3032

3133
namespace mongo {
3234

3335
using boost::intrusive_ptr;
3436
using std::pair;
37+
using std::string;
3538
using std::vector;
3639

3740
REGISTER_DOCUMENT_SOURCE(bucketAuto, DocumentSourceBucketAuto::createFromBson);
@@ -103,6 +106,25 @@ Value DocumentSourceBucketAuto::extractKey(const Document& doc) {
103106
_variables->setRoot(doc);
104107
Value key = _groupByExpression->evaluate(_variables.get());
105108

109+
if (_granularityRounder) {
110+
uassert(40258,
111+
str::stream() << "$bucketAuto can specify a 'granularity' with numeric boundaries "
112+
"only, but found a value with type: "
113+
<< typeName(key.getType()),
114+
key.numeric());
115+
116+
double keyValue = key.coerceToDouble();
117+
uassert(
118+
40259,
119+
"$bucketAuto can specify a 'granularity' with numeric boundaries only, but found a NaN",
120+
!std::isnan(keyValue));
121+
122+
uassert(40260,
123+
"$bucketAuto can specify a 'granularity' with non-negative numbers only, but found "
124+
"a negative number",
125+
keyValue >= 0.0);
126+
}
127+
106128
// To be consistent with the $group stage, we consider "missing" to be equivalent to null when
107129
// grouping values into buckets.
108130
return key.missing() ? Value(BSONNULL) : std::move(key);
@@ -186,21 +208,48 @@ void DocumentSourceBucketAuto::populateBuckets() {
186208
? boost::optional<pair<Value, Document>>(_sortedInput->next())
187209
: boost::none;
188210

189-
// If there are any more values that are equal to the boundary value, then absorb them
190-
// into the current bucket too.
191-
while (nextValue &&
192-
pExpCtx->getValueComparator().evaluate(currentBucket._max == nextValue->first)) {
193-
addDocumentToBucket(*nextValue, currentBucket);
194-
nextValue = _sortedInput->more()
195-
? boost::optional<pair<Value, Document>>(_sortedInput->next())
196-
: boost::none;
211+
if (_granularityRounder) {
212+
Value boundaryValue = _granularityRounder->roundUp(currentBucket._max);
213+
// If there are any values that now fall into this bucket after we round the
214+
// boundary, absorb them into this bucket too.
215+
while (nextValue &&
216+
pExpCtx->getValueComparator().evaluate(boundaryValue > nextValue->first)) {
217+
addDocumentToBucket(*nextValue, currentBucket);
218+
nextValue = _sortedInput->more()
219+
? boost::optional<pair<Value, Document>>(_sortedInput->next())
220+
: boost::none;
221+
}
222+
if (nextValue) {
223+
currentBucket._max = boundaryValue;
224+
}
225+
} else {
226+
// If there are any more values that are equal to the boundary value, then absorb
227+
// them into the current bucket too.
228+
while (nextValue &&
229+
pExpCtx->getValueComparator().evaluate(currentBucket._max ==
230+
nextValue->first)) {
231+
addDocumentToBucket(*nextValue, currentBucket);
232+
nextValue = _sortedInput->more()
233+
? boost::optional<pair<Value, Document>>(_sortedInput->next())
234+
: boost::none;
235+
}
197236
}
198237
firstEntryInNextBucket = nextValue;
199238
}
200239

201240
// Add the current bucket to the vector of buckets.
202241
addBucket(currentBucket);
203242
}
243+
244+
if (!_buckets.empty() && _granularityRounder) {
245+
// If we we have a granularity, we round the first bucket's minimum down and the last
246+
// bucket's maximum up. This way all of the bucket boundaries are rounded to numbers in the
247+
// granularity specification.
248+
Bucket& firstBucket = _buckets.front();
249+
Bucket& lastBucket = _buckets.back();
250+
firstBucket._min = _granularityRounder->roundDown(firstBucket._min);
251+
lastBucket._max = _granularityRounder->roundUp(lastBucket._max);
252+
}
204253
}
205254

206255
DocumentSourceBucketAuto::Bucket::Bucket(Value min,
@@ -213,14 +262,33 @@ DocumentSourceBucketAuto::Bucket::Bucket(Value min,
213262
}
214263
}
215264

216-
void DocumentSourceBucketAuto::addBucket(const Bucket& newBucket) {
217-
// If there is a bucket that comes before the new bucket being added, then the previous bucket's
218-
// max boundary is updated to the new bucket's min. This is makes it so that buckets' min
219-
// boundaries are inclusive and max boundaries are exclusive (except for the last bucket, which
220-
// has an inclusive max).
265+
void DocumentSourceBucketAuto::addBucket(Bucket& newBucket) {
221266
if (!_buckets.empty()) {
222267
Bucket& previous = _buckets.back();
223-
previous._max = newBucket._min;
268+
if (_granularityRounder) {
269+
// If we have a granularity specified and if there is a bucket that comes before the new
270+
// bucket being added, then the new bucket's min boundary is updated to be the
271+
// previous bucket's max boundary. This makes it so that bucket boundaries follow the
272+
// granularity, have inclusive minimums, and have exclusive maximums.
273+
274+
double prevMax = previous._max.coerceToDouble();
275+
if (prevMax == 0.0) {
276+
// Handle the special case where the largest value in the first bucket is zero. In
277+
// this case, we take the minimum boundary of the second bucket and round it down.
278+
// We then set the maximum boundary of the first bucket to be the rounded down
279+
// value. This maintains that the maximum boundary of the first bucket is exclusive
280+
// and the minimum boundary of the second bucket is inclusive.
281+
previous._max = _granularityRounder->roundDown(newBucket._min);
282+
}
283+
284+
newBucket._min = previous._max;
285+
} else {
286+
// If there is a bucket that comes before the new bucket being added, then the previous
287+
// bucket's max boundary is updated to the new bucket's min. This makes it so that
288+
// buckets' min boundaries are inclusive and max boundaries are exclusive (except for
289+
// the last bucket, which has an inclusive max).
290+
previous._max = newBucket._min;
291+
}
224292
}
225293
_buckets.push_back(newBucket);
226294
}
@@ -254,6 +322,10 @@ Value DocumentSourceBucketAuto::serialize(bool explain) const {
254322
insides["groupBy"] = _groupByExpression->serialize(explain);
255323
insides["buckets"] = Value(_nBuckets);
256324

325+
if (_granularityRounder) {
326+
insides["granularity"] = Value(_granularityRounder->getName());
327+
}
328+
257329
const size_t nOutputFields = _fieldNames.size();
258330
MutableDocument outputSpec(nOutputFields);
259331
for (size_t i = 0; i < nOutputFields; i++) {
@@ -263,8 +335,6 @@ Value DocumentSourceBucketAuto::serialize(bool explain) const {
263335
}
264336
insides["output"] = outputSpec.freezeToValue();
265337

266-
// TODO SERVER-24152: handle granularity field
267-
268338
return Value{Document{{getSourceName(), insides.freezeToValue()}}};
269339
}
270340

@@ -304,6 +374,10 @@ void DocumentSourceBucketAuto::addAccumulator(StringData fieldName,
304374
_expressions.push_back(expression);
305375
}
306376

377+
void DocumentSourceBucketAuto::setGranularity(string granularity) {
378+
_granularityRounder = GranularityRounder::getGranularityRounder(std::move(granularity));
379+
}
380+
307381
intrusive_ptr<DocumentSource> DocumentSourceBucketAuto::createFromBson(
308382
BSONElement elem, const intrusive_ptr<ExpressionContext>& pExpCtx) {
309383
uassert(40240,
@@ -363,11 +437,16 @@ intrusive_ptr<DocumentSource> DocumentSourceBucketAuto::createFromBson(
363437

364438
bucketAuto->addAccumulator(fieldName, factory, accExpression);
365439
}
440+
} else if ("granularity" == argName) {
441+
uassert(40261,
442+
str::stream()
443+
<< "The $bucketAuto 'granularity' field must be a string, but found type: "
444+
<< typeName(argument.type()),
445+
argument.type() == BSONType::String);
446+
bucketAuto->setGranularity(argument.str());
366447
} else {
367448
uasserted(40245, str::stream() << "Unrecognized option to $bucketAuto: " << argName);
368449
}
369-
370-
// TODO SERVER-24152: handle granularity field
371450
}
372451

373452
uassert(40246,

0 commit comments

Comments
 (0)