Skip to content

Commit 6c82e1b

Browse files
author
Andrea Lattuada
committed
SERVER-7592 descriptive statistics estimators
1 parent 7485530 commit 6c82e1b

File tree

4 files changed

+570
-0
lines changed

4 files changed

+570
-0
lines changed

src/mongo/SConscript

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ env.CppUnitTest('namespacestring_test', ['db/namespacestring_test.cpp'],
9292

9393
env.CppUnitTest('bson_extract_test', ['bson/util/bson_extract_test.cpp'], LIBDEPS=['bson'])
9494

95+
env.CppUnitTest('descriptive_stats_test',
96+
['util/descriptive_stats_test.cpp'],
97+
LIBDEPS=['foundation', 'bson']);
98+
9599
commonFiles = [ "pch.cpp",
96100
"buildinfo.cpp",
97101
"db/hasher.cpp",
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
/* Copyright 2012 10gen Inc.
2+
*
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*
15+
*
16+
* Based upon boost.accumulators (www.boost.org/libs/accumulators/),
17+
* distributed under the Boost Software License, Version 1.0.
18+
* See distrc/THIRD_PARTY_NOTICES for the full License Notice for Boost.
19+
*
20+
*/
21+
22+
#pragma once
23+
24+
#include <algorithm>
25+
#include <limits>
26+
27+
#include "mongo/util/mongoutils/str.h"
28+
29+
namespace mongo {
30+
31+
template <class Sample>
32+
BasicEstimators<Sample>::BasicEstimators() :
33+
_count(0),
34+
_sum(0),
35+
_diff(0),
36+
_min(std::numeric_limits<Sample>::max()),
37+
_max(std::numeric_limits<Sample>::min()) {
38+
39+
}
40+
41+
template <class Sample>
42+
BasicEstimators<Sample>& BasicEstimators<Sample>::operator <<(const Sample sample) {
43+
const double oldMean = (_count > 0) ? _sum / _count : 0;
44+
const double delta = oldMean - static_cast<double>(sample);
45+
const double weight = static_cast<double>(_count) / (_count + 1);
46+
_diff += delta * delta * weight;
47+
_sum += static_cast<double>(sample);
48+
_count++;
49+
_min = std::min(sample, _min);
50+
_max = std::max(sample, _max);
51+
return *this;
52+
}
53+
54+
template <class Sample>
55+
void BasicEstimators<Sample>::appendBasicToBSONObjBuilder(BSONObjBuilder& b) const {
56+
b << "count" << static_cast<long long>(count())
57+
<< "mean" << mean()
58+
<< "stddev" << stddev()
59+
<< "min" << min()
60+
<< "max" << max();
61+
}
62+
63+
template <std::size_t NumQuantiles>
64+
DistributionEstimators<NumQuantiles>::DistributionEstimators() :
65+
_count(0) {
66+
67+
for(std::size_t i = 0; i < NumMarkers; i++) {
68+
_actual_positions[i] = i + 1;
69+
}
70+
71+
for(std::size_t i = 0; i < NumMarkers; i++) {
72+
_desired_positions[i] = 1.0 + (2.0 * (NumQuantiles + 1.0) * _positions_increments(i));
73+
}
74+
}
75+
76+
/*
77+
* The quantile estimation follows the extended_p_square implementation in boost.accumulators.
78+
* It differs by removing the ability to request arbitrary quantiles and computing exactly
79+
* 'NumQuantiles' equidistant quantiles (plus minimum and maximum) instead.
80+
* See http://www.boost.org/doc/libs/1_51_0/doc/html/boost/accumulators/impl/extended_p_square_impl.html ,
81+
* R. Jain and I. Chlamtac, The P^2 algorithmus for dynamic calculation of quantiles and histograms without storing observations, Communications of the ACM, Volume 28 (October), Number 10, 1985, p. 1076-1085. and
82+
* K. E. E. Raatikainen, Simultaneous estimation of several quantiles, Simulation, Volume 49, Number 4 (October), 1986, p. 159-164.
83+
*/
84+
template <std::size_t NumQuantiles>
85+
DistributionEstimators<NumQuantiles>&
86+
DistributionEstimators<NumQuantiles>::operator <<(const double sample) {
87+
88+
// first accumulate num_markers samples
89+
if (_count++ < NumMarkers) {
90+
_heights[_count - 1] = sample;
91+
92+
if (_count == NumMarkers)
93+
{
94+
std::sort(_heights, _heights + NumMarkers);
95+
}
96+
}
97+
else {
98+
std::size_t sample_cell = 1;
99+
100+
// find cell k = sample_cell such that heights[k-1] <= sample < heights[k]
101+
if(sample < _heights[0])
102+
{
103+
_heights[0] = sample;
104+
sample_cell = 1;
105+
}
106+
else if (sample >= _heights[NumMarkers - 1])
107+
{
108+
_heights[NumMarkers - 1] = sample;
109+
sample_cell = NumMarkers - 1;
110+
}
111+
else {
112+
double* it = std::upper_bound(_heights,
113+
_heights + NumMarkers,
114+
sample);
115+
116+
sample_cell = std::distance(_heights, it);
117+
}
118+
119+
// update actual positions of all markers above sample_cell index
120+
for(std::size_t i = sample_cell; i < NumMarkers; i++) {
121+
_actual_positions[i]++;
122+
}
123+
124+
// update desired positions of all markers
125+
for(std::size_t i = 0; i < NumMarkers; i++) {
126+
_desired_positions[i] += _positions_increments(i);
127+
}
128+
129+
// adjust heights and actual positions of markers 1 to num_markers-2 if necessary
130+
for(std::size_t i = 1; i <= NumMarkers - 2; i++) {
131+
// offset to desired position
132+
double d = _desired_positions[i] - _actual_positions[i];
133+
134+
// offset to next position
135+
double dp = _actual_positions[i + 1] - _actual_positions[i];
136+
137+
// offset to previous position
138+
double dm = _actual_positions[i - 1] - _actual_positions[i];
139+
140+
// height ds
141+
double hp = (_heights[i + 1] - _heights[i]) / dp;
142+
double hm = (_heights[i - 1] - _heights[i]) / dm;
143+
144+
if((d >= 1 && dp > 1) || (d <= -1 && dm < -1))
145+
{
146+
short sign_d = static_cast<short>(d / std::abs(d));
147+
148+
double h = _heights[i] + sign_d / (dp - dm) * ((sign_d - dm)*hp
149+
+ (dp - sign_d) * hm);
150+
151+
// try adjusting heights[i] using p-squared formula
152+
if(_heights[i - 1] < h && h < _heights[i + 1])
153+
{
154+
_heights[i] = h;
155+
}
156+
else
157+
{
158+
// use linear formula
159+
if(d > 0)
160+
{
161+
_heights[i] += hp;
162+
}
163+
if(d < 0)
164+
{
165+
_heights[i] -= hm;
166+
}
167+
}
168+
_actual_positions[i] += sign_d;
169+
}
170+
}
171+
}
172+
173+
return *this;
174+
}
175+
176+
template <std::size_t NumQuantiles>
177+
void DistributionEstimators<NumQuantiles>::appendQuantilesToBSONArrayBuilder(
178+
BSONArrayBuilder& arr) const {
179+
180+
verify(quantilesReady());
181+
for (std::size_t i = 0; i <= NumQuantiles + 1; i++) {
182+
arr << quantile(i);
183+
}
184+
}
185+
186+
template <std::size_t NumQuantiles>
187+
inline double DistributionEstimators<NumQuantiles>::_positions_increments(std::size_t i) const {
188+
return static_cast<double>(i) / (2 * (NumQuantiles + 1));
189+
}
190+
191+
template <class Sample, std::size_t NumQuantiles>
192+
BSONObj SummaryEstimators<Sample, NumQuantiles>::statisticSummaryToBSONObj() const {
193+
BSONObjBuilder b;
194+
this->BasicEstimators<Sample>::appendBasicToBSONObjBuilder(b);
195+
if (this->DistributionEstimators<NumQuantiles>::quantilesReady()) {
196+
// Not using appendQuantiles to be explicit about which probability each quantile
197+
// refers to. This way the user does not need to count the quantiles or know in
198+
// advance how many quantiles were computed to figure out their meaning.
199+
BSONObjBuilder quantilesBuilder(b.subobjStart("quantiles"));
200+
for (size_t i = 1; i <= NumQuantiles; i++) {
201+
const double probability =
202+
this->DistributionEstimators<NumQuantiles>::probability(i);
203+
const double quantile =
204+
this->DistributionEstimators<NumQuantiles>::quantile(i);
205+
quantilesBuilder.append(std::string(mongoutils::str::stream() << probability),
206+
quantile);
207+
}
208+
quantilesBuilder.doneFast();
209+
}
210+
return b.obj();
211+
}
212+
213+
} // namespace mongo

0 commit comments

Comments
 (0)