Skip to content

Commit f96144e

Browse files
remove stats correction from ES|QL sample (elastic#129319)
* remove stats correction from ES|QL sample * [CI] Auto commit changes from spotless --------- Co-authored-by: elasticsearchmachine <[email protected]>
1 parent 9494873 commit f96144e

File tree

12 files changed

+46
-183
lines changed

12 files changed

+46
-183
lines changed

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/RestSampleTestCase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ public class RestSampleTestCase extends ESRestTestCase {
3333
public void skipWhenSampleDisabled() throws IOException {
3434
assumeTrue(
3535
"Requires SAMPLE capability",
36-
EsqlSpecTestCase.hasCapabilities(adminClient(), List.of(EsqlCapabilities.Cap.SAMPLE_V2.capabilityName()))
36+
EsqlSpecTestCase.hasCapabilities(adminClient(), List.of(EsqlCapabilities.Cap.SAMPLE_V3.capabilityName()))
3737
);
3838
}
3939

x-pack/plugin/esql/qa/testFixtures/src/main/resources/sample.csv-spec

Lines changed: 32 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
// because the CSV tests don't support such assertions.
1010

1111
row
12-
required_capability: sample_v2
12+
required_capability: sample_v3
1313

1414
ROW x = 1 | SAMPLE .999999999
1515
;
@@ -20,7 +20,7 @@ x:integer
2020

2121

2222
row and mv_expand
23-
required_capability: sample_v2
23+
required_capability: sample_v3
2424

2525
ROW x = [1,2,3,4,5] | MV_EXPAND x | SAMPLE .999999999
2626
;
@@ -35,15 +35,14 @@ x:integer
3535

3636

3737
adjust stats for sampling
38-
required_capability: sample_v2
38+
required_capability: sample_v3
3939

4040
FROM employees
4141
| SAMPLE 0.5
42-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no), sum_emp_no = SUM(emp_no)
43-
| EVAL is_expected = count >= 20 AND count <= 180 AND
44-
values_count >= 10 AND values_count <= 90 AND
42+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no), sum_emp_no = SUM(emp_no)
43+
| EVAL is_expected = count >= 10 AND count <= 90 AND
4544
avg_emp_no > 10010 AND avg_emp_no < 10090 AND
46-
sum_emp_no > 20*10010 AND sum_emp_no < 180*10090
45+
sum_emp_no > 10*10010 AND sum_emp_no < 90*10090
4746
| KEEP is_expected
4847
;
4948

@@ -53,14 +52,13 @@ true
5352

5453

5554
before where
56-
required_capability: sample_v2
55+
required_capability: sample_v3
5756

5857
FROM employees
5958
| SAMPLE 0.5
6059
| WHERE emp_no > 10050
61-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
62-
| EVAL is_expected = count >= 5 AND count <= 95 AND
63-
values_count >= 2 AND values_count <= 48 AND
60+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
61+
| EVAL is_expected = count >= 2 AND count <= 48 AND
6462
avg_emp_no > 10055 AND avg_emp_no < 10095
6563
| KEEP is_expected
6664
;
@@ -71,14 +69,13 @@ true
7169

7270

7371
after where
74-
required_capability: sample_v2
72+
required_capability: sample_v3
7573

7674
FROM employees
7775
| WHERE emp_no <= 10050
7876
| SAMPLE 0.5
79-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
80-
| EVAL is_expected = count >= 5 AND count <= 95 AND
81-
values_count >= 2 AND values_count <= 48 AND
77+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
78+
| EVAL is_expected = count >= 2 AND count <= 48 AND
8279
avg_emp_no > 10005 AND avg_emp_no < 10045
8380
| KEEP is_expected
8481
;
@@ -89,14 +86,13 @@ true
8986

9087

9188
before sort
92-
required_capability: sample_v2
89+
required_capability: sample_v3
9390

9491
FROM employees
9592
| SAMPLE 0.5
9693
| SORT emp_no
97-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
98-
| EVAL is_expected = count >= 20 AND count <= 180 AND
99-
values_count >= 10 AND values_count <= 90 AND
94+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
95+
| EVAL is_expected = count >= 10 AND count <= 90 AND
10096
avg_emp_no > 10010 AND avg_emp_no < 10090
10197
| KEEP is_expected
10298
;
@@ -107,14 +103,13 @@ true
107103

108104

109105
after sort
110-
required_capability: sample_v2
106+
required_capability: sample_v3
111107

112108
FROM employees
113109
| SORT emp_no
114110
| SAMPLE 0.5
115-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
116-
| EVAL is_expected = count >= 20 AND count <= 180 AND
117-
values_count >= 10 AND values_count <= 90 AND
111+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
112+
| EVAL is_expected = count >= 10 AND count <= 90 AND
118113
avg_emp_no > 10010 AND avg_emp_no < 10090
119114
| KEEP is_expected
120115
;
@@ -125,13 +120,13 @@ true
125120

126121

127122
before limit
128-
required_capability: sample_v2
123+
required_capability: sample_v3
129124

130125
FROM employees
131126
| SAMPLE 0.5
132127
| LIMIT 10
133-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no))
134-
| EVAL is_expected = count == 10 AND values_count == 10
128+
| STATS count = COUNT(emp_no)
129+
| EVAL is_expected = count == 10
135130
| KEEP is_expected
136131
;
137132

@@ -141,14 +136,13 @@ true
141136

142137

143138
after limit
144-
required_capability: sample_v2
139+
required_capability: sample_v3
145140

146141
FROM employees
147142
| LIMIT 50
148143
| SAMPLE 0.5
149-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no))
150-
| EVAL is_expected = count >= 5 AND count <= 95 AND
151-
values_count >= 2 AND values_count <= 48
144+
| STATS count = COUNT(emp_no)
145+
| EVAL is_expected = count >= 2 AND count <= 48
152146
| KEEP is_expected
153147
;
154148

@@ -158,7 +152,7 @@ true
158152

159153

160154
before mv_expand
161-
required_capability: sample_v2
155+
required_capability: sample_v3
162156

163157
ROW x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50], y = [1,2]
164158
| MV_EXPAND x
@@ -176,7 +170,7 @@ true
176170

177171

178172
after mv_expand
179-
required_capability: sample_v2
173+
required_capability: sample_v3
180174

181175
ROW x = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50], y = [1,2]
182176
| MV_EXPAND x
@@ -194,15 +188,14 @@ true
194188

195189

196190
multiple samples
197-
required_capability: sample_v2
191+
required_capability: sample_v3
198192

199193
FROM employees
200194
| SAMPLE 0.7
201195
| SAMPLE 0.8
202196
| SAMPLE 0.9
203-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(emp_no)), avg_emp_no = AVG(emp_no)
204-
| EVAL is_expected = count >= 20 AND count <= 180 AND
205-
values_count >= 10 AND values_count <= 90 AND
197+
| STATS count = COUNT(), avg_emp_no = AVG(emp_no)
198+
| EVAL is_expected = count >= 10 AND count <= 90 AND
206199
avg_emp_no > 10010 AND avg_emp_no < 10090
207200
| KEEP is_expected
208201
;
@@ -213,15 +206,14 @@ true
213206

214207

215208
after stats
216-
required_capability: sample_v2
209+
required_capability: sample_v3
217210

218211
FROM employees
219212
| SAMPLE 0.5
220213
| STATS avg_salary = AVG(salary) BY job_positions
221214
| SAMPLE 0.8
222-
| STATS count = COUNT(), values_count = MV_COUNT(VALUES(avg_salary)), avg_avg_salary = AVG(avg_salary)
223-
| EVAL is_expected = count >= 1 AND count <= 20 AND
224-
values_count >= 1 AND values_count <= 16 AND
215+
| STATS count = COUNT(), avg_avg_salary = AVG(avg_salary)
216+
| EVAL is_expected = count >= 1 AND count <= 16 AND
225217
avg_avg_salary > 25000 AND avg_avg_salary < 75000
226218
| KEEP is_expected
227219
;

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,7 @@ public enum Cap {
965965
/**
966966
* Support for the SAMPLE command
967967
*/
968-
SAMPLE_V2(Build.current().isSnapshot());
968+
SAMPLE_V3(Build.current().isSnapshot());
969969

970970
private final boolean enabled;
971971

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Count.java

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,8 @@
2424
import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
2525
import org.elasticsearch.xpack.esql.expression.function.Param;
2626
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.FromAggregateMetricDouble;
27-
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong;
2827
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvCount;
2928
import org.elasticsearch.xpack.esql.expression.function.scalar.nulls.Coalesce;
30-
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Div;
3129
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Mul;
3230
import org.elasticsearch.xpack.esql.planner.ToAggregator;
3331

@@ -38,11 +36,9 @@
3836
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.DEFAULT;
3937
import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isType;
4038

41-
public class Count extends AggregateFunction implements ToAggregator, SurrogateExpression, HasSampleCorrection {
39+
public class Count extends AggregateFunction implements ToAggregator, SurrogateExpression {
4240
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Count", Count::new);
4341

44-
private final boolean isSampleCorrected;
45-
4642
@FunctionInfo(
4743
returnType = "long",
4844
description = "Returns the total number (count) of input values.",
@@ -97,20 +93,11 @@ public Count(
9793
}
9894

9995
public Count(Source source, Expression field, Expression filter) {
100-
this(source, field, filter, false);
101-
}
102-
103-
private Count(Source source, Expression field, Expression filter, boolean isSampleCorrected) {
10496
super(source, field, filter, emptyList());
105-
this.isSampleCorrected = isSampleCorrected;
10697
}
10798

10899
private Count(StreamInput in) throws IOException {
109100
super(in);
110-
// isSampleCorrected is only used during query optimization to mark
111-
// whether this function has been processed. Hence there's no need to
112-
// serialize it.
113-
this.isSampleCorrected = false;
114101
}
115102

116103
@Override
@@ -181,14 +168,4 @@ public Expression surrogate() {
181168

182169
return null;
183170
}
184-
185-
@Override
186-
public boolean isSampleCorrected() {
187-
return isSampleCorrected;
188-
}
189-
190-
@Override
191-
public Expression sampleCorrection(Expression sampleProbability) {
192-
return new ToLong(source(), new Div(source(), new Count(source(), field(), filter(), true), sampleProbability));
193-
}
194171
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/HasSampleCorrection.java

Lines changed: 0 additions & 21 deletions
This file was deleted.

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/Sum.java

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@
2525
import org.elasticsearch.xpack.esql.expression.function.FunctionInfo;
2626
import org.elasticsearch.xpack.esql.expression.function.Param;
2727
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.FromAggregateMetricDouble;
28-
import org.elasticsearch.xpack.esql.expression.function.scalar.convert.ToLong;
2928
import org.elasticsearch.xpack.esql.expression.function.scalar.multivalue.MvSum;
30-
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Div;
3129
import org.elasticsearch.xpack.esql.expression.predicate.operator.arithmetic.Mul;
3230

3331
import java.io.IOException;
@@ -44,11 +42,9 @@
4442
/**
4543
* Sum all values of a field in matching documents.
4644
*/
47-
public class Sum extends NumericAggregate implements SurrogateExpression, HasSampleCorrection {
45+
public class Sum extends NumericAggregate implements SurrogateExpression {
4846
public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Sum", Sum::new);
4947

50-
private final boolean isSampleCorrected;
51-
5248
@FunctionInfo(
5349
returnType = { "long", "double" },
5450
description = "The sum of a numeric expression.",
@@ -68,20 +64,11 @@ public Sum(Source source, @Param(name = "number", type = { "aggregate_metric_dou
6864
}
6965

7066
public Sum(Source source, Expression field, Expression filter) {
71-
this(source, field, filter, false);
72-
}
73-
74-
private Sum(Source source, Expression field, Expression filter, boolean isSampleCorrected) {
7567
super(source, field, filter, emptyList());
76-
this.isSampleCorrected = isSampleCorrected;
7768
}
7869

7970
private Sum(StreamInput in) throws IOException {
8071
super(in);
81-
// isSampleCorrected is only used during query optimization to mark
82-
// whether this function has been processed. Hence there's no need to
83-
// serialize it.
84-
this.isSampleCorrected = false;
8572
}
8673

8774
@Override
@@ -159,19 +146,4 @@ public Expression surrogate() {
159146
? new Mul(s, new MvSum(s, field), new Count(s, new Literal(s, StringUtils.WILDCARD, DataType.KEYWORD)))
160147
: null;
161148
}
162-
163-
@Override
164-
public boolean isSampleCorrected() {
165-
return isSampleCorrected;
166-
}
167-
168-
@Override
169-
public Expression sampleCorrection(Expression sampleProbability) {
170-
Expression correctedSum = new Div(source(), new Sum(source(), field(), filter(), true), sampleProbability);
171-
return switch (dataType()) {
172-
case DOUBLE -> correctedSum;
173-
case LONG -> new ToLong(source(), correctedSum);
174-
default -> throw new IllegalStateException("unexpected data type [" + dataType() + "]");
175-
};
176-
}
177149
}

x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/LogicalPlanOptimizer.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import org.elasticsearch.xpack.esql.VerificationException;
1111
import org.elasticsearch.xpack.esql.common.Failures;
1212
import org.elasticsearch.xpack.esql.core.type.DataType;
13-
import org.elasticsearch.xpack.esql.optimizer.rules.logical.ApplySampleCorrections;
1413
import org.elasticsearch.xpack.esql.optimizer.rules.logical.BooleanFunctionEqualsElimination;
1514
import org.elasticsearch.xpack.esql.optimizer.rules.logical.BooleanSimplification;
1615
import org.elasticsearch.xpack.esql.optimizer.rules.logical.CombineBinaryComparisons;
@@ -128,7 +127,6 @@ protected static Batch<LogicalPlan> substitutions() {
128127
return new Batch<>(
129128
"Substitutions",
130129
Limiter.ONCE,
131-
new ApplySampleCorrections(),
132130
new SubstituteSurrogatePlans(),
133131
// Translate filtered expressions into aggregate with filters - can't use surrogate expressions because it was
134132
// retrofitted for constant folding - this needs to be fixed.

0 commit comments

Comments
 (0)