-
Notifications
You must be signed in to change notification settings - Fork 25.3k
ES|QL: fix join masking eval #126614
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ES|QL: fix join masking eval #126614
Changes from 14 commits
27865d7
0d35e21
08803dc
b820057
fc44c26
132c587
dcc9e18
fb205bd
3c1ccea
7d5d67b
812cbe8
5f3b001
f2f0a3d
ae267ac
6720f9b
1a23c32
ab77736
cf4719d
ecb72e9
d6fdb22
e6f51bb
8744884
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 126614 | ||
summary: Fix join masking eval | ||
area: ES|QL | ||
type: bug | ||
issues: [] |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -990,7 +990,13 @@ public enum Cap { | |||||
/** | ||||||
* Support avg_over_time aggregation that gets evaluated per time-series | ||||||
*/ | ||||||
AVG_OVER_TIME(Build.current().isSnapshot()); | ||||||
AVG_OVER_TIME(Build.current().isSnapshot()), | ||||||
|
||||||
/** | ||||||
* During resolution (pre-analysis) we have to consider that joins can override EVALuated values | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
* https://github.com/elastic/elasticsearch/issues/126419 | ||||||
*/ | ||||||
FIX_JOIN_MASKING_EVAL; | ||||||
|
||||||
private final boolean enabled; | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -60,12 +60,22 @@ | |||||
import org.elasticsearch.xpack.esql.parser.QueryParams; | ||||||
import org.elasticsearch.xpack.esql.plan.IndexPattern; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Aggregate; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Drop; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Enrich; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Eval; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Filter; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Fork; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.InlineStats; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Insist; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Keep; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Limit; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.LogicalPlan; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.MvExpand; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.OrderBy; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Project; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.RegexExtract; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.Rename; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.TopN; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.UnresolvedRelation; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.inference.InferencePlan; | ||||||
import org.elasticsearch.xpack.esql.plan.logical.join.InlineJoin; | ||||||
|
@@ -500,6 +510,7 @@ private void preAnalyzeMainIndices( | |||||
|
||||||
/** | ||||||
* Check if there are any clusters to search. | ||||||
* | ||||||
* @return true if there are no clusters to search, false otherwise | ||||||
*/ | ||||||
private boolean allCCSClustersSkipped( | ||||||
|
@@ -612,7 +623,12 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy | |||||
var keepJoinRefsBuilder = AttributeSet.builder(); | ||||||
Set<String> wildcardJoinIndices = new java.util.HashSet<>(); | ||||||
|
||||||
boolean[] canRemoveAliases = new boolean[] { true }; | ||||||
|
||||||
parsed.forEachDown(p -> {// go over each plan top-down | ||||||
if (couldOverrideAliases(p)) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would do this check closer to the actual logic that uses it, I find the code easier to reason about (there is a lot of code between this check here and where this check is actually useful, code that doesn't use the result of |
||||||
canRemoveAliases[0] = false; | ||||||
} | ||||||
if (p instanceof RegexExtract re) { // for Grok and Dissect | ||||||
// remove other down-the-tree references to the extracted fields | ||||||
for (Attribute extracted : re.extractedFields()) { | ||||||
|
@@ -663,14 +679,19 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy | |||||
AttributeSet planRefs = p.references(); | ||||||
Set<String> fieldNames = planRefs.names(); | ||||||
p.forEachExpressionDown(Alias.class, alias -> { | ||||||
// do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id = id" | ||||||
// or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)" | ||||||
if (fieldNames.contains(alias.name())) { | ||||||
return; | ||||||
// If there are joins/enriches in the middle, these could override some of these fields. | ||||||
// We don't know at this stage, so we have to keep all of them. | ||||||
if (canRemoveAliases[0]) { | ||||||
// do not remove the UnresolvedAttribute that has the same name as its alias, ie "rename id = id" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
// or the UnresolvedAttributes that are used in Functions that have aliases "STATS id = MAX(id)" | ||||||
if (fieldNames.contains(alias.name())) { | ||||||
return; | ||||||
} | ||||||
referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), keepCommandRefsBuilder.contains(attr))); | ||||||
} | ||||||
referencesBuilder.removeIf(attr -> matchByName(attr, alias.name(), keepCommandRefsBuilder.contains(attr))); | ||||||
}); | ||||||
}); | ||||||
|
||||||
// Add JOIN ON column references afterward to avoid Alias removal | ||||||
referencesBuilder.addAll(keepJoinRefsBuilder); | ||||||
// If any JOIN commands need wildcard field-caps calls, persist the index names | ||||||
|
@@ -694,6 +715,31 @@ static PreAnalysisResult fieldNames(LogicalPlan parsed, Set<String> enrichPolicy | |||||
} | ||||||
} | ||||||
|
||||||
/** | ||||||
* Could a plan "accidentally" override aliases? | ||||||
* Examples are JOIN and ENRICH, that _could_ produce fields with the same | ||||||
* name of an existing alias, based on their index mapping. | ||||||
* Here we just have to consider commands where this information is not available before index resolution, | ||||||
* eg. EVAL, GROK, DISSECT can override an alias, but we know it in advance, ie. we don't need to resolve indices to know. | ||||||
*/ | ||||||
private static boolean couldOverrideAliases(LogicalPlan p) { | ||||||
return (p instanceof Aggregate | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you listing all of these that cannot define an additional "hidden" Attribute instead of checking those that can? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To avoid that it breaks when we add new commands that behave like JOIN. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we add a command that doesn't behave like JOIN, then it could break the other way around. Imo, it makes much more sense to list those commands that are "special". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is interesting, I'm not sure I have completely clear the implications of this aspect. If I got it right, we use these field names to limit the scope of the field_caps; if we add a field that does not exist in any of the involved indices, will field_caps fail? My understanding is that it will work fine, just ignoring the additional fields. If it wasn't the case, then we would be in a catch-22: we couldn't know which fields to send before validating the query, and we couldn't validate the query before sending the fields to field_caps. The fact that everything works fine makes me think that it's safe, but maybe I'm missing something. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think Completion is safe (the target attribute is defined at parsing time), but I have to double-check. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yep, I see your point. Adding more fields is safer. Added less fields is problematic. Better more fields than less. |
||||||
|| p instanceof Drop | ||||||
|| p instanceof Eval | ||||||
|| p instanceof Filter | ||||||
|| p instanceof Fork | ||||||
|| p instanceof InlineStats | ||||||
|| p instanceof Insist | ||||||
|| p instanceof Keep | ||||||
|| p instanceof Limit | ||||||
|| p instanceof MvExpand | ||||||
|| p instanceof OrderBy | ||||||
|| p instanceof Project | ||||||
|| p instanceof RegexExtract | ||||||
|| p instanceof Rename | ||||||
|| p instanceof TopN) == false; | ||||||
} | ||||||
|
||||||
private static boolean matchByName(Attribute attr, String other, boolean skipIfPattern) { | ||||||
boolean isPattern = Regex.isSimpleMatchPattern(attr.name()); | ||||||
if (skipIfPattern && isPattern) { | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -478,13 +478,16 @@ public void testDropAllColumns_WithStats() { | |
} | ||
|
||
public void testEnrichOn() { | ||
assertFieldNames(""" | ||
from employees | ||
| sort emp_no | ||
| limit 1 | ||
| eval x = to_string(languages) | ||
| enrich languages_policy on x | ||
| keep emp_no, language_name""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); | ||
assertFieldNames( | ||
""" | ||
from employees | ||
| sort emp_no | ||
| limit 1 | ||
| eval x = to_string(languages) | ||
| enrich languages_policy on x | ||
| keep emp_no, language_name""", | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
); | ||
} | ||
|
||
public void testEnrichOn2() { | ||
|
@@ -494,7 +497,7 @@ public void testEnrichOn2() { | |
| enrich languages_policy on x | ||
| keep emp_no, language_name | ||
| sort emp_no | ||
| limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); | ||
| limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*")); | ||
} | ||
|
||
public void testUselessEnrich() { | ||
|
@@ -512,15 +515,15 @@ public void testSimpleSortLimit() { | |
| enrich languages_policy on x | ||
| keep emp_no, language_name | ||
| sort emp_no | ||
| limit 1""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*")); | ||
| limit 1""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*")); | ||
} | ||
|
||
public void testWith() { | ||
assertFieldNames( | ||
""" | ||
from employees | eval x = to_string(languages) | keep emp_no, x | sort emp_no | limit 1 | ||
| enrich languages_policy on x with language_name""", | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
); | ||
} | ||
|
||
|
@@ -529,7 +532,7 @@ public void testWithAlias() { | |
""" | ||
from employees | sort emp_no | limit 3 | eval x = to_string(languages) | keep emp_no, x | ||
| enrich languages_policy on x with lang = language_name""", | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
); | ||
} | ||
|
||
|
@@ -538,7 +541,7 @@ public void testWithAliasSort() { | |
""" | ||
from employees | eval x = to_string(languages) | keep emp_no, x | sort emp_no | limit 3 | ||
| enrich languages_policy on x with lang = language_name""", | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
); | ||
} | ||
|
||
|
@@ -547,7 +550,7 @@ public void testWithAliasAndPlain() { | |
""" | ||
from employees | sort emp_no desc | limit 3 | eval x = to_string(languages) | keep emp_no, x | ||
| enrich languages_policy on x with lang = language_name, language_name""", | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
); | ||
} | ||
|
||
|
@@ -556,7 +559,7 @@ public void testWithTwoAliasesSameProp() { | |
""" | ||
from employees | sort emp_no | limit 1 | eval x = to_string(languages) | keep emp_no, x | ||
| enrich languages_policy on x with lang = language_name, lang2 = language_name""", | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
); | ||
} | ||
|
||
|
@@ -565,7 +568,7 @@ public void testRedundantWith() { | |
""" | ||
from employees | sort emp_no | limit 1 | eval x = to_string(languages) | keep emp_no, x | ||
| enrich languages_policy on x with language_name, language_name""", | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*") | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "x.*") | ||
); | ||
} | ||
|
||
|
@@ -588,28 +591,34 @@ public void testConstantNullInput() { | |
| eval x = to_string(languages) | ||
| keep emp_no, x | ||
| enrich languages_policy on x with language_name, language_name""", | ||
Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*") | ||
Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*") | ||
); | ||
} | ||
|
||
public void testEnrichEval() { | ||
assertFieldNames(""" | ||
from employees | ||
| eval x = to_string(languages) | ||
| enrich languages_policy on x with lang = language_name | ||
| eval language = concat(x, "-", lang) | ||
| keep emp_no, x, lang, language | ||
| sort emp_no desc | limit 3""", Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*")); | ||
assertFieldNames( | ||
""" | ||
from employees | ||
| eval x = to_string(languages) | ||
| enrich languages_policy on x with lang = language_name | ||
| eval language = concat(x, "-", lang) | ||
| keep emp_no, x, lang, language | ||
| sort emp_no desc | limit 3""", | ||
Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*", "lang", "lang.*") | ||
); | ||
} | ||
|
||
public void testSimple() { | ||
assertFieldNames(""" | ||
from employees | ||
| eval x = 1, y = to_string(languages) | ||
| enrich languages_policy on y | ||
| where x > 1 | ||
| keep emp_no, language_name | ||
| limit 1""", Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*")); | ||
assertFieldNames( | ||
""" | ||
from employees | ||
| eval x = 1, y = to_string(languages) | ||
| enrich languages_policy on y | ||
| where x > 1 | ||
| keep emp_no, language_name | ||
| limit 1""", | ||
Set.of("emp_no", "emp_no.*", "languages", "languages.*", "language_name", "language_name.*", "x", "y", "x.*", "y.*") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly, we don't know the structure of the enrich index yet, after the ENRICH x could become something else (eg. a keyword) and the WHERE could be invalid |
||
); | ||
} | ||
|
||
public void testEvalNullSort() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please, add this test and others (be creative) to
IndexResolverFieldNamesTests
.