diff --git a/docs/changelog/126237.yaml b/docs/changelog/126237.yaml new file mode 100644 index 0000000000000..24f245a528795 --- /dev/null +++ b/docs/changelog/126237.yaml @@ -0,0 +1,5 @@ +pr: 126237 +summary: Use `FallbackSyntheticSourceBlockLoader` for text fields +area: Mapping +type: enhancement +issues: [] diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java index 5242b33666b21..3a60da9d742ee 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TextFieldMapper.java @@ -74,6 +74,7 @@ import org.elasticsearch.search.aggregations.support.CoreValuesSourceType; import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParser; import java.io.IOException; import java.util.ArrayList; @@ -1019,10 +1020,53 @@ protected String delegatingTo() { if (isStored()) { return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name()); } + + // _ignored_source field will only be present if text field is not stored + // and there is no syntheticSourceDelegate + if (isSyntheticSource && syntheticSourceDelegate == null) { + return fallbackSyntheticSourceBlockLoader(); + } + SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name())); return new BlockSourceReader.BytesRefsBlockLoader(fetcher, blockReaderDisiLookup(blContext)); } + FallbackSyntheticSourceBlockLoader fallbackSyntheticSourceBlockLoader() { + var reader = new FallbackSyntheticSourceBlockLoader.SingleValueReader(null) { + @Override + public void convertValue(Object value, List accumulator) { + if (value != null) { + accumulator.add(new BytesRef(value.toString())); + } + } + + @Override + protected void parseNonNullValue(XContentParser parser, List accumulator) throws IOException { + var text = parser.textOrNull(); + + if (text != null) { + accumulator.add(new BytesRef(text)); + } + } + + @Override + public void writeToBlock(List values, BlockLoader.Builder blockBuilder) { + var bytesRefBuilder = (BlockLoader.BytesRefBuilder) blockBuilder; + + for (var value : values) { + bytesRefBuilder.appendBytesRef(value); + } + } + }; + + return new FallbackSyntheticSourceBlockLoader(reader, name()) { + @Override + public Builder builder(BlockFactory factory, int expectedCount) { + return factory.bytesRefs(expectedCount); + } + }; + } + /** * Build an iterator of documents that have the field. This mirrors parseCreateField, * using whatever diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java index e2a7b4fc49a4b..6e1b177fb218d 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/KeywordFieldBlockLoaderTests.java @@ -24,9 +24,13 @@ public KeywordFieldBlockLoaderTests(Params params) { super(FieldType.KEYWORD.toString(), params); } - @SuppressWarnings("unchecked") @Override protected Object expected(Map fieldMapping, Object value, TestContext testContext) { + return expectedValue(fieldMapping, value, params, testContext); + } + + @SuppressWarnings("unchecked") + public static Object expectedValue(Map fieldMapping, Object value, Params params, TestContext testContext) { var nullValue = (String) fieldMapping.get("null_value"); var ignoreAbove = fieldMapping.get("ignore_above") == null @@ -59,7 +63,7 @@ protected Object expected(Map fieldMapping, Object value, TestCo return maybeFoldList(resultList); } - private BytesRef convert(String value, String nullValue, int ignoreAbove) { + private static BytesRef convert(String value, String nullValue, int ignoreAbove) { if (value == null) { if (nullValue != null) { value = nullValue; diff --git a/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java new file mode 100644 index 0000000000000..10d8ce748aec1 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/blockloader/TextFieldBlockLoaderTests.java @@ -0,0 +1,135 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.mapper.blockloader; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.index.mapper.BlockLoaderTestCase; +import org.elasticsearch.logsdb.datageneration.FieldType; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +public class TextFieldBlockLoaderTests extends BlockLoaderTestCase { + public TextFieldBlockLoaderTests(Params params) { + super(FieldType.TEXT.toString(), params); + } + + @SuppressWarnings("unchecked") + @Override + protected Object expected(Map fieldMapping, Object value, TestContext testContext) { + if (fieldMapping.getOrDefault("store", false).equals(true)) { + return valuesInSourceOrder(value); + } + + var fields = (Map) fieldMapping.get("fields"); + if (fields != null) { + var keywordMultiFieldMapping = (Map) fields.get("kwd"); + boolean docValues = hasDocValues(keywordMultiFieldMapping, true); + boolean index = keywordMultiFieldMapping.getOrDefault("index", true).equals(true); + boolean store = keywordMultiFieldMapping.getOrDefault("store", false).equals(true); + Object ignoreAbove = keywordMultiFieldMapping.get("ignore_above"); + + // See TextFieldMapper.SyntheticSourceHelper#usingSyntheticSourceDelegate + // and TextFieldMapper#canUseSyntheticSourceDelegateForLoading(). + boolean usingSyntheticSourceDelegate = docValues || store; + boolean canUseSyntheticSourceDelegateForLoading = usingSyntheticSourceDelegate && ignoreAbove == null && (index || store); + if (canUseSyntheticSourceDelegateForLoading) { + return KeywordFieldBlockLoaderTests.expectedValue(keywordMultiFieldMapping, value, params, testContext); + } + + // Even if multi-field is not eligible for loading it can still be used to produce synthetic source + // and then we load from the synthetic source. + // Synthetic source is actually different from keyword field block loader results + // because synthetic source includes values exceeding ignore_above and block loader doesn't. + // TODO ideally this logic should be in some kind of KeywordFieldSyntheticSourceTest that uses same infra as + // KeywordFieldBlockLoaderTest + // It is here since KeywordFieldBlockLoaderTest does not really need it + if (params.syntheticSource() && testContext.forceFallbackSyntheticSource() == false && usingSyntheticSourceDelegate) { + var nullValue = (String) keywordMultiFieldMapping.get("null_value"); + + // Due to how TextFieldMapper#blockReaderDisiLookup works this is complicated. + // If we are using lookupMatchingAll() then we'll see all docs, generate synthetic source using syntheticSourceDelegate, + // parse it and see null_value inside. + // But if we are using lookupFromNorms() we will skip the document (since the text field itself does not exist). + // Same goes for lookupFromFieldNames(). + boolean textFieldIndexed = (boolean) fieldMapping.getOrDefault("index", true); + + if (value == null) { + if (textFieldIndexed == false + && nullValue != null + && (ignoreAbove == null || nullValue.length() <= (int) ignoreAbove)) { + return new BytesRef(nullValue); + } + + return null; + } + + if (value instanceof String s) { + return new BytesRef(s); + } + + var values = (List) value; + + // See note above about TextFieldMapper#blockReaderDisiLookup. + if (textFieldIndexed && values.stream().allMatch(Objects::isNull)) { + return null; + } + + var indexed = values.stream() + .map(s -> s == null ? nullValue : s) + .filter(Objects::nonNull) + .filter(s -> ignoreAbove == null || s.length() <= (int) ignoreAbove) + .map(BytesRef::new) + .collect(Collectors.toList()); + + if (store == false) { + // using doc_values for synthetic source + indexed = new ArrayList<>(new HashSet<>(indexed)); + indexed.sort(BytesRef::compareTo); + } + + // ignored values always come last + List ignored = ignoreAbove == null + ? List.of() + : values.stream() + .map(s -> s == null ? nullValue : s) + .filter(Objects::nonNull) + .filter(s -> s.length() > (int) ignoreAbove) + .map(BytesRef::new) + .toList(); + + indexed.addAll(ignored); + + return maybeFoldList(indexed); + } + } + + // Loading from _ignored_source or stored _source + return valuesInSourceOrder(value); + } + + @SuppressWarnings("unchecked") + private Object valuesInSourceOrder(Object value) { + if (value == null) { + return null; + } + + if (value instanceof String s) { + return new BytesRef(s); + } + + var resultList = ((List) value).stream().filter(Objects::nonNull).map(BytesRef::new).toList(); + return maybeFoldList(resultList); + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/FieldType.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/FieldType.java index 127e4a4fc75e3..cb2b443658af4 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/FieldType.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/FieldType.java @@ -23,6 +23,7 @@ import org.elasticsearch.logsdb.datageneration.fields.leaf.LongFieldDataGenerator; import org.elasticsearch.logsdb.datageneration.fields.leaf.ScaledFloatFieldDataGenerator; import org.elasticsearch.logsdb.datageneration.fields.leaf.ShortFieldDataGenerator; +import org.elasticsearch.logsdb.datageneration.fields.leaf.TextFieldDataGenerator; import org.elasticsearch.logsdb.datageneration.fields.leaf.UnsignedLongFieldDataGenerator; /** @@ -42,7 +43,8 @@ public enum FieldType { COUNTED_KEYWORD("counted_keyword"), BOOLEAN("boolean"), DATE("date"), - GEO_POINT("geo_point"); + GEO_POINT("geo_point"), + TEXT("text"); private final String name; @@ -66,6 +68,7 @@ public FieldDataGenerator generator(String fieldName, DataSource dataSource) { case BOOLEAN -> new BooleanFieldDataGenerator(dataSource); case DATE -> new DateFieldDataGenerator(dataSource); case GEO_POINT -> new GeoPointFieldDataGenerator(dataSource); + case TEXT -> new TextFieldDataGenerator(dataSource); }; } @@ -85,6 +88,7 @@ public static FieldType tryParse(String name) { case "boolean" -> FieldType.BOOLEAN; case "date" -> FieldType.DATE; case "geo_point" -> FieldType.GEO_POINT; + case "text" -> FieldType.TEXT; default -> null; }; } diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java index bf87285b9c730..2dc2082f9ce60 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/datasource/DefaultMappingParametersHandler.java @@ -35,10 +35,7 @@ public DataSourceResponse.LeafMappingParametersGenerator handle(DataSourceReques return null; } - var map = new HashMap(); - map.put("store", ESTestCase.randomBoolean()); - map.put("index", ESTestCase.randomBoolean()); - map.put("doc_values", ESTestCase.randomBoolean()); + var map = commonMappingParameters(); if (ESTestCase.randomBoolean()) { map.put(Mapper.SYNTHETIC_SOURCE_KEEP_PARAM, ESTestCase.randomFrom("none", "arrays", "all")); } @@ -51,6 +48,7 @@ public DataSourceResponse.LeafMappingParametersGenerator handle(DataSourceReques case BOOLEAN -> booleanMapping(map); case DATE -> dateMapping(map); case GEO_POINT -> geoPointMapping(map); + case TEXT -> textMapping(request, new HashMap<>()); }); } @@ -190,6 +188,35 @@ private Supplier> geoPointMapping(Map inject }; } + private Supplier> textMapping( + DataSourceRequest.LeafMappingParametersGenerator request, + Map injected + ) { + return () -> { + injected.put("store", ESTestCase.randomBoolean()); + injected.put("index", ESTestCase.randomBoolean()); + + if (ESTestCase.randomDouble() <= 0.1) { + var keywordMultiFieldMapping = keywordMapping(request, commonMappingParameters()).get(); + keywordMultiFieldMapping.put("type", "keyword"); + keywordMultiFieldMapping.remove("copy_to"); + + injected.put("fields", Map.of("kwd", keywordMultiFieldMapping)); + + } + + return injected; + }; + } + + private static HashMap commonMappingParameters() { + var map = new HashMap(); + map.put("store", ESTestCase.randomBoolean()); + map.put("index", ESTestCase.randomBoolean()); + map.put("doc_values", ESTestCase.randomBoolean()); + return map; + } + @Override public DataSourceResponse.ObjectMappingParametersGenerator handle(DataSourceRequest.ObjectMappingParametersGenerator request) { if (request.isNested()) { diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/fields/leaf/TextFieldDataGenerator.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/fields/leaf/TextFieldDataGenerator.java new file mode 100644 index 0000000000000..ba107cd844fe1 --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/fields/leaf/TextFieldDataGenerator.java @@ -0,0 +1,34 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.logsdb.datageneration.fields.leaf; + +import org.elasticsearch.logsdb.datageneration.FieldDataGenerator; +import org.elasticsearch.logsdb.datageneration.datasource.DataSource; +import org.elasticsearch.logsdb.datageneration.datasource.DataSourceRequest; + +import java.util.Map; +import java.util.function.Supplier; + +public class TextFieldDataGenerator implements FieldDataGenerator { + private final Supplier valueGenerator; + + public TextFieldDataGenerator(DataSource dataSource) { + var strings = dataSource.get(new DataSourceRequest.StringGenerator()); + var nulls = dataSource.get(new DataSourceRequest.NullWrapper()); + var arrays = dataSource.get(new DataSourceRequest.ArrayWrapper()); + + this.valueGenerator = arrays.wrapper().compose(nulls.wrapper()).apply(() -> strings.generator().get()); + } + + @Override + public Object generateValue(Map fieldMapping) { + return valueGenerator.get(); + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java index e484b955b0517..cd9d39ec3637a 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/FieldSpecificMatcher.java @@ -61,6 +61,7 @@ static Map matchers( put("geo_shape", new ExactMatcher("geo_shape", actualMappings, actualSettings, expectedMappings, expectedSettings)); put("shape", new ExactMatcher("shape", actualMappings, actualSettings, expectedMappings, expectedSettings)); put("geo_point", new GeoPointMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings)); + put("text", new TextMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings)); } }; } @@ -601,6 +602,73 @@ public MatchResult match( } } + class TextMatcher implements FieldSpecificMatcher { + private final XContentBuilder actualMappings; + private final Settings.Builder actualSettings; + private final XContentBuilder expectedMappings; + private final Settings.Builder expectedSettings; + + TextMatcher( + XContentBuilder actualMappings, + Settings.Builder actualSettings, + XContentBuilder expectedMappings, + Settings.Builder expectedSettings + ) { + this.actualMappings = actualMappings; + this.actualSettings = actualSettings; + this.expectedMappings = expectedMappings; + this.expectedSettings = expectedSettings; + } + + @Override + @SuppressWarnings("unchecked") + public MatchResult match( + List actual, + List expected, + Map actualMapping, + Map expectedMapping + ) { + var expectedNormalized = normalize(expected); + var actualNormalized = normalize(actual); + + // Match simply as text first. + if (actualNormalized.equals(expectedNormalized)) { + return MatchResult.match(); + } + + // In some cases synthetic source for text fields is synthesized using the keyword multi field. + // So in this case it's appropriate to match it using keyword matching logic (mainly to cover `null_value`). + var multiFields = (Map) getMappingParameter("fields", actualMapping, expectedMapping); + if (multiFields != null) { + var keywordMatcher = new KeywordMatcher(actualMappings, actualSettings, expectedMappings, expectedSettings); + + var keywordFieldMapping = (Map) multiFields.get("kwd"); + var keywordMatchResult = keywordMatcher.match(actual, expected, keywordFieldMapping, keywordFieldMapping); + if (keywordMatchResult.isMatch()) { + return MatchResult.match(); + } + } + + return MatchResult.noMatch( + formatErrorMessage( + actualMappings, + actualSettings, + expectedMappings, + expectedSettings, + "Values of type [text] don't match, " + prettyPrintCollections(actual, expected) + ) + ); + } + + private Set normalize(List values) { + if (values == null) { + return Set.of(); + } + + return values.stream().filter(Objects::nonNull).collect(Collectors.toSet()); + } + } + /** * Generic matcher that supports common matching logic like null values. */ diff --git a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/MappingTransforms.java b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/MappingTransforms.java index 2d961f1f1408e..dcae8ea07993d 100644 --- a/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/MappingTransforms.java +++ b/test/framework/src/main/java/org/elasticsearch/logsdb/datageneration/matchers/source/MappingTransforms.java @@ -46,7 +46,7 @@ private static void descend(String pathFromRoot, Map currentLeve if (entry.getKey().equals("_doc") || entry.getKey().equals("properties")) { descend(pathFromRoot, (Map) entry.getValue(), flattened); } else { - if (entry.getValue() instanceof Map map) { + if (entry.getKey().equals("fields") == false && entry.getValue() instanceof Map map) { var pathToField = pathFromRoot == null ? entry.getKey() : pathFromRoot + "." + entry.getKey(); // Descending to subobject, we need to remember parent mapping