Skip to content

[8.x] Use FallbackSyntheticSourceBlockLoader for text fields (#126237) #126430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/126237.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 126237
summary: Use `FallbackSyntheticSourceBlockLoader` for text fields
area: Mapping
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser;

import java.io.IOException;
import java.util.ArrayList;
Expand Down Expand Up @@ -1019,10 +1020,53 @@ protected String delegatingTo() {
if (isStored()) {
return new BlockStoredFieldsReader.BytesFromStringsBlockLoader(name());
}

// _ignored_source field will only be present if text field is not stored
// and there is no syntheticSourceDelegate
if (isSyntheticSource && syntheticSourceDelegate == null) {
return fallbackSyntheticSourceBlockLoader();
}

SourceValueFetcher fetcher = SourceValueFetcher.toString(blContext.sourcePaths(name()));
return new BlockSourceReader.BytesRefsBlockLoader(fetcher, blockReaderDisiLookup(blContext));
}

FallbackSyntheticSourceBlockLoader fallbackSyntheticSourceBlockLoader() {
var reader = new FallbackSyntheticSourceBlockLoader.SingleValueReader<BytesRef>(null) {
@Override
public void convertValue(Object value, List<BytesRef> accumulator) {
if (value != null) {
accumulator.add(new BytesRef(value.toString()));
}
}

@Override
protected void parseNonNullValue(XContentParser parser, List<BytesRef> accumulator) throws IOException {
var text = parser.textOrNull();

if (text != null) {
accumulator.add(new BytesRef(text));
}
}

@Override
public void writeToBlock(List<BytesRef> values, BlockLoader.Builder blockBuilder) {
var bytesRefBuilder = (BlockLoader.BytesRefBuilder) blockBuilder;

for (var value : values) {
bytesRefBuilder.appendBytesRef(value);
}
}
};

return new FallbackSyntheticSourceBlockLoader(reader, name()) {
@Override
public Builder builder(BlockFactory factory, int expectedCount) {
return factory.bytesRefs(expectedCount);
}
};
}

/**
* Build an iterator of documents that have the field. This mirrors parseCreateField,
* using whatever
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,13 @@ public KeywordFieldBlockLoaderTests(Params params) {
super(FieldType.KEYWORD.toString(), params);
}

@SuppressWarnings("unchecked")
@Override
protected Object expected(Map<String, Object> fieldMapping, Object value, TestContext testContext) {
return expectedValue(fieldMapping, value, params, testContext);
}

@SuppressWarnings("unchecked")
public static Object expectedValue(Map<String, Object> fieldMapping, Object value, Params params, TestContext testContext) {
var nullValue = (String) fieldMapping.get("null_value");

var ignoreAbove = fieldMapping.get("ignore_above") == null
Expand Down Expand Up @@ -59,7 +63,7 @@ protected Object expected(Map<String, Object> fieldMapping, Object value, TestCo
return maybeFoldList(resultList);
}

private BytesRef convert(String value, String nullValue, int ignoreAbove) {
private static BytesRef convert(String value, String nullValue, int ignoreAbove) {
if (value == null) {
if (nullValue != null) {
value = nullValue;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.index.mapper.blockloader;

import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.mapper.BlockLoaderTestCase;
import org.elasticsearch.logsdb.datageneration.FieldType;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;

public class TextFieldBlockLoaderTests extends BlockLoaderTestCase {
public TextFieldBlockLoaderTests(Params params) {
super(FieldType.TEXT.toString(), params);
}

@SuppressWarnings("unchecked")
@Override
protected Object expected(Map<String, Object> fieldMapping, Object value, TestContext testContext) {
if (fieldMapping.getOrDefault("store", false).equals(true)) {
return valuesInSourceOrder(value);
}

var fields = (Map<String, Object>) fieldMapping.get("fields");
if (fields != null) {
var keywordMultiFieldMapping = (Map<String, Object>) fields.get("kwd");
boolean docValues = hasDocValues(keywordMultiFieldMapping, true);
boolean index = keywordMultiFieldMapping.getOrDefault("index", true).equals(true);
boolean store = keywordMultiFieldMapping.getOrDefault("store", false).equals(true);
Object ignoreAbove = keywordMultiFieldMapping.get("ignore_above");

// See TextFieldMapper.SyntheticSourceHelper#usingSyntheticSourceDelegate
// and TextFieldMapper#canUseSyntheticSourceDelegateForLoading().
boolean usingSyntheticSourceDelegate = docValues || store;
boolean canUseSyntheticSourceDelegateForLoading = usingSyntheticSourceDelegate && ignoreAbove == null && (index || store);
if (canUseSyntheticSourceDelegateForLoading) {
return KeywordFieldBlockLoaderTests.expectedValue(keywordMultiFieldMapping, value, params, testContext);
}

// Even if multi-field is not eligible for loading it can still be used to produce synthetic source
// and then we load from the synthetic source.
// Synthetic source is actually different from keyword field block loader results
// because synthetic source includes values exceeding ignore_above and block loader doesn't.
// TODO ideally this logic should be in some kind of KeywordFieldSyntheticSourceTest that uses same infra as
// KeywordFieldBlockLoaderTest
// It is here since KeywordFieldBlockLoaderTest does not really need it
if (params.syntheticSource() && testContext.forceFallbackSyntheticSource() == false && usingSyntheticSourceDelegate) {
var nullValue = (String) keywordMultiFieldMapping.get("null_value");

// Due to how TextFieldMapper#blockReaderDisiLookup works this is complicated.
// If we are using lookupMatchingAll() then we'll see all docs, generate synthetic source using syntheticSourceDelegate,
// parse it and see null_value inside.
// But if we are using lookupFromNorms() we will skip the document (since the text field itself does not exist).
// Same goes for lookupFromFieldNames().
boolean textFieldIndexed = (boolean) fieldMapping.getOrDefault("index", true);

if (value == null) {
if (textFieldIndexed == false
&& nullValue != null
&& (ignoreAbove == null || nullValue.length() <= (int) ignoreAbove)) {
return new BytesRef(nullValue);
}

return null;
}

if (value instanceof String s) {
return new BytesRef(s);
}

var values = (List<String>) value;

// See note above about TextFieldMapper#blockReaderDisiLookup.
if (textFieldIndexed && values.stream().allMatch(Objects::isNull)) {
return null;
}

var indexed = values.stream()
.map(s -> s == null ? nullValue : s)
.filter(Objects::nonNull)
.filter(s -> ignoreAbove == null || s.length() <= (int) ignoreAbove)
.map(BytesRef::new)
.collect(Collectors.toList());

if (store == false) {
// using doc_values for synthetic source
indexed = new ArrayList<>(new HashSet<>(indexed));
indexed.sort(BytesRef::compareTo);
}

// ignored values always come last
List<BytesRef> ignored = ignoreAbove == null
? List.of()
: values.stream()
.map(s -> s == null ? nullValue : s)
.filter(Objects::nonNull)
.filter(s -> s.length() > (int) ignoreAbove)
.map(BytesRef::new)
.toList();

indexed.addAll(ignored);

return maybeFoldList(indexed);
}
}

// Loading from _ignored_source or stored _source
return valuesInSourceOrder(value);
}

@SuppressWarnings("unchecked")
private Object valuesInSourceOrder(Object value) {
if (value == null) {
return null;
}

if (value instanceof String s) {
return new BytesRef(s);
}

var resultList = ((List<String>) value).stream().filter(Objects::nonNull).map(BytesRef::new).toList();
return maybeFoldList(resultList);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.elasticsearch.logsdb.datageneration.fields.leaf.LongFieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.fields.leaf.ScaledFloatFieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.fields.leaf.ShortFieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.fields.leaf.TextFieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.fields.leaf.UnsignedLongFieldDataGenerator;

/**
Expand All @@ -42,7 +43,8 @@ public enum FieldType {
COUNTED_KEYWORD("counted_keyword"),
BOOLEAN("boolean"),
DATE("date"),
GEO_POINT("geo_point");
GEO_POINT("geo_point"),
TEXT("text");

private final String name;

Expand All @@ -66,6 +68,7 @@ public FieldDataGenerator generator(String fieldName, DataSource dataSource) {
case BOOLEAN -> new BooleanFieldDataGenerator(dataSource);
case DATE -> new DateFieldDataGenerator(dataSource);
case GEO_POINT -> new GeoPointFieldDataGenerator(dataSource);
case TEXT -> new TextFieldDataGenerator(dataSource);
};
}

Expand All @@ -85,6 +88,7 @@ public static FieldType tryParse(String name) {
case "boolean" -> FieldType.BOOLEAN;
case "date" -> FieldType.DATE;
case "geo_point" -> FieldType.GEO_POINT;
case "text" -> FieldType.TEXT;
default -> null;
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,7 @@ public DataSourceResponse.LeafMappingParametersGenerator handle(DataSourceReques
return null;
}

var map = new HashMap<String, Object>();
map.put("store", ESTestCase.randomBoolean());
map.put("index", ESTestCase.randomBoolean());
map.put("doc_values", ESTestCase.randomBoolean());
var map = commonMappingParameters();
if (ESTestCase.randomBoolean()) {
map.put(Mapper.SYNTHETIC_SOURCE_KEEP_PARAM, ESTestCase.randomFrom("none", "arrays", "all"));
}
Expand All @@ -51,6 +48,7 @@ public DataSourceResponse.LeafMappingParametersGenerator handle(DataSourceReques
case BOOLEAN -> booleanMapping(map);
case DATE -> dateMapping(map);
case GEO_POINT -> geoPointMapping(map);
case TEXT -> textMapping(request, new HashMap<>());
});
}

Expand Down Expand Up @@ -190,6 +188,35 @@ private Supplier<Map<String, Object>> geoPointMapping(Map<String, Object> inject
};
}

private Supplier<Map<String, Object>> textMapping(
DataSourceRequest.LeafMappingParametersGenerator request,
Map<String, Object> injected
) {
return () -> {
injected.put("store", ESTestCase.randomBoolean());
injected.put("index", ESTestCase.randomBoolean());

if (ESTestCase.randomDouble() <= 0.1) {
var keywordMultiFieldMapping = keywordMapping(request, commonMappingParameters()).get();
keywordMultiFieldMapping.put("type", "keyword");
keywordMultiFieldMapping.remove("copy_to");

injected.put("fields", Map.of("kwd", keywordMultiFieldMapping));

}

return injected;
};
}

private static HashMap<String, Object> commonMappingParameters() {
var map = new HashMap<String, Object>();
map.put("store", ESTestCase.randomBoolean());
map.put("index", ESTestCase.randomBoolean());
map.put("doc_values", ESTestCase.randomBoolean());
return map;
}

@Override
public DataSourceResponse.ObjectMappingParametersGenerator handle(DataSourceRequest.ObjectMappingParametersGenerator request) {
if (request.isNested()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.logsdb.datageneration.fields.leaf;

import org.elasticsearch.logsdb.datageneration.FieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.datasource.DataSource;
import org.elasticsearch.logsdb.datageneration.datasource.DataSourceRequest;

import java.util.Map;
import java.util.function.Supplier;

public class TextFieldDataGenerator implements FieldDataGenerator {
private final Supplier<Object> valueGenerator;

public TextFieldDataGenerator(DataSource dataSource) {
var strings = dataSource.get(new DataSourceRequest.StringGenerator());
var nulls = dataSource.get(new DataSourceRequest.NullWrapper());
var arrays = dataSource.get(new DataSourceRequest.ArrayWrapper());

this.valueGenerator = arrays.wrapper().compose(nulls.wrapper()).apply(() -> strings.generator().get());
}

@Override
public Object generateValue(Map<String, Object> fieldMapping) {
return valueGenerator.get();
}
}
Loading