Skip to content

Skip UTF8 to UTF16 conversion during document indexing #126492

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 43 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
2153322
Prototype avoid UTF8 to UTF16 conversion
jordan-powers Apr 7, 2025
eaa66bb
Rename to ESBytesRef
jordan-powers Apr 8, 2025
c5d71f7
Apply spotless
jordan-powers Apr 8, 2025
a277aea
Some cleanup and comments
jordan-powers Apr 8, 2025
74822c2
Remove unnecessary throws IOException
jordan-powers Apr 8, 2025
a9ee991
Fix missing bytesValue call
jordan-powers Apr 8, 2025
ae432e1
Fix subsequent calls to parser.getText() after a call to parser.getVa…
jordan-powers Apr 8, 2025
3d1bec4
Use cached stringEnd on subsequent calls to getValueAsByteRef()
jordan-powers Apr 8, 2025
702ada2
Spotless
jordan-powers Apr 8, 2025
e2ebb92
Add textRefOrNull to DotExpandingXContentParser
jordan-powers Apr 8, 2025
4a6fe60
Add textRefOrNull() override to MultiFieldParser
jordan-powers Apr 8, 2025
240fbdb
Avoid cloning ByteSourceJsonBootstrapper
jordan-powers Apr 15, 2025
c1affcf
Rename ESBytesRef to XBytesRef
jordan-powers Apr 15, 2025
ddf7495
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 15, 2025
e285349
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 16, 2025
b0f3336
Add tests for ESJsonFactory
jordan-powers Apr 16, 2025
f2f106f
Add tests for ESUTF8StreamJsonParser
jordan-powers Apr 16, 2025
b0f701c
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 16, 2025
20616e6
Move RawString class into separate file and rename to EncodedString
jordan-powers Apr 17, 2025
1f66ff2
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 17, 2025
fd4ec6c
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 18, 2025
8913ca5
Combine XBytesRef and EncodedString into XContentString
jordan-powers Apr 22, 2025
082ffeb
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 22, 2025
3526556
Add missing override for new xContentText()
jordan-powers Apr 22, 2025
0ca2b60
Fix override in DotExpandingXContentParser
jordan-powers Apr 23, 2025
c12f1b1
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 23, 2025
9c88362
Fix GeoPointFieldMapper geohash
jordan-powers Apr 24, 2025
0d7ff66
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 24, 2025
d413cd3
Add some more tests
jordan-powers Apr 28, 2025
deefb81
Split Text and BytesReference and move base api to libs/core
jordan-powers Apr 28, 2025
681ce38
Use new BaseText and BaseBytesReference types
jordan-powers Apr 28, 2025
b31c2e0
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 28, 2025
603af45
Implement TODO UTF8 to UTF16 conversion
jordan-powers Apr 28, 2025
68d6c2d
Rename xContentText to optimizedText
jordan-powers Apr 30, 2025
1364683
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers Apr 30, 2025
70202da
Rename xContentText in tests too
jordan-powers Apr 30, 2025
b3f4e04
Revert "Split Text and BytesReference and move base api to libs/core"
jordan-powers May 1, 2025
a40bee3
Move Text to :libs:x-content
jordan-powers May 1, 2025
84921aa
Use Text instead of XContentString
jordan-powers May 1, 2025
dbbdbb1
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers May 1, 2025
9380a0b
Fix missed reference to XContentString
jordan-powers May 1, 2025
ca03f87
Fix CI
jordan-powers May 5, 2025
180078c
Merge remote-tracking branch 'upstream/main' into prototype-skip-utf16
jordan-powers May 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent.provider.json;

import com.fasterxml.jackson.core.JsonEncoding;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonFactoryBuilder;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.io.IOContext;
import com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper;
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;

import java.io.IOException;

public class ESJsonFactory extends JsonFactory {
ESJsonFactory(JsonFactoryBuilder b) {
super(b);
}

@Override
protected JsonParser _createParser(byte[] data, int offset, int len, IOContext ctxt) throws IOException {
if (len > 0
&& Feature.CHARSET_DETECTION.enabledIn(_factoryFeatures)
&& Feature.CANONICALIZE_FIELD_NAMES.enabledIn(_factoryFeatures)) {
var bootstrap = new ByteSourceJsonBootstrapper(ctxt, data, offset, len);
var encoding = bootstrap.detectEncoding();
if (encoding == JsonEncoding.UTF8) {
boolean invalidBom = false;
int ptr = offset;
// Skip over the BOM if present
if ((data[ptr] & 0xFF) == 0xEF) {
if (len < 3) {
invalidBom = true;
} else if ((data[ptr + 1] & 0xFF) != 0xBB) {
invalidBom = true;
} else if ((data[ptr + 2] & 0xFF) != 0xBF) {
invalidBom = true;
} else {
ptr += 3;
}
}
if (invalidBom == false) {
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
return new ESUTF8StreamJsonParser(
ctxt,
_parserFeatures,
null,
_objectCodec,
can,
data,
ptr,
offset + len,
ptr - offset,
false
);
}
}
}
return new ByteSourceJsonBootstrapper(ctxt, data, offset, len).constructParser(
_parserFeatures,
_objectCodec,
_byteSymbolCanonicalizer,
_rootCharSymbols,
_factoryFeatures
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent.provider.json;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonFactoryBuilder;

public class ESJsonFactoryBuilder extends JsonFactoryBuilder {
@Override
public JsonFactory build() {
return new ESJsonFactory(this);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent.provider.json;

import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.ObjectCodec;
import com.fasterxml.jackson.core.SerializableString;
import com.fasterxml.jackson.core.io.IOContext;
import com.fasterxml.jackson.core.json.UTF8StreamJsonParser;
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;

import org.elasticsearch.xcontent.Text;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;

public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
protected int stringEnd = -1;

public ESUTF8StreamJsonParser(
IOContext ctxt,
int features,
InputStream in,
ObjectCodec codec,
ByteQuadsCanonicalizer sym,
byte[] inputBuffer,
int start,
int end,
int bytesPreProcessed,
boolean bufferRecyclable
) {
super(ctxt, features, in, codec, sym, inputBuffer, start, end, bytesPreProcessed, bufferRecyclable);
}

/**
* Method that will try to get underlying UTF-8 encoded bytes of the current string token.
* This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null.
* Currently, this is only implemented for ascii-only strings that do not contain escaped characters.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect we need to support escaped characters? I don't think we have a test for this. Maybe a rest test (yaml or java) that tries to index a escaped json string field using keyword field mapper?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll definitely add a test.

I was thinking that support for escaped characters and/or possibly full unicode (not just ascii) could happen in a follow-up after this is merged. The current implementation does still function with such characters, it just won't be optimized (since this method will return null so the KeywordFieldMapper will fall back to getValueAsString()).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current implementation does still function with such characters, it just won't be optimized (since this method will return null so the KeywordFieldMapper will fall back to getValueAsString()).

That makes sense to me.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

++ it makes sense to have it in a follow-up, this PR is already big enough :)
The important thing is to test nothing is broken, and it takes the non-optimized path.

*/
public Text getValueAsText() throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
if (stringEnd > 0) {
final int len = stringEnd - 1 - _inputPtr;
// For now, we can use `len` for `charCount` because we only support ascii-encoded unescaped strings,
// which means each character uses exactly 1 byte.
return new Text(ByteBuffer.wrap(_inputBuffer, _inputPtr, len), len);
}
return _finishAndReturnText();
}
return null;
}

protected Text _finishAndReturnText() throws IOException {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
_loadMoreGuaranteed();
ptr = _inputPtr;
}

int startPtr = ptr;
final int[] codes = INPUT_CODES_UTF8;
final int max = _inputEnd;
final byte[] inputBuffer = _inputBuffer;
while (ptr < max) {
int c = inputBuffer[ptr] & 0xFF;
if (codes[c] != 0) {
if (c == INT_QUOTE) {
stringEnd = ptr + 1;
final int len = ptr - startPtr;
// For now, we can use `len` for `charCount` because we only support ascii-encoded unescaped strings,
// which means each character uses exactly 1 byte.
return new Text(ByteBuffer.wrap(inputBuffer, startPtr, len), len);
}
return null;
}
++ptr;
}
return null;
}

@Override
public JsonToken nextToken() throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
_inputPtr = stringEnd;
_tokenIncomplete = false;
}
stringEnd = -1;
return super.nextToken();
}

@Override
public boolean nextFieldName(SerializableString str) throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
_inputPtr = stringEnd;
_tokenIncomplete = false;
}
stringEnd = -1;
return super.nextFieldName(str);
}

@Override
public String nextFieldName() throws IOException {
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
_inputPtr = stringEnd;
_tokenIncomplete = false;
}
stringEnd = -1;
return super.nextFieldName();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import com.fasterxml.jackson.core.JsonEncoding;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonFactoryBuilder;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;

Expand Down Expand Up @@ -47,7 +46,7 @@ public static final XContent jsonXContent() {
}

static {
jsonFactory = XContentImplUtils.configure(new JsonFactoryBuilder());
jsonFactory = XContentImplUtils.configure(new ESJsonFactoryBuilder());
jsonFactory.configure(JsonGenerator.Feature.QUOTE_FIELD_NAMES, true);
jsonFactory.configure(JsonParser.Feature.ALLOW_COMMENTS, true);
jsonFactory.configure(JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW, false); // this trips on many mappings now...
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import com.fasterxml.jackson.core.io.JsonEOFException;

import org.elasticsearch.core.IOUtils;
import org.elasticsearch.xcontent.Text;
import org.elasticsearch.xcontent.XContentEOFException;
import org.elasticsearch.xcontent.XContentLocation;
import org.elasticsearch.xcontent.XContentParseException;
Expand Down Expand Up @@ -115,6 +116,20 @@ public String text() throws IOException {
}
}

@Override
public Text optimizedText() throws IOException {
if (currentToken().isValue() == false) {
throwOnNoText();
}
if (parser instanceof ESUTF8StreamJsonParser esParser) {
var bytesRef = esParser.getValueAsText();
if (bytesRef != null) {
return bytesRef;
}
}
return new Text(text());
}

private void throwOnNoText() {
throw new IllegalArgumentException("Expected text at " + getTokenLocation() + " but found " + currentToken());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.xcontent.provider.json;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.json.ReaderBasedJsonParser;
import com.fasterxml.jackson.core.json.UTF8StreamJsonParser;

import org.elasticsearch.test.ESTestCase;
import org.hamcrest.Matchers;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

public class ESJsonFactoryTests extends ESTestCase {

public void testCreateParser() throws IOException {
JsonFactory factory = new ESJsonFactoryBuilder().build();
assertThat(factory, Matchers.instanceOf(ESJsonFactory.class));

// \ufeff is the BOM
String[] inputs = { "{\"foo\": \"bar\"}", "\ufeff{\"foo\": \"bar\"}" };
Charset[] charsets = { StandardCharsets.UTF_8, StandardCharsets.UTF_16LE, StandardCharsets.UTF_16BE };
Class<?>[] expectedParsers = { ESUTF8StreamJsonParser.class, ReaderBasedJsonParser.class, ReaderBasedJsonParser.class };

for (String input : inputs) {
for (int i = 0; i < charsets.length; i++) {
ByteBuffer encoded = charsets[i].encode(input);
JsonParser parser = factory.createParser(encoded.array());
assertThat(parser, Matchers.instanceOf(expectedParsers[i]));
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
assertThat(parser.getValueAsString(), Matchers.equalTo("bar"));
}
}

// Valid BOM
{
JsonParser parser = factory.createParser(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, '{', '}' });
assertThat(parser, Matchers.instanceOf(ESUTF8StreamJsonParser.class));
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
}

// Invalid BOMs
{
JsonParser parser = factory.createParser(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBB, '{', '}' });
assertThat(parser, Matchers.instanceOf(UTF8StreamJsonParser.class));
assertThrows("Invalid UTF-8 start byte 0xbb", JsonParseException.class, parser::nextToken);
}

{
JsonParser parser = factory.createParser(new byte[] { (byte) 0xEF, '{', '}' });
assertThat(parser, Matchers.instanceOf(UTF8StreamJsonParser.class));
assertThrows("Invalid UTF-8 start byte 0x7b", JsonParseException.class, parser::nextToken);
}
}
}
Loading