elastic · jordan-powers · Jun 6, 2025 · Apr 7, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java
@@ -0,0 +1,108 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.benchmark.xcontent;
+
+import org.elasticsearch.benchmark.index.mapper.MapperServiceFactory;
+import org.elasticsearch.common.UUIDs;
+import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.logging.LogConfigurator;
+import org.elasticsearch.index.mapper.MapperService;
+import org.elasticsearch.index.mapper.SourceToParse;
+import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentFactory;
+import org.elasticsearch.xcontent.XContentType;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Benchmark to measure indexing performance of keyword fields. Used to measure performance impact of skipping
+ * UTF-8 to UTF-16 conversion during document parsing.
+ */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Fork(1)
+@Threads(1)
+@Warmup(iterations = 1)
+@Measurement(iterations = 5)
+public class OptimizedTextBenchmark {
+    static {
+        // For Elasticsearch900Lucene101Codec:
+        LogConfigurator.loadLog4jPlugins();
+        LogConfigurator.configureESLogging();
+        LogConfigurator.setNodeName("test");
+    }
+
+    /**
+     * Total number of documents to index.
+     */
+    @Param("1048576")
+    private int nDocs;
+
+    private MapperService mapperService;
+    private SourceToParse[] sources;
+
+    private String randomValue(int length) {
+        final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+        Random random = new Random();
+        StringBuilder builder = new StringBuilder(length);
+        for (int i = 0; i < length; i++) {
+            builder.append(CHARS.charAt(random.nextInt(CHARS.length())));
+        }
+        return builder.toString();
+    }
+
+    @Setup(Level.Trial)
+    public void setup() throws IOException {
+        mapperService = MapperServiceFactory.create("""
+            {
+                "_doc": {
+                    "dynamic": false,
+                    "properties": {
+                        "field": {
+                            "type": "keyword"
+                        }
+                    }
+                }
+            }
+            """);
+
+        sources = new SourceToParse[nDocs];
+        for (int i = 0; i < nDocs; i++) {
+            XContentBuilder b = XContentFactory.jsonBuilder();
+            b.startObject().field("field", randomValue(8)).endObject();
+            sources[i] = new SourceToParse(UUIDs.randomBase64UUID(), BytesReference.bytes(b), XContentType.JSON);
+        }
+    }
+
+    @Benchmark
+    public void indexDocuments(final Blackhole bh) {
+        final var mapper = mapperService.documentMapper();
+        for (int i = 0; i < nDocs; i++) {
+            bh.consume(mapper.parse(sources[i]));
+        }
+    }
+}
@@ -0,0 +1,74 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.xcontent.provider.json;
+
+import com.fasterxml.jackson.core.JsonEncoding;
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonFactoryBuilder;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.io.IOContext;
+import com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper;
+import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
+
+import java.io.IOException;
+
+public class ESJsonFactory extends JsonFactory {
+    ESJsonFactory(JsonFactoryBuilder b) {
+        super(b);
+    }
+
+    @Override
+    protected JsonParser _createParser(byte[] data, int offset, int len, IOContext ctxt) throws IOException {
+        if (len > 0
+            && Feature.CHARSET_DETECTION.enabledIn(_factoryFeatures)
+            && Feature.CANONICALIZE_FIELD_NAMES.enabledIn(_factoryFeatures)) {
+            var bootstrap = new ByteSourceJsonBootstrapper(ctxt, data, offset, len);
+            var encoding = bootstrap.detectEncoding();
+            if (encoding == JsonEncoding.UTF8) {
+                boolean invalidBom = false;
+                int ptr = offset;
+                // Skip over the BOM if present
+                if ((data[ptr] & 0xFF) == 0xEF) {
+                    if (len < 3) {
+                        invalidBom = true;
+                    } else if ((data[ptr + 1] & 0xFF) != 0xBB) {
+                        invalidBom = true;
+                    } else if ((data[ptr + 2] & 0xFF) != 0xBF) {
+                        invalidBom = true;
+                    } else {
+                        ptr += 3;
+                    }
+                }
+                if (invalidBom == false) {
+                    ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
+                    return new ESUTF8StreamJsonParser(
+                        ctxt,
+                        _parserFeatures,
+                        null,
+                        _objectCodec,
+                        can,
+                        data,
+                        ptr,
+                        offset + len,
+                        ptr - offset,
+                        false
+                    );
+                }
+            }
+        }
+        return new ByteSourceJsonBootstrapper(ctxt, data, offset, len).constructParser(
+            _parserFeatures,
+            _objectCodec,
+            _byteSymbolCanonicalizer,
+            _rootCharSymbols,
+            _factoryFeatures
+        );
+    }
+}
@@ -0,0 +1,20 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.xcontent.provider.json;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonFactoryBuilder;
+
+public class ESJsonFactoryBuilder extends JsonFactoryBuilder {
+    @Override
+    public JsonFactory build() {
+        return new ESJsonFactory(this);
+    }
+}
@@ -0,0 +1,118 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.xcontent.provider.json;
+
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.ObjectCodec;
+import com.fasterxml.jackson.core.SerializableString;
+import com.fasterxml.jackson.core.io.IOContext;
+import com.fasterxml.jackson.core.json.UTF8StreamJsonParser;
+import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
+
+import org.elasticsearch.xcontent.Text;
+import org.elasticsearch.xcontent.XContentString;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
+    protected int stringEnd = -1;
+
+    public ESUTF8StreamJsonParser(
+        IOContext ctxt,
+        int features,
+        InputStream in,
+        ObjectCodec codec,
+        ByteQuadsCanonicalizer sym,
+        byte[] inputBuffer,
+        int start,
+        int end,
+        int bytesPreProcessed,
+        boolean bufferRecyclable
+    ) {
+        super(ctxt, features, in, codec, sym, inputBuffer, start, end, bytesPreProcessed, bufferRecyclable);
+    }
+
+    /**
+     * Method that will try to get underlying UTF-8 encoded bytes of the current string token.
+     * This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null.
+     * Currently, this is only implemented for ascii-only strings that do not contain escaped characters.
+     */
+    public Text getValueAsText() throws IOException {
+        if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
+            if (stringEnd > 0) {
+                final int len = stringEnd - 1 - _inputPtr;
+                // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
+                // which means each character uses exactly 1 byte.
+                return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len);
+            }
+            return _finishAndReturnText();
+        }
+        return null;
+    }
+
+    protected Text _finishAndReturnText() throws IOException {
+        int ptr = _inputPtr;
+        if (ptr >= _inputEnd) {
+            _loadMoreGuaranteed();
+            ptr = _inputPtr;
+        }
+
+        int startPtr = ptr;
+        final int[] codes = INPUT_CODES_UTF8;
+        final int max = _inputEnd;
+        final byte[] inputBuffer = _inputBuffer;
+        while (ptr < max) {
+            int c = inputBuffer[ptr] & 0xFF;
+            if (codes[c] != 0) {
+                if (c == INT_QUOTE) {
+                    stringEnd = ptr + 1;
+                    final int len = ptr - startPtr;
+                    // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
+                    // which means each character uses exactly 1 byte.
+                    return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len);
+                }
+                return null;
+            }
+            ++ptr;
+        }
+        return null;
+    }
+
+    @Override
+    public JsonToken nextToken() throws IOException {
+        if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
+            _inputPtr = stringEnd;
+            _tokenIncomplete = false;
+        }
+        stringEnd = -1;
+        return super.nextToken();
+    }
+
+    @Override
+    public boolean nextFieldName(SerializableString str) throws IOException {
+        if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
+            _inputPtr = stringEnd;
+            _tokenIncomplete = false;
+        }
+        stringEnd = -1;
+        return super.nextFieldName(str);
+    }
+
+    @Override
+    public String nextFieldName() throws IOException {
+        if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
+            _inputPtr = stringEnd;
+            _tokenIncomplete = false;
+        }
+        stringEnd = -1;
+        return super.nextFieldName();
+    }
+}
@@ -11,7 +11,6 @@
 
 import com.fasterxml.jackson.core.JsonEncoding;
 import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonFactoryBuilder;
 import com.fasterxml.jackson.core.JsonGenerator;
 import com.fasterxml.jackson.core.JsonParser;
 
@@ -47,7 +46,7 @@ public static final XContent jsonXContent() {
     }
 
     static {
-        jsonFactory = XContentImplUtils.configure(new JsonFactoryBuilder());
+        jsonFactory = XContentImplUtils.configure(new ESJsonFactoryBuilder());
         jsonFactory.configure(JsonGenerator.Feature.QUOTE_FIELD_NAMES, true);
         jsonFactory.configure(JsonParser.Feature.ALLOW_COMMENTS, true);
         jsonFactory.configure(JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW, false); // this trips on many mappings now...

@@ -18,10 +18,12 @@
 import com.fasterxml.jackson.core.io.JsonEOFException;
 
 import org.elasticsearch.core.IOUtils;
+import org.elasticsearch.xcontent.Text;
 import org.elasticsearch.xcontent.XContentEOFException;
 import org.elasticsearch.xcontent.XContentLocation;
 import org.elasticsearch.xcontent.XContentParseException;
 import org.elasticsearch.xcontent.XContentParserConfiguration;
+import org.elasticsearch.xcontent.XContentString;
 import org.elasticsearch.xcontent.XContentType;
 import org.elasticsearch.xcontent.provider.XContentParserConfigurationImpl;
 import org.elasticsearch.xcontent.support.AbstractXContentParser;
@@ -115,6 +117,20 @@ public String text() throws IOException {
         }
     }
 
+    @Override
+    public XContentString optimizedText() throws IOException {
+        if (currentToken().isValue() == false) {
+            throwOnNoText();
+        }
+        if (parser instanceof ESUTF8StreamJsonParser esParser) {
+            var bytesRef = esParser.getValueAsText();
+            if (bytesRef != null) {
+                return bytesRef;
+            }
+        }
+        return new Text(text());
+    }
+
     private void throwOnNoText() {
         throw new IllegalArgumentException("Expected text at " + getTokenLocation() + " but found " + currentToken());
     }