From d267af9af700d526a120f37307e4adb7b00bbaf6 Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Wed, 23 Apr 2025 10:40:48 -0700 Subject: [PATCH 1/7] Add binary field type to ES819 merge tests --- .../es819/ES819TSDBDocValuesFormatTests.java | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java index 13eea6dbdd6d3..66636ec5475ac 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java @@ -11,6 +11,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; @@ -86,9 +87,11 @@ public void testForceMergeDenseCase() throws Exception { int numTags = 1 + random().nextInt(8); for (int j = 0; j < numTags; j++) { - d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[j]))); + d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[(i + j) % tags.length]))); } + d.add(new BinaryDocValuesField("bytes_1", new BytesRef(tags[i % tags.length]))); + iw.addDocument(d); if (i % 100 == 0) { iw.commit(); @@ -119,6 +122,8 @@ public void testForceMergeDenseCase() throws Exception { assertNotNull(gaugeTwoDV); var tagsDV = leaf.getSortedSetDocValues("tags"); assertNotNull(tagsDV); + var bytesOneDV = leaf.getBinaryDocValues("bytes_1"); + assertNotNull(bytesOneDV); for (int i = 0; i < numDocs; i++) { assertEquals(i, hostNameDV.nextDoc()); int batchIndex = i / numHosts; @@ -161,6 +166,10 @@ public void testForceMergeDenseCase() throws Exception { String actualTag = tagsDV.lookupOrd(ordinal).utf8ToString(); assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); } + + assertEquals(i, bytesOneDV.nextDoc()); + BytesRef bytesOneValue = bytesOneDV.binaryValue(); + assertTrue("unexpected bytes " + bytesOneValue, Arrays.binarySearch(tags, bytesOneValue.utf8ToString()) >= 0); } } } @@ -182,6 +191,7 @@ public void testTwoSegmentsTwoDifferentFields() throws Exception { d.add(new SortedNumericDocValuesField(timestampField, timestamp - 1)); d.add(new NumericDocValuesField("counter_1", counter1)); d.add(new SortedNumericDocValuesField("gauge_1", 2)); + d.add(new BinaryDocValuesField("binary_1", new BytesRef("foo"))); iw.addDocument(d); iw.commit(); } @@ -191,6 +201,7 @@ public void testTwoSegmentsTwoDifferentFields() throws Exception { d.add(new SortedNumericDocValuesField(timestampField, timestamp)); d.add(new SortedNumericDocValuesField("counter_2", counter2)); d.add(new SortedNumericDocValuesField("gauge_2", -2)); + d.add(new BinaryDocValuesField("binary_2", new BytesRef("bar"))); iw.addDocument(d); iw.commit(); } @@ -213,6 +224,10 @@ public void testTwoSegmentsTwoDifferentFields() throws Exception { assertNotNull(gaugeOneDV); var gaugeTwoDV = leaf.getSortedNumericDocValues("gauge_2"); assertNotNull(gaugeTwoDV); + var binaryOneDV = leaf.getBinaryDocValues("binary_1"); + assertNotNull(binaryOneDV); + var binaryTwoDv = leaf.getBinaryDocValues("binary_2"); + assertNotNull(binaryTwoDv); for (int i = 0; i < 2; i++) { assertEquals(i, hostNameDV.nextDoc()); assertEquals("host-001", hostNameDV.lookupOrd(hostNameDV.ordValue()).utf8ToString()); @@ -243,6 +258,16 @@ public void testTwoSegmentsTwoDifferentFields() throws Exception { long gaugeTwoValue = gaugeTwoDV.nextValue(); assertEquals(-2, gaugeTwoValue); } + + if (binaryOneDV.advanceExact(i)) { + BytesRef binaryOneValue = binaryOneDV.binaryValue(); + assertEquals(new BytesRef("foo"), binaryOneValue); + } + + if (binaryTwoDv.advanceExact(i)) { + BytesRef binaryTwoValue = binaryTwoDv.binaryValue(); + assertEquals(new BytesRef("bar"), binaryTwoValue); + } } } } @@ -299,6 +324,10 @@ public void testForceMergeSparseCase() throws Exception { int randomIndex = random().nextInt(tags.length); d.add(new SortedDocValuesField("other_tag", new BytesRef(tags[randomIndex]))); } + if (random().nextBoolean()) { + int randomIndex = random().nextInt(tags.length); + d.add(new BinaryDocValuesField("bytes_1", new BytesRef(tags[randomIndex]))); + } iw.addDocument(d); if (i % 100 == 0) { @@ -332,6 +361,8 @@ public void testForceMergeSparseCase() throws Exception { assertNotNull(tagsDV); var otherTagDV = leaf.getSortedDocValues("other_tag"); assertNotNull(otherTagDV); + var bytesOneDV = leaf.getBinaryDocValues("bytes_1"); + assertNotNull(bytesOneDV); for (int i = 0; i < numDocs; i++) { assertEquals(i, hostNameDV.nextDoc()); int batchIndex = i / numHosts; @@ -384,6 +415,11 @@ public void testForceMergeSparseCase() throws Exception { String actualTag = otherTagDV.lookupOrd(ordinal).utf8ToString(); assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); } + + if (bytesOneDV.advanceExact(i)) { + BytesRef bytesOneValue = bytesOneDV.binaryValue(); + assertTrue("unexpected bytes " + bytesOneValue, Arrays.binarySearch(tags, bytesOneValue.utf8ToString()) >= 0); + } } } } From 4a237601f28f8829531d54393bb222c285045289 Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Wed, 23 Apr 2025 10:54:48 -0700 Subject: [PATCH 2/7] Add MergeStats support for binary doc values --- .../tsdb/es819/DocValuesConsumerUtil.java | 16 ++- .../es819/ES819TSDBDocValuesConsumer.java | 10 ++ .../es819/ES819TSDBDocValuesProducer.java | 4 +- .../codec/tsdb/es819/XDocValuesConsumer.java | 97 +++++++++++++++++++ 4 files changed, 122 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java index fb2218494affd..c29585f173316 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/DocValuesConsumerUtil.java @@ -20,9 +20,9 @@ */ class DocValuesConsumerUtil { - static final MergeStats UNSUPPORTED = new MergeStats(false, -1, -1); + static final MergeStats UNSUPPORTED = new MergeStats(false, -1, -1, -1, -1); - record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField) {} + record MergeStats(boolean supported, long sumNumValues, int sumNumDocsWithField, int minLength, int maxLength) {} static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, MergeState mergeState, FieldInfo fieldInfo) { if (optimizedMergeEnabled == false || mergeState.needsIndexSort == false) { @@ -38,6 +38,8 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me long sumNumValues = 0; int sumNumDocsWithField = 0; + int minLength = Integer.MAX_VALUE; + int maxLength = 0; for (int i = 0; i < mergeState.docValuesProducers.length; i++) { DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; @@ -86,6 +88,14 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me } } } + case BINARY -> { + var entry = tsdbDocValuesProducer.binaries.get(fieldInfo.number); + if (entry != null) { + sumNumDocsWithField += entry.numDocsWithField; + minLength = Math.min(minLength, entry.minLength); + maxLength = Math.max(maxLength, entry.maxLength); + } + } default -> throw new IllegalStateException("unexpected doc values producer type: " + fieldInfo.getDocValuesType()); } } else { @@ -96,7 +106,7 @@ static MergeStats compatibleWithOptimizedMerge(boolean optimizedMergeEnabled, Me } } - return new MergeStats(true, sumNumValues, sumNumDocsWithField); + return new MergeStats(true, sumNumValues, sumNumDocsWithField, minLength, maxLength); } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java index bf50ebc598e07..efdc5d5852ab8 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java @@ -258,6 +258,16 @@ public void mergeNumericField(FieldInfo mergeFieldInfo, MergeState mergeState) t } } + @Override + public void mergeBinaryField(FieldInfo mergeFieldInfo, MergeState mergeState) throws IOException { + var result = compatibleWithOptimizedMerge(enableOptimizedMerge, mergeState, mergeFieldInfo); + if (result.supported()) { + mergeBinaryField(result, mergeFieldInfo, mergeState); + } else { + super.mergeBinaryField(mergeFieldInfo, mergeState); + } + } + @Override public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { meta.writeInt(field.number); diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java index 22172268add5f..31d65bde1be0e 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesProducer.java @@ -52,7 +52,7 @@ final class ES819TSDBDocValuesProducer extends DocValuesProducer { final IntObjectHashMap numerics; - private final IntObjectHashMap binaries; + final IntObjectHashMap binaries; final IntObjectHashMap sorted; final IntObjectHashMap sortedSets; final IntObjectHashMap sortedNumerics; @@ -1445,7 +1445,7 @@ static class NumericEntry { long valuesLength; } - private static class BinaryEntry { + static class BinaryEntry { long dataOffset; long dataLength; long docsWithFieldOffset; diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java index af6fc2587a49a..f4d8eccdfe2e3 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/XDocValuesConsumer.java @@ -11,6 +11,7 @@ import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesType; @@ -152,6 +153,102 @@ public long longValue() throws IOException { }; } + /** Tracks state of one binary sub-reader that we are merging */ + private static class BinaryDocValuesSub extends DocIDMerger.Sub { + + final BinaryDocValues values; + + BinaryDocValuesSub(MergeState.DocMap docMap, BinaryDocValues values) { + super(docMap); + this.values = values; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + return values.nextDoc(); + } + } + + /** + * Merges the binary docvalues from MergeState. + * + *

The default implementation calls {@link #addBinaryField}, passing a DocValuesProducer that + * merges and filters deleted documents on the fly. + */ + public void mergeBinaryField(MergeStats mergeStats, FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { + addBinaryField(mergeFieldInfo, new TsdbDocValuesProducer(mergeStats) { + @Override + public BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException { + if (fieldInfo != mergeFieldInfo) { + throw new IllegalArgumentException("wrong fieldInfo"); + } + + List subs = new ArrayList<>(); + + long cost = 0; + for (int i = 0; i < mergeState.docValuesProducers.length; i++) { + BinaryDocValues values = null; + DocValuesProducer docValuesProducer = mergeState.docValuesProducers[i]; + if (docValuesProducer != null) { + FieldInfo readerFieldInfo = mergeState.fieldInfos[i].fieldInfo(mergeFieldInfo.name); + if (readerFieldInfo != null && readerFieldInfo.getDocValuesType() == DocValuesType.BINARY) { + values = docValuesProducer.getBinary(readerFieldInfo); + } + } + if (values != null) { + cost += values.cost(); + subs.add(new BinaryDocValuesSub(mergeState.docMaps[i], values)); + } + } + + final DocIDMerger docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); + final long finalCost = cost; + + return new BinaryDocValues() { + private BinaryDocValuesSub current; + private int docID = -1; + + @Override + public int docID() { + return docID; + } + + @Override + public int nextDoc() throws IOException { + current = docIDMerger.next(); + if (current == null) { + docID = NO_MORE_DOCS; + } else { + docID = current.mappedDocID; + } + return docID; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return finalCost; + } + + @Override + public BytesRef binaryValue() throws IOException { + return current.values.binaryValue(); + } + }; + } + }); + } + /** Tracks state of one sorted numeric sub-reader that we are merging */ private static class SortedNumericDocValuesSub extends DocIDMerger.Sub { From 78a56dedfb436b32565bc3630a1d925850248a4c Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Wed, 23 Apr 2025 10:55:57 -0700 Subject: [PATCH 3/7] Apply address offset calculation optimization to binary doc values merges --- .../es819/ES819TSDBDocValuesConsumer.java | 171 ++++++++++++------ 1 file changed, 116 insertions(+), 55 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java index efdc5d5852ab8..886a66fd5155e 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java @@ -273,66 +273,127 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th meta.writeInt(field.number); meta.writeByte(ES819TSDBDocValuesFormat.BINARY); - BinaryDocValues values = valuesProducer.getBinary(field); - long start = data.getFilePointer(); - meta.writeLong(start); // dataOffset - int numDocsWithField = 0; - int minLength = Integer.MAX_VALUE; - int maxLength = 0; - for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { - numDocsWithField++; - BytesRef v = values.binaryValue(); - int length = v.length; - data.writeBytes(v.bytes, v.offset, v.length); - minLength = Math.min(length, minLength); - maxLength = Math.max(length, maxLength); - } - assert numDocsWithField <= maxDoc; - meta.writeLong(data.getFilePointer() - start); // dataLength - - if (numDocsWithField == 0) { - meta.writeLong(-2); // docsWithFieldOffset - meta.writeLong(0L); // docsWithFieldLength - meta.writeShort((short) -1); // jumpTableEntryCount - meta.writeByte((byte) -1); // denseRankPower - } else if (numDocsWithField == maxDoc) { - meta.writeLong(-1); // docsWithFieldOffset - meta.writeLong(0L); // docsWithFieldLength - meta.writeShort((short) -1); // jumpTableEntryCount - meta.writeByte((byte) -1); // denseRankPower - } else { - long offset = data.getFilePointer(); - meta.writeLong(offset); // docsWithFieldOffset - values = valuesProducer.getBinary(field); - final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER); - meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength - meta.writeShort(jumpTableEntryCount); - meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER); - } + if (valuesProducer instanceof TsdbDocValuesProducer tsdbValuesProducer && tsdbValuesProducer.mergeStats.supported()) { + int numDocsWithField = tsdbValuesProducer.mergeStats.sumNumDocsWithField(); + int minLength = tsdbValuesProducer.mergeStats.minLength(); + int maxLength = tsdbValuesProducer.mergeStats.maxLength(); - meta.writeInt(numDocsWithField); - meta.writeInt(minLength); - meta.writeInt(maxLength); - if (maxLength > minLength) { - start = data.getFilePointer(); - meta.writeLong(start); - meta.writeVInt(ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT); + assert numDocsWithField <= maxDoc; - final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance( - meta, - data, - numDocsWithField + 1, - ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT - ); - long addr = 0; - writer.add(addr); - values = valuesProducer.getBinary(field); + BinaryDocValues values = valuesProducer.getBinary(field); + long start = data.getFilePointer(); + meta.writeLong(start); // dataOffset + + OffsetsAccumulator offsetsAccumulator = null; + try { + if (maxLength > minLength) { + offsetsAccumulator = new OffsetsAccumulator(dir, context, data, numDocsWithField); + } + + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + BytesRef v = values.binaryValue(); + data.writeBytes(v.bytes, v.offset, v.length); + if (offsetsAccumulator != null) { + offsetsAccumulator.addDoc(v.length); + } + } + meta.writeLong(data.getFilePointer() - start); // dataLength + + // TODO: This is verbatim from the unoptimized path, and it still has a valuesProducer.getBinary() call in it. + // Need to optimize this part too. + if (numDocsWithField == 0) { + meta.writeLong(-2); // docsWithFieldOffset + meta.writeLong(0L); // docsWithFieldLength + meta.writeShort((short) -1); // jumpTableEntryCount + meta.writeByte((byte) -1); // denseRankPower + } else if (numDocsWithField == maxDoc) { + meta.writeLong(-1); // docsWithFieldOffset + meta.writeLong(0L); // docsWithFieldLength + meta.writeShort((short) -1); // jumpTableEntryCount + meta.writeByte((byte) -1); // denseRankPower + } else { + long offset = data.getFilePointer(); + meta.writeLong(offset); // docsWithFieldOffset + values = valuesProducer.getBinary(field); + final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER); + meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength + meta.writeShort(jumpTableEntryCount); + meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER); + } + + meta.writeInt(numDocsWithField); + meta.writeInt(minLength); + meta.writeInt(maxLength); + if (offsetsAccumulator != null) { + offsetsAccumulator.build(meta, data); + } + } finally { + if (offsetsAccumulator != null) { + offsetsAccumulator.close(); + } + } + } else { + BinaryDocValues values = valuesProducer.getBinary(field); + long start = data.getFilePointer(); + meta.writeLong(start); // dataOffset + int numDocsWithField = 0; + int minLength = Integer.MAX_VALUE; + int maxLength = 0; for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { - addr += values.binaryValue().length; + numDocsWithField++; + BytesRef v = values.binaryValue(); + int length = v.length; + data.writeBytes(v.bytes, v.offset, v.length); + minLength = Math.min(length, minLength); + maxLength = Math.max(length, maxLength); + } + assert numDocsWithField <= maxDoc; + meta.writeLong(data.getFilePointer() - start); // dataLength + + if (numDocsWithField == 0) { + meta.writeLong(-2); // docsWithFieldOffset + meta.writeLong(0L); // docsWithFieldLength + meta.writeShort((short) -1); // jumpTableEntryCount + meta.writeByte((byte) -1); // denseRankPower + } else if (numDocsWithField == maxDoc) { + meta.writeLong(-1); // docsWithFieldOffset + meta.writeLong(0L); // docsWithFieldLength + meta.writeShort((short) -1); // jumpTableEntryCount + meta.writeByte((byte) -1); // denseRankPower + } else { + long offset = data.getFilePointer(); + meta.writeLong(offset); // docsWithFieldOffset + values = valuesProducer.getBinary(field); + final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER); + meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength + meta.writeShort(jumpTableEntryCount); + meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER); + } + + meta.writeInt(numDocsWithField); + meta.writeInt(minLength); + meta.writeInt(maxLength); + if (maxLength > minLength) { + start = data.getFilePointer(); + meta.writeLong(start); + meta.writeVInt(ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT); + + final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance( + meta, + data, + numDocsWithField + 1, + ES819TSDBDocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT + ); + long addr = 0; writer.add(addr); + values = valuesProducer.getBinary(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + addr += values.binaryValue().length; + writer.add(addr); + } + writer.finish(); + meta.writeLong(data.getFilePointer() - start); } - writer.finish(); - meta.writeLong(data.getFilePointer() - start); } } From fa2e1d59f007f51ac9e102b634115fe78b94bc43 Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Wed, 23 Apr 2025 11:32:45 -0700 Subject: [PATCH 4/7] Use DISIAccumulator for binary doc values merges --- .../es819/ES819TSDBDocValuesConsumer.java | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java index 886a66fd5155e..6083f65bb52ed 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java @@ -285,22 +285,30 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th meta.writeLong(start); // dataOffset OffsetsAccumulator offsetsAccumulator = null; + DISIAccumulator disiAccumulator = null; try { + if (numDocsWithField > 0 && numDocsWithField < maxDoc) { + disiAccumulator = new DISIAccumulator(dir, context, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER); + } + + assert maxLength >= minLength; if (maxLength > minLength) { offsetsAccumulator = new OffsetsAccumulator(dir, context, data, numDocsWithField); + } for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { BytesRef v = values.binaryValue(); data.writeBytes(v.bytes, v.offset, v.length); + if (disiAccumulator != null) { + disiAccumulator.addDocId(doc); + } if (offsetsAccumulator != null) { offsetsAccumulator.addDoc(v.length); } } meta.writeLong(data.getFilePointer() - start); // dataLength - // TODO: This is verbatim from the unoptimized path, and it still has a valuesProducer.getBinary() call in it. - // Need to optimize this part too. if (numDocsWithField == 0) { meta.writeLong(-2); // docsWithFieldOffset meta.writeLong(0L); // docsWithFieldLength @@ -314,8 +322,13 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th } else { long offset = data.getFilePointer(); meta.writeLong(offset); // docsWithFieldOffset - values = valuesProducer.getBinary(field); - final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER); + final short jumpTableEntryCount; + if (disiAccumulator != null) { + jumpTableEntryCount = disiAccumulator.build(data); + } else { + values = valuesProducer.getBinary(field); + jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER); + } meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength meta.writeShort(jumpTableEntryCount); meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER); @@ -328,9 +341,7 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th offsetsAccumulator.build(meta, data); } } finally { - if (offsetsAccumulator != null) { - offsetsAccumulator.close(); - } + IOUtils.close(disiAccumulator, offsetsAccumulator); } } else { BinaryDocValues values = valuesProducer.getBinary(field); From e850ec47a5dd233f3b90bda1271cd5f6695cd78c Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Wed, 23 Apr 2025 11:41:46 -0700 Subject: [PATCH 5/7] Remove extra whitespace --- .../index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java index 6083f65bb52ed..61d6201e5c6e1 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java @@ -294,7 +294,6 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th assert maxLength >= minLength; if (maxLength > minLength) { offsetsAccumulator = new OffsetsAccumulator(dir, context, data, numDocsWithField); - } for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { From 5ab5937d47f3a4c524137281e0e5fa0e0104e9c2 Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Thu, 24 Apr 2025 08:54:39 -0700 Subject: [PATCH 6/7] Remove redundant null check --- .../tsdb/es819/ES819TSDBDocValuesConsumer.java | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java index 61d6201e5c6e1..3651be472051f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesConsumer.java @@ -274,9 +274,9 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th meta.writeByte(ES819TSDBDocValuesFormat.BINARY); if (valuesProducer instanceof TsdbDocValuesProducer tsdbValuesProducer && tsdbValuesProducer.mergeStats.supported()) { - int numDocsWithField = tsdbValuesProducer.mergeStats.sumNumDocsWithField(); - int minLength = tsdbValuesProducer.mergeStats.minLength(); - int maxLength = tsdbValuesProducer.mergeStats.maxLength(); + final int numDocsWithField = tsdbValuesProducer.mergeStats.sumNumDocsWithField(); + final int minLength = tsdbValuesProducer.mergeStats.minLength(); + final int maxLength = tsdbValuesProducer.mergeStats.maxLength(); assert numDocsWithField <= maxDoc; @@ -321,13 +321,7 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th } else { long offset = data.getFilePointer(); meta.writeLong(offset); // docsWithFieldOffset - final short jumpTableEntryCount; - if (disiAccumulator != null) { - jumpTableEntryCount = disiAccumulator.build(data); - } else { - values = valuesProducer.getBinary(field); - jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER); - } + final short jumpTableEntryCount = disiAccumulator.build(data); meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength meta.writeShort(jumpTableEntryCount); meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER); From f15a9d3f8684245488d7f8ece5573058472446de Mon Sep 17 00:00:00 2001 From: Jordan Powers Date: Thu, 24 Apr 2025 09:03:11 -0700 Subject: [PATCH 7/7] Rename bytes_1 to tags_as_bytes --- .../es819/ES819TSDBDocValuesFormatTests.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java index 66636ec5475ac..368d6f23d0fa1 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/tsdb/es819/ES819TSDBDocValuesFormatTests.java @@ -90,7 +90,7 @@ public void testForceMergeDenseCase() throws Exception { d.add(new SortedSetDocValuesField("tags", new BytesRef(tags[(i + j) % tags.length]))); } - d.add(new BinaryDocValuesField("bytes_1", new BytesRef(tags[i % tags.length]))); + d.add(new BinaryDocValuesField("tags_as_bytes", new BytesRef(tags[i % tags.length]))); iw.addDocument(d); if (i % 100 == 0) { @@ -122,8 +122,8 @@ public void testForceMergeDenseCase() throws Exception { assertNotNull(gaugeTwoDV); var tagsDV = leaf.getSortedSetDocValues("tags"); assertNotNull(tagsDV); - var bytesOneDV = leaf.getBinaryDocValues("bytes_1"); - assertNotNull(bytesOneDV); + var tagBytesDV = leaf.getBinaryDocValues("tags_as_bytes"); + assertNotNull(tagBytesDV); for (int i = 0; i < numDocs; i++) { assertEquals(i, hostNameDV.nextDoc()); int batchIndex = i / numHosts; @@ -167,9 +167,9 @@ public void testForceMergeDenseCase() throws Exception { assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); } - assertEquals(i, bytesOneDV.nextDoc()); - BytesRef bytesOneValue = bytesOneDV.binaryValue(); - assertTrue("unexpected bytes " + bytesOneValue, Arrays.binarySearch(tags, bytesOneValue.utf8ToString()) >= 0); + assertEquals(i, tagBytesDV.nextDoc()); + BytesRef tagBytesValue = tagBytesDV.binaryValue(); + assertTrue("unexpected bytes " + tagBytesValue, Arrays.binarySearch(tags, tagBytesValue.utf8ToString()) >= 0); } } } @@ -326,7 +326,7 @@ public void testForceMergeSparseCase() throws Exception { } if (random().nextBoolean()) { int randomIndex = random().nextInt(tags.length); - d.add(new BinaryDocValuesField("bytes_1", new BytesRef(tags[randomIndex]))); + d.add(new BinaryDocValuesField("tags_as_bytes", new BytesRef(tags[randomIndex]))); } iw.addDocument(d); @@ -361,8 +361,8 @@ public void testForceMergeSparseCase() throws Exception { assertNotNull(tagsDV); var otherTagDV = leaf.getSortedDocValues("other_tag"); assertNotNull(otherTagDV); - var bytesOneDV = leaf.getBinaryDocValues("bytes_1"); - assertNotNull(bytesOneDV); + var tagBytesDV = leaf.getBinaryDocValues("tags_as_bytes"); + assertNotNull(tagBytesDV); for (int i = 0; i < numDocs; i++) { assertEquals(i, hostNameDV.nextDoc()); int batchIndex = i / numHosts; @@ -416,9 +416,9 @@ public void testForceMergeSparseCase() throws Exception { assertTrue("unexpected tag [" + actualTag + "]", Arrays.binarySearch(tags, actualTag) >= 0); } - if (bytesOneDV.advanceExact(i)) { - BytesRef bytesOneValue = bytesOneDV.binaryValue(); - assertTrue("unexpected bytes " + bytesOneValue, Arrays.binarySearch(tags, bytesOneValue.utf8ToString()) >= 0); + if (tagBytesDV.advanceExact(i)) { + BytesRef tagBytesValue = tagBytesDV.binaryValue(); + assertTrue("unexpected bytes " + tagBytesValue, Arrays.binarySearch(tags, tagBytesValue.utf8ToString()) >= 0); } } }