Skip to content

Add support for 512-bit vectors in utf-8 validator #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/main/java/org/simdjson/CharactersClassifier.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ class CharactersClassifier {

private static final ByteVector WHITESPACE_TABLE =
ByteVector.fromArray(
StructuralIndexer.SPECIES,
repeat(new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100}, StructuralIndexer.SPECIES.vectorByteSize() / 4),
StructuralIndexer.BYTE_SPECIES,
repeat(new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4),
0);

private static final ByteVector OP_TABLE =
ByteVector.fromArray(
StructuralIndexer.SPECIES,
repeat(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0}, StructuralIndexer.SPECIES.vectorByteSize() / 4),
StructuralIndexer.BYTE_SPECIES,
repeat(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4),
0);

private static byte[] repeat(byte[] array, int n) {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/simdjson/JsonStringScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class JsonStringScanner {
private long prevEscaped = 0;

JsonStringScanner() {
this.backslashMask = ByteVector.broadcast(StructuralIndexer.SPECIES, (byte) '\\');
this.quoteMask = ByteVector.broadcast(StructuralIndexer.SPECIES, (byte) '"');
this.backslashMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '\\');
this.quoteMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '"');
}

JsonStringBlock next(ByteVector chunk0) {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/simdjson/StringParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class StringParser {

private static final byte BACKSLASH = '\\';
private static final byte QUOTE = '"';
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
private static final int BYTES_PROCESSED = StructuralIndexer.BYTE_SPECIES.vectorByteSize();
private static final int MIN_HIGH_SURROGATE = 0xD800;
private static final int MAX_HIGH_SURROGATE = 0xDBFF;
private static final int MIN_LOW_SURROGATE = 0xDC00;
Expand All @@ -31,7 +31,7 @@ void parseString(byte[] buffer, int idx) {
int src = idx + 1;
int dst = stringBufferIdx + Integer.BYTES;
while (true) {
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.BYTE_SPECIES, buffer, src);
srcVec.intoArray(stringBuffer, dst);
long backslashBits = srcVec.eq(BACKSLASH).toLong();
long quoteBits = srcVec.eq(QUOTE).toLong();
Expand Down
40 changes: 28 additions & 12 deletions src/main/java/org/simdjson/StructuralIndexer.java
Original file line number Diff line number Diff line change
@@ -1,27 +1,43 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.VectorShape;
import jdk.incubator.vector.VectorSpecies;
import java.lang.invoke.MethodType;

import static jdk.incubator.vector.VectorOperators.UNSIGNED_LE;

class StructuralIndexer {

static final VectorSpecies<Byte> SPECIES;
static final VectorSpecies<Integer> INT_SPECIES;
static final VectorSpecies<Byte> BYTE_SPECIES;
static final int N_CHUNKS;

static {
String species = System.getProperty("org.simdjson.species", "preferred");
SPECIES = switch(species) {
case "preferred" -> ByteVector.SPECIES_PREFERRED;
case "512" -> ByteVector.SPECIES_512;
case "256" -> ByteVector.SPECIES_256;
switch (species) {
case "preferred" -> {
BYTE_SPECIES = ByteVector.SPECIES_PREFERRED;
INT_SPECIES = IntVector.SPECIES_PREFERRED;
}
case "512" -> {
BYTE_SPECIES = ByteVector.SPECIES_512;
INT_SPECIES = IntVector.SPECIES_512;
}
case "256" -> {
BYTE_SPECIES = ByteVector.SPECIES_256;
INT_SPECIES = IntVector.SPECIES_256;
}
default -> throw new IllegalArgumentException("Unsupported vector species: " + species);
};
N_CHUNKS = 64 / SPECIES.vectorByteSize();
if (SPECIES != ByteVector.SPECIES_256 && SPECIES != ByteVector.SPECIES_512) {
throw new IllegalArgumentException("Unsupported vector species: " + SPECIES);
}
N_CHUNKS = 64 / BYTE_SPECIES.vectorByteSize();
assertSupportForSpecies(BYTE_SPECIES);
assertSupportForSpecies(INT_SPECIES);
}

private static void assertSupportForSpecies(VectorSpecies<?> species) {
if (species.vectorShape() != VectorShape.S_256_BIT && species.vectorShape() != VectorShape.S_512_BIT) {
throw new IllegalArgumentException("Unsupported vector species: " + species);
}
}

Expand All @@ -48,7 +64,7 @@ void step(byte[] buffer, int offset, int blockIndex) {
}

private void step1(byte[] buffer, int offset, int blockIndex) {
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_512, buffer, offset);
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_512, buffer, offset);
JsonStringBlock strings = stringScanner.next(chunk0);
JsonCharacterBlock characters = classifier.classify(chunk0);
long unescaped = lteq(chunk0, (byte) 0x1F);
Expand All @@ -75,7 +91,7 @@ private void finishStep(JsonCharacterBlock characters, JsonStringBlock strings,
bitIndexes.write(blockIndex, prevStructurals);
prevStructurals = potentialStructuralStart & ~strings.stringTail();
unescapedCharsError |= strings.nonQuoteInsideString(unescaped);
}
}

private long lteq(ByteVector chunk0, byte scalar) {
long r = chunk0.compare(UNSIGNED_LE, scalar).toLong();
Expand Down
11 changes: 6 additions & 5 deletions src/main/java/org/simdjson/Utf8Validator.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

import java.util.Arrays;

public class Utf8Validator {
private static final VectorSpecies<Byte> VECTOR_SPECIES = ByteVector.SPECIES_256;
class Utf8Validator {

private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES;
private static final ByteVector INCOMPLETE_CHECK = getIncompleteCheck();
private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(IntVector.SPECIES_256,
IntVector.SPECIES_256.elementSize() - 1, 1, true);
private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(StructuralIndexer.INT_SPECIES,
StructuralIndexer.INT_SPECIES.elementSize() - 1, 1, true);
private static final ByteVector LOW_NIBBLE_MASK = ByteVector.broadcast(VECTOR_SPECIES, 0b0000_1111);
private static final ByteVector ALL_ASCII_MASK = ByteVector.broadcast(VECTOR_SPECIES, (byte) 0b1000_0000);

Expand Down Expand Up @@ -39,7 +40,7 @@ static void validate(byte[] inputBytes) {

errors |= secondCheck.compare(VectorOperators.NE, 0).toLong();
}
previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(IntVector.SPECIES_256.length() - 1);
previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(StructuralIndexer.INT_SPECIES.length() - 1);
}

// if the input file doesn't align with the vector width, pad the missing bytes with zero
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/org/simdjson/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ static String padWithSpaces(String str) {
}

static ByteVector chunk(String str, int n) {
return ByteVector.fromArray(StructuralIndexer.SPECIES, str.getBytes(UTF_8), n * StructuralIndexer.SPECIES.vectorByteSize());
return ByteVector.fromArray(StructuralIndexer.BYTE_SPECIES, str.getBytes(UTF_8), n * StructuralIndexer.BYTE_SPECIES.vectorByteSize());
}

static byte[] toUtf8(String str) {
Expand Down
4 changes: 1 addition & 3 deletions src/test/java/org/simdjson/Utf8ValidatorTest.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.VectorSpecies;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;

import static org.assertj.core.api.Assertions.*;

class Utf8ValidatorTest {
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.SPECIES;
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES;


/* ASCII / 1 BYTE TESTS */
Expand Down