Skip to content

Add Apache Arrow as a bulk ingestion format #125040

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ public void execute(Task t) {
"--add-opens=java.base/java.nio.file=ALL-UNNAMED",
"--add-opens=java.base/java.time=ALL-UNNAMED",
"--add-opens=java.management/java.lang.management=ALL-UNNAMED",
// Arrow (may need to be replaced by org.apache.arrow.memory.core once modularized)
"--add-opens=java.base/java.nio=ALL-UNNAMED",
// Define the allocation manager type to avoid classpath scanning to locate one.
"-Darrow.allocation.manager.type=Unsafe",
"-XX:+HeapDumpOnOutOfMemoryError"
);

Expand Down
9 changes: 9 additions & 0 deletions distribution/src/config/jvm.options
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,12 @@

## GC logging
-Xlog:gc*,gc+age=trace,safepoint:file=gc.log:utctime,level,pid,tags:filecount=32,filesize=64m

## Arrow
# Allow accessing a private field of java.nio.Buffer for direct memory access.
# See org.apache.arrow.memory.MemoryUtil and https://arrow.apache.org/docs/java/install.html
# See also libs/arrow/src/main/java/module-info.java-disabled for why we open to ALL-UNNAMED
# instead of limiting to org.apache.arrow.memory.core
--add-opens=java.base/java.nio=ALL-UNNAMED
# Define the allocation manager type to avoid classpath scanning to locate one.
-Darrow.allocation.manager.type=Unsafe
5 changes: 5 additions & 0 deletions docs/changelog/125040.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 125040
summary: Add Apache Arrow as a bulk ingestion format
area: CRUD
type: enhancement
issues: []
45 changes: 30 additions & 15 deletions gradle/verification-metadata.xml
Original file line number Diff line number Diff line change
Expand Up @@ -676,9 +676,14 @@
<sha256 value="baf7d6ea97ce606c53e11b6854ba5f2ce7ef5c24dddf0afa18d1260bd25b002c" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="com.google.flatbuffers" name="flatbuffers-java" version="23.5.26">
<artifact name="flatbuffers-java-23.5.26.jar">
<sha256 value="8d10cac2ea9878896077ba437d76fdb1b9a07f55a863c560bb8a024b04103f8b" origin="Generated by Gradle"/>
<component group="com.google.errorprone" name="error_prone_annotations" version="2.31.0">
<artifact name="error_prone_annotations-2.31.0.jar">
<sha256 value="ba8d20fb1fc181672552b323f3c7549b30be1d57c49dd5835e2729e7647d9cfa" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="com.google.flatbuffers" name="flatbuffers-java" version="24.3.25">
<artifact name="flatbuffers-java-24.3.25.jar">
<sha256 value="f5b50034a53debda980aca803b8b06949f93a40163bc1db6cd69581d3718e355" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="com.google.googlejavaformat" name="google-java-format" version="1.19.2">
Expand Down Expand Up @@ -2043,24 +2048,24 @@
<sha256 value="5c8551990307a032336d98ddaed549a39a689f07d4d4c6b950601bf22b3d6a1b" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.apache.arrow" name="arrow-format" version="16.1.0">
<artifact name="arrow-format-16.1.0.jar">
<sha256 value="ad97e0fc72e193b1de3cbce4818d1ff16e81673fd523d001e8d2774bde40ee6c" origin="Generated by Gradle"/>
<component group="org.apache.arrow" name="arrow-format" version="18.2.0">
<artifact name="arrow-format-18.2.0.jar">
<sha256 value="6d977352a232559c97a38dfdd786f014e87489d3bb3f33f12cfed30bdd164669" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.apache.arrow" name="arrow-memory-core" version="16.1.0">
<artifact name="arrow-memory-core-16.1.0.jar">
<sha256 value="da7af1a1a899bd5a1b6c71284243b9f3c0e1098f0cb10cd7be4b8b455ced79dd" origin="Generated by Gradle"/>
<component group="org.apache.arrow" name="arrow-memory-core" version="18.2.0">
<artifact name="arrow-memory-core-18.2.0.jar">
<sha256 value="f2867e5267d0ae4eb97ea5cac47bba90ca73522c53aa702a23258f9754ca8b8e" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.apache.arrow" name="arrow-memory-unsafe" version="16.1.0">
<artifact name="arrow-memory-unsafe-16.1.0.jar">
<sha256 value="6534eded25f2c30593416a294c1047f0b017baa9906d98f6f3270737b076c745" origin="Generated by Gradle"/>
<component group="org.apache.arrow" name="arrow-memory-unsafe" version="18.2.0">
<artifact name="arrow-memory-unsafe-18.2.0.jar">
<sha256 value="78efe5893361e3b2493b242d9d0550e6558a85d95103352925267825da748874" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.apache.arrow" name="arrow-vector" version="16.1.0">
<artifact name="arrow-vector-16.1.0.jar">
<sha256 value="c5837b3aa24dfd93759f57bc5759b9a8fbb5bf3912d55994d70cabb904436aab" origin="Generated by Gradle"/>
<component group="org.apache.arrow" name="arrow-vector" version="18.2.0">
<artifact name="arrow-vector-18.2.0.jar">
<sha256 value="29fe15fac68e30fdd59364bcd8b5058490cc85269fd47fb460abaff1fb61243d" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.apache.avro" name="avro" version="1.7.4">
Expand Down Expand Up @@ -3514,6 +3519,11 @@
<sha256 value="ccaedd33af0b7894d9f2f3b644f4d19e43928e32902e61ac4d10777830f5aac7" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.checkerframework" name="checker-qual" version="3.48.1">
<artifact name="checker-qual-3.48.1.jar">
<sha256 value="21e8dfe8103e125d96a329653ca81e87ac430326dbdbf299cea3dc1ae3f039a2" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.checkerframework" name="checker-qual" version="3.49.0">
<artifact name="checker-qual-3.49.0.jar">
<sha256 value="8b9d9a36eaaf7c0fc26503c83cd97d8c9c0f9e2913cc2a6e92ac26c735d4dcbe" origin="Generated by Gradle"/>
Expand Down Expand Up @@ -4009,6 +4019,11 @@
<sha256 value="95d40913be28dfd439cefea9170c40898ea84f11f25e6ff8de50339b8a7b5e3e" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.immutables" name="value-annotations" version="2.10.1">
<artifact name="value-annotations-2.10.1.jar">
<sha256 value="9ef9629d2b710d9d705aa154457e1ba33b8c12118129b7c400bf65d923b46f26" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="org.ini4j" name="ini4j" version="0.5.2">
<artifact name="ini4j-0.5.2.jar">
<sha256 value="631656eb38639b0ae41161f706ff7fbe04313b5b8f42892da5ec656390031fc6" origin="Generated by Gradle"/>
Expand Down
82 changes: 82 additions & 0 deletions libs/arrow/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

// Notes:
// - additional JVM arguments are added to distribution/src/config/jvm.options and ElasticsearchTestBasePlugin
// - additional permissions are added to server/src/main/resources/org/elasticsearch/bootstrap/security.policy

import org.elasticsearch.gradle.internal.precommit.CheckForbiddenApisTask

apply plugin: 'elasticsearch.build'
apply plugin: 'elasticsearch.publish'

var arrowVersion = "18.2.0"

dependencies {
//implementation(project(":libs:x-content"))

// jackson-core is provided by :libs:x-content:impl. If declared here, there's a module issue that prevents ES from starting:
//
// fatal exception while booting Elasticsearch java.lang.IllegalAccessError: class org.elasticsearch.xcontent.provider.json.JsonXContentImpl (in module org.elasticsearch.xcontent.impl) cannot access class com.fasterxml.jackson.core.JsonFactoryBuilder (in unnamed module @0x4727e5fc) because module org.elasticsearch.xcontent.impl does not read unnamed module @0x4727e5fc
// at [email protected]/org.elasticsearch.xcontent.provider.json.JsonXContentImpl.<clinit>(JsonXContentImpl.java:50)
// at [email protected]/org.elasticsearch.xcontent.provider.XContentProviderImpl$2.XContent(XContentProviderImpl.java:54)
// at [email protected]/org.elasticsearch.xcontent.json.JsonXContent.<clinit>(JsonXContent.java:37)
// at [email protected]/org.elasticsearch.xcontent.XContentType.<clinit>(XContentType.java:28)
// at [email protected]/org.elasticsearch.common.settings.Setting.arrayToParsableString(Setting.java:1883)
//implementation(project(":libs:x-content:impl"))

// arrow-vector
api("org.apache.arrow:arrow-vector:${arrowVersion}")
api("com.fasterxml.jackson.core:jackson-core:${versions.jackson}")
api("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}")
api("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}")
api("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${versions.jackson}")

api("com.google.flatbuffers:flatbuffers-java:24.3.25")
api("commons-codec:commons-codec:${versions.commonscodec}") // Arrow 18 -> commons-codec 1.17.1
api("org.slf4j:slf4j-api:${versions.slf4j}")
api("org.immutables:value-annotations:2.10.1") // provided dependency

// arrow-format
api("org.apache.arrow:arrow-format:${arrowVersion}")
// also depends on flatbuffers

// arrow-memory-core
api("org.apache.arrow:arrow-memory-core:${arrowVersion}")
api("com.google.errorprone:error_prone_annotations:2.31.0") // provided dependency
api('org.checkerframework:checker-qual:3.48.1') // provided dependency
// also depends on value-annotations (provided dependency)

// arrow-memory-unsafe
api("org.apache.arrow:arrow-memory-unsafe:${arrowVersion}")
// also depends on value-annotations (provided dependency)

testImplementation(project(":test:framework")) {
exclude group: 'org.elasticsearch', module: 'arrow'
}
}

tasks.named("dependencyLicenses").configure {
mapping from: /jackson-.*/, to: 'jackson'
mapping from: /arrow-.*/, to: 'arrow'
mapping from: /value-annotations.*/, to: 'org-immutables'
}

tasks.named("thirdPartyAudit").configure {
ignoreViolations(
'org.apache.arrow.memory.util.MemoryUtil',
'org.apache.arrow.memory.util.MemoryUtil$1',
)
}

tasks.withType(CheckForbiddenApisTask).configureEach {
// Remove server signatures as they will fail on classes missing in this lib's classpath,
// like org.apache.lucene.util.IOUtils
replaceSignatureFiles('jdk-signatures')
}
17 changes: 17 additions & 0 deletions libs/arrow/licenses/commons-codec-NOTICE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Apache Commons Codec
Copyright 2002-2015 The Apache Software Foundation

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java
contains test data from http://aspell.net/test/orig/batch0.tab.
Copyright (C) 2002 Kevin Atkinson ([email protected])

===============================================================================

The content of package org.apache.commons.codec.language.bm has been translated
from the original php source code available at http://stevemorse.org/phoneticinfo.htm
with permission from the original authors.
Original source copyright:
Copyright (c) 2008 Alexander Beider & Stephen P. Morse.
Loading
Loading