Skip to content

Commit 3a93166

Browse files
authored
perf: Use Cow in get_format_string in FFI_ArrowSchema (#6853) (#6937)
* add cast_decimal bench * format * save * revert * criterion disable default features * address feedback
1 parent e4cb337 commit 3a93166

File tree

3 files changed

+98
-52
lines changed

3 files changed

+98
-52
lines changed

arrow-schema/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,8 @@ features = ["ffi"]
4747
[dev-dependencies]
4848
serde_json = "1.0"
4949
bincode = { version = "1.3.3", default-features = false }
50+
criterion = { version = "0.5", default-features = false }
51+
52+
[[bench]]
53+
name = "ffi"
54+
harness = false

arrow-schema/benches/ffi.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow_schema::ffi::FFI_ArrowSchema;
19+
use arrow_schema::{DataType, Field};
20+
use criterion::*;
21+
use std::sync::Arc;
22+
23+
fn criterion_benchmark(c: &mut Criterion) {
24+
let fields = vec![
25+
Arc::new(Field::new("c1", DataType::Utf8, false)),
26+
Arc::new(Field::new("c2", DataType::Utf8, false)),
27+
Arc::new(Field::new("c3", DataType::Utf8, false)),
28+
Arc::new(Field::new("c4", DataType::Utf8, false)),
29+
Arc::new(Field::new("c5", DataType::Utf8, false)),
30+
];
31+
let data_type = DataType::Struct(fields.into());
32+
c.bench_function("ffi_arrow_schema_try_from", |b| {
33+
b.iter(|| FFI_ArrowSchema::try_from(&data_type));
34+
});
35+
}
36+
37+
criterion_group!(benches, criterion_benchmark);
38+
criterion_main!(benches);

arrow-schema/src/ffi.rs

Lines changed: 55 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ use crate::{
3838
ArrowError, DataType, Field, FieldRef, IntervalUnit, Schema, TimeUnit, UnionFields, UnionMode,
3939
};
4040
use bitflags::bitflags;
41+
use std::borrow::Cow;
4142
use std::sync::Arc;
4243
use std::{
4344
collections::HashMap,
@@ -685,66 +686,68 @@ impl TryFrom<&DataType> for FFI_ArrowSchema {
685686
}
686687
}
687688

688-
fn get_format_string(dtype: &DataType) -> Result<String, ArrowError> {
689+
fn get_format_string(dtype: &DataType) -> Result<Cow<'static, str>, ArrowError> {
689690
match dtype {
690-
DataType::Null => Ok("n".to_string()),
691-
DataType::Boolean => Ok("b".to_string()),
692-
DataType::Int8 => Ok("c".to_string()),
693-
DataType::UInt8 => Ok("C".to_string()),
694-
DataType::Int16 => Ok("s".to_string()),
695-
DataType::UInt16 => Ok("S".to_string()),
696-
DataType::Int32 => Ok("i".to_string()),
697-
DataType::UInt32 => Ok("I".to_string()),
698-
DataType::Int64 => Ok("l".to_string()),
699-
DataType::UInt64 => Ok("L".to_string()),
700-
DataType::Float16 => Ok("e".to_string()),
701-
DataType::Float32 => Ok("f".to_string()),
702-
DataType::Float64 => Ok("g".to_string()),
703-
DataType::BinaryView => Ok("vz".to_string()),
704-
DataType::Binary => Ok("z".to_string()),
705-
DataType::LargeBinary => Ok("Z".to_string()),
706-
DataType::Utf8View => Ok("vu".to_string()),
707-
DataType::Utf8 => Ok("u".to_string()),
708-
DataType::LargeUtf8 => Ok("U".to_string()),
709-
DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")),
710-
DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")),
711-
DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")),
712-
DataType::Decimal256(precision, scale) => Ok(format!("d:{precision},{scale},256")),
713-
DataType::Date32 => Ok("tdD".to_string()),
714-
DataType::Date64 => Ok("tdm".to_string()),
715-
DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()),
716-
DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".to_string()),
717-
DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".to_string()),
718-
DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".to_string()),
719-
DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".to_string()),
720-
DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".to_string()),
721-
DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".to_string()),
722-
DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".to_string()),
723-
DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(format!("tss:{tz}")),
724-
DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(format!("tsm:{tz}")),
725-
DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(format!("tsu:{tz}")),
726-
DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(format!("tsn:{tz}")),
727-
DataType::Duration(TimeUnit::Second) => Ok("tDs".to_string()),
728-
DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".to_string()),
729-
DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".to_string()),
730-
DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".to_string()),
731-
DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".to_string()),
732-
DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".to_string()),
733-
DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".to_string()),
734-
DataType::List(_) => Ok("+l".to_string()),
735-
DataType::LargeList(_) => Ok("+L".to_string()),
736-
DataType::Struct(_) => Ok("+s".to_string()),
737-
DataType::Map(_, _) => Ok("+m".to_string()),
738-
DataType::RunEndEncoded(_, _) => Ok("+r".to_string()),
691+
DataType::Null => Ok("n".into()),
692+
DataType::Boolean => Ok("b".into()),
693+
DataType::Int8 => Ok("c".into()),
694+
DataType::UInt8 => Ok("C".into()),
695+
DataType::Int16 => Ok("s".into()),
696+
DataType::UInt16 => Ok("S".into()),
697+
DataType::Int32 => Ok("i".into()),
698+
DataType::UInt32 => Ok("I".into()),
699+
DataType::Int64 => Ok("l".into()),
700+
DataType::UInt64 => Ok("L".into()),
701+
DataType::Float16 => Ok("e".into()),
702+
DataType::Float32 => Ok("f".into()),
703+
DataType::Float64 => Ok("g".into()),
704+
DataType::BinaryView => Ok("vz".into()),
705+
DataType::Binary => Ok("z".into()),
706+
DataType::LargeBinary => Ok("Z".into()),
707+
DataType::Utf8View => Ok("vu".into()),
708+
DataType::Utf8 => Ok("u".into()),
709+
DataType::LargeUtf8 => Ok("U".into()),
710+
DataType::FixedSizeBinary(num_bytes) => Ok(Cow::Owned(format!("w:{num_bytes}"))),
711+
DataType::FixedSizeList(_, num_elems) => Ok(Cow::Owned(format!("+w:{num_elems}"))),
712+
DataType::Decimal128(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale}"))),
713+
DataType::Decimal256(precision, scale) => {
714+
Ok(Cow::Owned(format!("d:{precision},{scale},256")))
715+
}
716+
DataType::Date32 => Ok("tdD".into()),
717+
DataType::Date64 => Ok("tdm".into()),
718+
DataType::Time32(TimeUnit::Second) => Ok("tts".into()),
719+
DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".into()),
720+
DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".into()),
721+
DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".into()),
722+
DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".into()),
723+
DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".into()),
724+
DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".into()),
725+
DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".into()),
726+
DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(Cow::Owned(format!("tss:{tz}"))),
727+
DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(Cow::Owned(format!("tsm:{tz}"))),
728+
DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(Cow::Owned(format!("tsu:{tz}"))),
729+
DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(Cow::Owned(format!("tsn:{tz}"))),
730+
DataType::Duration(TimeUnit::Second) => Ok("tDs".into()),
731+
DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".into()),
732+
DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".into()),
733+
DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".into()),
734+
DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".into()),
735+
DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".into()),
736+
DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()),
737+
DataType::List(_) => Ok("+l".into()),
738+
DataType::LargeList(_) => Ok("+L".into()),
739+
DataType::Struct(_) => Ok("+s".into()),
740+
DataType::Map(_, _) => Ok("+m".into()),
741+
DataType::RunEndEncoded(_, _) => Ok("+r".into()),
739742
DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type),
740743
DataType::Union(fields, mode) => {
741744
let formats = fields
742745
.iter()
743746
.map(|(t, _)| t.to_string())
744747
.collect::<Vec<_>>();
745748
match mode {
746-
UnionMode::Dense => Ok(format!("{}:{}", "+ud", formats.join(","))),
747-
UnionMode::Sparse => Ok(format!("{}:{}", "+us", formats.join(","))),
749+
UnionMode::Dense => Ok(Cow::Owned(format!("{}:{}", "+ud", formats.join(",")))),
750+
UnionMode::Sparse => Ok(Cow::Owned(format!("{}:{}", "+us", formats.join(",")))),
748751
}
749752
}
750753
other => Err(ArrowError::CDataInterface(format!(

0 commit comments

Comments
 (0)