From ceaf64f4e3028629d9d91af91f0e35f07a47f3f8 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sun, 8 Mar 2026 09:00:38 -0400 Subject: [PATCH 1/3] chore: Add substr benchmarks, refactor --- datafusion/functions/benches/substr.rs | 152 +++++---- datafusion/functions/src/unicode/substr.rs | 309 ++++++------------ .../spark/src/function/string/substring.rs | 8 +- .../test_files/string/string_literal.slt | 4 +- .../source/user-guide/sql/scalar_functions.md | 2 +- 5 files changed, 199 insertions(+), 276 deletions(-) diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs index 37a1e178f5612..182a3848d7eaf 100644 --- a/datafusion/functions/benches/substr.rs +++ b/datafusion/functions/benches/substr.rs @@ -21,47 +21,42 @@ use arrow::util::bench_util::{ create_string_array_with_len, create_string_view_array_with_len, }; use criterion::{Criterion, SamplingMode, criterion_group, criterion_main}; -use datafusion_common::DataFusionError; use datafusion_common::config::ConfigOptions; +use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::unicode; use std::hint::black_box; use std::sync::Arc; +fn make_i64_arg(value: i64, size: usize, as_scalar: bool) -> ColumnarValue { + if as_scalar { + ColumnarValue::Scalar(ScalarValue::from(value)) + } else { + ColumnarValue::Array(Arc::new(Int64Array::from(vec![value; size]))) + } +} + fn create_args_without_count( size: usize, str_len: usize, start_half_way: bool, force_view_types: bool, + scalar_start: bool, ) -> Vec { - let start_array = Arc::new(Int64Array::from( - (0..size) - .map(|_| { - if start_half_way { - (str_len / 2) as i64 - } else { - 1i64 - } - }) - .collect::>(), - )); - - if force_view_types { - let string_array = - Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); - vec![ - ColumnarValue::Array(string_array), - ColumnarValue::Array(start_array), - ] + let start_val = if start_half_way { + (str_len / 2) as i64 } else { - let string_array = - Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + 1i64 + }; + let start = make_i64_arg(start_val, size, scalar_start); - vec![ - ColumnarValue::Array(string_array), - ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), - ] - } + let string_array: ArrayRef = if force_view_types { + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)) + } else { + Arc::new(create_string_array_with_len::(size, 0.1, str_len)) + }; + + vec![ColumnarValue::Array(string_array), start] } fn create_args_with_count( @@ -69,32 +64,19 @@ fn create_args_with_count( str_len: usize, count_max: usize, force_view_types: bool, + scalar_args: bool, ) -> Vec { - let start_array = - Arc::new(Int64Array::from((0..size).map(|_| 1).collect::>())); let count = count_max.min(str_len) as i64; - let count_array = Arc::new(Int64Array::from( - (0..size).map(|_| count).collect::>(), - )); - - if force_view_types { - let string_array = - Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); - vec![ - ColumnarValue::Array(string_array), - ColumnarValue::Array(start_array), - ColumnarValue::Array(count_array), - ] + let start = make_i64_arg(1i64, size, scalar_args); + let count = make_i64_arg(count, size, scalar_args); + + let string_array: ArrayRef = if force_view_types { + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)) } else { - let string_array = - Arc::new(create_string_array_with_len::(size, 0.1, str_len)); - - vec![ - ColumnarValue::Array(string_array), - ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), - ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef), - ] - } + Arc::new(create_string_array_with_len::(size, 0.1, str_len)) + }; + + vec![ColumnarValue::Array(string_array), start, count] } #[expect(clippy::needless_pass_by_value)] @@ -122,22 +104,22 @@ fn criterion_benchmark(c: &mut Criterion) { for size in [1024, 4096] { // string_len = 12, substring_len=6 (see `create_args_without_count`) let len = 12; - let mut group = c.benchmark_group("SHORTER THAN 12"); + let mut group = c.benchmark_group("substr, no count, short strings"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - let args = create_args_without_count::(size, len, true, true); + let args = create_args_without_count::(size, len, true, true, false); group.bench_function( format!("substr_string_view [size={size}, strlen={len}]"), |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), ); - let args = create_args_without_count::(size, len, false, false); + let args = create_args_without_count::(size, len, false, false, false); group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| { b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))) }); - let args = create_args_without_count::(size, len, true, false); + let args = create_args_without_count::(size, len, true, false, false); group.bench_function( format!("substr_large_string [size={size}, strlen={len}]"), |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), @@ -148,23 +130,23 @@ fn criterion_benchmark(c: &mut Criterion) { // string_len = 128, start=1, count=64, substring_len=64 let len = 128; let count = 64; - let mut group = c.benchmark_group("LONGER THAN 12"); + let mut group = c.benchmark_group("substr, with count, long strings"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - let args = create_args_with_count::(size, len, count, true); + let args = create_args_with_count::(size, len, count, true, false); group.bench_function( format!("substr_string_view [size={size}, count={count}, strlen={len}]",), |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), ); - let args = create_args_with_count::(size, len, count, false); + let args = create_args_with_count::(size, len, count, false, false); group.bench_function( format!("substr_string [size={size}, count={count}, strlen={len}]",), |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), ); - let args = create_args_with_count::(size, len, count, false); + let args = create_args_with_count::(size, len, count, false, false); group.bench_function( format!("substr_large_string [size={size}, count={count}, strlen={len}]",), |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), @@ -175,29 +157,75 @@ fn criterion_benchmark(c: &mut Criterion) { // string_len = 128, start=1, count=6, substring_len=6 let len = 128; let count = 6; - let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12"); + let mut group = c.benchmark_group("substr, short count, long strings"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - let args = create_args_with_count::(size, len, count, true); + let args = create_args_with_count::(size, len, count, true, false); group.bench_function( format!("substr_string_view [size={size}, count={count}, strlen={len}]",), |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), ); - let args = create_args_with_count::(size, len, count, false); + let args = create_args_with_count::(size, len, count, false, false); group.bench_function( format!("substr_string [size={size}, count={count}, strlen={len}]",), |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), ); - let args = create_args_with_count::(size, len, count, false); + let args = create_args_with_count::(size, len, count, false, false); group.bench_function( format!("substr_large_string [size={size}, count={count}, strlen={len}]",), |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), ); group.finish(); + + // Scalar start, no count + let len = 12; + let mut group = c.benchmark_group("substr, scalar start, short strings"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_without_count::(size, len, true, true, true); + group.bench_function( + format!("substr_string_view [size={size}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_without_count::(size, len, false, false, true); + group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| { + b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))) + }); + + group.finish(); + + // Scalar start and count, longer strings + let len = 128; + let count = 64; + let mut group = c.benchmark_group("substr, scalar args, long strings"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_with_count::(size, len, count, true, true); + group.bench_function( + format!("substr_string_view [size={size}, count={count}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_with_count::(size, len, count, false, true); + group.bench_function( + format!("substr_string [size={size}, count={count}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_with_count::(size, len, count, false, true); + group.bench_function( + format!("substr_large_string [size={size}, count={count}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + group.finish(); } } diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 505388089f198..5641e86898765 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use crate::strings::make_and_append_view; use crate::utils::make_scalar_function; use arrow::array::{ - Array, ArrayIter, ArrayRef, AsArray, Int64Array, NullBufferBuilder, StringArrayType, + Array, ArrayRef, AsArray, Int64Array, NullBufferBuilder, StringArrayType, StringViewArray, StringViewBuilder, }; use arrow::buffer::ScalarBuffer; @@ -53,7 +53,7 @@ use datafusion_macros::user_doc; standard_argument(name = "str", prefix = "String"), argument( name = "start_pos", - description = "Character position to start the substring at. The first character in the string has a position of 1." + description = "Character position to start the substring at. The first character in the string has a position of 1. If the start position is less than 1, it is treated as if it is before the start of the string and the (absolute) number of characters before position 1 is subtracted from `length` (if given). For example, `substr('abc', -3, 6)` returns `'ab'`." ), argument( name = "length", @@ -137,10 +137,7 @@ impl ScalarUDFImpl for SubstrFunc { } } -/// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) -/// substr('alphabet', 3) = 'phabet' -/// substr('alphabet', 3, 2) = 'ph' -/// The implementation uses UTF-8 code points as characters +/// Dispatches `substr` to the appropriate string array implementation. fn substr(args: &[ArrayRef]) -> Result { match args[0].data_type() { DataType::Utf8 => { @@ -162,70 +159,74 @@ fn substr(args: &[ArrayRef]) -> Result { } } -// Convert the given `start` and `count` to valid byte indices within `input` string -// -// Input `start` and `count` are equivalent to PostgreSQL's `substr(s, start, count)` -// `start` is 1-based, if `count` is not provided count to the end of the string -// Input indices are character-based, and return values are byte indices -// The input bounds can be outside string bounds, this function will return -// the intersection between input bounds and valid string bounds -// `input_ascii_only` is used to optimize this function if `input` is ASCII-only -// -// * Example -// 'Hi🌏' in-mem (`[]` for one char, `x` for one byte): [x][x][xxxx] -// `get_true_start_end('Hi🌏', 1, None) -> (0, 6)` -// `get_true_start_end('Hi🌏', 1, 1) -> (0, 1)` -// `get_true_start_end('Hi🌏', -10, 2) -> (0, 0)` +/// Convert the given `start` and `count` to valid byte indices within `input` string. +/// +/// Input `start` and `count` are equivalent to PostgreSQL's `substr(s, start, count)`. +/// `start` is 1-based; if `count` is not provided, returns indices to the end of the string. +/// Input indices are character-based, and return values are byte indices. +/// The input bounds can be outside string bounds; this function will return +/// the intersection between input bounds and valid string bounds. +/// `is_input_ascii_only` is used to optimize this function if `input` is ASCII-only. +/// +/// # Example +/// ```text +/// 'Hi🌏' in-mem (`[]` for one char, `x` for one byte): [x][x][xxxx] +/// get_true_start_end('Hi🌏', 1, None) -> Ok((0, 6)) +/// get_true_start_end('Hi🌏', 1, Some(1)) -> Ok((0, 1)) +/// get_true_start_end('Hi🌏', -10, Some(2)) -> Ok((0, 0)) +/// ``` pub fn get_true_start_end( input: &str, start: i64, - count: Option, + count: Option, is_input_ascii_only: bool, -) -> (usize, usize) { - let start = start.checked_sub(1).unwrap_or(start); +) -> Result<(usize, usize)> { + if let Some(count) = count + && count < 0 + { + return exec_err!("negative count not allowed: {count}"); + } + + // The caller-provided `start` is 1-indexed. + let Some(start) = start.checked_sub(1) else { + return exec_err!("start position overflow: {start}"); + }; let end = match count { - Some(count) => { - let count_i64 = i64::try_from(count).unwrap_or(i64::MAX); - start.saturating_add(count_i64) - } + Some(count) => start.saturating_add(count), None => input.len() as i64, }; - let count_to_end = count.is_some(); let start = start.clamp(0, input.len() as i64) as usize; let end = end.clamp(0, input.len() as i64) as usize; - let count = end - start; - // If input is ASCII-only, byte-based indices equals to char-based indices + // If input is ASCII-only, byte-based indices equal char-based indices if is_input_ascii_only { - return (start, end); + return Ok((start, end)); } - // Otherwise, calculate byte indices from char indices - // Note this decoding is relatively expensive for this simple `substr` function,, - // so the implementation attempts to decode in one pass (and caused the complexity) - let (mut st, mut ed) = (input.len(), input.len()); - let mut start_counting = false; - let mut cnt = 0; - for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() { - if char_cnt == start { - st = byte_cnt; - if count_to_end { - start_counting = true; - } else { + // Otherwise, calculate byte indices from char indices. We initialize both + // `byte_start` and `byte_end` to the string length to handle cases where + // the requested 'start' or 'end' positions are at or beyond the end of the + // string (resulting in an empty substring). + let mut byte_start = input.len(); + let mut byte_end = input.len(); + + for (char_idx, (byte_idx, _)) in input.char_indices().enumerate() { + if char_idx == start { + byte_start = byte_idx; + // If no length is specified, we only need the start offset. + if count.is_none() { break; } } - if start_counting { - if cnt == count { - ed = byte_cnt; - break; - } - cnt += 1; + if char_idx == end { + byte_end = byte_idx; + break; } } - (st, ed) + + Ok((byte_start, byte_end)) } // String characters are variable length encoded in UTF-8, `substr()` function's @@ -275,100 +276,45 @@ pub fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>( } } -// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44 -// From for ByteView fn string_view_substr( string_view_array: &StringViewArray, args: &[ArrayRef], ) -> Result { - let mut views_buf = Vec::with_capacity(string_view_array.len()); - let mut null_builder = NullBufferBuilder::new(string_view_array.len()); - let start_array = as_int64_array(&args[0])?; - let count_array_opt = if args.len() == 2 { - Some(as_int64_array(&args[1])?) - } else { - None - }; + let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?; let enable_ascii_fast_path = enable_ascii_fast_path(&string_view_array, start_array, count_array_opt); - // In either case of `substr(s, i)` or `substr(s, i, cnt)` - // If any of input argument is `NULL`, the result is `NULL` - match args.len() { - 1 => { - for ((str_opt, raw_view), start_opt) in string_view_array - .iter() - .zip(string_view_array.views().iter()) - .zip(start_array.iter()) - { - if let (Some(str), Some(start)) = (str_opt, start_opt) { - let (start, end) = - get_true_start_end(str, start, None, enable_ascii_fast_path); - let substr = &str[start..end]; - - make_and_append_view( - &mut views_buf, - &mut null_builder, - raw_view, - substr, - start as u32, - ); - } else { - null_builder.append_null(); - views_buf.push(0); - } - } - } - 2 => { - let count_array = count_array_opt.unwrap(); - for (((str_opt, raw_view), start_opt), count_opt) in string_view_array - .iter() - .zip(string_view_array.views().iter()) - .zip(start_array.iter()) - .zip(count_array.iter()) - { - if let (Some(str), Some(start), Some(count)) = - (str_opt, start_opt, count_opt) - { - if count < 0 { - return exec_err!( - "negative substring length not allowed: substr(, {start}, {count})" - ); - } else { - if start == i64::MIN { - return exec_err!( - "negative overflow when calculating skip value" - ); - } - let (start, end) = get_true_start_end( - str, - start, - Some(count as u64), - enable_ascii_fast_path, - ); - let substr = &str[start..end]; - - make_and_append_view( - &mut views_buf, - &mut null_builder, - raw_view, - substr, - start as u32, - ); - } - } else { - null_builder.append_null(); - views_buf.push(0); - } - } - } - other => { - return exec_err!( - "substr was called with {other} arguments. It requires 2 or 3." - ); + let mut views_buf = Vec::with_capacity(string_view_array.len()); + let mut null_builder = NullBufferBuilder::new(string_view_array.len()); + + for i in 0..string_view_array.len() { + if string_view_array.is_null(i) + || start_array.is_null(i) + || count_array_opt.map(|a| a.is_null(i)).unwrap_or(false) + { + null_builder.append_null(); + views_buf.push(0); + continue; } + + let string = string_view_array.value(i); + let start = start_array.value(i); + let count = count_array_opt.map(|a| a.value(i)); + let raw_view = string_view_array.views()[i]; + + let (start, end) = + get_true_start_end(string, start, count, enable_ascii_fast_path)?; + let substr = &string[start..end]; + + make_and_append_view( + &mut views_buf, + &mut null_builder, + &raw_view, + substr, + start as u32, + ); } let views_buf = ScalarBuffer::from(views_buf); @@ -390,82 +336,35 @@ fn string_view_substr( fn string_substr<'a, V>(string_array: V, args: &[ArrayRef]) -> Result where - V: StringArrayType<'a>, + V: StringArrayType<'a> + Copy, { let start_array = as_int64_array(&args[0])?; - let count_array_opt = if args.len() == 2 { - Some(as_int64_array(&args[1])?) - } else { - None - }; + let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?; let enable_ascii_fast_path = enable_ascii_fast_path(&string_array, start_array, count_array_opt); - match args.len() { - 1 => { - let iter = ArrayIter::new(string_array); - let mut result_builder = StringViewBuilder::new(); - for (string, start) in iter.zip(start_array.iter()) { - match (string, start) { - (Some(string), Some(start)) => { - let (start, end) = get_true_start_end( - string, - start, - None, - enable_ascii_fast_path, - ); // start, end is byte-based - let substr = &string[start..end]; - result_builder.append_value(substr); - } - _ => { - result_builder.append_null(); - } - } - } - Ok(Arc::new(result_builder.finish()) as ArrayRef) - } - 2 => { - let iter = ArrayIter::new(string_array); - let count_array = count_array_opt.unwrap(); - let mut result_builder = StringViewBuilder::new(); - - for ((string, start), count) in - iter.zip(start_array.iter()).zip(count_array.iter()) - { - match (string, start, count) { - (Some(string), Some(start), Some(count)) => { - if count < 0 { - return exec_err!( - "negative substring length not allowed: substr(, {start}, {count})" - ); - } else { - if start == i64::MIN { - return exec_err!( - "negative overflow when calculating skip value" - ); - } - let (start, end) = get_true_start_end( - string, - start, - Some(count as u64), - enable_ascii_fast_path, - ); // start, end is byte-based - let substr = &string[start..end]; - result_builder.append_value(substr); - } - } - _ => { - result_builder.append_null(); - } - } - } - Ok(Arc::new(result_builder.finish()) as ArrayRef) - } - other => { - exec_err!("substr was called with {other} arguments. It requires 2 or 3.") + let mut result_builder = StringViewBuilder::new(); + + for i in 0..string_array.len() { + if string_array.is_null(i) + || start_array.is_null(i) + || count_array_opt.map(|a| a.is_null(i)).unwrap_or(false) + { + result_builder.append_null(); + continue; } + + let string = string_array.value(i); + let start = start_array.value(i); + let count = count_array_opt.map(|a| a.value(i)); + + let (start, end) = + get_true_start_end(string, start, count, enable_ascii_fast_path)?; + result_builder.append_value(&string[start..end]); } + + Ok(Arc::new(result_builder.finish()) as ArrayRef) } #[cfg(test)] @@ -778,7 +677,7 @@ mod tests { ColumnarValue::Scalar(ScalarValue::from(1i64)), ColumnarValue::Scalar(ScalarValue::from(-1i64)), ], - exec_err!("negative substring length not allowed: substr(, 1, -1)"), + exec_err!("negative count not allowed: -1"), &str, Utf8View, StringViewArray @@ -815,7 +714,7 @@ mod tests { ColumnarValue::Scalar(ScalarValue::from("abc")), ColumnarValue::Scalar(ScalarValue::from(i64::MIN)), ], - Ok(Some("abc")), + exec_err!("start position overflow: -9223372036854775808"), &str, Utf8View, StringViewArray @@ -827,7 +726,7 @@ mod tests { ColumnarValue::Scalar(ScalarValue::from(i64::MIN)), ColumnarValue::Scalar(ScalarValue::from(1i64)), ], - exec_err!("negative overflow when calculating skip value"), + exec_err!("start position overflow: -9223372036854775808"), &str, Utf8View, StringViewArray diff --git a/datafusion/spark/src/function/string/substring.rs b/datafusion/spark/src/function/string/substring.rs index 524262b12f193..76abba68cdc36 100644 --- a/datafusion/spark/src/function/string/substring.rs +++ b/datafusion/spark/src/function/string/substring.rs @@ -244,12 +244,8 @@ where let adjusted_start = spark_start_to_datafusion_start(start, string_len); - let (byte_start, byte_end) = get_true_start_end( - string, - adjusted_start, - len_opt.map(|l| l as u64), - is_ascii, - ); + let (byte_start, byte_end) = + get_true_start_end(string, adjusted_start, len_opt, is_ascii)?; let substr = &string[byte_start..byte_end]; builder.append_value(substr); } diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index 6cf02218872df..0100ac1fe7396 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -138,10 +138,10 @@ SELECT substr(1, 3) statement error Function 'substr' failed to match any signature SELECT substr(1, 3, 4) -statement error Execution error: negative substring length not allowed +statement error Execution error: negative count not allowed select substr(arrow_cast('foo', 'Utf8View'), 1, -1); -statement error Execution error: negative substring length not allowed +statement error Execution error: negative count not allowed select substr('', 1, -1); # StringView scalar to StringView scalar diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 254151c2c20eb..8ae6f05f8d7bf 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1974,7 +1974,7 @@ substr(str, start_pos[, length]) #### Arguments - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. -- **start_pos**: Character position to start the substring at. The first character in the string has a position of 1. +- **start_pos**: Character position to start the substring at. The first character in the string has a position of 1. If the start position is less than 1, it is treated as if it is before the start of the string and the (absolute) number of characters before position 1 is subtracted from `length` (if given). For example, `substr('abc', -3, 6)` returns `'ab'`. - **length**: Number of characters to extract. If not specified, returns the rest of the string after the start position. #### Example From c21399f753638ac9febd18a31a80d3ea38f9c303 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sun, 8 Mar 2026 11:10:22 -0400 Subject: [PATCH 2/3] Improve benchmark coverage --- datafusion/functions/benches/substr.rs | 46 ++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs index 182a3848d7eaf..350f700817dd4 100644 --- a/datafusion/functions/benches/substr.rs +++ b/datafusion/functions/benches/substr.rs @@ -181,9 +181,9 @@ fn criterion_benchmark(c: &mut Criterion) { group.finish(); - // Scalar start, no count + // Scalar start, no count, short strings let len = 12; - let mut group = c.benchmark_group("substr, scalar start, short strings"); + let mut group = c.benchmark_group("substr, scalar start, no count, short strings"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); @@ -200,7 +200,47 @@ fn criterion_benchmark(c: &mut Criterion) { group.finish(); - // Scalar start and count, longer strings + // Scalar start, no count, long strings + let len = 128; + let mut group = c.benchmark_group("substr, scalar start, no count, long strings"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_without_count::(size, len, true, true, true); + group.bench_function( + format!("substr_string_view [size={size}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_without_count::(size, len, false, false, true); + group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| { + b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))) + }); + + group.finish(); + + // Scalar start and count, short strings + let len = 12; + let count = 6; + let mut group = c.benchmark_group("substr, scalar args, short strings"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_with_count::(size, len, count, true, true); + group.bench_function( + format!("substr_string_view [size={size}, count={count}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_with_count::(size, len, count, false, true); + group.bench_function( + format!("substr_string [size={size}, count={count}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + group.finish(); + + // Scalar start and count, long strings let len = 128; let count = 64; let mut group = c.benchmark_group("substr, scalar args, long strings"); From 06566cf813174364d1e9d24ebeccf6bbb1c464bc Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sun, 8 Mar 2026 11:16:29 -0400 Subject: [PATCH 3/3] cargo fmt --- datafusion/functions/benches/substr.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs index 350f700817dd4..4ea90c3708ca8 100644 --- a/datafusion/functions/benches/substr.rs +++ b/datafusion/functions/benches/substr.rs @@ -183,7 +183,8 @@ fn criterion_benchmark(c: &mut Criterion) { // Scalar start, no count, short strings let len = 12; - let mut group = c.benchmark_group("substr, scalar start, no count, short strings"); + let mut group = + c.benchmark_group("substr, scalar start, no count, short strings"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10);