From c124860e94edd8e43a7641a354e0724a161b740b Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 11 Jan 2026 22:44:42 -0500 Subject: [PATCH 1/6] Extract out date time parsing to a DateTimeParser trait with chrono & jiff implementations. Overall jiff seems slightly faster in the case where parsing has no errors but slower when using multiple formats (higher cost for Error handling). --- Cargo.lock | 32 + .../examples/builtin_functions/date_time.rs | 2 +- datafusion/common/src/config.rs | 6 + datafusion/core/Cargo.toml | 1 + datafusion/functions/Cargo.toml | 7 +- datafusion/functions/benches/to_timestamp.rs | 731 ++++++++++++------ datafusion/functions/src/datetime/common.rs | 246 +----- datafusion/functions/src/datetime/mod.rs | 9 +- datafusion/functions/src/datetime/parser.rs | 702 +++++++++++++++++ datafusion/functions/src/datetime/to_date.rs | 41 +- .../functions/src/datetime/to_timestamp.rs | 111 ++- .../test_files/datetime/dates.slt | 7 +- .../test_files/datetime/timestamps.slt | 15 +- .../test_files/information_schema.slt | 2 + .../test_files/to_timestamp_timezone.slt | 2 +- docs/source/user-guide/configs.md | 1 + 16 files changed, 1430 insertions(+), 485 deletions(-) create mode 100644 datafusion/functions/src/datetime/parser.rs diff --git a/Cargo.lock b/Cargo.lock index 9c8e47cb6e140..6cba1d87f6762 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2220,9 +2220,12 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", + "dyn-eq", + "dyn-hash", "env_logger", "hex", "itertools 0.14.0", + "jiff", "log", "md-5", "num-traits", @@ -2783,6 +2786,18 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "dyn-eq" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c2d035d21af5cde1a6f5c7b444a5bf963520a9f142e5d06931178433d7d5388" + +[[package]] +name = "dyn-hash" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fdab65db9274e0168143841eb8f864a0a21f8b1b8d2ba6812bbe6024346e99e" + [[package]] name = "educe" version = "0.6.0" @@ -3748,10 +3763,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e67e8da4c49d6d9909fe03361f9b620f58898859f5c7aded68351e85e71ecf50" dependencies = [ "jiff-static", + "jiff-tzdb-platform", "log", "portable-atomic", "portable-atomic-util", "serde_core", + "windows-sys 0.61.2", ] [[package]] @@ -3765,6 +3782,21 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "jiff-tzdb" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68971ebff725b9e2ca27a601c5eb38a4c5d64422c4cbab0c535f248087eda5c2" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jobserver" version = "0.1.34" diff --git a/datafusion-examples/examples/builtin_functions/date_time.rs b/datafusion-examples/examples/builtin_functions/date_time.rs index 08d4bc6e29978..7b430f1d2d168 100644 --- a/datafusion-examples/examples/builtin_functions/date_time.rs +++ b/datafusion-examples/examples/builtin_functions/date_time.rs @@ -417,7 +417,7 @@ async fn query_to_timestamp() -> Result<()> { .collect() .await; - let expected = "Execution error: Error parsing timestamp from '01-14-2023 01/01/30' using format '%d-%m-%Y %H:%M:%S': input is out of range"; + let expected = "Error parsing timestamp from '01-14-2023 01/01/30' using formats: [\"%d-%m-%Y %H:%M:%S\"]: input is out of range"; assert_contains!(result.unwrap_err().to_string(), expected); // note that using arrays for the chrono formats is not supported diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 87344914d2f7e..3d0de69e8b24f 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -505,11 +505,17 @@ config_namespace! { /// Defaults to the number of CPU cores on the system pub target_partitions: usize, transform = ExecutionOptions::normalized_parallelism, default = get_available_parallelism() + /// The date time parser to use when parsing date time values. + /// + /// Defaults to 'chrono'. 'jiff' is supported when the 'jiff' feature is enabled. + pub date_time_parser: Option, default = Some("chrono".to_string()) + /// The default time zone /// /// Some functions, e.g. `now` return timestamps in this time zone pub time_zone: Option, default = None + /// Parquet options pub parquet: ParquetOptions, default = Default::default() diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index bd88ed3b9ca1e..4e4ef4ffe0e83 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -109,6 +109,7 @@ unicode_expressions = [ "datafusion-functions/unicode_expressions", ] extended_tests = [] +jiff = ["datafusion-functions/jiff"] [dependencies] arrow = { workspace = true } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 4ecd7a597814b..586dbc91117ef 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -52,6 +52,8 @@ default = [ ] # enable encode/decode functions encoding_expressions = ["base64", "hex"] +# enable jiff timestamp parsing +jiff = ["dep:jiff"] # enable math functions math_expressions = [] # enable regular expressions @@ -78,8 +80,11 @@ datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } datafusion-macros = { workspace = true } +dyn-eq = "0.1.3" +dyn-hash = "1.0.0" hex = { workspace = true, optional = true } itertools = { workspace = true } +jiff = { version = "0.2.18", optional = true } log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } num-traits = { workspace = true } @@ -115,7 +120,7 @@ required-features = ["string_expressions"] [[bench]] harness = false name = "to_timestamp" -required-features = ["datetime_expressions"] +required-features = ["datetime_expressions", "jiff"] [[bench]] harness = false diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs index ed865fa6e8d50..4eaf4303be237 100644 --- a/datafusion/functions/benches/to_timestamp.rs +++ b/datafusion/functions/benches/to_timestamp.rs @@ -27,14 +27,18 @@ use arrow::datatypes::{DataType, Field, TimeUnit}; use criterion::{Criterion, criterion_group, criterion_main}; use datafusion_common::config::ConfigOptions; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::datetime::parser::DateTimeParser; +use datafusion_functions::datetime::parser::chrono::ChronoDateTimeParser; +use datafusion_functions::datetime::parser::jiff::JiffDateTimeParser; use datafusion_functions::datetime::to_timestamp; +use itertools::izip; fn data() -> StringArray { let data: Vec<&str> = vec![ "1997-01-31T09:26:56.123Z", "1997-01-31T09:26:56.123-05:00", "1997-01-31 09:26:56.123-05:00", - "2023-01-01 04:05:06.789 -08", + "2023-01-01 04:05:06.789-08", "1997-01-31T09:26:56.123", "1997-01-31 09:26:56.123", "1997-01-31 09:26:56", @@ -46,261 +50,558 @@ fn data() -> StringArray { StringArray::from(data) } -fn data_with_formats() -> (StringArray, StringArray, StringArray, StringArray) { +fn data_with_formats() -> ( + StringArray, + StringArray, + StringArray, + StringArray, + StringArray, + StringArray, + StringArray, +) { let mut inputs = StringBuilder::new(); let mut format1_builder = StringBuilder::with_capacity(2, 10); let mut format2_builder = StringBuilder::with_capacity(2, 10); let mut format3_builder = StringBuilder::with_capacity(2, 10); + let mut jiff_format1_builder = StringBuilder::with_capacity(2, 10); + let mut jiff_format2_builder = StringBuilder::with_capacity(2, 10); + let mut jiff_format3_builder = StringBuilder::with_capacity(2, 10); - inputs.append_value("1997-01-31T09:26:56.123Z"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z"); + inputs.append_value("1997-01-31T09:26:56UTC"); + + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); + format3_builder.append_value("%+"); + + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%Q"); inputs.append_value("1997-01-31T09:26:56.123-05:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%:z"); + + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%:z"); inputs.append_value("1997-01-31 09:26:56.123-05:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%:z"); + + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%:z"); - inputs.append_value("2023-01-01 04:05:06.789 -08"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + inputs.append_value("2023-01-01 04:05:06.789 -0800"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %z"); + inputs.append_value("1997-01-31T09:26:56.123"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); + inputs.append_value("1997-01-31 09:26:56.123"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); + inputs.append_value("1997-01-31 09:26:56"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H:%M:%S"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S"); + inputs.append_value("1997-01-31 092656"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H%M%S"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H%M%S"); + inputs.append_value("1997-01-31 092656+04:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); + inputs.append_value("Sun Jul 8 00:34:60 2001"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d 00:00:00"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); + format3_builder.append_value("%c"); + + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%a %b %e %H:%M:%S %Y"); ( inputs.finish(), format1_builder.finish(), format2_builder.finish(), format3_builder.finish(), + jiff_format1_builder.finish(), + jiff_format2_builder.finish(), + jiff_format3_builder.finish(), ) } + fn criterion_benchmark(c: &mut Criterion) { - let return_field = - Field::new("f", DataType::Timestamp(TimeUnit::Nanosecond, None), true).into(); - let arg_field = Field::new("a", DataType::Utf8, false).into(); - let arg_fields = vec![arg_field]; - let mut options = ConfigOptions::default(); - options.execution.time_zone = Some("UTC".into()); - let config_options = Arc::new(options); - - let to_timestamp_udf = to_timestamp(config_options.as_ref()); - - c.bench_function("to_timestamp_no_formats_utf8", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let arr_data = data(); - let batch_len = arr_data.len(); - let string_array = ColumnarValue::Array(Arc::new(arr_data) as ArrayRef); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: vec![string_array.clone()], - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_no_formats_largeutf8", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let data = cast(&data(), &DataType::LargeUtf8).unwrap(); - let batch_len = data.len(); - let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: vec![string_array.clone()], - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_no_formats_utf8view", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let data = cast(&data(), &DataType::Utf8View).unwrap(); - let batch_len = data.len(); - let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: vec![string_array.clone()], - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_with_formats_utf8", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let (inputs, format1, format2, format3) = data_with_formats(); - let batch_len = inputs.len(); - - let args = vec![ - ColumnarValue::Array(Arc::new(inputs) as ArrayRef), - ColumnarValue::Array(Arc::new(format1) as ArrayRef), - ColumnarValue::Array(Arc::new(format2) as ArrayRef), - ColumnarValue::Array(Arc::new(format3) as ArrayRef), - ]; - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() - }) - .collect::>(); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_with_formats_largeutf8", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let (inputs, format1, format2, format3) = data_with_formats(); - let batch_len = inputs.len(); - - let args = vec![ - ColumnarValue::Array( - Arc::new(cast(&inputs, &DataType::LargeUtf8).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format1, &DataType::LargeUtf8).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format2, &DataType::LargeUtf8).unwrap()) as ArrayRef + for &parser in ["jiff", "chrono"].iter() { + c.bench_function( + &format!("string_to_timestamp_nanos_formatted_single_format_utf8_{parser}"), + |b| { + let datetime_parser = match parser { + "chrono" => { + Box::new(ChronoDateTimeParser::new()) as Box + } + "jiff" => { + Box::new(JiffDateTimeParser::new()) as Box + } + _ => unreachable!(), + }; + + let (inputs, _, _, format3, _, _, jiffformat3) = data_with_formats(); + + b.iter(|| { + for (input, format3, jiff_format3) in + izip!(inputs.iter(), format3.iter(), jiffformat3.iter()) + { + black_box(match parser { + "chrono" => { + let t = datetime_parser + .string_to_timestamp_nanos_formatted( + "UTC", + input.unwrap(), + &[format3.unwrap()], + ); + if t.is_err() { + println!("Error: {:?}", t); + } + } + "jiff" => { + let t = datetime_parser + .string_to_timestamp_nanos_formatted( + "UTC", + input.unwrap(), + &[jiff_format3.unwrap()], + ); + if t.is_err() { + println!("Error: {:?}", t); + } + } + _ => unreachable!(), + }) + } + }) + }, + ); + + c.bench_function( + &format!( + "string_to_timestamp_nanos_formatted_multiple_formats_utf8_{parser}" ), - ColumnarValue::Array( - Arc::new(cast(&format3, &DataType::LargeUtf8).unwrap()) as ArrayRef - ), - ]; - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + |b| { + let datetime_parser = match parser { + "chrono" => { + Box::new(ChronoDateTimeParser::new()) as Box + } + "jiff" => { + Box::new(JiffDateTimeParser::new()) as Box + } + _ => unreachable!(), + }; + + let ( + inputs, + format1, + format2, + format3, + jiff_format1, + jiff_format2, + jiff_format3, + ) = data_with_formats(); + + b.iter(|| { + for ( + input, + format1, + format2, + format3, + jiff_format1, + jiff_format2, + jiff_format3, + ) in izip!( + inputs.iter(), + format1.iter(), + format2.iter(), + format3.iter(), + jiff_format1.iter(), + jiff_format2.iter(), + jiff_format3.iter() + ) { + black_box(match parser { + "chrono" => { + let t = datetime_parser + .string_to_timestamp_nanos_formatted( + "UTC", + input.unwrap(), + &[ + format1.unwrap(), + format2.unwrap(), + format3.unwrap(), + ], + ); + if t.is_err() { + println!("Error: {:?}", t); + } + } + "jiff" => { + let t = datetime_parser + .string_to_timestamp_nanos_formatted( + "UTC", + input.unwrap(), + &[ + jiff_format1.unwrap(), + jiff_format2.unwrap(), + jiff_format3.unwrap(), + ], + ); + if t.is_err() { + println!("Error: {:?}", t); + } + } + _ => unreachable!(), + }) + } + }) + }, + ); + + let return_field = + Field::new("f", DataType::Timestamp(TimeUnit::Nanosecond, None), true).into(); + let arg_field = Field::new("a", DataType::Utf8, false).into(); + let arg_fields = vec![arg_field]; + + c.bench_function(&format!("to_timestamp_no_formats_utf8_{parser}"), |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let arr_data = data(); + let batch_len = arr_data.len(); + let string_array = ColumnarValue::Array(Arc::new(arr_data) as ArrayRef); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: vec![string_array.clone()], + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) }) - .collect::>(); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_with_formats_utf8view", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let (inputs, format1, format2, format3) = data_with_formats(); - - let batch_len = inputs.len(); - - let args = vec![ - ColumnarValue::Array( - Arc::new(cast(&inputs, &DataType::Utf8View).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format1, &DataType::Utf8View).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format2, &DataType::Utf8View).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format3, &DataType::Utf8View).unwrap()) as ArrayRef - ), - ]; - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }); + + c.bench_function( + &format!("to_timestamp_no_formats_largeutf8_{parser}"), + |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let data = cast(&data(), &DataType::LargeUtf8).unwrap(); + let batch_len = data.len(); + let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: vec![string_array.clone()], + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) + }) + }, + ); + + c.bench_function(&format!("to_timestamp_no_formats_utf8view_{parser}"), |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let data = cast(&data(), &DataType::Utf8View).unwrap(); + let batch_len = data.len(); + let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: vec![string_array.clone()], + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) + }) + }); + + c.bench_function(&format!("to_timestamp_with_formats_utf8_{parser}"), |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let ( + inputs, + format1, + format2, + format3, + jiffformat1, + jiffformat2, + jiffformat3, + ) = data_with_formats(); + let batch_len = inputs.len(); + + let args = match parser { + "chrono" => vec![ + ColumnarValue::Array(Arc::new(inputs) as ArrayRef), + ColumnarValue::Array(Arc::new(format1) as ArrayRef), + ColumnarValue::Array(Arc::new(format2) as ArrayRef), + ColumnarValue::Array(Arc::new(format3) as ArrayRef), + ], + "jiff" => { + vec![ + ColumnarValue::Array(Arc::new(inputs) as ArrayRef), + ColumnarValue::Array(Arc::new(jiffformat1) as ArrayRef), + ColumnarValue::Array(Arc::new(jiffformat2) as ArrayRef), + ColumnarValue::Array(Arc::new(jiffformat3) as ArrayRef), + ] + } + _ => unreachable!(), + }; + + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) }) - .collect::>(); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), + }); + + c.bench_function( + &format!("to_timestamp_with_formats_largeutf8_{parser}"), + |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let ( + inputs, + format1, + format2, + format3, + jiffformat1, + jiffformat2, + jiffformat3, + ) = data_with_formats(); + let batch_len = inputs.len(); + + let args = match parser { + "chrono" => vec![ + ColumnarValue::Array(Arc::new( + cast(&inputs, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format1, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format2, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format3, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ], + "jiff" => vec![ + ColumnarValue::Array(Arc::new( + cast(&inputs, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat1, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat2, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat3, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ], + _ => unreachable!(), + }; + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) + }) + }, + ); + + c.bench_function( + &format!("to_timestamp_with_formats_utf8view_{parser}"), + |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let ( + inputs, + format1, + format2, + format3, + jiffformat1, + jiffformat2, + jiffformat3, + ) = data_with_formats(); + + let batch_len = inputs.len(); + + let args = match parser { + "chrono" => vec![ + ColumnarValue::Array(Arc::new( + cast(&inputs, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format1, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format2, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format3, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ], + "jiff" => vec![ + ColumnarValue::Array(Arc::new( + cast(&inputs, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat1, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat2, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat3, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ], + _ => unreachable!(), + }; + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); + .collect::>(); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) + }) + }, + ); + } } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs index 2db64beafa9b7..f39c7236ab248 100644 --- a/datafusion/functions/src/datetime/common.rs +++ b/datafusion/functions/src/datetime/common.rs @@ -15,59 +15,21 @@ // specific language governing permissions and limitations // under the License. -use std::sync::{Arc, LazyLock}; +use std::sync::Arc; -use arrow::array::timezone::Tz; use arrow::array::{ Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray, StringArrayType, StringViewArray, }; use arrow::compute::DecimalCast; -use arrow::compute::kernels::cast_utils::string_to_datetime; use arrow::datatypes::{DataType, TimeUnit}; use arrow_buffer::ArrowNativeType; -use chrono::LocalResult::Single; -use chrono::format::{Parsed, StrftimeItems, parse}; -use chrono::{DateTime, TimeZone, Utc}; use datafusion_common::cast::as_generic_string_array; use datafusion_common::{ - DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, - internal_datafusion_err, unwrap_or_internal_err, + Result, ScalarValue, exec_err, internal_datafusion_err, unwrap_or_internal_err, }; use datafusion_expr::ColumnarValue; -/// Error message if nanosecond conversion request beyond supported interval -const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; - -static UTC: LazyLock = LazyLock::new(|| "UTC".parse().expect("UTC is always valid")); - -/// Converts a string representation of a date‑time into a timestamp expressed in -/// nanoseconds since the Unix epoch. -/// -/// This helper is a thin wrapper around the more general `string_to_datetime` -/// function. It accepts an optional `timezone` which, if `None`, defaults to -/// Coordinated Universal Time (UTC). The string `s` must contain a valid -/// date‑time format that can be parsed by the underlying chrono parser. -/// -/// # Return Value -/// -/// * `Ok(i64)` – The number of nanoseconds since `1970‑01‑01T00:00:00Z`. -/// * `Err(DataFusionError)` – If the string cannot be parsed, the parsed -/// value is out of range (between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804) -/// or the parsed value does not correspond to an unambiguous time. -pub(crate) fn string_to_timestamp_nanos_with_timezone( - timezone: &Option, - s: &str, -) -> Result { - let tz = timezone.as_ref().unwrap_or(&UTC); - let dt = string_to_datetime(tz, s)?; - let parsed = dt - .timestamp_nanos_opt() - .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?; - - Ok(parsed) -} - /// Checks that all the arguments from the second are of type [Utf8], [LargeUtf8] or [Utf8View] /// /// [Utf8]: DataType::Utf8 @@ -92,161 +54,6 @@ pub(crate) fn validate_data_types(args: &[ColumnarValue], name: &str) -> Result< Ok(()) } -/// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers -/// relative to the provided `timezone` -/// -/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error -/// will be returned -/// -/// Note that parsing [IANA timezones] is not supported yet in chrono - -/// and this implementation only supports named timezones at the end of the string preceded by a space. -/// -/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html -/// [IANA timezones]: https://www.iana.org/time-zones -pub(crate) fn string_to_datetime_formatted( - timezone: &T, - s: &str, - format: &str, -) -> Result, DataFusionError> { - let err = |err_ctx: &str| { - exec_datafusion_err!( - "Error parsing timestamp from '{s}' using format '{format}': {err_ctx}" - ) - }; - - let mut datetime_str = s; - let mut format = format; - - // Manually handle the most common case of a named timezone at the end of the timestamp. - // Note that %+ handles 'Z' at the end of the string without a space. This code doesn't - // handle named timezones with no preceding space since that would require writing a - // custom parser (or switching to Jiff) - let tz: Option = if format.trim_end().ends_with(" %Z") { - // grab the string after the last space as the named timezone - if let Some((dt_str, timezone_name)) = datetime_str.trim_end().rsplit_once(' ') { - datetime_str = dt_str; - - // attempt to parse the timezone name - let result: Result = - timezone_name.parse(); - let Ok(tz) = result else { - return Err(err(&result.unwrap_err().to_string())); - }; - - // successfully parsed the timezone name, remove the ' %Z' from the format - format = &format[..format.len() - 3]; - - Some(tz) - } else { - None - } - } else if format.contains("%Z") { - return Err(err( - "'%Z' is only supported at the end of the format string preceded by a space", - )); - } else { - None - }; - - let mut parsed = Parsed::new(); - parse(&mut parsed, datetime_str, StrftimeItems::new(format)) - .map_err(|e| err(&e.to_string()))?; - - let dt = match tz { - Some(tz) => { - // A timezone was manually parsed out, convert it to a fixed offset - match parsed.to_datetime_with_timezone(&tz) { - Ok(dt) => Ok(dt.fixed_offset()), - Err(e) => Err(e), - } - } - // default to parse the string assuming it has a timezone - None => parsed.to_datetime(), - }; - - if let Err(e) = &dt { - // no timezone or other failure, try without a timezone - let ndt = parsed - .to_naive_datetime_with_offset(0) - .or_else(|_| parsed.to_naive_date().map(|nd| nd.into())); - if let Err(e) = &ndt { - return Err(err(&e.to_string())); - } - - if let Single(e) = &timezone.from_local_datetime(&ndt.unwrap()) { - Ok(e.to_owned()) - } else { - Err(err(&e.to_string())) - } - } else { - Ok(dt.unwrap().with_timezone(timezone)) - } -} - -/// Accepts a string with a `chrono` format and converts it to a -/// nanosecond precision timestamp relative to the provided `timezone`. -/// -/// See [`chrono::format::strftime`] for the full set of supported formats. -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// Any timestamp in the formatting string is handled according to the rules -/// defined by `chrono`. -/// -/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html -#[inline] -pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone( - timezone: &Option, - s: &str, - format: &str, -) -> Result { - let dt = string_to_datetime_formatted(timezone.as_ref().unwrap_or(&UTC), s, format)?; - let parsed = dt - .timestamp_nanos_opt() - .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?; - - Ok(parsed) -} - -/// Accepts a string with a `chrono` format and converts it to a -/// millisecond precision timestamp relative to the provided `timezone`. -/// -/// See [`chrono::format::strftime`] for the full set of supported formats. -/// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// Any timestamp in the formatting string is handled according to the rules -/// defined by `chrono`. -/// -/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html -#[inline] -pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Result { - Ok(string_to_datetime_formatted(&Utc, s, format)? - .naive_utc() - .and_utc() - .timestamp_millis()) -} - pub(crate) fn handle( args: &[ColumnarValue], op: F, @@ -306,7 +113,7 @@ pub(crate) fn handle_multiple( ) -> Result where O: ArrowPrimitiveType, - F: Fn(&str, &str) -> Result, + F: Fn(&str, &[&str]) -> Result, M: Fn(O::Native) -> O::Native, { match &args[0] { @@ -372,7 +179,7 @@ where }; if let Some(s) = x { - match op(a, s.as_str()) { + match op(a, &[s.as_str()]) { Ok(r) => { let result = op2(r).to_i64(); let s = scalar_value(dt, result)?; @@ -411,7 +218,7 @@ pub(crate) fn strings_to_primitive_function( ) -> Result> where O: ArrowPrimitiveType, - F: Fn(&str, &str) -> Result, + F: Fn(&str, &[&str]) -> Result, F2: Fn(O::Native) -> O::Native, { if args.len() < 2 { @@ -472,7 +279,7 @@ fn handle_array_op<'a, O, V, F, F2>( where V: StringArrayType<'a>, O: ArrowPrimitiveType, - F: Fn(&str, &str) -> Result, + F: Fn(&str, &[&str]) -> Result, F2: Fn(O::Native) -> O::Native, { first @@ -481,28 +288,39 @@ where .map(|(pos, x)| { let mut val = None; if let Some(x) = x { + let mut v = vec![]; + for arg in args { - let v = match arg { + match arg { ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8View => Ok(a.as_string_view().value(pos)), - DataType::LargeUtf8 => Ok(a.as_string::().value(pos)), - DataType::Utf8 => Ok(a.as_string::().value(pos)), - other => exec_err!("Unexpected type encountered '{other}'"), + DataType::Utf8View => v.push(a.as_string_view().value(pos)), + DataType::LargeUtf8 => { + v.push(a.as_string::().value(pos)) + } + DataType::Utf8 => v.push(a.as_string::().value(pos)), + other => { + return exec_err!( + "Unexpected type encountered '{other}'" + ); + } }, ColumnarValue::Scalar(s) => match s.try_as_str() { - Some(Some(v)) => Ok(v), + Some(Some(s)) => v.push(s), Some(None) => continue, // null string - None => exec_err!("Unexpected scalar type encountered '{s}'"), + None => { + return exec_err!( + "Unexpected scalar type encountered '{s}'" + ); + } }, - }?; + }; + } - let r = op(x, v); - if let Ok(inner) = r { - val = Some(Ok(op2(inner))); - break; - } else { - val = Some(r); - } + let r = op(x, &v); + if let Ok(inner) = r { + val = Some(Ok(op2(inner))); + } else { + val = Some(r); } }; diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index 39b9453295df6..5b1ec3e3f1dfe 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -31,6 +31,7 @@ pub mod from_unixtime; pub mod make_date; pub mod make_time; pub mod now; +pub mod parser; pub mod planner; pub mod to_char; pub mod to_date; @@ -49,7 +50,7 @@ make_udf_function!(make_date::MakeDateFunc, make_date); make_udf_function!(make_time::MakeTimeFunc, make_time); make_udf_function!(from_unixtime::FromUnixtimeFunc, from_unixtime); make_udf_function!(to_char::ToCharFunc, to_char); -make_udf_function!(to_date::ToDateFunc, to_date); +make_udf_function_with_config!(to_date::ToDateFunc, to_date); make_udf_function!(to_local_time::ToLocalTimeFunc, to_local_time); make_udf_function!(to_time::ToTimeFunc, to_time); make_udf_function!(to_unixtime::ToUnixtimeFunc, to_unixtime); @@ -69,6 +70,7 @@ make_udf_function_with_config!(now::NowFunc, now); // functions with varargs currently pub mod expr_fn { + use datafusion_common::config::ConfigOptions; use datafusion_expr::Expr; export_functions!(( @@ -267,7 +269,8 @@ pub mod expr_fn { /// # } /// ``` pub fn to_date(args: Vec) -> Expr { - super::to_date().call(args) + let config = ConfigOptions::default(); + super::to_date(&config).call(args) } } @@ -286,7 +289,7 @@ pub fn functions() -> Vec> { make_time(), now(&config), to_char(), - to_date(), + to_date(&config), to_local_time(), to_time(), to_unixtime(), diff --git a/datafusion/functions/src/datetime/parser.rs b/datafusion/functions/src/datetime/parser.rs new file mode 100644 index 0000000000000..f6e55d8b79b9b --- /dev/null +++ b/datafusion/functions/src/datetime/parser.rs @@ -0,0 +1,702 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::DataFusionError; +use datafusion_common::config::ConfigOptions; +use dyn_eq::DynEq; +use dyn_hash::DynHash; +use std::fmt::Debug; + +/// A trait for parsing timestamps from strings. Two implementations are provided: +/// `ChronoDateTimeParser` (the default) which uses [Chrono] to parse timestamps, and +/// `JiffDateTimeParser` (via the `jiff` feature flag) which uses [Jiff] to parse +/// timestamps. +/// +/// While both implementations are capable of parsing timestamps, the `ChronoDateTimeParser` +/// is a bit more lenient wrt formats at the cost of being slightly slower than Jiff. +/// Jiff, on the other hand, has slightly less support for some format options than Chrono +/// but is measurably faster than Chrono in some situations. +/// +/// [Chrono]: https://docs.rs/chrono/latest/chrono/ +/// [Jiff]: https://docs.rs/jiff/latest/jiff/ +pub trait DateTimeParser: Debug + DynEq + DynHash + Send + Sync { + fn string_to_timestamp_nanos( + &self, + tz: &str, + s: &str, + ) -> datafusion_common::Result; + + fn string_to_timestamp_nanos_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> datafusion_common::Result; + + fn string_to_timestamp_millis_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> datafusion_common::Result; +} + +// impl Eq and PartialEaq for dyn DateTimeParser +dyn_eq::eq_trait_object!(DateTimeParser); + +// Implement std::hash::Hash for dyn DateTimeParser +dyn_hash::hash_trait_object!(DateTimeParser); + +pub fn get_date_time_parser(config_options: &ConfigOptions) -> Box { + let parser_cfg = config_options.execution.date_time_parser.as_ref(); + + match parser_cfg { + Some(p) => match p.as_str() { + "chrono" => { + Box::new(chrono::ChronoDateTimeParser::new()) as Box + } + #[cfg(feature = "jiff")] + "jiff" => { + Box::new(jiff::JiffDateTimeParser::new()) as Box + } + _ => panic!("Unknown/unsupported date time parser: {p}"), + }, + None => Box::new(chrono::ChronoDateTimeParser::new()) as Box, + } +} + +pub mod chrono { + use crate::datetime::parser::DateTimeParser; + use arrow::array::timezone::Tz; + use arrow::compute::kernels::cast_utils::string_to_datetime; + use chrono::LocalResult; + use chrono::format::{ParseErrorKind, Parsed, StrftimeItems, parse}; + use chrono_tz::ParseError; + use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err}; + + const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + + #[derive(Debug, Default, PartialEq, Eq, Hash)] + pub struct ChronoDateTimeParser {} + + impl ChronoDateTimeParser { + pub fn new() -> Self { + Self {} + } + + /// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers + /// relative to the provided `timezone`. + /// + /// If a timestamp is ambiguous, for example, as a result of daylight-savings time, an error + /// will be returned. + /// + /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html + fn string_to_datetime_formatted( + &self, + timezone: &T, + s: &str, + formats: &[&str], + ) -> Result, DataFusionError> { + let mut err: Option = None; + + for &format in formats.iter() { + let mut format = format; + let mut datetime_str = s; + + // we manually handle the most common case of a named timezone at the end of the timestamp + // since chrono doesn't support that directly. Note, however, that %+ does handle 'Z' at the + // end of the string. + // + // This code doesn't handle named timezones with no preceding space since that would require + // writing a custom parser. + let tz: Option = if format.ends_with(" %Z") { + // grab the string after the last space as the named timezone + let parts: Vec<&str> = datetime_str.rsplitn(2, ' ').collect(); + let timezone_name = parts[0]; + datetime_str = parts[1]; + + // attempt to parse the timezone name + let result: Result = timezone_name.parse(); + let Ok(tz) = result else { + err = Some(result.unwrap_err().to_string()); + continue; + }; + + // successfully parsed the timezone name, remove the ' %Z' from the format + format = &format[..format.len() - 3]; + + Some(tz) + } else if format.contains("%Z") { + err = Some("'%Z' is only supported at the end of the format string preceded by a space".into()); + continue; + } else { + None + }; + + let mut parsed = Parsed::new(); + let items = StrftimeItems::new(format); + let result = parse(&mut parsed, datetime_str, items); + + if let Err(e) = &result { + err = Some(e.to_string()); + continue; + } + + let dt = match tz { + Some(tz) => { + // A timezone was manually parsed out, convert it to a fixed offset + match parsed.to_datetime_with_timezone(&tz) { + Ok(dt) => Ok(dt.fixed_offset()), + Err(e) => Err(e), + } + } + // default to parse the string assuming it has a timezone + None => parsed.to_datetime(), + }; + + match dt { + Ok(dt) => return Ok(dt.with_timezone(timezone)), + Err(e) if e.kind() == ParseErrorKind::Impossible => { + err = Some(format!( + "Unable to parse timestamp {datetime_str} in timezone {tz:?}: datetime was impossible" + )); + continue; + } + Err(e) if e.kind() == ParseErrorKind::OutOfRange => { + err = Some(format!( + "Unable to parse timestamp {datetime_str} in timezone {tz:?}: datetime was out of range" + )); + continue; + } + _ => { + // no timezone or other failure, try without a timezone + let ndt = parsed + .to_naive_datetime_with_offset(0) + .or_else(|_| parsed.to_naive_date().map(|nd| nd.into())); + if let Err(e) = &ndt { + err = Some(e.to_string()); + continue; + } + + let result = &timezone.from_local_datetime(&ndt.unwrap()); + + match result { + LocalResult::Single(e) => return Ok(e.to_owned()), + LocalResult::None => { + err = Some(format!( + "Unable to parse timestamp {datetime_str} in timezone {tz:?}: no valid local time found" + )); + continue; + } + LocalResult::Ambiguous(earliest, latest) => { + err = Some(format!( + "Unable to parse timestamp {datetime_str} in timezone {tz:?}: ambiguous timestamps found {earliest:?} and {latest:?}" + )); + continue; + } + } + } + } + } + + match err { + Some(e) => exec_err!( + "Error parsing timestamp from '{s}' using formats: {formats:?}: {e}" + ), + None => exec_err!( + "Error parsing timestamp from '{s}' using formats: {formats:?}" + ), + } + } + } + + impl DateTimeParser for ChronoDateTimeParser { + /// Accepts a string and parses it relative to the provided `timezone` + /// + /// In addition to RFC3339 / ISO8601 standard timestamps, it also + /// accepts strings that use a space ` ` to separate the date and time + /// as well as strings that have no explicit timezone offset. + /// + /// Examples of accepted inputs: + /// * `1997-01-31T09:26:56.123Z` # RCF3339 + /// * `1997-01-31T09:26:56.123-05:00` # RCF3339 + /// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T + /// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator + /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified + /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset + /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds + /// * `1997-01-31 092656` # close to RCF3339, no fractional seconds + /// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator + /// * `1997-01-31` # close to RCF3339, only date no time + /// + /// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled + /// + /// * `2023-01-01 040506 America/Los_Angeles` + /// + /// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error + /// will be returned + /// + /// Some formats supported by PostgresSql + /// are not supported, like + /// + /// * "2023-01-01 04:05:06.789 +07:30:00", + /// * "2023-01-01 040506 +07:30:00", + /// * "2023-01-01 04:05:06.789 PST", + #[inline] + fn string_to_timestamp_nanos( + &self, + tz: &str, + s: &str, + ) -> Result { + let tz: Tz = match tz.parse() { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"), + }; + + let dt = string_to_datetime(&tz, s)?; + let parsed = dt + .timestamp_nanos_opt() + .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?; + + Ok(parsed) + } + + /// Accepts a string with an array of `chrono` formats and converts it to a + /// nanosecond precision timestamp according to the rules + /// defined by `chrono`. + /// + /// See [`chrono::format::strftime`] for the full set of supported formats. + /// + /// ## Timestamp Precision + /// + /// This function uses the maximum precision timestamps supported by + /// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This + /// means the range of dates that timestamps can represent is ~1677 AD + /// to 2262 AM. + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + /// + /// Note that parsing named [IANA timezones] is not supported yet in chrono + /// and this implementation + /// only supports named timezones at the end of the string preceded by a space. + /// + /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html + /// [IANA timezones]: https://www.iana.org/time-zones + #[inline] + fn string_to_timestamp_nanos_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> Result { + let tz: Tz = match tz.parse() { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"), + }; + + let dt = self.string_to_datetime_formatted(&tz, s, formats)?; + dt.naive_utc() + .and_utc() + .timestamp_nanos_opt() + .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}")) + } + + /// Accepts a string with a `chrono` format and converts it to a + /// millisecond precision timestamp. + /// + /// See [`chrono::format::strftime`] for the full set of supported formats. + /// + /// Internally, this function uses the `chrono` library for the + /// datetime parsing + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + /// + /// Any timestamp in the formatting string is handled according to the rules + /// defined by `chrono`. + /// + /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html + #[inline] + fn string_to_timestamp_millis_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> Result { + let tz: Tz = match tz.parse() { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"), + }; + + Ok(self + .string_to_datetime_formatted(&tz, s, formats)? + .naive_utc() + .and_utc() + .timestamp_millis()) + } + } +} + +#[cfg(feature = "jiff")] +pub mod jiff { + use crate::datetime::parser::DateTimeParser; + use arrow::array::timezone::Tz; + use arrow::error::ArrowError; + use chrono::format::{Parsed, StrftimeItems, parse}; + use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err}; + use jiff::civil::Time; + use jiff::fmt::temporal::Pieces; + use jiff::tz::{Disambiguation, TimeZone}; + use jiff::{Timestamp, Zoned}; + use num_traits::ToPrimitive; + + const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + + #[derive(Debug, PartialEq, Eq, Hash)] + pub struct JiffDateTimeParser {} + + impl JiffDateTimeParser { + pub fn new() -> Self { + Self {} + } + + /// Attempts to parse a given string representation of a timestamp into a `Timestamp` object. + /// The function also adjusts the datetime for the specified timezone. + /// + /// # Parameters + /// - `timezone`: A reference to the `TimeZone` object used to adjust the parsed datetime to the desired timezone. + /// - `s`: A string slice holding the timestamp to be parsed. + /// + /// # Returns + /// - `Ok(Timestamp)`: Contains the parsed timestamp in seconds since the Unix epoch. + /// - `Err(ArrowError)`: Returned in the event of errors in parsing + /// the timestamp string or computing the timezone offset. + /// + /// # Errors + /// This function will return an `ArrowError` if: + /// + /// - The string `s` is shorter than 10 characters. + /// - The format of the string does not match expected timestamp patterns. + /// - An invalid/unknown time zone or offset is provided during parsing. + /// - Errors occur while converting the datetime to a specific time zone. + fn string_to_datetime( + &self, + timezone: &TimeZone, + s: &str, + ) -> datafusion_common::Result { + let err = |ctx: &str| { + ArrowError::ParseError(format!( + "Error parsing timestamp from '{s}': {ctx}" + )) + }; + + let bytes = s.as_bytes(); + if bytes.len() < 10 { + return Err(err("timestamp must contain at least 10 characters")); + } + + let pieces = Pieces::parse(bytes).map_err(|e| err(&format!("{e:?}")))?; + let time = pieces.time().unwrap_or_else(Time::midnight); + let dt = pieces.date().to_datetime(time); + let tz = match pieces + .to_time_zone() + .map_err(|e| err(&format!("unknown time zone: {e:?}")))? + { + Some(tz) => tz, + None => match pieces.to_numeric_offset() { + Some(offset) => TimeZone::fixed(offset), + None => timezone.to_owned(), + }, + }; + let zdt = tz + .to_zoned(dt) + .map_err(|e| err(&format!("error computing timezone offset: {e:?}")))?; + + Ok(zdt.timestamp()) + } + + /// Attempts to parse a given string representation of a timestamp into a `Zoned` datetime object + /// using a list of provided formats. The function also adjusts the datetime for the specified timezone. + /// + /// # Parameters + /// - `timezone`: A reference to the `TimeZone` object used to adjust the parsed datetime to the desired timezone. + /// - `s`: A string slice holding the timestamp to be parsed. + /// - `formats`: A slice of string slices representing the accepted datetime formats to use when parsing. + /// + /// # Returns + /// - `Ok(Zoned)` if the string is successfully parsed into a `Zoned` datetime object using one of the formats provided. + /// - `Err(DataFusionError)` if the string cannot be parsed or if there are issues related to the timezone or format handling. + /// + /// # Behavior + /// 1. Iterates through the list of provided formats. + /// 2. Handles special cases such as `%Z` (timezone) or `%c` (locale-aware datetime representation) + /// by either modifying the format string or switching to different parsing logic. + /// 3. Attempts to resolve timezone-related issues: + /// - Calculates a fixed offset for the given `timezone`. + /// - Handles attempts to parse timezones using IANA names or offsets. + /// 4. If parsing fails for the current format, the function moves to the next format and aggregates any parsing errors. + /// 5. If no formats succeed, an error is returned summarizing the failure. + /// + /// # Errors + /// - Returns a descriptive error wrapped in `DataFusionError` if: + /// - The input string is not compatible with any of the provided formats. + /// - There are issues parsing the timezone from the input string or adjusting to the provided `TimeZone`. + /// - There are issues resolving ambiguous datetime representations. + fn string_to_datetime_formatted( + &self, + timezone: &TimeZone, + s: &str, + formats: &[&str], + ) -> datafusion_common::Result { + let mut err: Option = None; + + for &format in formats.iter() { + let mut format = if format.contains("%Z") { + &format.replace("%Z", "%Q") + } else { + format + }; + + if format == "%c" { + // switch to chrono, jiff doesn't support parsing with %c + let mut parsed = Parsed::new(); + let result = parse(&mut parsed, s, StrftimeItems::new(format)); + + if result.is_err() { + err = Some(format!("error parsing timestamp: {result:?}")); + continue; + } + + let offset = timezone.to_fixed_offset(); + let tz: &str = if let Ok(offset) = offset { + &offset.to_string() + } else { + let result = timezone.iana_name(); + match result { + Some(tz) => tz, + None => { + err = + Some(format!("error parsing timezone: {timezone:?}")); + continue; + } + } + }; + let tz: Tz = tz.parse::().ok().unwrap_or("UTC".parse::()?); + + match parsed.to_datetime_with_timezone(&tz) { + Ok(dt) => { + let dt = dt.fixed_offset(); + let result = + Timestamp::from_nanosecond(dt.timestamp() as i128); + match result { + Ok(result) => { + return Ok(result.to_zoned(timezone.to_owned())); + } + Err(e) => { + err = Some(e.to_string()); + continue; + } + } + } + Err(e) => { + err = Some(e.to_string()); + continue; + } + }; + } + + let result = if format == "%+" { + // jiff doesn't support parsing with %+, but the equivalent of %+ is + // somewhat rfc3389 which is what is used by the parse method + let result: Result = s.parse(); + + if let Ok(r) = result { + return Ok(r); + } else { + // however, the above only works with timezone names, not offsets + let s = if s.ends_with("Z") { + &(s.trim_end_matches("Z").to_owned() + "+00:00") + } else { + s + }; + + // try again with fractional seconds + format = "%Y-%m-%dT%H:%M:%S%.f%:z"; + + jiff::fmt::strtime::parse(format, s) + } + } else { + jiff::fmt::strtime::parse(format, s) + }; + + match result { + Ok(bdt) => { + if bdt.iana_time_zone().is_some() || bdt.offset().is_some() { + if let Ok(zoned) = bdt.to_zoned() { + return Ok(zoned.with_time_zone(timezone.to_owned())); + } + } + + let result = bdt.to_datetime(); + let datetime = match result { + Ok(datetime) => datetime, + Err(e) => { + err = Some(e.to_string()); + continue; + } + }; + + let zoned = timezone + .to_owned() + .into_ambiguous_zoned(datetime) + .disambiguate(Disambiguation::Compatible); + + match zoned { + Ok(z) => return Ok(z), + Err(e) => { + err = Some(e.to_string()); + continue; + } + } + } + Err(e) => { + err = Some(e.to_string()); + continue; + } + } + } + + match err { + Some(e) => exec_err!("{e}"), + None => { + exec_err!("Unable to parse timestamp: {s} with formats: {formats:?}") + } + } + } + } + + impl DateTimeParser for JiffDateTimeParser { + /// Accepts a string with a `jiff` format and converts it to a + /// nanosecond precision timestamp according to the rules + /// defined by `jiff`. + /// + /// See + /// for the full set of supported formats. + /// + /// ## Timestamp Precision + /// + /// This function uses the maximum precision timestamps supported by + /// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This + /// means the range of dates that timestamps can represent is ~1677 AD + /// to 2262 AM. + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + #[inline] + fn string_to_timestamp_nanos( + &self, + tz: &str, + s: &str, + ) -> Result { + let result = TimeZone::get(tz.as_ref()); + let timezone = match result { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), + }; + + let timestamp = self.string_to_datetime(&timezone, s)?; + let parsed = timestamp + .as_nanosecond() + .to_i64() + .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?; + + Ok(parsed) + } + + /// Accepts a string with an array of `jiff` formats and converts it to a + /// nanosecond precision timestamp according to the rules + /// defined by `jiff`. + /// + /// See + /// for the full set of supported formats. + /// + /// ## Timestamp Precision + /// + /// This function uses the maximum precision timestamps supported by + /// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This + /// means the range of dates that timestamps can represent is ~1677 AD + /// to 2262 AM. + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + #[inline] + fn string_to_timestamp_nanos_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> Result { + let result = TimeZone::get(tz.as_ref()); + let timezone = match result { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), + }; + + let zoned = self.string_to_datetime_formatted(&timezone, s, formats)?; + let parsed = + zoned.timestamp().as_nanosecond().to_i64().ok_or_else(|| { + exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}") + })?; + + Ok(parsed) + } + + /// Accepts a string with an array of `jiff` formats and converts it to a + /// millisecond precision timestamp according to the rules + /// defined by `jiff`. + /// + /// See + /// for the full set of supported formats. + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + #[inline] + fn string_to_timestamp_millis_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> Result { + let result = TimeZone::get(tz.as_ref()); + let timezone = match result { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), + }; + + let dt = self.string_to_datetime_formatted(&timezone, s, formats)?; + let parsed = dt.timestamp().as_millisecond(); + + Ok(parsed) + } + } +} diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 60c6fdf2df975..5098e98b3ee88 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -16,11 +16,13 @@ // under the License. use crate::datetime::common::*; +use crate::datetime::parser::DateTimeParser; use arrow::compute::cast_with_options; use arrow::datatypes::DataType; use arrow::datatypes::DataType::*; use arrow::error::ArrowError::ParseError; use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser}; +use datafusion_common::config::ConfigOptions; use datafusion_common::format::DEFAULT_CAST_OPTIONS; use datafusion_common::{Result, arrow_err, exec_err, internal_datafusion_err}; use datafusion_expr::{ @@ -67,21 +69,39 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo #[derive(Debug, PartialEq, Eq, Hash)] pub struct ToDateFunc { signature: Signature, + parser: Box, } impl Default for ToDateFunc { fn default() -> Self { - Self::new() + Self::new_with_config(&ConfigOptions::default()) } } impl ToDateFunc { + #[deprecated(since = "52.0.0", note = "use `new_with_config` instead")] + /// Deprecated constructor retained for backwards compatibility. + /// + /// Prefer [`ToDateFunc::new_with_config`] which allows specifying the + /// date time parser via [`ConfigOptions`]. pub fn new() -> Self { + Self::new_with_config(&ConfigOptions::default()) + } + + pub fn new_with_config(config: &ConfigOptions) -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), + parser: crate::datetime::parser::get_date_time_parser(config), } } + /// Set a custom date time parser. + pub fn with_datetime_parser(&mut self, parser: Box) -> &Self { + self.parser = parser; + + self + } + fn to_date(&self, args: &[ColumnarValue]) -> Result { match args.len() { 1 => handle::( @@ -98,8 +118,8 @@ impl ToDateFunc { ), 2.. => handle_multiple::( args, - |s, format| { - string_to_timestamp_millis_formatted(s, format) + |s, formats| { + self.parser.string_to_timestamp_millis_formatted("UTC", s, formats) .map(|n| n / (24 * 60 * 60 * 1_000)) .and_then(|v| { v.try_into().map_err(|_| { @@ -223,7 +243,7 @@ mod tests { return_field: Field::new("f", DataType::Date32, true).into(), config_options: Arc::new(ConfigOptions::default()), }; - ToDateFunc::new().invoke_with_args(args) + ToDateFunc::new_with_config(&ConfigOptions::default()).invoke_with_args(args) } #[test] @@ -394,10 +414,13 @@ mod tests { tc.name, tc.formatted_date, tc.format_str ); } - _ => panic!( - "Could not convert '{}' with format string '{}'to Date", - tc.date_str, tc.format_str - ), + e => { + println!("e was {e:?}"); + panic!( + "Could not convert '{}' with format string '{}' to Date", + tc.formatted_date, tc.format_str + ) + } } } @@ -435,7 +458,7 @@ mod tests { ); } _ => panic!( - "Could not convert '{}' with format string '{}'to Date: {:?}", + "Could not convert '{}' with format string '{}' to Date: {:?}", tc.formatted_date, tc.format_str, to_date_result ), } diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index 1c5d3dbd88bcd..edb28af39d4d5 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::sync::Arc; use crate::datetime::common::*; -use arrow::array::timezone::Tz; +use crate::datetime::parser::DateTimeParser; use arrow::array::{ Array, Decimal128Array, Float16Array, Float32Array, Float64Array, TimestampNanosecondArray, @@ -89,6 +89,7 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampFunc { signature: Signature, timezone: Option>, + parser: Box, } #[user_doc( @@ -138,6 +139,7 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampSecondsFunc { signature: Signature, timezone: Option>, + parser: Box, } #[user_doc( @@ -187,6 +189,7 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampMillisFunc { signature: Signature, timezone: Option>, + parser: Box, } #[user_doc( @@ -236,6 +239,7 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampMicrosFunc { signature: Signature, timezone: Option>, + parser: Box, } #[user_doc( @@ -284,10 +288,12 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampNanosFunc { signature: Signature, timezone: Option>, + parser: Box, } /// Macro to generate boilerplate constructors and config methods for ToTimestamp* functions. -/// Generates: Default impl, deprecated new(), new_with_config(), and extracts timezone from ConfigOptions. +/// Generates: Default impl, deprecated new(), new_with_config(), with_datetime_parser(), and +/// extracts timezone from ConfigOptions. macro_rules! impl_to_timestamp_constructors { ($func:ty) => { impl Default for $func { @@ -315,8 +321,19 @@ macro_rules! impl_to_timestamp_constructors { .time_zone .as_ref() .map(|tz| Arc::from(tz.as_str())), + parser: super::parser::get_date_time_parser(config), } } + + /// Set a custom date time parser. + pub fn with_datetime_parser( + &mut self, + parser: Box, + ) -> &Self { + self.parser = parser; + + self + } } }; } @@ -451,9 +468,12 @@ impl ScalarUDFImpl for ToTimestampFunc { decimal128_to_timestamp_nanos(&arg, tz) } Decimal128(_, _) => decimal128_to_timestamp_nanos(&args[0], tz), - Utf8View | LargeUtf8 | Utf8 => { - to_timestamp_impl::(&args, "to_timestamp", &tz) - } + Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::( + &args, + "to_timestamp", + &tz, + &self.parser, + ), other => { exec_err!("Unsupported data type {other} for function to_timestamp") } @@ -526,6 +546,7 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc { &args, "to_timestamp_seconds", &self.timezone, + &self.parser, ), other => { exec_err!( @@ -602,6 +623,7 @@ impl ScalarUDFImpl for ToTimestampMillisFunc { &args, "to_timestamp_millis", &self.timezone, + &self.parser, ), other => { exec_err!( @@ -678,6 +700,7 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc { &args, "to_timestamp_micros", &self.timezone, + &self.parser, ), other => { exec_err!( @@ -754,6 +777,7 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { &args, "to_timestamp_nanos", &self.timezone, + &self.parser, ), other => { exec_err!( @@ -769,10 +793,12 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { } } +#[expect(clippy::borrowed_box)] fn to_timestamp_impl>( args: &[ColumnarValue], name: &str, timezone: &Option>, + parser: &Box, ) -> Result { let factor = match T::UNIT { Second => 1_000_000_000, @@ -781,23 +807,18 @@ fn to_timestamp_impl>( Nanosecond => 1, }; - let tz = match timezone.clone() { - Some(tz) => Some(tz.parse::()?), - None => None, - }; + let tz = timezone.clone().unwrap_or_else(|| "UTC".into()); match args.len() { 1 => handle::( args, - move |s| string_to_timestamp_nanos_with_timezone(&tz, s).map(|n| n / factor), + move |s| parser.string_to_timestamp_nanos(&tz, s).map(|n| n / factor), name, &Timestamp(T::UNIT, timezone.clone()), ), n if n >= 2 => handle_multiple::( args, - move |s, format| { - string_to_timestamp_nanos_formatted_with_timezone(&tz, s, format) - }, + move |s, formats| parser.string_to_timestamp_nanos_formatted(&tz, s, formats), |n| n / factor, name, &Timestamp(T::UNIT, timezone.clone()), @@ -810,6 +831,8 @@ fn to_timestamp_impl>( mod tests { use std::sync::Arc; + use super::*; + use crate::datetime::parser::chrono::ChronoDateTimeParser; use arrow::array::types::Int64Type; use arrow::array::{ Array, PrimitiveArray, TimestampMicrosecondArray, TimestampMillisecondArray, @@ -822,47 +845,67 @@ mod tests { use datafusion_common::{DataFusionError, ScalarValue, assert_contains}; use datafusion_expr::{ScalarFunctionArgs, ScalarFunctionImplementation}; - use super::*; - fn to_timestamp(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); - to_timestamp_impl::(args, "to_timestamp", &timezone) + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + + to_timestamp_impl::( + args, + "to_timestamp", + &timezone, + &parser, + ) } /// to_timestamp_millis SQL function fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + to_timestamp_impl::( args, "to_timestamp_millis", &timezone, + &parser, ) } /// to_timestamp_micros SQL function fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + to_timestamp_impl::( args, "to_timestamp_micros", &timezone, + &parser, ) } /// to_timestamp_nanos SQL function fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + to_timestamp_impl::( args, "to_timestamp_nanos", &timezone, + &parser, ) } /// to_timestamp_seconds SQL function fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); - to_timestamp_impl::(args, "to_timestamp_seconds", &timezone) + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + to_timestamp_impl::( + args, + "to_timestamp_seconds", + &timezone, + &parser, + ) } fn udfs_and_timeunit() -> Vec<(Box, TimeUnit)> { @@ -1474,7 +1517,7 @@ mod tests { ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), ]; - let expected_err = "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters"; + let expected_err = "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using formats: [\"%s\", \"%c\", \"%H:%M:%S\"]: input contains invalid characters"; match to_timestamp(&string_array) { Ok(_) => panic!("Expected error but got success"), Err(e) => { @@ -1522,11 +1565,9 @@ mod tests { } fn parse_timestamp_formatted(s: &str, format: &str) -> Result { - let result = string_to_timestamp_nanos_formatted_with_timezone( - &Some("UTC".parse()?), - s, - format, - ); + let timezone = "UTC"; + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + let result = parser.string_to_timestamp_nanos_formatted(timezone, s, &[format]); if let Err(e) = &result { eprintln!("Error parsing timestamp '{s}' using format '{format}': {e:?}"); } @@ -1552,14 +1593,16 @@ mod tests { ), ]; + let timezone = "UTC"; + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + for (s, f, ctx) in cases { - let expected = format!( - "Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}" - ); - let actual = string_to_datetime_formatted(&Utc, s, f) + let expected = ctx.to_string(); + let actual = parser + .string_to_timestamp_nanos_formatted(timezone, s, &[f]) .unwrap_err() .strip_backtrace(); - assert_eq!(actual, expected) + assert_contains!(actual, expected); } } @@ -1582,14 +1625,16 @@ mod tests { ), ]; + let timezone = "UTC"; + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + for (s, f, ctx) in cases { - let expected = format!( - "Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}" - ); - let actual = string_to_datetime_formatted(&Utc, s, f) + let expected = ctx.to_string(); + let actual = parser + .string_to_timestamp_nanos_formatted(timezone, s, &[f]) .unwrap_err() .strip_backtrace(); - assert_eq!(actual, expected) + assert_contains!(actual, expected); } } diff --git a/datafusion/sqllogictest/test_files/datetime/dates.slt b/datafusion/sqllogictest/test_files/datetime/dates.slt index d2a7360b120c6..ad20228767012 100644 --- a/datafusion/sqllogictest/test_files/datetime/dates.slt +++ b/datafusion/sqllogictest/test_files/datetime/dates.slt @@ -295,7 +295,7 @@ query error input contains invalid characters SELECT to_date('2020-09-08 12/00/00+00:00', '%c', '%+') # to_date with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_date('2020-09-08 12/00/00+00:00', '%q') statement ok @@ -371,8 +371,11 @@ SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') 1999-12-31 # verify date data using tables with formatting options where at least one column cannot be parsed -query error Error parsing timestamp from '1926632005' using format '%d-%m-%Y %H:%M:%S%#z': input contains invalid characters +query error SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t +---- +DataFusion error: Execution error: Error parsing timestamp from '1926632005' using formats: ["%Y-%m-%d %H/%M/%S%#z", "%+", "%d-%m-%Y %H:%M:%S%#z"]: input contains invalid characters + # verify date data using tables with formatting options where one of the formats is invalid query D diff --git a/datafusion/sqllogictest/test_files/datetime/timestamps.slt b/datafusion/sqllogictest/test_files/datetime/timestamps.slt index efa7a536c8ba4..3e20129589081 100644 --- a/datafusion/sqllogictest/test_files/datetime/timestamps.slt +++ b/datafusion/sqllogictest/test_files/datetime/timestamps.slt @@ -2543,23 +2543,23 @@ query error input contains invalid characters SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+') # to_timestamp with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_nanos with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_millis with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_micros with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_seconds with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%q') # Create string timestamp table with different formats @@ -2663,8 +2663,11 @@ SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 # verify timestamp data using tables with formatting options where at least one column cannot be parsed -query error Error parsing timestamp from '1926632005' using format '%d-%m-%Y %H:%M:%S%#z': input contains invalid characters +query error SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t +---- +DataFusion error: Execution error: Error parsing timestamp from '1926632005' using formats: ["%Y-%m-%d %H/%M/%S%#z", "%+", "%d-%m-%Y %H:%M:%S%#z"]: input contains invalid characters + # verify timestamp data using tables with formatting options where one of the formats is invalid query P diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 2039ee93df837..a9de9b74e05d6 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -217,6 +217,7 @@ datafusion.catalog.newlines_in_values false datafusion.execution.batch_size 8192 datafusion.execution.coalesce_batches true datafusion.execution.collect_statistics true +datafusion.execution.date_time_parser chrono datafusion.execution.enable_ansi_mode false datafusion.execution.enable_recursive_ctes true datafusion.execution.enforce_batch_size_in_joins false @@ -354,6 +355,7 @@ datafusion.catalog.newlines_in_values false Specifies whether newlines in (quote datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. +datafusion.execution.date_time_parser chrono The date time parser to use when parsing date time values. Defaults to 'chrono'. 'jiff' is supported when the 'jiff' feature is enabled. datafusion.execution.enable_ansi_mode false Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. diff --git a/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt index d48e41d1204de..d774237468831 100644 --- a/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt +++ b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt @@ -142,7 +142,7 @@ SELECT to_timestamp('2020-09-08 13:42:29 America/Toronto', '%Y-%m-%d %H:%M:%S %Z ---- 2020-09-08T17:42:29Z -query error Error parsing timestamp from '2020-09-08 13:42:29America/Toronto' using format '%Y-%m-%d %H:%M:%S%Z': '%Z' is only supported at the end of the format string preceded by a space +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 13:42:29America/Toronto' using formats: \["%Y\-%m\-%d %H:%M:%S%Z"\]: '%Z' is only supported at the end of the format string preceded by a space SELECT to_timestamp('2020-09-08 13:42:29America/Toronto', '%Y-%m-%d %H:%M:%S%Z'); ## Test 17: Test all precision variants respect timezone diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 99c94b2c78ec8..de93c2fc76263 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -79,6 +79,7 @@ The following configuration settings are available: | datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | | datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. | | datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system | +| datafusion.execution.date_time_parser | chrono | The date time parser to use when parsing date time values. Defaults to 'chrono'. 'jiff' is supported when the 'jiff' feature is enabled. | | datafusion.execution.time_zone | NULL | The default time zone Some functions, e.g. `now` return timestamps in this time zone | | datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | | datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | From b1d0376e205fa0954be58f6067e731716a9bdf9e Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 13 Jan 2026 14:24:28 -0500 Subject: [PATCH 2/6] Review updates, cargo clippy updates. --- datafusion/common/src/config.rs | 3 +- datafusion/functions/benches/to_timestamp.rs | 24 ++++++---- datafusion/functions/src/datetime/common.rs | 12 ++--- datafusion/functions/src/datetime/parser.rs | 45 ++++++++++++------- datafusion/functions/src/datetime/to_date.rs | 2 +- .../functions/src/datetime/to_timestamp.rs | 2 +- 6 files changed, 55 insertions(+), 33 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 3d0de69e8b24f..8fddbe7f00929 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -508,14 +508,13 @@ config_namespace! { /// The date time parser to use when parsing date time values. /// /// Defaults to 'chrono'. 'jiff' is supported when the 'jiff' feature is enabled. - pub date_time_parser: Option, default = Some("chrono".to_string()) + pub date_time_parser: Option, transform = str::to_lowercase, default = Some("chrono".to_string()) /// The default time zone /// /// Some functions, e.g. `now` return timestamps in this time zone pub time_zone: Option, default = None - /// Parquet options pub parquet: ParquetOptions, default = Default::default() diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs index 4eaf4303be237..23cd1ea304af8 100644 --- a/datafusion/functions/benches/to_timestamp.rs +++ b/datafusion/functions/benches/to_timestamp.rs @@ -190,7 +190,7 @@ fn criterion_benchmark(c: &mut Criterion) { for (input, format3, jiff_format3) in izip!(inputs.iter(), format3.iter(), jiffformat3.iter()) { - black_box(match parser { + let _ = black_box(match parser { "chrono" => { let t = datetime_parser .string_to_timestamp_nanos_formatted( @@ -199,8 +199,10 @@ fn criterion_benchmark(c: &mut Criterion) { &[format3.unwrap()], ); if t.is_err() { - println!("Error: {:?}", t); + println!("Error: {t:?}"); } + + t } "jiff" => { let t = datetime_parser @@ -210,11 +212,13 @@ fn criterion_benchmark(c: &mut Criterion) { &[jiff_format3.unwrap()], ); if t.is_err() { - println!("Error: {:?}", t); + println!("Error: {t:?}"); } + + t } _ => unreachable!(), - }) + }); } }) }, @@ -263,7 +267,7 @@ fn criterion_benchmark(c: &mut Criterion) { jiff_format2.iter(), jiff_format3.iter() ) { - black_box(match parser { + let _ = black_box(match parser { "chrono" => { let t = datetime_parser .string_to_timestamp_nanos_formatted( @@ -276,8 +280,10 @@ fn criterion_benchmark(c: &mut Criterion) { ], ); if t.is_err() { - println!("Error: {:?}", t); + println!("Error: {t:?}"); } + + t } "jiff" => { let t = datetime_parser @@ -291,11 +297,13 @@ fn criterion_benchmark(c: &mut Criterion) { ], ); if t.is_err() { - println!("Error: {:?}", t); + println!("Error: {t:?}"); } + + t } _ => unreachable!(), - }) + }); } }) }, diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs index f39c7236ab248..f9981914a8314 100644 --- a/datafusion/functions/src/datetime/common.rs +++ b/datafusion/functions/src/datetime/common.rs @@ -316,11 +316,13 @@ where }; } - let r = op(x, &v); - if let Ok(inner) = r { - val = Some(Ok(op2(inner))); - } else { - val = Some(r); + if !v.is_empty() { + let r = op(x, &v); + if let Ok(inner) = r { + val = Some(Ok(op2(inner))); + } else { + val = Some(r); + } } }; diff --git a/datafusion/functions/src/datetime/parser.rs b/datafusion/functions/src/datetime/parser.rs index f6e55d8b79b9b..3d4e739c11aa9 100644 --- a/datafusion/functions/src/datetime/parser.rs +++ b/datafusion/functions/src/datetime/parser.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::DataFusionError; use datafusion_common::config::ConfigOptions; use dyn_eq::DynEq; use dyn_hash::DynHash; @@ -38,24 +37,24 @@ pub trait DateTimeParser: Debug + DynEq + DynHash + Send + Sync { &self, tz: &str, s: &str, - ) -> datafusion_common::Result; + ) -> datafusion_common::Result; fn string_to_timestamp_nanos_formatted( &self, tz: &str, s: &str, formats: &[&str], - ) -> datafusion_common::Result; + ) -> datafusion_common::Result; fn string_to_timestamp_millis_formatted( &self, tz: &str, s: &str, formats: &[&str], - ) -> datafusion_common::Result; + ) -> datafusion_common::Result; } -// impl Eq and PartialEaq for dyn DateTimeParser +// impl Eq and PartialEq for dyn DateTimeParser dyn_eq::eq_trait_object!(DateTimeParser); // Implement std::hash::Hash for dyn DateTimeParser @@ -73,6 +72,12 @@ pub fn get_date_time_parser(config_options: &ConfigOptions) -> Box { Box::new(jiff::JiffDateTimeParser::new()) as Box } + #[cfg(not(feature = "jiff"))] + "jiff" => { + return exec_err!( + "jiff parser requested but 'jiff' feature is not enabled. Enable with --features jiff" + ); + } _ => panic!("Unknown/unsupported date time parser: {p}"), }, None => Box::new(chrono::ChronoDateTimeParser::new()) as Box, @@ -123,7 +128,9 @@ pub mod chrono { // // This code doesn't handle named timezones with no preceding space since that would require // writing a custom parser. - let tz: Option = if format.ends_with(" %Z") { + let tz: Option = if format.ends_with(" %Z") + && datetime_str.contains(' ') + { // grab the string after the last space as the named timezone let parts: Vec<&str> = datetime_str.rsplitn(2, ' ').collect(); let timezone_name = parts[0]; @@ -369,7 +376,7 @@ pub mod jiff { const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; - #[derive(Debug, PartialEq, Eq, Hash)] + #[derive(Debug, Default, PartialEq, Eq, Hash)] pub struct JiffDateTimeParser {} impl JiffDateTimeParser { @@ -503,8 +510,14 @@ pub mod jiff { match parsed.to_datetime_with_timezone(&tz) { Ok(dt) => { let dt = dt.fixed_offset(); - let result = - Timestamp::from_nanosecond(dt.timestamp() as i128); + let nanos_opt = dt.timestamp_nanos_opt(); + + let Some(nanos) = nanos_opt else { + err = Some(ERR_NANOSECONDS_NOT_SUPPORTED.to_owned()); + continue; + }; + + let result = Timestamp::from_nanosecond(nanos as i128); match result { Ok(result) => { return Ok(result.to_zoned(timezone.to_owned())); @@ -548,10 +561,10 @@ pub mod jiff { match result { Ok(bdt) => { - if bdt.iana_time_zone().is_some() || bdt.offset().is_some() { - if let Ok(zoned) = bdt.to_zoned() { - return Ok(zoned.with_time_zone(timezone.to_owned())); - } + if (bdt.iana_time_zone().is_some() || bdt.offset().is_some()) + && let Ok(zoned) = bdt.to_zoned() + { + return Ok(zoned.with_time_zone(timezone.to_owned())); } let result = bdt.to_datetime(); @@ -616,7 +629,7 @@ pub mod jiff { tz: &str, s: &str, ) -> Result { - let result = TimeZone::get(tz.as_ref()); + let result = TimeZone::get(tz); let timezone = match result { Ok(tz) => tz, Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), @@ -655,7 +668,7 @@ pub mod jiff { s: &str, formats: &[&str], ) -> Result { - let result = TimeZone::get(tz.as_ref()); + let result = TimeZone::get(tz); let timezone = match result { Ok(tz) => tz, Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), @@ -687,7 +700,7 @@ pub mod jiff { s: &str, formats: &[&str], ) -> Result { - let result = TimeZone::get(tz.as_ref()); + let result = TimeZone::get(tz); let timezone = match result { Ok(tz) => tz, Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 5098e98b3ee88..5d3f359f1fceb 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -96,7 +96,7 @@ impl ToDateFunc { } /// Set a custom date time parser. - pub fn with_datetime_parser(&mut self, parser: Box) -> &Self { + pub fn with_datetime_parser(&mut self, parser: Box) -> &mut Self { self.parser = parser; self diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index edb28af39d4d5..d99156bf38ea4 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -329,7 +329,7 @@ macro_rules! impl_to_timestamp_constructors { pub fn with_datetime_parser( &mut self, parser: Box, - ) -> &Self { + ) -> &mut Self { self.parser = parser; self From 60be1ea41416f9f079b2c765be116b34256c02af Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 13 Jan 2026 16:10:47 -0500 Subject: [PATCH 3/6] Adding tests for jiff, fixed issue with %s and jiff parsing. --- datafusion/functions/src/datetime/parser.rs | 29 +- datafusion/sqllogictest/Cargo.toml | 2 +- .../test_files/datetime/timestamps_jiff.slt | 290 ++++++++++++++++++ 3 files changed, 310 insertions(+), 11 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt diff --git a/datafusion/functions/src/datetime/parser.rs b/datafusion/functions/src/datetime/parser.rs index 3d4e739c11aa9..5f5b66bcfae97 100644 --- a/datafusion/functions/src/datetime/parser.rs +++ b/datafusion/functions/src/datetime/parser.rs @@ -567,12 +567,19 @@ pub mod jiff { return Ok(zoned.with_time_zone(timezone.to_owned())); } - let result = bdt.to_datetime(); - let datetime = match result { - Ok(datetime) => datetime, - Err(e) => { - err = Some(e.to_string()); - continue; + // %s just sets the timestamp and the .to_datetime() call will error + // since the other fields like year are not set. Handle this case + // directly. + let datetime = if let Some(ts) = bdt.timestamp() { + timezone.to_datetime(ts) + } else { + let result = bdt.to_datetime(); + match result { + Ok(datetime) => datetime, + Err(e) => { + err = Some(e.to_string()); + continue; + } } }; @@ -597,10 +604,12 @@ pub mod jiff { } match err { - Some(e) => exec_err!("{e}"), - None => { - exec_err!("Unable to parse timestamp: {s} with formats: {formats:?}") - } + Some(e) => exec_err!( + "Error parsing timestamp from '{s}' using formats: {formats:?}: {e}" + ), + None => exec_err!( + "Error parsing timestamp from '{s}' using formats: {formats:?}" + ), } } } diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 13ae6e6a57e01..33013283038b3 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -46,7 +46,7 @@ bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } clap = { version = "4.5.53", features = ["derive", "env"] } -datafusion = { workspace = true, default-features = true, features = ["avro"] } +datafusion = { workspace = true, default-features = true, features = ["avro", "jiff"] } datafusion-spark = { workspace = true, default-features = true } datafusion-substrait = { workspace = true, default-features = true } futures = { workspace = true } diff --git a/datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt b/datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt new file mode 100644 index 0000000000000..8871e8d32497f --- /dev/null +++ b/datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt @@ -0,0 +1,290 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +statement ok +SET datafusion.execution.date_time_parser = 'jiff'; + +statement ok +create table ts_data(ts bigint, value int) as values + (1599572549190855123, 1), + (1599568949190855123, 2), + (1599565349190855123, 3); + +statement ok +create table ts_data_nanos as select arrow_cast(ts, 'Timestamp(Nanosecond, None)') as ts, value from ts_data; + +statement ok +create table ts_data_micros as select arrow_cast(ts / 1000, 'Timestamp(Microsecond, None)') as ts, value from ts_data; + +statement ok +create table ts_data_millis as select arrow_cast(ts / 1000000, 'Timestamp(Millisecond, None)') as ts, value from ts_data; + +statement ok +create table ts_data_secs as select arrow_cast(ts / 1000000000, 'Timestamp(Second, None)') as ts, value from ts_data; + +# verify chrono formats don't work + +query error strptime parsing failed +SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp('2020-09-08T12:00:00+00:00', '%Y-%m-%dT%H:%M:%S%Z') + +# to_timestamp with formatting +query I +SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp('2020-09-08T12:00:00+00:00', '2020-09-08 12/00/00+00:00', '%+', '%Y-%m-%d %H/%M/%s%#z') +---- +2 + +# to_timestamp_nanos with formatting +query I +SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z') +---- +2 + +# to_timestamp_millis with formatting +query I +SELECT COUNT(*) FROM ts_data_millis where ts > to_timestamp_millis('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z') +---- +2 + +# to_timestamp_micros with formatting +query I +SELECT COUNT(*) FROM ts_data_micros where ts > to_timestamp_micros('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z') +---- +2 + +# to_timestamp_seconds with formatting +query I +SELECT COUNT(*) FROM ts_data_secs where ts > to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z') +---- +2 + +# verify timestamp data with formatting options +query PPPPPP +SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'), to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1, '%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s') +---- +NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:59:59 1969-12-31T23:59:59 + +# verify timestamp data with formatting options +query PPPPPP +SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'), to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1, '%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s') +---- +NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:59:59 1969-12-31T23:59:59 + +# verify timestamp output types with formatting options +query TTT +SELECT arrow_typeof(to_timestamp(1, '%c', '%s')), arrow_typeof(to_timestamp(null, '%+', '%s')), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000', '%Y-%m-%d %H:%M:%S%.f')) +---- +Timestamp(ns) Timestamp(ns) Timestamp(ns) + +# to_timestamp with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_nanos with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_millis with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_micros with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_seconds with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_nanos with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_millis with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_micros with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_seconds with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%q') + +# Create string timestamp table with different formats +# including a few very non-standard formats + +statement ok +create table ts_utf8_data(ts varchar(100), format varchar(100)) as values + ('2020-09-08 12/00/00+00:00', '%Y-%m-%d %H/%M/%S%#:z'), + ('2031-01-19T23:33:25+05:00', '%+'), + ('08-09-2020 12:00:00+00:00', '%d-%m-%Y %H:%M:%S%#:z'), + ('1926632005', '%s'), + ('2000-01-01T01:01:01+07:00', '%+'); + +statement ok +create table ts_largeutf8_data as +select arrow_cast(ts, 'LargeUtf8') as ts, arrow_cast(format, 'LargeUtf8') as format from ts_utf8_data; + +statement ok +create table ts_utf8view_data as +select arrow_cast(ts, 'Utf8View') as ts, arrow_cast(format, 'Utf8View') as format from ts_utf8_data; + +# verify timestamp data using tables with formatting options +query P +SELECT to_timestamp(t.ts, t.format) from ts_utf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, t.format), + to_timestamp_seconds(t.ts, t.format), + to_timestamp_millis(t.ts, t.format), + to_timestamp_micros(t.ts, t.format), + to_timestamp_nanos(t.ts, t.format) + from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, t.format), + to_timestamp_seconds(t.ts, t.format), + to_timestamp_millis(t.ts, t.format), + to_timestamp_micros(t.ts, t.format), + to_timestamp_nanos(t.ts, t.format) + from ts_utf8view_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +# verify timestamp data using tables with formatting options +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z') + from ts_utf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z') + from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z') + from ts_utf8view_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +# verify timestamp data using tables with formatting options where at least one column cannot be parsed +query error +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#:z') from ts_utf8_data as t +---- +DataFusion error: Execution error: Error parsing timestamp from '2020-09-08 12/00/00+00:00' using formats: ["%Y-%m-%d %H/%M/%S%#z", "%+", "%d-%m-%Y %H:%M:%S%#:z"]: strptime parsing failed: expected to match literal byte `-` from format string, but found byte `2` in input + + +# verify timestamp data using tables with formatting options where one of the formats is invalid +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#:z', '%+') from ts_utf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#:z', '%+') from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#:z', '%+') from ts_utf8view_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +# timestamp data using tables with formatting options in an array is not supported at this time +query error function unsupported data type at index 1: +SELECT to_timestamp(t.ts, make_array('%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#:z', '%+')) from ts_utf8_data as t + +statement ok +drop table ts_utf8_data + +statement ok +drop table ts_data + +statement ok +drop table ts_data_nanos + +statement ok +drop table ts_data_micros + +statement ok +drop table ts_data_millis + +statement ok +drop table ts_data_secs + +statement ok +SET datafusion.execution.date_time_parser = 'chrono'; From 87d3ef99afd8e005898e1d7e4234f9071d76374d Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 13 Jan 2026 17:04:00 -0500 Subject: [PATCH 4/6] Fixed issue with jiff and exec_err import. --- datafusion/functions/src/datetime/parser.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/parser.rs b/datafusion/functions/src/datetime/parser.rs index 5f5b66bcfae97..0191be09385ba 100644 --- a/datafusion/functions/src/datetime/parser.rs +++ b/datafusion/functions/src/datetime/parser.rs @@ -74,7 +74,7 @@ pub fn get_date_time_parser(config_options: &ConfigOptions) -> Box { - return exec_err!( + return datafusion_common::exec_err!( "jiff parser requested but 'jiff' feature is not enabled. Enable with --features jiff" ); } From 7a8955f9883ac69a9d639bfa65fa944f21222d1d Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 13 Jan 2026 17:14:26 -0500 Subject: [PATCH 5/6] panic instead of exec error since there isn't a good way to propagate any error back up the call chain. --- datafusion/functions/src/datetime/parser.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/parser.rs b/datafusion/functions/src/datetime/parser.rs index 0191be09385ba..b342e06098852 100644 --- a/datafusion/functions/src/datetime/parser.rs +++ b/datafusion/functions/src/datetime/parser.rs @@ -74,7 +74,7 @@ pub fn get_date_time_parser(config_options: &ConfigOptions) -> Box { - return datafusion_common::exec_err!( + panic!( "jiff parser requested but 'jiff' feature is not enabled. Enable with --features jiff" ); } From 8d8946a82781d00b286b01b99179a29988def6f0 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 15 Jan 2026 17:30:06 -0500 Subject: [PATCH 6/6] Updating argument conversion for scalar args in handle_multiple. --- datafusion/functions/src/datetime/common.rs | 29 +++++++++++---------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs index f9981914a8314..c49da7b94c193 100644 --- a/datafusion/functions/src/datetime/common.rs +++ b/datafusion/functions/src/datetime/common.rs @@ -160,11 +160,7 @@ where // if the first argument is a scalar utf8 all arguments are expected to be scalar utf8 ColumnarValue::Scalar(scalar) => match scalar.try_as_str() { Some(a) => { - let a = a.as_ref(); - // ASK: Why do we trust `a` to be non-null at this point? - let a = unwrap_or_internal_err!(a); - - let mut ret = None; + let mut vals = vec![]; for (pos, v) in args.iter().enumerate().skip(1) { let ColumnarValue::Scalar( @@ -179,18 +175,23 @@ where }; if let Some(s) = x { - match op(a, &[s.as_str()]) { - Ok(r) => { - let result = op2(r).to_i64(); - let s = scalar_value(dt, result)?; - ret = Some(Ok(ColumnarValue::Scalar(s))); - break; - } - Err(e) => ret = Some(Err(e)), - } + vals.push(s.as_str()); } } + let a = a.as_ref(); + // ASK: Why do we trust `a` to be non-null at this point? + let a = unwrap_or_internal_err!(a); + + let ret = match op(a, &vals) { + Ok(r) => { + let result = op2(r).to_i64(); + let s = scalar_value(dt, result)?; + Some(Ok(ColumnarValue::Scalar(s))) + } + Err(e) => Some(Err(e)), + }; + unwrap_or_internal_err!(ret) } other => {