diff --git a/Cargo.lock b/Cargo.lock index 1f28687f4f839..025b18159be85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2224,9 +2224,12 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", + "dyn-eq", + "dyn-hash", "env_logger", "hex", "itertools 0.14.0", + "jiff", "log", "md-5", "num-traits", @@ -2791,6 +2794,18 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "dyn-eq" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c2d035d21af5cde1a6f5c7b444a5bf963520a9f142e5d06931178433d7d5388" + +[[package]] +name = "dyn-hash" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fdab65db9274e0168143841eb8f864a0a21f8b1b8d2ba6812bbe6024346e99e" + [[package]] name = "educe" version = "0.6.0" @@ -3756,10 +3771,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e67e8da4c49d6d9909fe03361f9b620f58898859f5c7aded68351e85e71ecf50" dependencies = [ "jiff-static", + "jiff-tzdb-platform", "log", "portable-atomic", "portable-atomic-util", "serde_core", + "windows-sys 0.61.2", ] [[package]] @@ -3773,6 +3790,21 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "jiff-tzdb" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68971ebff725b9e2ca27a601c5eb38a4c5d64422c4cbab0c535f248087eda5c2" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jobserver" version = "0.1.34" diff --git a/datafusion-examples/examples/builtin_functions/date_time.rs b/datafusion-examples/examples/builtin_functions/date_time.rs index 08d4bc6e29978..7b430f1d2d168 100644 --- a/datafusion-examples/examples/builtin_functions/date_time.rs +++ b/datafusion-examples/examples/builtin_functions/date_time.rs @@ -417,7 +417,7 @@ async fn query_to_timestamp() -> Result<()> { .collect() .await; - let expected = "Execution error: Error parsing timestamp from '01-14-2023 01/01/30' using format '%d-%m-%Y %H:%M:%S': input is out of range"; + let expected = "Error parsing timestamp from '01-14-2023 01/01/30' using formats: [\"%d-%m-%Y %H:%M:%S\"]: input is out of range"; assert_contains!(result.unwrap_err().to_string(), expected); // note that using arrays for the chrono formats is not supported diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index a18861aa3a695..13eba814cbb4d 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -505,6 +505,11 @@ config_namespace! { /// Defaults to the number of CPU cores on the system pub target_partitions: usize, transform = ExecutionOptions::normalized_parallelism, default = get_available_parallelism() + /// The date time parser to use when parsing date time values. + /// + /// Defaults to 'chrono'. 'jiff' is supported when the 'jiff' feature is enabled. + pub date_time_parser: Option, transform = str::to_lowercase, default = Some("chrono".to_string()) + /// The default time zone /// /// Some functions, e.g. `now` return timestamps in this time zone diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index e55c012470b6b..f68795cab523a 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -109,6 +109,7 @@ unicode_expressions = [ "datafusion-functions/unicode_expressions", ] extended_tests = [] +jiff = ["datafusion-functions/jiff"] [dependencies] arrow = { workspace = true } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 529f6354ef691..5b39fa96a7c6e 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -52,6 +52,8 @@ default = [ ] # enable encode/decode functions encoding_expressions = ["base64", "hex"] +# enable jiff timestamp parsing +jiff = ["dep:jiff"] # enable math functions math_expressions = [] # enable regular expressions @@ -78,8 +80,11 @@ datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } datafusion-macros = { workspace = true } +dyn-eq = "0.1.3" +dyn-hash = "1.0.0" hex = { workspace = true, optional = true } itertools = { workspace = true } +jiff = { version = "0.2.18", optional = true } log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } num-traits = { workspace = true } @@ -115,7 +120,7 @@ required-features = ["string_expressions"] [[bench]] harness = false name = "to_timestamp" -required-features = ["datetime_expressions"] +required-features = ["datetime_expressions", "jiff"] [[bench]] harness = false diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs index ed865fa6e8d50..23cd1ea304af8 100644 --- a/datafusion/functions/benches/to_timestamp.rs +++ b/datafusion/functions/benches/to_timestamp.rs @@ -27,14 +27,18 @@ use arrow::datatypes::{DataType, Field, TimeUnit}; use criterion::{Criterion, criterion_group, criterion_main}; use datafusion_common::config::ConfigOptions; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::datetime::parser::DateTimeParser; +use datafusion_functions::datetime::parser::chrono::ChronoDateTimeParser; +use datafusion_functions::datetime::parser::jiff::JiffDateTimeParser; use datafusion_functions::datetime::to_timestamp; +use itertools::izip; fn data() -> StringArray { let data: Vec<&str> = vec![ "1997-01-31T09:26:56.123Z", "1997-01-31T09:26:56.123-05:00", "1997-01-31 09:26:56.123-05:00", - "2023-01-01 04:05:06.789 -08", + "2023-01-01 04:05:06.789-08", "1997-01-31T09:26:56.123", "1997-01-31 09:26:56.123", "1997-01-31 09:26:56", @@ -46,261 +50,566 @@ fn data() -> StringArray { StringArray::from(data) } -fn data_with_formats() -> (StringArray, StringArray, StringArray, StringArray) { +fn data_with_formats() -> ( + StringArray, + StringArray, + StringArray, + StringArray, + StringArray, + StringArray, + StringArray, +) { let mut inputs = StringBuilder::new(); let mut format1_builder = StringBuilder::with_capacity(2, 10); let mut format2_builder = StringBuilder::with_capacity(2, 10); let mut format3_builder = StringBuilder::with_capacity(2, 10); + let mut jiff_format1_builder = StringBuilder::with_capacity(2, 10); + let mut jiff_format2_builder = StringBuilder::with_capacity(2, 10); + let mut jiff_format3_builder = StringBuilder::with_capacity(2, 10); - inputs.append_value("1997-01-31T09:26:56.123Z"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z"); + inputs.append_value("1997-01-31T09:26:56UTC"); + + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); + format3_builder.append_value("%+"); + + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%Q"); inputs.append_value("1997-01-31T09:26:56.123-05:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%:z"); + + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%:z"); inputs.append_value("1997-01-31 09:26:56.123-05:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%:z"); + + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%:z"); - inputs.append_value("2023-01-01 04:05:06.789 -08"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + inputs.append_value("2023-01-01 04:05:06.789 -0800"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %z"); + inputs.append_value("1997-01-31T09:26:56.123"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); + inputs.append_value("1997-01-31 09:26:56.123"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); + inputs.append_value("1997-01-31 09:26:56"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H:%M:%S"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S"); + inputs.append_value("1997-01-31 092656"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H%M%S"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H%M%S"); + inputs.append_value("1997-01-31 092656+04:00"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); + inputs.append_value("Sun Jul 8 00:34:60 2001"); - format1_builder.append_value("%+"); - format2_builder.append_value("%c"); - format3_builder.append_value("%Y-%m-%d 00:00:00"); + format1_builder.append_value("%m"); + format2_builder.append_value("%H"); + format3_builder.append_value("%c"); + + jiff_format1_builder.append_value("%m"); + jiff_format2_builder.append_value("%H"); + jiff_format3_builder.append_value("%a %b %e %H:%M:%S %Y"); ( inputs.finish(), format1_builder.finish(), format2_builder.finish(), format3_builder.finish(), + jiff_format1_builder.finish(), + jiff_format2_builder.finish(), + jiff_format3_builder.finish(), ) } + fn criterion_benchmark(c: &mut Criterion) { - let return_field = - Field::new("f", DataType::Timestamp(TimeUnit::Nanosecond, None), true).into(); - let arg_field = Field::new("a", DataType::Utf8, false).into(); - let arg_fields = vec![arg_field]; - let mut options = ConfigOptions::default(); - options.execution.time_zone = Some("UTC".into()); - let config_options = Arc::new(options); - - let to_timestamp_udf = to_timestamp(config_options.as_ref()); - - c.bench_function("to_timestamp_no_formats_utf8", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let arr_data = data(); - let batch_len = arr_data.len(); - let string_array = ColumnarValue::Array(Arc::new(arr_data) as ArrayRef); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: vec![string_array.clone()], - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_no_formats_largeutf8", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let data = cast(&data(), &DataType::LargeUtf8).unwrap(); - let batch_len = data.len(); - let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: vec![string_array.clone()], - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_no_formats_utf8view", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let data = cast(&data(), &DataType::Utf8View).unwrap(); - let batch_len = data.len(); - let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: vec![string_array.clone()], - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_with_formats_utf8", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let (inputs, format1, format2, format3) = data_with_formats(); - let batch_len = inputs.len(); - - let args = vec![ - ColumnarValue::Array(Arc::new(inputs) as ArrayRef), - ColumnarValue::Array(Arc::new(format1) as ArrayRef), - ColumnarValue::Array(Arc::new(format2) as ArrayRef), - ColumnarValue::Array(Arc::new(format3) as ArrayRef), - ]; - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() - }) - .collect::>(); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_with_formats_largeutf8", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let (inputs, format1, format2, format3) = data_with_formats(); - let batch_len = inputs.len(); - - let args = vec![ - ColumnarValue::Array( - Arc::new(cast(&inputs, &DataType::LargeUtf8).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format1, &DataType::LargeUtf8).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format2, &DataType::LargeUtf8).unwrap()) as ArrayRef + for &parser in ["jiff", "chrono"].iter() { + c.bench_function( + &format!("string_to_timestamp_nanos_formatted_single_format_utf8_{parser}"), + |b| { + let datetime_parser = match parser { + "chrono" => { + Box::new(ChronoDateTimeParser::new()) as Box + } + "jiff" => { + Box::new(JiffDateTimeParser::new()) as Box + } + _ => unreachable!(), + }; + + let (inputs, _, _, format3, _, _, jiffformat3) = data_with_formats(); + + b.iter(|| { + for (input, format3, jiff_format3) in + izip!(inputs.iter(), format3.iter(), jiffformat3.iter()) + { + let _ = black_box(match parser { + "chrono" => { + let t = datetime_parser + .string_to_timestamp_nanos_formatted( + "UTC", + input.unwrap(), + &[format3.unwrap()], + ); + if t.is_err() { + println!("Error: {t:?}"); + } + + t + } + "jiff" => { + let t = datetime_parser + .string_to_timestamp_nanos_formatted( + "UTC", + input.unwrap(), + &[jiff_format3.unwrap()], + ); + if t.is_err() { + println!("Error: {t:?}"); + } + + t + } + _ => unreachable!(), + }); + } + }) + }, + ); + + c.bench_function( + &format!( + "string_to_timestamp_nanos_formatted_multiple_formats_utf8_{parser}" ), - ColumnarValue::Array( - Arc::new(cast(&format3, &DataType::LargeUtf8).unwrap()) as ArrayRef - ), - ]; - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + |b| { + let datetime_parser = match parser { + "chrono" => { + Box::new(ChronoDateTimeParser::new()) as Box + } + "jiff" => { + Box::new(JiffDateTimeParser::new()) as Box + } + _ => unreachable!(), + }; + + let ( + inputs, + format1, + format2, + format3, + jiff_format1, + jiff_format2, + jiff_format3, + ) = data_with_formats(); + + b.iter(|| { + for ( + input, + format1, + format2, + format3, + jiff_format1, + jiff_format2, + jiff_format3, + ) in izip!( + inputs.iter(), + format1.iter(), + format2.iter(), + format3.iter(), + jiff_format1.iter(), + jiff_format2.iter(), + jiff_format3.iter() + ) { + let _ = black_box(match parser { + "chrono" => { + let t = datetime_parser + .string_to_timestamp_nanos_formatted( + "UTC", + input.unwrap(), + &[ + format1.unwrap(), + format2.unwrap(), + format3.unwrap(), + ], + ); + if t.is_err() { + println!("Error: {t:?}"); + } + + t + } + "jiff" => { + let t = datetime_parser + .string_to_timestamp_nanos_formatted( + "UTC", + input.unwrap(), + &[ + jiff_format1.unwrap(), + jiff_format2.unwrap(), + jiff_format3.unwrap(), + ], + ); + if t.is_err() { + println!("Error: {t:?}"); + } + + t + } + _ => unreachable!(), + }); + } + }) + }, + ); + + let return_field = + Field::new("f", DataType::Timestamp(TimeUnit::Nanosecond, None), true).into(); + let arg_field = Field::new("a", DataType::Utf8, false).into(); + let arg_fields = vec![arg_field]; + + c.bench_function(&format!("to_timestamp_no_formats_utf8_{parser}"), |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let arr_data = data(); + let batch_len = arr_data.len(); + let string_array = ColumnarValue::Array(Arc::new(arr_data) as ArrayRef); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: vec![string_array.clone()], + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) }) - .collect::>(); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), - }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); - - c.bench_function("to_timestamp_with_formats_utf8view", |b| { - let to_timestamp_udf = Arc::clone(&to_timestamp_udf); - let (inputs, format1, format2, format3) = data_with_formats(); - - let batch_len = inputs.len(); - - let args = vec![ - ColumnarValue::Array( - Arc::new(cast(&inputs, &DataType::Utf8View).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format1, &DataType::Utf8View).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format2, &DataType::Utf8View).unwrap()) as ArrayRef - ), - ColumnarValue::Array( - Arc::new(cast(&format3, &DataType::Utf8View).unwrap()) as ArrayRef - ), - ]; - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }); + + c.bench_function( + &format!("to_timestamp_no_formats_largeutf8_{parser}"), + |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let data = cast(&data(), &DataType::LargeUtf8).unwrap(); + let batch_len = data.len(); + let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: vec![string_array.clone()], + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) + }) + }, + ); + + c.bench_function(&format!("to_timestamp_no_formats_utf8view_{parser}"), |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let data = cast(&data(), &DataType::Utf8View).unwrap(); + let batch_len = data.len(); + let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: vec![string_array.clone()], + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) + }) + }); + + c.bench_function(&format!("to_timestamp_with_formats_utf8_{parser}"), |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let ( + inputs, + format1, + format2, + format3, + jiffformat1, + jiffformat2, + jiffformat3, + ) = data_with_formats(); + let batch_len = inputs.len(); + + let args = match parser { + "chrono" => vec![ + ColumnarValue::Array(Arc::new(inputs) as ArrayRef), + ColumnarValue::Array(Arc::new(format1) as ArrayRef), + ColumnarValue::Array(Arc::new(format2) as ArrayRef), + ColumnarValue::Array(Arc::new(format3) as ArrayRef), + ], + "jiff" => { + vec![ + ColumnarValue::Array(Arc::new(inputs) as ArrayRef), + ColumnarValue::Array(Arc::new(jiffformat1) as ArrayRef), + ColumnarValue::Array(Arc::new(jiffformat2) as ArrayRef), + ColumnarValue::Array(Arc::new(jiffformat3) as ArrayRef), + ] + } + _ => unreachable!(), + }; + + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) }) - .collect::>(); - - b.iter(|| { - black_box( - to_timestamp_udf - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: batch_len, - return_field: Arc::clone(&return_field), - config_options: Arc::clone(&config_options), + }); + + c.bench_function( + &format!("to_timestamp_with_formats_largeutf8_{parser}"), + |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let ( + inputs, + format1, + format2, + format3, + jiffformat1, + jiffformat2, + jiffformat3, + ) = data_with_formats(); + let batch_len = inputs.len(); + + let args = match parser { + "chrono" => vec![ + ColumnarValue::Array(Arc::new( + cast(&inputs, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format1, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format2, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format3, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ], + "jiff" => vec![ + ColumnarValue::Array(Arc::new( + cast(&inputs, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat1, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat2, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat3, &DataType::LargeUtf8).unwrap(), + ) as ArrayRef), + ], + _ => unreachable!(), + }; + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) + }) + }, + ); + + c.bench_function( + &format!("to_timestamp_with_formats_utf8view_{parser}"), + |b| { + let mut config_options = ConfigOptions::default(); + if parser == "jiff" { + config_options.execution.date_time_parser = Some("jiff".to_string()); + } + let config = Arc::new(config_options.clone()); + let to_timestamp_udf = to_timestamp(&config_options); + let to_timestamp_udf = Arc::clone(&to_timestamp_udf); + let ( + inputs, + format1, + format2, + format3, + jiffformat1, + jiffformat2, + jiffformat3, + ) = data_with_formats(); + + let batch_len = inputs.len(); + + let args = match parser { + "chrono" => vec![ + ColumnarValue::Array(Arc::new( + cast(&inputs, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format1, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format2, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&format3, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ], + "jiff" => vec![ + ColumnarValue::Array(Arc::new( + cast(&inputs, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat1, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat2, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ColumnarValue::Array(Arc::new( + cast(&jiffformat3, &DataType::Utf8View).unwrap(), + ) as ArrayRef), + ], + _ => unreachable!(), + }; + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() }) - .expect("to_timestamp should work on valid values"), - ) - }) - }); + .collect::>(); + + b.iter(|| { + black_box( + to_timestamp_udf + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: batch_len, + return_field: Arc::clone(&return_field), + config_options: Arc::clone(&config), + }) + .expect("to_timestamp should work on valid values"), + ) + }) + }, + ); + } } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs index 2db64beafa9b7..c49da7b94c193 100644 --- a/datafusion/functions/src/datetime/common.rs +++ b/datafusion/functions/src/datetime/common.rs @@ -15,59 +15,21 @@ // specific language governing permissions and limitations // under the License. -use std::sync::{Arc, LazyLock}; +use std::sync::Arc; -use arrow::array::timezone::Tz; use arrow::array::{ Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray, StringArrayType, StringViewArray, }; use arrow::compute::DecimalCast; -use arrow::compute::kernels::cast_utils::string_to_datetime; use arrow::datatypes::{DataType, TimeUnit}; use arrow_buffer::ArrowNativeType; -use chrono::LocalResult::Single; -use chrono::format::{Parsed, StrftimeItems, parse}; -use chrono::{DateTime, TimeZone, Utc}; use datafusion_common::cast::as_generic_string_array; use datafusion_common::{ - DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, - internal_datafusion_err, unwrap_or_internal_err, + Result, ScalarValue, exec_err, internal_datafusion_err, unwrap_or_internal_err, }; use datafusion_expr::ColumnarValue; -/// Error message if nanosecond conversion request beyond supported interval -const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; - -static UTC: LazyLock = LazyLock::new(|| "UTC".parse().expect("UTC is always valid")); - -/// Converts a string representation of a date‑time into a timestamp expressed in -/// nanoseconds since the Unix epoch. -/// -/// This helper is a thin wrapper around the more general `string_to_datetime` -/// function. It accepts an optional `timezone` which, if `None`, defaults to -/// Coordinated Universal Time (UTC). The string `s` must contain a valid -/// date‑time format that can be parsed by the underlying chrono parser. -/// -/// # Return Value -/// -/// * `Ok(i64)` – The number of nanoseconds since `1970‑01‑01T00:00:00Z`. -/// * `Err(DataFusionError)` – If the string cannot be parsed, the parsed -/// value is out of range (between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804) -/// or the parsed value does not correspond to an unambiguous time. -pub(crate) fn string_to_timestamp_nanos_with_timezone( - timezone: &Option, - s: &str, -) -> Result { - let tz = timezone.as_ref().unwrap_or(&UTC); - let dt = string_to_datetime(tz, s)?; - let parsed = dt - .timestamp_nanos_opt() - .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?; - - Ok(parsed) -} - /// Checks that all the arguments from the second are of type [Utf8], [LargeUtf8] or [Utf8View] /// /// [Utf8]: DataType::Utf8 @@ -92,161 +54,6 @@ pub(crate) fn validate_data_types(args: &[ColumnarValue], name: &str) -> Result< Ok(()) } -/// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers -/// relative to the provided `timezone` -/// -/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error -/// will be returned -/// -/// Note that parsing [IANA timezones] is not supported yet in chrono - -/// and this implementation only supports named timezones at the end of the string preceded by a space. -/// -/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html -/// [IANA timezones]: https://www.iana.org/time-zones -pub(crate) fn string_to_datetime_formatted( - timezone: &T, - s: &str, - format: &str, -) -> Result, DataFusionError> { - let err = |err_ctx: &str| { - exec_datafusion_err!( - "Error parsing timestamp from '{s}' using format '{format}': {err_ctx}" - ) - }; - - let mut datetime_str = s; - let mut format = format; - - // Manually handle the most common case of a named timezone at the end of the timestamp. - // Note that %+ handles 'Z' at the end of the string without a space. This code doesn't - // handle named timezones with no preceding space since that would require writing a - // custom parser (or switching to Jiff) - let tz: Option = if format.trim_end().ends_with(" %Z") { - // grab the string after the last space as the named timezone - if let Some((dt_str, timezone_name)) = datetime_str.trim_end().rsplit_once(' ') { - datetime_str = dt_str; - - // attempt to parse the timezone name - let result: Result = - timezone_name.parse(); - let Ok(tz) = result else { - return Err(err(&result.unwrap_err().to_string())); - }; - - // successfully parsed the timezone name, remove the ' %Z' from the format - format = &format[..format.len() - 3]; - - Some(tz) - } else { - None - } - } else if format.contains("%Z") { - return Err(err( - "'%Z' is only supported at the end of the format string preceded by a space", - )); - } else { - None - }; - - let mut parsed = Parsed::new(); - parse(&mut parsed, datetime_str, StrftimeItems::new(format)) - .map_err(|e| err(&e.to_string()))?; - - let dt = match tz { - Some(tz) => { - // A timezone was manually parsed out, convert it to a fixed offset - match parsed.to_datetime_with_timezone(&tz) { - Ok(dt) => Ok(dt.fixed_offset()), - Err(e) => Err(e), - } - } - // default to parse the string assuming it has a timezone - None => parsed.to_datetime(), - }; - - if let Err(e) = &dt { - // no timezone or other failure, try without a timezone - let ndt = parsed - .to_naive_datetime_with_offset(0) - .or_else(|_| parsed.to_naive_date().map(|nd| nd.into())); - if let Err(e) = &ndt { - return Err(err(&e.to_string())); - } - - if let Single(e) = &timezone.from_local_datetime(&ndt.unwrap()) { - Ok(e.to_owned()) - } else { - Err(err(&e.to_string())) - } - } else { - Ok(dt.unwrap().with_timezone(timezone)) - } -} - -/// Accepts a string with a `chrono` format and converts it to a -/// nanosecond precision timestamp relative to the provided `timezone`. -/// -/// See [`chrono::format::strftime`] for the full set of supported formats. -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// Any timestamp in the formatting string is handled according to the rules -/// defined by `chrono`. -/// -/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html -#[inline] -pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone( - timezone: &Option, - s: &str, - format: &str, -) -> Result { - let dt = string_to_datetime_formatted(timezone.as_ref().unwrap_or(&UTC), s, format)?; - let parsed = dt - .timestamp_nanos_opt() - .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?; - - Ok(parsed) -} - -/// Accepts a string with a `chrono` format and converts it to a -/// millisecond precision timestamp relative to the provided `timezone`. -/// -/// See [`chrono::format::strftime`] for the full set of supported formats. -/// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// Any timestamp in the formatting string is handled according to the rules -/// defined by `chrono`. -/// -/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html -#[inline] -pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Result { - Ok(string_to_datetime_formatted(&Utc, s, format)? - .naive_utc() - .and_utc() - .timestamp_millis()) -} - pub(crate) fn handle( args: &[ColumnarValue], op: F, @@ -306,7 +113,7 @@ pub(crate) fn handle_multiple( ) -> Result where O: ArrowPrimitiveType, - F: Fn(&str, &str) -> Result, + F: Fn(&str, &[&str]) -> Result, M: Fn(O::Native) -> O::Native, { match &args[0] { @@ -353,11 +160,7 @@ where // if the first argument is a scalar utf8 all arguments are expected to be scalar utf8 ColumnarValue::Scalar(scalar) => match scalar.try_as_str() { Some(a) => { - let a = a.as_ref(); - // ASK: Why do we trust `a` to be non-null at this point? - let a = unwrap_or_internal_err!(a); - - let mut ret = None; + let mut vals = vec![]; for (pos, v) in args.iter().enumerate().skip(1) { let ColumnarValue::Scalar( @@ -372,18 +175,23 @@ where }; if let Some(s) = x { - match op(a, s.as_str()) { - Ok(r) => { - let result = op2(r).to_i64(); - let s = scalar_value(dt, result)?; - ret = Some(Ok(ColumnarValue::Scalar(s))); - break; - } - Err(e) => ret = Some(Err(e)), - } + vals.push(s.as_str()); } } + let a = a.as_ref(); + // ASK: Why do we trust `a` to be non-null at this point? + let a = unwrap_or_internal_err!(a); + + let ret = match op(a, &vals) { + Ok(r) => { + let result = op2(r).to_i64(); + let s = scalar_value(dt, result)?; + Some(Ok(ColumnarValue::Scalar(s))) + } + Err(e) => Some(Err(e)), + }; + unwrap_or_internal_err!(ret) } other => { @@ -411,7 +219,7 @@ pub(crate) fn strings_to_primitive_function( ) -> Result> where O: ArrowPrimitiveType, - F: Fn(&str, &str) -> Result, + F: Fn(&str, &[&str]) -> Result, F2: Fn(O::Native) -> O::Native, { if args.len() < 2 { @@ -472,7 +280,7 @@ fn handle_array_op<'a, O, V, F, F2>( where V: StringArrayType<'a>, O: ArrowPrimitiveType, - F: Fn(&str, &str) -> Result, + F: Fn(&str, &[&str]) -> Result, F2: Fn(O::Native) -> O::Native, { first @@ -481,25 +289,38 @@ where .map(|(pos, x)| { let mut val = None; if let Some(x) = x { + let mut v = vec![]; + for arg in args { - let v = match arg { + match arg { ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8View => Ok(a.as_string_view().value(pos)), - DataType::LargeUtf8 => Ok(a.as_string::().value(pos)), - DataType::Utf8 => Ok(a.as_string::().value(pos)), - other => exec_err!("Unexpected type encountered '{other}'"), + DataType::Utf8View => v.push(a.as_string_view().value(pos)), + DataType::LargeUtf8 => { + v.push(a.as_string::().value(pos)) + } + DataType::Utf8 => v.push(a.as_string::().value(pos)), + other => { + return exec_err!( + "Unexpected type encountered '{other}'" + ); + } }, ColumnarValue::Scalar(s) => match s.try_as_str() { - Some(Some(v)) => Ok(v), + Some(Some(s)) => v.push(s), Some(None) => continue, // null string - None => exec_err!("Unexpected scalar type encountered '{s}'"), + None => { + return exec_err!( + "Unexpected scalar type encountered '{s}'" + ); + } }, - }?; + }; + } - let r = op(x, v); + if !v.is_empty() { + let r = op(x, &v); if let Ok(inner) = r { val = Some(Ok(op2(inner))); - break; } else { val = Some(r); } diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index 39b9453295df6..5b1ec3e3f1dfe 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -31,6 +31,7 @@ pub mod from_unixtime; pub mod make_date; pub mod make_time; pub mod now; +pub mod parser; pub mod planner; pub mod to_char; pub mod to_date; @@ -49,7 +50,7 @@ make_udf_function!(make_date::MakeDateFunc, make_date); make_udf_function!(make_time::MakeTimeFunc, make_time); make_udf_function!(from_unixtime::FromUnixtimeFunc, from_unixtime); make_udf_function!(to_char::ToCharFunc, to_char); -make_udf_function!(to_date::ToDateFunc, to_date); +make_udf_function_with_config!(to_date::ToDateFunc, to_date); make_udf_function!(to_local_time::ToLocalTimeFunc, to_local_time); make_udf_function!(to_time::ToTimeFunc, to_time); make_udf_function!(to_unixtime::ToUnixtimeFunc, to_unixtime); @@ -69,6 +70,7 @@ make_udf_function_with_config!(now::NowFunc, now); // functions with varargs currently pub mod expr_fn { + use datafusion_common::config::ConfigOptions; use datafusion_expr::Expr; export_functions!(( @@ -267,7 +269,8 @@ pub mod expr_fn { /// # } /// ``` pub fn to_date(args: Vec) -> Expr { - super::to_date().call(args) + let config = ConfigOptions::default(); + super::to_date(&config).call(args) } } @@ -286,7 +289,7 @@ pub fn functions() -> Vec> { make_time(), now(&config), to_char(), - to_date(), + to_date(&config), to_local_time(), to_time(), to_unixtime(), diff --git a/datafusion/functions/src/datetime/parser.rs b/datafusion/functions/src/datetime/parser.rs new file mode 100644 index 0000000000000..b342e06098852 --- /dev/null +++ b/datafusion/functions/src/datetime/parser.rs @@ -0,0 +1,724 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::config::ConfigOptions; +use dyn_eq::DynEq; +use dyn_hash::DynHash; +use std::fmt::Debug; + +/// A trait for parsing timestamps from strings. Two implementations are provided: +/// `ChronoDateTimeParser` (the default) which uses [Chrono] to parse timestamps, and +/// `JiffDateTimeParser` (via the `jiff` feature flag) which uses [Jiff] to parse +/// timestamps. +/// +/// While both implementations are capable of parsing timestamps, the `ChronoDateTimeParser` +/// is a bit more lenient wrt formats at the cost of being slightly slower than Jiff. +/// Jiff, on the other hand, has slightly less support for some format options than Chrono +/// but is measurably faster than Chrono in some situations. +/// +/// [Chrono]: https://docs.rs/chrono/latest/chrono/ +/// [Jiff]: https://docs.rs/jiff/latest/jiff/ +pub trait DateTimeParser: Debug + DynEq + DynHash + Send + Sync { + fn string_to_timestamp_nanos( + &self, + tz: &str, + s: &str, + ) -> datafusion_common::Result; + + fn string_to_timestamp_nanos_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> datafusion_common::Result; + + fn string_to_timestamp_millis_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> datafusion_common::Result; +} + +// impl Eq and PartialEq for dyn DateTimeParser +dyn_eq::eq_trait_object!(DateTimeParser); + +// Implement std::hash::Hash for dyn DateTimeParser +dyn_hash::hash_trait_object!(DateTimeParser); + +pub fn get_date_time_parser(config_options: &ConfigOptions) -> Box { + let parser_cfg = config_options.execution.date_time_parser.as_ref(); + + match parser_cfg { + Some(p) => match p.as_str() { + "chrono" => { + Box::new(chrono::ChronoDateTimeParser::new()) as Box + } + #[cfg(feature = "jiff")] + "jiff" => { + Box::new(jiff::JiffDateTimeParser::new()) as Box + } + #[cfg(not(feature = "jiff"))] + "jiff" => { + panic!( + "jiff parser requested but 'jiff' feature is not enabled. Enable with --features jiff" + ); + } + _ => panic!("Unknown/unsupported date time parser: {p}"), + }, + None => Box::new(chrono::ChronoDateTimeParser::new()) as Box, + } +} + +pub mod chrono { + use crate::datetime::parser::DateTimeParser; + use arrow::array::timezone::Tz; + use arrow::compute::kernels::cast_utils::string_to_datetime; + use chrono::LocalResult; + use chrono::format::{ParseErrorKind, Parsed, StrftimeItems, parse}; + use chrono_tz::ParseError; + use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err}; + + const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + + #[derive(Debug, Default, PartialEq, Eq, Hash)] + pub struct ChronoDateTimeParser {} + + impl ChronoDateTimeParser { + pub fn new() -> Self { + Self {} + } + + /// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers + /// relative to the provided `timezone`. + /// + /// If a timestamp is ambiguous, for example, as a result of daylight-savings time, an error + /// will be returned. + /// + /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html + fn string_to_datetime_formatted( + &self, + timezone: &T, + s: &str, + formats: &[&str], + ) -> Result, DataFusionError> { + let mut err: Option = None; + + for &format in formats.iter() { + let mut format = format; + let mut datetime_str = s; + + // we manually handle the most common case of a named timezone at the end of the timestamp + // since chrono doesn't support that directly. Note, however, that %+ does handle 'Z' at the + // end of the string. + // + // This code doesn't handle named timezones with no preceding space since that would require + // writing a custom parser. + let tz: Option = if format.ends_with(" %Z") + && datetime_str.contains(' ') + { + // grab the string after the last space as the named timezone + let parts: Vec<&str> = datetime_str.rsplitn(2, ' ').collect(); + let timezone_name = parts[0]; + datetime_str = parts[1]; + + // attempt to parse the timezone name + let result: Result = timezone_name.parse(); + let Ok(tz) = result else { + err = Some(result.unwrap_err().to_string()); + continue; + }; + + // successfully parsed the timezone name, remove the ' %Z' from the format + format = &format[..format.len() - 3]; + + Some(tz) + } else if format.contains("%Z") { + err = Some("'%Z' is only supported at the end of the format string preceded by a space".into()); + continue; + } else { + None + }; + + let mut parsed = Parsed::new(); + let items = StrftimeItems::new(format); + let result = parse(&mut parsed, datetime_str, items); + + if let Err(e) = &result { + err = Some(e.to_string()); + continue; + } + + let dt = match tz { + Some(tz) => { + // A timezone was manually parsed out, convert it to a fixed offset + match parsed.to_datetime_with_timezone(&tz) { + Ok(dt) => Ok(dt.fixed_offset()), + Err(e) => Err(e), + } + } + // default to parse the string assuming it has a timezone + None => parsed.to_datetime(), + }; + + match dt { + Ok(dt) => return Ok(dt.with_timezone(timezone)), + Err(e) if e.kind() == ParseErrorKind::Impossible => { + err = Some(format!( + "Unable to parse timestamp {datetime_str} in timezone {tz:?}: datetime was impossible" + )); + continue; + } + Err(e) if e.kind() == ParseErrorKind::OutOfRange => { + err = Some(format!( + "Unable to parse timestamp {datetime_str} in timezone {tz:?}: datetime was out of range" + )); + continue; + } + _ => { + // no timezone or other failure, try without a timezone + let ndt = parsed + .to_naive_datetime_with_offset(0) + .or_else(|_| parsed.to_naive_date().map(|nd| nd.into())); + if let Err(e) = &ndt { + err = Some(e.to_string()); + continue; + } + + let result = &timezone.from_local_datetime(&ndt.unwrap()); + + match result { + LocalResult::Single(e) => return Ok(e.to_owned()), + LocalResult::None => { + err = Some(format!( + "Unable to parse timestamp {datetime_str} in timezone {tz:?}: no valid local time found" + )); + continue; + } + LocalResult::Ambiguous(earliest, latest) => { + err = Some(format!( + "Unable to parse timestamp {datetime_str} in timezone {tz:?}: ambiguous timestamps found {earliest:?} and {latest:?}" + )); + continue; + } + } + } + } + } + + match err { + Some(e) => exec_err!( + "Error parsing timestamp from '{s}' using formats: {formats:?}: {e}" + ), + None => exec_err!( + "Error parsing timestamp from '{s}' using formats: {formats:?}" + ), + } + } + } + + impl DateTimeParser for ChronoDateTimeParser { + /// Accepts a string and parses it relative to the provided `timezone` + /// + /// In addition to RFC3339 / ISO8601 standard timestamps, it also + /// accepts strings that use a space ` ` to separate the date and time + /// as well as strings that have no explicit timezone offset. + /// + /// Examples of accepted inputs: + /// * `1997-01-31T09:26:56.123Z` # RCF3339 + /// * `1997-01-31T09:26:56.123-05:00` # RCF3339 + /// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T + /// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator + /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified + /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset + /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds + /// * `1997-01-31 092656` # close to RCF3339, no fractional seconds + /// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator + /// * `1997-01-31` # close to RCF3339, only date no time + /// + /// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled + /// + /// * `2023-01-01 040506 America/Los_Angeles` + /// + /// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error + /// will be returned + /// + /// Some formats supported by PostgresSql + /// are not supported, like + /// + /// * "2023-01-01 04:05:06.789 +07:30:00", + /// * "2023-01-01 040506 +07:30:00", + /// * "2023-01-01 04:05:06.789 PST", + #[inline] + fn string_to_timestamp_nanos( + &self, + tz: &str, + s: &str, + ) -> Result { + let tz: Tz = match tz.parse() { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"), + }; + + let dt = string_to_datetime(&tz, s)?; + let parsed = dt + .timestamp_nanos_opt() + .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?; + + Ok(parsed) + } + + /// Accepts a string with an array of `chrono` formats and converts it to a + /// nanosecond precision timestamp according to the rules + /// defined by `chrono`. + /// + /// See [`chrono::format::strftime`] for the full set of supported formats. + /// + /// ## Timestamp Precision + /// + /// This function uses the maximum precision timestamps supported by + /// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This + /// means the range of dates that timestamps can represent is ~1677 AD + /// to 2262 AM. + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + /// + /// Note that parsing named [IANA timezones] is not supported yet in chrono + /// and this implementation + /// only supports named timezones at the end of the string preceded by a space. + /// + /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html + /// [IANA timezones]: https://www.iana.org/time-zones + #[inline] + fn string_to_timestamp_nanos_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> Result { + let tz: Tz = match tz.parse() { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"), + }; + + let dt = self.string_to_datetime_formatted(&tz, s, formats)?; + dt.naive_utc() + .and_utc() + .timestamp_nanos_opt() + .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}")) + } + + /// Accepts a string with a `chrono` format and converts it to a + /// millisecond precision timestamp. + /// + /// See [`chrono::format::strftime`] for the full set of supported formats. + /// + /// Internally, this function uses the `chrono` library for the + /// datetime parsing + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + /// + /// Any timestamp in the formatting string is handled according to the rules + /// defined by `chrono`. + /// + /// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html + #[inline] + fn string_to_timestamp_millis_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> Result { + let tz: Tz = match tz.parse() { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"), + }; + + Ok(self + .string_to_datetime_formatted(&tz, s, formats)? + .naive_utc() + .and_utc() + .timestamp_millis()) + } + } +} + +#[cfg(feature = "jiff")] +pub mod jiff { + use crate::datetime::parser::DateTimeParser; + use arrow::array::timezone::Tz; + use arrow::error::ArrowError; + use chrono::format::{Parsed, StrftimeItems, parse}; + use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err}; + use jiff::civil::Time; + use jiff::fmt::temporal::Pieces; + use jiff::tz::{Disambiguation, TimeZone}; + use jiff::{Timestamp, Zoned}; + use num_traits::ToPrimitive; + + const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + + #[derive(Debug, Default, PartialEq, Eq, Hash)] + pub struct JiffDateTimeParser {} + + impl JiffDateTimeParser { + pub fn new() -> Self { + Self {} + } + + /// Attempts to parse a given string representation of a timestamp into a `Timestamp` object. + /// The function also adjusts the datetime for the specified timezone. + /// + /// # Parameters + /// - `timezone`: A reference to the `TimeZone` object used to adjust the parsed datetime to the desired timezone. + /// - `s`: A string slice holding the timestamp to be parsed. + /// + /// # Returns + /// - `Ok(Timestamp)`: Contains the parsed timestamp in seconds since the Unix epoch. + /// - `Err(ArrowError)`: Returned in the event of errors in parsing + /// the timestamp string or computing the timezone offset. + /// + /// # Errors + /// This function will return an `ArrowError` if: + /// + /// - The string `s` is shorter than 10 characters. + /// - The format of the string does not match expected timestamp patterns. + /// - An invalid/unknown time zone or offset is provided during parsing. + /// - Errors occur while converting the datetime to a specific time zone. + fn string_to_datetime( + &self, + timezone: &TimeZone, + s: &str, + ) -> datafusion_common::Result { + let err = |ctx: &str| { + ArrowError::ParseError(format!( + "Error parsing timestamp from '{s}': {ctx}" + )) + }; + + let bytes = s.as_bytes(); + if bytes.len() < 10 { + return Err(err("timestamp must contain at least 10 characters")); + } + + let pieces = Pieces::parse(bytes).map_err(|e| err(&format!("{e:?}")))?; + let time = pieces.time().unwrap_or_else(Time::midnight); + let dt = pieces.date().to_datetime(time); + let tz = match pieces + .to_time_zone() + .map_err(|e| err(&format!("unknown time zone: {e:?}")))? + { + Some(tz) => tz, + None => match pieces.to_numeric_offset() { + Some(offset) => TimeZone::fixed(offset), + None => timezone.to_owned(), + }, + }; + let zdt = tz + .to_zoned(dt) + .map_err(|e| err(&format!("error computing timezone offset: {e:?}")))?; + + Ok(zdt.timestamp()) + } + + /// Attempts to parse a given string representation of a timestamp into a `Zoned` datetime object + /// using a list of provided formats. The function also adjusts the datetime for the specified timezone. + /// + /// # Parameters + /// - `timezone`: A reference to the `TimeZone` object used to adjust the parsed datetime to the desired timezone. + /// - `s`: A string slice holding the timestamp to be parsed. + /// - `formats`: A slice of string slices representing the accepted datetime formats to use when parsing. + /// + /// # Returns + /// - `Ok(Zoned)` if the string is successfully parsed into a `Zoned` datetime object using one of the formats provided. + /// - `Err(DataFusionError)` if the string cannot be parsed or if there are issues related to the timezone or format handling. + /// + /// # Behavior + /// 1. Iterates through the list of provided formats. + /// 2. Handles special cases such as `%Z` (timezone) or `%c` (locale-aware datetime representation) + /// by either modifying the format string or switching to different parsing logic. + /// 3. Attempts to resolve timezone-related issues: + /// - Calculates a fixed offset for the given `timezone`. + /// - Handles attempts to parse timezones using IANA names or offsets. + /// 4. If parsing fails for the current format, the function moves to the next format and aggregates any parsing errors. + /// 5. If no formats succeed, an error is returned summarizing the failure. + /// + /// # Errors + /// - Returns a descriptive error wrapped in `DataFusionError` if: + /// - The input string is not compatible with any of the provided formats. + /// - There are issues parsing the timezone from the input string or adjusting to the provided `TimeZone`. + /// - There are issues resolving ambiguous datetime representations. + fn string_to_datetime_formatted( + &self, + timezone: &TimeZone, + s: &str, + formats: &[&str], + ) -> datafusion_common::Result { + let mut err: Option = None; + + for &format in formats.iter() { + let mut format = if format.contains("%Z") { + &format.replace("%Z", "%Q") + } else { + format + }; + + if format == "%c" { + // switch to chrono, jiff doesn't support parsing with %c + let mut parsed = Parsed::new(); + let result = parse(&mut parsed, s, StrftimeItems::new(format)); + + if result.is_err() { + err = Some(format!("error parsing timestamp: {result:?}")); + continue; + } + + let offset = timezone.to_fixed_offset(); + let tz: &str = if let Ok(offset) = offset { + &offset.to_string() + } else { + let result = timezone.iana_name(); + match result { + Some(tz) => tz, + None => { + err = + Some(format!("error parsing timezone: {timezone:?}")); + continue; + } + } + }; + let tz: Tz = tz.parse::().ok().unwrap_or("UTC".parse::()?); + + match parsed.to_datetime_with_timezone(&tz) { + Ok(dt) => { + let dt = dt.fixed_offset(); + let nanos_opt = dt.timestamp_nanos_opt(); + + let Some(nanos) = nanos_opt else { + err = Some(ERR_NANOSECONDS_NOT_SUPPORTED.to_owned()); + continue; + }; + + let result = Timestamp::from_nanosecond(nanos as i128); + match result { + Ok(result) => { + return Ok(result.to_zoned(timezone.to_owned())); + } + Err(e) => { + err = Some(e.to_string()); + continue; + } + } + } + Err(e) => { + err = Some(e.to_string()); + continue; + } + }; + } + + let result = if format == "%+" { + // jiff doesn't support parsing with %+, but the equivalent of %+ is + // somewhat rfc3389 which is what is used by the parse method + let result: Result = s.parse(); + + if let Ok(r) = result { + return Ok(r); + } else { + // however, the above only works with timezone names, not offsets + let s = if s.ends_with("Z") { + &(s.trim_end_matches("Z").to_owned() + "+00:00") + } else { + s + }; + + // try again with fractional seconds + format = "%Y-%m-%dT%H:%M:%S%.f%:z"; + + jiff::fmt::strtime::parse(format, s) + } + } else { + jiff::fmt::strtime::parse(format, s) + }; + + match result { + Ok(bdt) => { + if (bdt.iana_time_zone().is_some() || bdt.offset().is_some()) + && let Ok(zoned) = bdt.to_zoned() + { + return Ok(zoned.with_time_zone(timezone.to_owned())); + } + + // %s just sets the timestamp and the .to_datetime() call will error + // since the other fields like year are not set. Handle this case + // directly. + let datetime = if let Some(ts) = bdt.timestamp() { + timezone.to_datetime(ts) + } else { + let result = bdt.to_datetime(); + match result { + Ok(datetime) => datetime, + Err(e) => { + err = Some(e.to_string()); + continue; + } + } + }; + + let zoned = timezone + .to_owned() + .into_ambiguous_zoned(datetime) + .disambiguate(Disambiguation::Compatible); + + match zoned { + Ok(z) => return Ok(z), + Err(e) => { + err = Some(e.to_string()); + continue; + } + } + } + Err(e) => { + err = Some(e.to_string()); + continue; + } + } + } + + match err { + Some(e) => exec_err!( + "Error parsing timestamp from '{s}' using formats: {formats:?}: {e}" + ), + None => exec_err!( + "Error parsing timestamp from '{s}' using formats: {formats:?}" + ), + } + } + } + + impl DateTimeParser for JiffDateTimeParser { + /// Accepts a string with a `jiff` format and converts it to a + /// nanosecond precision timestamp according to the rules + /// defined by `jiff`. + /// + /// See + /// for the full set of supported formats. + /// + /// ## Timestamp Precision + /// + /// This function uses the maximum precision timestamps supported by + /// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This + /// means the range of dates that timestamps can represent is ~1677 AD + /// to 2262 AM. + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + #[inline] + fn string_to_timestamp_nanos( + &self, + tz: &str, + s: &str, + ) -> Result { + let result = TimeZone::get(tz); + let timezone = match result { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), + }; + + let timestamp = self.string_to_datetime(&timezone, s)?; + let parsed = timestamp + .as_nanosecond() + .to_i64() + .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?; + + Ok(parsed) + } + + /// Accepts a string with an array of `jiff` formats and converts it to a + /// nanosecond precision timestamp according to the rules + /// defined by `jiff`. + /// + /// See + /// for the full set of supported formats. + /// + /// ## Timestamp Precision + /// + /// This function uses the maximum precision timestamps supported by + /// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This + /// means the range of dates that timestamps can represent is ~1677 AD + /// to 2262 AM. + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + #[inline] + fn string_to_timestamp_nanos_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> Result { + let result = TimeZone::get(tz); + let timezone = match result { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), + }; + + let zoned = self.string_to_datetime_formatted(&timezone, s, formats)?; + let parsed = + zoned.timestamp().as_nanosecond().to_i64().ok_or_else(|| { + exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}") + })?; + + Ok(parsed) + } + + /// Accepts a string with an array of `jiff` formats and converts it to a + /// millisecond precision timestamp according to the rules + /// defined by `jiff`. + /// + /// See + /// for the full set of supported formats. + /// + /// ## Timezone / Offset Handling + /// + /// Numerical values of timestamps are stored compared to offset UTC. + #[inline] + fn string_to_timestamp_millis_formatted( + &self, + tz: &str, + s: &str, + formats: &[&str], + ) -> Result { + let result = TimeZone::get(tz); + let timezone = match result { + Ok(tz) => tz, + Err(e) => return exec_err!("Invalid timezone {tz}: {e}"), + }; + + let dt = self.string_to_datetime_formatted(&timezone, s, formats)?; + let parsed = dt.timestamp().as_millisecond(); + + Ok(parsed) + } + } +} diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 60c6fdf2df975..5d3f359f1fceb 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -16,11 +16,13 @@ // under the License. use crate::datetime::common::*; +use crate::datetime::parser::DateTimeParser; use arrow::compute::cast_with_options; use arrow::datatypes::DataType; use arrow::datatypes::DataType::*; use arrow::error::ArrowError::ParseError; use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser}; +use datafusion_common::config::ConfigOptions; use datafusion_common::format::DEFAULT_CAST_OPTIONS; use datafusion_common::{Result, arrow_err, exec_err, internal_datafusion_err}; use datafusion_expr::{ @@ -67,21 +69,39 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo #[derive(Debug, PartialEq, Eq, Hash)] pub struct ToDateFunc { signature: Signature, + parser: Box, } impl Default for ToDateFunc { fn default() -> Self { - Self::new() + Self::new_with_config(&ConfigOptions::default()) } } impl ToDateFunc { + #[deprecated(since = "52.0.0", note = "use `new_with_config` instead")] + /// Deprecated constructor retained for backwards compatibility. + /// + /// Prefer [`ToDateFunc::new_with_config`] which allows specifying the + /// date time parser via [`ConfigOptions`]. pub fn new() -> Self { + Self::new_with_config(&ConfigOptions::default()) + } + + pub fn new_with_config(config: &ConfigOptions) -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), + parser: crate::datetime::parser::get_date_time_parser(config), } } + /// Set a custom date time parser. + pub fn with_datetime_parser(&mut self, parser: Box) -> &mut Self { + self.parser = parser; + + self + } + fn to_date(&self, args: &[ColumnarValue]) -> Result { match args.len() { 1 => handle::( @@ -98,8 +118,8 @@ impl ToDateFunc { ), 2.. => handle_multiple::( args, - |s, format| { - string_to_timestamp_millis_formatted(s, format) + |s, formats| { + self.parser.string_to_timestamp_millis_formatted("UTC", s, formats) .map(|n| n / (24 * 60 * 60 * 1_000)) .and_then(|v| { v.try_into().map_err(|_| { @@ -223,7 +243,7 @@ mod tests { return_field: Field::new("f", DataType::Date32, true).into(), config_options: Arc::new(ConfigOptions::default()), }; - ToDateFunc::new().invoke_with_args(args) + ToDateFunc::new_with_config(&ConfigOptions::default()).invoke_with_args(args) } #[test] @@ -394,10 +414,13 @@ mod tests { tc.name, tc.formatted_date, tc.format_str ); } - _ => panic!( - "Could not convert '{}' with format string '{}'to Date", - tc.date_str, tc.format_str - ), + e => { + println!("e was {e:?}"); + panic!( + "Could not convert '{}' with format string '{}' to Date", + tc.formatted_date, tc.format_str + ) + } } } @@ -435,7 +458,7 @@ mod tests { ); } _ => panic!( - "Could not convert '{}' with format string '{}'to Date: {:?}", + "Could not convert '{}' with format string '{}' to Date: {:?}", tc.formatted_date, tc.format_str, to_date_result ), } diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index 738c4acee7da4..0a865ffe3039b 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::sync::Arc; use crate::datetime::common::*; -use arrow::array::timezone::Tz; +use crate::datetime::parser::DateTimeParser; use arrow::array::{ Array, Decimal128Array, Float16Array, Float32Array, Float64Array, TimestampNanosecondArray, @@ -90,6 +90,7 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampFunc { signature: Signature, timezone: Option>, + parser: Box, } #[user_doc( @@ -140,6 +141,7 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampSecondsFunc { signature: Signature, timezone: Option>, + parser: Box, } #[user_doc( @@ -190,6 +192,7 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampMillisFunc { signature: Signature, timezone: Option>, + parser: Box, } #[user_doc( @@ -240,6 +243,7 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampMicrosFunc { signature: Signature, timezone: Option>, + parser: Box, } #[user_doc( @@ -289,10 +293,12 @@ only supported at the end of the string preceded by a space. pub struct ToTimestampNanosFunc { signature: Signature, timezone: Option>, + parser: Box, } /// Macro to generate boilerplate constructors and config methods for ToTimestamp* functions. -/// Generates: Default impl, deprecated new(), new_with_config(), and extracts timezone from ConfigOptions. +/// Generates: Default impl, deprecated new(), new_with_config(), with_datetime_parser(), and +/// extracts timezone from ConfigOptions. macro_rules! impl_to_timestamp_constructors { ($func:ty) => { impl Default for $func { @@ -320,8 +326,19 @@ macro_rules! impl_to_timestamp_constructors { .time_zone .as_ref() .map(|tz| Arc::from(tz.as_str())), + parser: super::parser::get_date_time_parser(config), } } + + /// Set a custom date time parser. + pub fn with_datetime_parser( + &mut self, + parser: Box, + ) -> &mut Self { + self.parser = parser; + + self + } } }; } @@ -456,9 +473,12 @@ impl ScalarUDFImpl for ToTimestampFunc { decimal128_to_timestamp_nanos(&arg, tz) } Decimal128(_, _) => decimal128_to_timestamp_nanos(&args[0], tz), - Utf8View | LargeUtf8 | Utf8 => { - to_timestamp_impl::(&args, "to_timestamp", &tz) - } + Utf8View | LargeUtf8 | Utf8 => to_timestamp_impl::( + &args, + "to_timestamp", + &tz, + &self.parser, + ), other => { exec_err!("Unsupported data type {other} for function to_timestamp") } @@ -531,6 +551,7 @@ impl ScalarUDFImpl for ToTimestampSecondsFunc { &args, "to_timestamp_seconds", &self.timezone, + &self.parser, ), other => { exec_err!( @@ -607,6 +628,7 @@ impl ScalarUDFImpl for ToTimestampMillisFunc { &args, "to_timestamp_millis", &self.timezone, + &self.parser, ), other => { exec_err!( @@ -683,6 +705,7 @@ impl ScalarUDFImpl for ToTimestampMicrosFunc { &args, "to_timestamp_micros", &self.timezone, + &self.parser, ), other => { exec_err!( @@ -759,6 +782,7 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { &args, "to_timestamp_nanos", &self.timezone, + &self.parser, ), other => { exec_err!( @@ -774,10 +798,12 @@ impl ScalarUDFImpl for ToTimestampNanosFunc { } } +#[expect(clippy::borrowed_box)] fn to_timestamp_impl>( args: &[ColumnarValue], name: &str, timezone: &Option>, + parser: &Box, ) -> Result { let factor = match T::UNIT { Second => 1_000_000_000, @@ -786,23 +812,18 @@ fn to_timestamp_impl>( Nanosecond => 1, }; - let tz = match timezone.clone() { - Some(tz) => Some(tz.parse::()?), - None => None, - }; + let tz = timezone.clone().unwrap_or_else(|| "UTC".into()); match args.len() { 1 => handle::( args, - move |s| string_to_timestamp_nanos_with_timezone(&tz, s).map(|n| n / factor), + move |s| parser.string_to_timestamp_nanos(&tz, s).map(|n| n / factor), name, &Timestamp(T::UNIT, timezone.clone()), ), n if n >= 2 => handle_multiple::( args, - move |s, format| { - string_to_timestamp_nanos_formatted_with_timezone(&tz, s, format) - }, + move |s, formats| parser.string_to_timestamp_nanos_formatted(&tz, s, formats), |n| n / factor, name, &Timestamp(T::UNIT, timezone.clone()), @@ -815,6 +836,8 @@ fn to_timestamp_impl>( mod tests { use std::sync::Arc; + use super::*; + use crate::datetime::parser::chrono::ChronoDateTimeParser; use arrow::array::types::Int64Type; use arrow::array::{ Array, PrimitiveArray, TimestampMicrosecondArray, TimestampMillisecondArray, @@ -827,47 +850,67 @@ mod tests { use datafusion_common::{DataFusionError, ScalarValue, assert_contains}; use datafusion_expr::{ScalarFunctionArgs, ScalarFunctionImplementation}; - use super::*; - fn to_timestamp(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); - to_timestamp_impl::(args, "to_timestamp", &timezone) + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + + to_timestamp_impl::( + args, + "to_timestamp", + &timezone, + &parser, + ) } /// to_timestamp_millis SQL function fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + to_timestamp_impl::( args, "to_timestamp_millis", &timezone, + &parser, ) } /// to_timestamp_micros SQL function fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + to_timestamp_impl::( args, "to_timestamp_micros", &timezone, + &parser, ) } /// to_timestamp_nanos SQL function fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + to_timestamp_impl::( args, "to_timestamp_nanos", &timezone, + &parser, ) } /// to_timestamp_seconds SQL function fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { let timezone: Option> = Some("UTC".into()); - to_timestamp_impl::(args, "to_timestamp_seconds", &timezone) + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + to_timestamp_impl::( + args, + "to_timestamp_seconds", + &timezone, + &parser, + ) } fn udfs_and_timeunit() -> Vec<(Box, TimeUnit)> { @@ -1479,7 +1522,7 @@ mod tests { ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), ]; - let expected_err = "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters"; + let expected_err = "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using formats: [\"%s\", \"%c\", \"%H:%M:%S\"]: input contains invalid characters"; match to_timestamp(&string_array) { Ok(_) => panic!("Expected error but got success"), Err(e) => { @@ -1527,11 +1570,9 @@ mod tests { } fn parse_timestamp_formatted(s: &str, format: &str) -> Result { - let result = string_to_timestamp_nanos_formatted_with_timezone( - &Some("UTC".parse()?), - s, - format, - ); + let timezone = "UTC"; + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + let result = parser.string_to_timestamp_nanos_formatted(timezone, s, &[format]); if let Err(e) = &result { eprintln!("Error parsing timestamp '{s}' using format '{format}': {e:?}"); } @@ -1557,14 +1598,16 @@ mod tests { ), ]; + let timezone = "UTC"; + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + for (s, f, ctx) in cases { - let expected = format!( - "Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}" - ); - let actual = string_to_datetime_formatted(&Utc, s, f) + let expected = ctx.to_string(); + let actual = parser + .string_to_timestamp_nanos_formatted(timezone, s, &[f]) .unwrap_err() .strip_backtrace(); - assert_eq!(actual, expected) + assert_contains!(actual, expected); } } @@ -1587,14 +1630,16 @@ mod tests { ), ]; + let timezone = "UTC"; + let parser = Box::new(ChronoDateTimeParser::new()) as Box; + for (s, f, ctx) in cases { - let expected = format!( - "Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}" - ); - let actual = string_to_datetime_formatted(&Utc, s, f) + let expected = ctx.to_string(); + let actual = parser + .string_to_timestamp_nanos_formatted(timezone, s, &[f]) .unwrap_err() .strip_backtrace(); - assert_eq!(actual, expected) + assert_contains!(actual, expected); } } diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 182874c60ef3e..928f0d978b4b8 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -46,7 +46,7 @@ bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } clap = { version = "4.5.53", features = ["derive", "env"] } -datafusion = { workspace = true, default-features = true, features = ["avro"] } +datafusion = { workspace = true, default-features = true, features = ["avro", "jiff"] } datafusion-spark = { workspace = true, features = ["core"] } datafusion-substrait = { workspace = true, default-features = true } futures = { workspace = true } diff --git a/datafusion/sqllogictest/test_files/datetime/dates.slt b/datafusion/sqllogictest/test_files/datetime/dates.slt index d2a7360b120c6..ad20228767012 100644 --- a/datafusion/sqllogictest/test_files/datetime/dates.slt +++ b/datafusion/sqllogictest/test_files/datetime/dates.slt @@ -295,7 +295,7 @@ query error input contains invalid characters SELECT to_date('2020-09-08 12/00/00+00:00', '%c', '%+') # to_date with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_date('2020-09-08 12/00/00+00:00', '%q') statement ok @@ -371,8 +371,11 @@ SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') 1999-12-31 # verify date data using tables with formatting options where at least one column cannot be parsed -query error Error parsing timestamp from '1926632005' using format '%d-%m-%Y %H:%M:%S%#z': input contains invalid characters +query error SELECT to_date(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t +---- +DataFusion error: Execution error: Error parsing timestamp from '1926632005' using formats: ["%Y-%m-%d %H/%M/%S%#z", "%+", "%d-%m-%Y %H:%M:%S%#z"]: input contains invalid characters + # verify date data using tables with formatting options where one of the formats is invalid query D diff --git a/datafusion/sqllogictest/test_files/datetime/timestamps.slt b/datafusion/sqllogictest/test_files/datetime/timestamps.slt index fa25994ed7edc..84f6ee1d983ef 100644 --- a/datafusion/sqllogictest/test_files/datetime/timestamps.slt +++ b/datafusion/sqllogictest/test_files/datetime/timestamps.slt @@ -2543,23 +2543,23 @@ query error input contains invalid characters SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+') # to_timestamp with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_nanos with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_millis with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_micros with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%q') # to_timestamp_seconds with broken formatting -query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using format '%q': trailing input +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: trailing input SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%q') # Create string timestamp table with different formats @@ -2663,8 +2663,11 @@ SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 # verify timestamp data using tables with formatting options where at least one column cannot be parsed -query error Error parsing timestamp from '1926632005' using format '%d-%m-%Y %H:%M:%S%#z': input contains invalid characters +query error SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t +---- +DataFusion error: Execution error: Error parsing timestamp from '1926632005' using formats: ["%Y-%m-%d %H/%M/%S%#z", "%+", "%d-%m-%Y %H:%M:%S%#z"]: input contains invalid characters + # verify timestamp data using tables with formatting options where one of the formats is invalid query P diff --git a/datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt b/datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt new file mode 100644 index 0000000000000..8871e8d32497f --- /dev/null +++ b/datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt @@ -0,0 +1,290 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +statement ok +SET datafusion.execution.date_time_parser = 'jiff'; + +statement ok +create table ts_data(ts bigint, value int) as values + (1599572549190855123, 1), + (1599568949190855123, 2), + (1599565349190855123, 3); + +statement ok +create table ts_data_nanos as select arrow_cast(ts, 'Timestamp(Nanosecond, None)') as ts, value from ts_data; + +statement ok +create table ts_data_micros as select arrow_cast(ts / 1000, 'Timestamp(Microsecond, None)') as ts, value from ts_data; + +statement ok +create table ts_data_millis as select arrow_cast(ts / 1000000, 'Timestamp(Millisecond, None)') as ts, value from ts_data; + +statement ok +create table ts_data_secs as select arrow_cast(ts / 1000000000, 'Timestamp(Second, None)') as ts, value from ts_data; + +# verify chrono formats don't work + +query error strptime parsing failed +SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp('2020-09-08T12:00:00+00:00', '%Y-%m-%dT%H:%M:%S%Z') + +# to_timestamp with formatting +query I +SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp('2020-09-08T12:00:00+00:00', '2020-09-08 12/00/00+00:00', '%+', '%Y-%m-%d %H/%M/%s%#z') +---- +2 + +# to_timestamp_nanos with formatting +query I +SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z') +---- +2 + +# to_timestamp_millis with formatting +query I +SELECT COUNT(*) FROM ts_data_millis where ts > to_timestamp_millis('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z') +---- +2 + +# to_timestamp_micros with formatting +query I +SELECT COUNT(*) FROM ts_data_micros where ts > to_timestamp_micros('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z') +---- +2 + +# to_timestamp_seconds with formatting +query I +SELECT COUNT(*) FROM ts_data_secs where ts > to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z') +---- +2 + +# verify timestamp data with formatting options +query PPPPPP +SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'), to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1, '%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s') +---- +NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:59:59 1969-12-31T23:59:59 + +# verify timestamp data with formatting options +query PPPPPP +SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'), to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1, '%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s') +---- +NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:59:59 1969-12-31T23:59:59 + +# verify timestamp output types with formatting options +query TTT +SELECT arrow_typeof(to_timestamp(1, '%c', '%s')), arrow_typeof(to_timestamp(null, '%+', '%s')), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000', '%Y-%m-%d %H:%M:%S%.f')) +---- +Timestamp(ns) Timestamp(ns) Timestamp(ns) + +# to_timestamp with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_nanos with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_millis with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_micros with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_seconds with invalid formatting +query error strptime parsing failed: expected to match literal byte `T` from format string, but found byte ` ` in input +SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_nanos with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_millis with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_micros with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_seconds with broken formatting +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]: +SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%q') + +# Create string timestamp table with different formats +# including a few very non-standard formats + +statement ok +create table ts_utf8_data(ts varchar(100), format varchar(100)) as values + ('2020-09-08 12/00/00+00:00', '%Y-%m-%d %H/%M/%S%#:z'), + ('2031-01-19T23:33:25+05:00', '%+'), + ('08-09-2020 12:00:00+00:00', '%d-%m-%Y %H:%M:%S%#:z'), + ('1926632005', '%s'), + ('2000-01-01T01:01:01+07:00', '%+'); + +statement ok +create table ts_largeutf8_data as +select arrow_cast(ts, 'LargeUtf8') as ts, arrow_cast(format, 'LargeUtf8') as format from ts_utf8_data; + +statement ok +create table ts_utf8view_data as +select arrow_cast(ts, 'Utf8View') as ts, arrow_cast(format, 'Utf8View') as format from ts_utf8_data; + +# verify timestamp data using tables with formatting options +query P +SELECT to_timestamp(t.ts, t.format) from ts_utf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, t.format), + to_timestamp_seconds(t.ts, t.format), + to_timestamp_millis(t.ts, t.format), + to_timestamp_micros(t.ts, t.format), + to_timestamp_nanos(t.ts, t.format) + from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, t.format), + to_timestamp_seconds(t.ts, t.format), + to_timestamp_millis(t.ts, t.format), + to_timestamp_micros(t.ts, t.format), + to_timestamp_nanos(t.ts, t.format) + from ts_utf8view_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +# verify timestamp data using tables with formatting options +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z') + from ts_utf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z') + from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +query PPPPP +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z'), + to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#:z') + from ts_utf8view_data as t +---- +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25 +2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00 +2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25 +1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01 + +# verify timestamp data using tables with formatting options where at least one column cannot be parsed +query error +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#:z') from ts_utf8_data as t +---- +DataFusion error: Execution error: Error parsing timestamp from '2020-09-08 12/00/00+00:00' using formats: ["%Y-%m-%d %H/%M/%S%#z", "%+", "%d-%m-%Y %H:%M:%S%#:z"]: strptime parsing failed: expected to match literal byte `-` from format string, but found byte `2` in input + + +# verify timestamp data using tables with formatting options where one of the formats is invalid +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#:z', '%+') from ts_utf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#:z', '%+') from ts_largeutf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#:z', '%+') from ts_utf8view_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +# timestamp data using tables with formatting options in an array is not supported at this time +query error function unsupported data type at index 1: +SELECT to_timestamp(t.ts, make_array('%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#:z', '%+')) from ts_utf8_data as t + +statement ok +drop table ts_utf8_data + +statement ok +drop table ts_data + +statement ok +drop table ts_data_nanos + +statement ok +drop table ts_data_micros + +statement ok +drop table ts_data_millis + +statement ok +drop table ts_data_secs + +statement ok +SET datafusion.execution.date_time_parser = 'chrono'; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index e31cdbe0aad23..8e6c7d2e633d2 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -217,6 +217,7 @@ datafusion.catalog.newlines_in_values false datafusion.execution.batch_size 8192 datafusion.execution.coalesce_batches true datafusion.execution.collect_statistics true +datafusion.execution.date_time_parser chrono datafusion.execution.enable_ansi_mode false datafusion.execution.enable_recursive_ctes true datafusion.execution.enforce_batch_size_in_joins false @@ -354,6 +355,7 @@ datafusion.catalog.newlines_in_values false Specifies whether newlines in (quote datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. +datafusion.execution.date_time_parser chrono The date time parser to use when parsing date time values. Defaults to 'chrono'. 'jiff' is supported when the 'jiff' feature is enabled. datafusion.execution.enable_ansi_mode false Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. diff --git a/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt index d48e41d1204de..d774237468831 100644 --- a/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt +++ b/datafusion/sqllogictest/test_files/to_timestamp_timezone.slt @@ -142,7 +142,7 @@ SELECT to_timestamp('2020-09-08 13:42:29 America/Toronto', '%Y-%m-%d %H:%M:%S %Z ---- 2020-09-08T17:42:29Z -query error Error parsing timestamp from '2020-09-08 13:42:29America/Toronto' using format '%Y-%m-%d %H:%M:%S%Z': '%Z' is only supported at the end of the format string preceded by a space +query error DataFusion error: Execution error: Error parsing timestamp from '2020\-09\-08 13:42:29America/Toronto' using formats: \["%Y\-%m\-%d %H:%M:%S%Z"\]: '%Z' is only supported at the end of the format string preceded by a space SELECT to_timestamp('2020-09-08 13:42:29America/Toronto', '%Y-%m-%d %H:%M:%S%Z'); ## Test 17: Test all precision variants respect timezone diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index aaba453b3541f..744ddee31c614 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -79,6 +79,7 @@ The following configuration settings are available: | datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting | | datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. | | datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system | +| datafusion.execution.date_time_parser | chrono | The date time parser to use when parsing date time values. Defaults to 'chrono'. 'jiff' is supported when the 'jiff' feature is enabled. | | datafusion.execution.time_zone | NULL | The default time zone Some functions, e.g. `now` return timestamps in this time zone | | datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | | datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file |