fix: substring with negative start index#4017
Conversation
| let result = DictionaryArray::try_new(dict.keys().clone(), values)?; | ||
| Ok(Arc::new(result) as ArrayRef) | ||
| } | ||
| _ => Ok(Arc::clone(array)), |
There was a problem hiding this comment.
Should this be an error rather than just returning the input data?
There was a problem hiding this comment.
Thank you, updated
| Ok(Arc::new(builder.finish()) as ArrayRef) | ||
| } | ||
| DataType::Dictionary(_, _) => { | ||
| let dict = as_dictionary_array::<Int32Type>(array); |
There was a problem hiding this comment.
This would panic for a dictionary with Int64Type. Can we add a check for the type?
There was a problem hiding this comment.
Parquet dictionary uses Integer, so we are not doing Int64Type including other locations. E.g.https://github.com/apache/datafusion-comet/blob/main/native/spark-expr/src/static_invoke/char_varchar_utils/read_side_padding.rs#L68
| fn spark_substr_negative(s: &str, pos: i64, len: u64) -> String { | ||
| let num_chars = s.chars().count() as i64; | ||
| let start = num_chars + pos; | ||
| let end = start.saturating_add(len as i64).min(num_chars); | ||
| let start = start.max(0); | ||
|
|
||
| if start >= end { | ||
| return String::new(); | ||
| } | ||
|
|
||
| s.chars() | ||
| .skip(start as usize) | ||
| .take((end - start) as usize) | ||
| .collect() | ||
| } |
There was a problem hiding this comment.
Claude recommended an optimized version to avoid an intermediate string allocation per row. I have not verified.
fn spark_substr_negative(s: &str, pos: i64, len: u64) -> &str {
let num_chars = s.chars().count() as i64;
let end = (num_chars + pos).saturating_add(len as i64).min(num_chars);
let start = (num_chars + pos).max(0);
if start >= end {
return "";
}
// Translate char indices [start, end) to byte offsets in a single forward pass.
let mut it = s.char_indices();
let byte_start = it.by_ref().nth(start as usize).map(|(b, _)| b).unwrap_or(s.len());
let span = (end - start - 1) as usize;
let byte_end = it.nth(span).map(|(b, _)| b).unwrap_or(s.len());
&s[byte_start..byte_end]
} | SELECT substring('こんにちは世界', -2) | ||
|
|
||
| query | ||
| SELECT substring('🎉🎊🎈🎁', 2, 2) |
comphead
left a comment
There was a problem hiding this comment.
Thanks @kazuyukitanimura I think we also would need to donate this later to DF
andygrove
left a comment
There was a problem hiding this comment.
LGTM. Thanks @kazuyukitanimura.
nit: there is some overlap between the Scala and SQL tests, and the SQL tests may have been sufficient alone, but we can review laster
|
Turns out DF has spark impl for |
Which issue does this PR close?
Closes #3919
Closes #3337
Rationale for this change
What changes are included in this PR?
How are these changes tested?