From 6faaa6b850a666b74f6cfe3f635ca4a83a031d65 Mon Sep 17 00:00:00 2001 From: Geoffrey Claude Date: Thu, 30 Apr 2026 15:23:08 +0200 Subject: [PATCH 1/3] Refactor generic InList static filter helpers --- .../in_list/array_static_filter.rs | 163 ++++++++++-------- 1 file changed, 90 insertions(+), 73 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs b/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs index 93bfcd49600d0..d1befdc9cba12 100644 --- a/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs +++ b/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs @@ -42,71 +42,6 @@ pub(super) struct ArrayStaticFilter { map: HashMap, } -impl StaticFilter for ArrayStaticFilter { - fn null_count(&self) -> usize { - self.in_array.null_count() - } - - /// Checks if values in `v` are contained in the `in_array` using this hash set for lookup. - fn contains(&self, v: &dyn Array, negated: bool) -> Result { - // Null type comparisons always return null (SQL three-valued logic) - if v.data_type() == &DataType::Null - || self.in_array.data_type() == &DataType::Null - { - let nulls = NullBuffer::new_null(v.len()); - return Ok(BooleanArray::new( - BooleanBuffer::new_unset(v.len()), - Some(nulls), - )); - } - - // Unwrap dictionary-encoded needles when the value type matches - // in_array, evaluating against the dictionary values and mapping - // back via keys. - downcast_dictionary_array! { - v => { - // Only unwrap when the haystack (in_array) type matches - // the dictionary value type - if v.values().data_type() == self.in_array.data_type() { - let values_contains = self.contains(v.values().as_ref(), negated)?; - let result = take(&values_contains, v.keys(), None)?; - return Ok(downcast_array(result.as_ref())); - } - } - _ => {} - } - - let needle_nulls = v.logical_nulls(); - let needle_nulls = needle_nulls.as_ref(); - let haystack_has_nulls = self.in_array.null_count() != 0; - - with_hashes([v], &self.state, |hashes| { - let cmp = make_comparator(v, &self.in_array, SortOptions::default())?; - Ok((0..v.len()) - .map(|i| { - // SQL three-valued logic: null IN (...) is always null - if needle_nulls.is_some_and(|nulls| nulls.is_null(i)) { - return None; - } - - let hash = hashes[i]; - let contains = self - .map - .raw_entry() - .from_hash(hash, |idx| cmp(i, *idx).is_eq()) - .is_some(); - - match contains { - true => Some(!negated), - false if haystack_has_nulls => None, - false => Some(negated), - } - }) - .collect()) - }) - } -} - impl ArrayStaticFilter { /// Computes a [`StaticFilter`] for the provided [`Array`] if there /// are nulls present or there are more than the configured number of @@ -125,10 +60,23 @@ impl ArrayStaticFilter { } let state = RandomState::default(); + let map = Self::build_haystack_map(&in_array, &state)?; + + Ok(Self { + in_array, + state, + map, + }) + } + + fn build_haystack_map( + haystack: &ArrayRef, + state: &RandomState, + ) -> Result> { let mut map: HashMap = HashMap::with_hasher(()); - with_hashes([&in_array], &state, |hashes| -> Result<()> { - let cmp = make_comparator(&in_array, &in_array, SortOptions::default())?; + with_hashes([haystack.as_ref()], state, |hashes| -> Result<()> { + let cmp = make_comparator(haystack, haystack, SortOptions::default())?; let insert_value = |idx| { let hash = hashes[idx]; @@ -140,21 +88,90 @@ impl ArrayStaticFilter { } }; - match in_array.nulls() { + match haystack.nulls() { Some(nulls) => { BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()) .for_each(insert_value) } - None => (0..in_array.len()).for_each(insert_value), + None => (0..haystack.len()).for_each(insert_value), } Ok(()) })?; - Ok(Self { - in_array, - state, - map, + Ok(map) + } + + fn find_needles_in_haystack( + &self, + needles: &dyn Array, + negated: bool, + ) -> Result { + let needle_nulls = needles.logical_nulls(); + let needle_nulls = needle_nulls.as_ref(); + let haystack_has_nulls = self.in_array.null_count() != 0; + + with_hashes([needles], &self.state, |hashes| { + let cmp = make_comparator(needles, &self.in_array, SortOptions::default())?; + Ok((0..needles.len()) + .map(|i| { + // SQL three-valued logic: null IN (...) is always null + if needle_nulls.is_some_and(|nulls| nulls.is_null(i)) { + return None; + } + + let hash = hashes[i]; + let contains = self + .map + .raw_entry() + .from_hash(hash, |idx| cmp(i, *idx).is_eq()) + .is_some(); + + match contains { + true => Some(!negated), + false if haystack_has_nulls => None, + false => Some(negated), + } + }) + .collect()) }) } } + +impl StaticFilter for ArrayStaticFilter { + fn null_count(&self) -> usize { + self.in_array.null_count() + } + + /// Checks if values in `v` are contained in the `in_array` using this hash set for lookup. + fn contains(&self, v: &dyn Array, negated: bool) -> Result { + // Null type comparisons always return null (SQL three-valued logic) + if v.data_type() == &DataType::Null + || self.in_array.data_type() == &DataType::Null + { + let nulls = NullBuffer::new_null(v.len()); + return Ok(BooleanArray::new( + BooleanBuffer::new_unset(v.len()), + Some(nulls), + )); + } + + // Unwrap dictionary-encoded needles when the value type matches + // in_array, evaluating against the dictionary values and mapping + // back via keys. + downcast_dictionary_array! { + v => { + // Only unwrap when the haystack (in_array) type matches + // the dictionary value type + if v.values().data_type() == self.in_array.data_type() { + let values_contains = self.contains(v.values().as_ref(), negated)?; + let result = take(&values_contains, v.keys(), None)?; + return Ok(downcast_array(result.as_ref())); + } + } + _ => {} + } + + self.find_needles_in_haystack(v, negated) + } +} From 4bcbc4b5f35347fc8fc17ffcd0f6bda4c3d0c3ca Mon Sep 17 00:00:00 2001 From: Geoffrey Claude Date: Thu, 30 Apr 2026 15:24:36 +0200 Subject: [PATCH 2/3] Build InList results from bitmaps --- .../physical-expr/src/expressions/in_list.rs | 1 + .../in_list/array_static_filter.rs | 36 +++--- .../src/expressions/in_list/result.rs | 105 ++++++++++++++++++ 3 files changed, 121 insertions(+), 21 deletions(-) create mode 100644 datafusion/physical-expr/src/expressions/in_list/result.rs diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index ede493dcf17d5..ae2ab5d5c4182 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -38,6 +38,7 @@ use datafusion_expr::{ColumnarValue, expr_vec_fmt}; mod array_static_filter; mod primitive_filter; +mod result; mod static_filter; mod strategy; diff --git a/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs b/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs index d1befdc9cba12..f6c24b15b0da7 100644 --- a/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs +++ b/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs @@ -28,6 +28,7 @@ use datafusion_common::Result; use datafusion_common::hash_utils::{RandomState, with_hashes}; use hashbrown::hash_map::RawEntryMut; +use super::result::build_in_list_result; use super::static_filter::StaticFilter; /// Static filter for InList that stores the array and hash set for O(1) lookups @@ -108,32 +109,25 @@ impl ArrayStaticFilter { negated: bool, ) -> Result { let needle_nulls = needles.logical_nulls(); - let needle_nulls = needle_nulls.as_ref(); let haystack_has_nulls = self.in_array.null_count() != 0; - with_hashes([needles], &self.state, |hashes| { + with_hashes([needles], &self.state, |needle_hashes| { let cmp = make_comparator(needles, &self.in_array, SortOptions::default())?; - Ok((0..needles.len()) - .map(|i| { - // SQL three-valued logic: null IN (...) is always null - if needle_nulls.is_some_and(|nulls| nulls.is_null(i)) { - return None; - } - - let hash = hashes[i]; - let contains = self - .map + + Ok(build_in_list_result( + needles.len(), + needle_nulls.as_ref(), + haystack_has_nulls, + negated, + #[inline(always)] + |i| { + let hash = needle_hashes[i]; + self.map .raw_entry() .from_hash(hash, |idx| cmp(i, *idx).is_eq()) - .is_some(); - - match contains { - true => Some(!negated), - false if haystack_has_nulls => None, - false => Some(negated), - } - }) - .collect()) + .is_some() + }, + )) }) } } diff --git a/datafusion/physical-expr/src/expressions/in_list/result.rs b/datafusion/physical-expr/src/expressions/in_list/result.rs new file mode 100644 index 0000000000000..3ebdbfe19f743 --- /dev/null +++ b/datafusion/physical-expr/src/expressions/in_list/result.rs @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Result building helpers for InList operations. +//! +//! This module provides unified logic for building BooleanArray results +//! from IN list membership tests, handling null propagation correctly +//! according to SQL three-valued logic. + +use arrow::array::BooleanArray; +use arrow::buffer::{BooleanBuffer, NullBuffer}; + +// Truth table for (needle_nulls, haystack_has_nulls, negated): +// (Some, true, false) => values: valid & contains, nulls: valid & contains +// (None, true, false) => values: contains, nulls: contains +// (Some, true, true) => values: valid & !contains, nulls: valid & contains +// (None, true, true) => values: !contains, nulls: contains +// (Some, false, false) => values: valid & contains, nulls: valid +// (Some, false, true) => values: valid & !contains, nulls: valid +// (None, false, false) => values: contains, nulls: none +// (None, false, true) => values: !contains, nulls: none + +/// Builds a BooleanArray result for IN list operations. +/// +/// This function handles the null propagation logic for SQL IN lists: +/// - If the needle value is null, the result is null +/// - If the needle is not in the set and the haystack has nulls, the result is null +/// - Otherwise, the result is true/false based on membership and negation +/// +/// This version computes contains for all positions, including nulls, then applies +/// null masking via bitmap operations. +#[inline] +pub(crate) fn build_in_list_result( + len: usize, + needle_nulls: Option<&NullBuffer>, + haystack_has_nulls: bool, + negated: bool, + contains: C, +) -> BooleanArray +where + C: FnMut(usize) -> bool, +{ + let contains_buf = BooleanBuffer::collect_bool(len, contains); + build_result_from_contains(needle_nulls, haystack_has_nulls, negated, contains_buf) +} + +/// Builds a BooleanArray result from a pre-computed contains buffer. +/// +/// This version does not assume contains_buf is pre-masked at null positions. +/// It handles nulls using bitmap operations. +#[inline] +pub(crate) fn build_result_from_contains( + needle_nulls: Option<&NullBuffer>, + haystack_has_nulls: bool, + negated: bool, + contains_buf: BooleanBuffer, +) -> BooleanArray { + match (needle_nulls, haystack_has_nulls, negated) { + // Haystack has nulls: result is null unless value is found. + (Some(v), true, false) => { + // values: valid & contains, nulls: valid & contains + let values = v.inner() & &contains_buf; + BooleanArray::new(values.clone(), Some(NullBuffer::new(values))) + } + (None, true, false) => { + BooleanArray::new(contains_buf.clone(), Some(NullBuffer::new(contains_buf))) + } + (Some(v), true, true) => { + // NOT IN with nulls: false if found, null if not found or needle null. + // values: valid & !contains, nulls: valid & contains + let valid = v.inner(); + let values = valid & &(!&contains_buf); + let nulls = valid & &contains_buf; + BooleanArray::new(values, Some(NullBuffer::new(nulls))) + } + (None, true, true) => { + BooleanArray::new(!&contains_buf, Some(NullBuffer::new(contains_buf))) + } + // Haystack has no nulls: result validity follows needle validity. + (Some(v), false, false) => { + // values: valid & contains, nulls: valid + BooleanArray::new(v.inner() & &contains_buf, Some(v.clone())) + } + (Some(v), false, true) => { + // values: valid & !contains, nulls: valid + BooleanArray::new(v.inner() & &(!&contains_buf), Some(v.clone())) + } + (None, false, false) => BooleanArray::new(contains_buf, None), + (None, false, true) => BooleanArray::new(!&contains_buf, None), + } +} From 5a9378f33f44afca489b02b02818116e4690b246 Mon Sep 17 00:00:00 2001 From: Geoffrey Claude Date: Thu, 30 Apr 2026 15:25:25 +0200 Subject: [PATCH 3/3] Optimize generic InList static filtering --- .../in_list/array_static_filter.rs | 37 +++++++------------ 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs b/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs index f6c24b15b0da7..75e92dbcc59b4 100644 --- a/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs +++ b/datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs @@ -23,10 +23,9 @@ use arrow::buffer::{BooleanBuffer, NullBuffer}; use arrow::compute::{SortOptions, take}; use arrow::datatypes::DataType; use arrow::util::bit_iterator::BitIndexIterator; -use datafusion_common::HashMap; use datafusion_common::Result; use datafusion_common::hash_utils::{RandomState, with_hashes}; -use hashbrown::hash_map::RawEntryMut; +use hashbrown::HashTable; use super::result::build_in_list_result; use super::static_filter::StaticFilter; @@ -36,11 +35,8 @@ use super::static_filter::StaticFilter; pub(super) struct ArrayStaticFilter { in_array: ArrayRef, state: RandomState, - /// Used to provide a lookup from value to in list index - /// - /// Note: usize::hash is not used, instead the raw entry - /// API is used to store entries w.r.t their value - map: HashMap, + /// Stores indices into `in_array` for O(1) lookups. + table: HashTable, } impl ArrayStaticFilter { @@ -56,36 +52,34 @@ impl ArrayStaticFilter { return Ok(ArrayStaticFilter { in_array, state: RandomState::default(), - map: HashMap::with_hasher(()), + table: HashTable::new(), }); } let state = RandomState::default(); - let map = Self::build_haystack_map(&in_array, &state)?; + let table = Self::build_haystack_table(&in_array, &state)?; Ok(Self { in_array, state, - map, + table, }) } - fn build_haystack_map( + fn build_haystack_table( haystack: &ArrayRef, state: &RandomState, - ) -> Result> { - let mut map: HashMap = HashMap::with_hasher(()); + ) -> Result> { + let mut table = HashTable::new(); with_hashes([haystack.as_ref()], state, |hashes| -> Result<()> { let cmp = make_comparator(haystack, haystack, SortOptions::default())?; let insert_value = |idx| { let hash = hashes[idx]; - if let RawEntryMut::Vacant(v) = map - .raw_entry_mut() - .from_hash(hash, |x| cmp(*x, idx).is_eq()) - { - v.insert_with_hasher(hash, idx, (), |x| hashes[*x]); + // Only insert if not already present (deduplication) + if table.find(hash, |&x| cmp(x, idx).is_eq()).is_none() { + table.insert_unique(hash, idx, |&x| hashes[x]); } }; @@ -100,7 +94,7 @@ impl ArrayStaticFilter { Ok(()) })?; - Ok(map) + Ok(table) } fn find_needles_in_haystack( @@ -122,10 +116,7 @@ impl ArrayStaticFilter { #[inline(always)] |i| { let hash = needle_hashes[i]; - self.map - .raw_entry() - .from_hash(hash, |idx| cmp(i, *idx).is_eq()) - .is_some() + self.table.find(hash, |&idx| cmp(i, idx).is_eq()).is_some() }, )) })