vortex-data · gatesn · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock
@@ -2268,6 +2268,12 @@ pub type vortex_array::arrays::decimal::DecimalArray = vortex_array::Array<vorte
 
 pub mod vortex_array::arrays::dict
 
+pub mod vortex_array::arrays::dict::cardinality
+
+pub fn vortex_array::arrays::dict::cardinality::estimate_code_cardinality<I: vortex_array::dtype::IntegerPType>(&vortex_array::arrays::PrimitiveArray, &vortex_mask::Mask) -> core::option::Option<usize>
+
+pub fn vortex_array::arrays::dict::cardinality::has_repeated_code_sample<I: vortex_array::dtype::IntegerPType>(&vortex_array::arrays::PrimitiveArray, &vortex_mask::Mask) -> bool
+
 pub mod vortex_array::arrays::dict::vtable
 
 pub struct vortex_array::arrays::dict::vtable::Dict

diff --git a/vortex-array/src/arrays/dict/array.rs b/vortex-array/src/arrays/dict/array.rs
@@ -11,6 +11,7 @@ use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_ensure;
 use vortex_mask::AllOr;
+use vortex_mask::Mask;
 
 use crate::ArrayRef;
 use crate::ArraySlots;
@@ -23,6 +24,7 @@ use crate::array::ArrayParts;
 use crate::array::TypedArrayRef;
 use crate::array_slots;
 use crate::arrays::Dict;
+use crate::arrays::PrimitiveArray;
 use crate::dtype::DType;
 use crate::dtype::PType;
 use crate::match_each_integer_ptype;
@@ -150,46 +152,63 @@ pub trait DictArrayExt: TypedArrayRef<Dict> + DictArraySlotsExt {
             .execute_mask(codes.len(), &mut LEGACY_SESSION.create_execution_ctx())?;
         #[expect(deprecated)]
         let codes_primitive = self.codes().to_primitive();
-        let values_len = self.values().len();
-
-        let init_value = !referenced;
-        let referenced_value = referenced;
-
-        let mut values_vec = vec![init_value; values_len];
-        match codes_validity.bit_buffer() {
-            AllOr::All => {
-                match_each_integer_ptype!(codes_primitive.ptype(), |P| {
-                    #[allow(
-                        clippy::cast_possible_truncation,
-                        clippy::cast_sign_loss,
-                        reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
-                    )]
-                    for &idx in codes_primitive.as_slice::<P>() {
-                        values_vec[idx as usize] = referenced_value;
-                    }
-                });
-            }
-            AllOr::None => {}
-            AllOr::Some(mask) => {
-                match_each_integer_ptype!(codes_primitive.ptype(), |P| {
-                    let codes = codes_primitive.as_slice::<P>();
-
-                    #[allow(
-                        clippy::cast_possible_truncation,
-                        clippy::cast_sign_loss,
-                        reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
-                    )]
-                    mask.set_indices().for_each(|idx| {
-                        values_vec[codes[idx] as usize] = referenced_value;
-                    });
+        compute_referenced_values_mask_from_codes(
+            &codes_primitive,
+            self.values().len(),
+            &codes_validity,
+            referenced,
+        )
+    }
+}
+impl<T: TypedArrayRef<Dict>> DictArrayExt for T {}
+
+/// Build an exact bitmap over dictionary values referenced by valid codes.
+///
+/// The sampling-based sparse dictionary estimator only decides whether an exact pass is likely to
+/// be worthwhile. This helper is the exact pass: aggregate kernels use it to ignore unreferenced
+/// values, and sparse dictionary canonicalization uses it to compact values before remapping codes.
+pub(crate) fn compute_referenced_values_mask_from_codes(
+    codes_primitive: &PrimitiveArray,
+    values_len: usize,
+    codes_validity: &Mask,
+    referenced: bool,
+) -> VortexResult<BitBuffer> {
+    let init_value = !referenced;
+    let referenced_value = referenced;
+
+    let mut values_vec = vec![init_value; values_len];
+    match codes_validity.bit_buffer() {
+        AllOr::All => {
+            match_each_integer_ptype!(codes_primitive.ptype(), |P| {
+                #[allow(
+                    clippy::cast_possible_truncation,
+                    clippy::cast_sign_loss,
+                    reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
+                )]
+                for &idx in codes_primitive.as_slice::<P>() {
+                    values_vec[idx as usize] = referenced_value;
+                }
+            });
+        }
+        AllOr::None => {}
+        AllOr::Some(mask) => {
+            match_each_integer_ptype!(codes_primitive.ptype(), |P| {
+                let codes = codes_primitive.as_slice::<P>();
+
+                #[allow(
+                    clippy::cast_possible_truncation,
+                    clippy::cast_sign_loss,
+                    reason = "codes are non-negative indices; a negative signed code would wrap to a large usize and panic on the bounds-checked array index"
+                )]
+                mask.set_indices().for_each(|idx| {
+                    values_vec[codes[idx] as usize] = referenced_value;
                 });
-            }
+            });
         }
-
-        Ok(BitBuffer::from(values_vec))
     }
+
+    Ok(BitBuffer::from(values_vec))
 }
-impl<T: TypedArrayRef<Dict>> DictArrayExt for T {}
 
 /// Concrete parts of a [`DictArray`](super::DictArray) after iterative execution.
 pub struct DictParts {

diff --git a/vortex-array/src/arrays/dict/cardinality.rs b/vortex-array/src/arrays/dict/cardinality.rs
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Sampling-based cardinality estimation for dictionary codes.
+//!
+//! This module is used only as a cheap gate before the exact sparse-dictionary remap pass and as a
+//! routing hint for downstream exporters. The estimate may be conservative or noisy, but
+//! correctness does not depend on it: callers must still collect the exact unique code set and
+//! re-check the sparse threshold before compacting.
+
+use vortex_mask::Mask;
+
+use crate::arrays::PrimitiveArray;
+use crate::dtype::IntegerPType;
+
+const SAMPLE_SIZE: usize = 128;
+const REPEATED_CODE_PROBE_SIZE: usize = 16;
+
+/// Return whether a small deterministic probe observes a repeated non-null code.
+///
+/// Sparse canonicalization always has a cheap worst-case gate before it samples. This probe is the
+/// next, cheaper filter for cases that are not sparse by row count alone: dense dictionaries should
+/// not pay the full estimator cost unless the code stream first shows evidence of repeated codes.
+/// A `true` result only means "run the estimator"; it is not enough to compact by itself.
+pub fn has_repeated_code_sample<I: IntegerPType>(
+    codes: &PrimitiveArray,
+    validity_mask: &Mask,
+) -> bool {
+    let sample_count = codes.len().min(REPEATED_CODE_PROBE_SIZE);
+    let mut observed_codes = Vec::<usize>::with_capacity(sample_count);
+
+    for sample_idx in 0..sample_count {
+        let idx = sample_index(sample_idx, codes.len(), sample_count);
+        if !validity_mask.value(idx) {
+            continue;
+        }
+
+        let code: usize = codes.as_slice::<I>()[idx].as_();
+        if observed_codes.contains(&code) {
+            return true;
+        }
+        observed_codes.push(code);
+    }
+
+    false
+}
+
+/// Estimate the number of distinct non-null dictionary codes.
+///
+/// The estimator samples deterministic bucket midpoints so repeated executions make the same
+/// compaction decision for the same input. Returning `None` means no valid sampled codes were seen.
+/// A returned value should only be used to decide whether an exact pass is worth attempting.
+pub fn estimate_code_cardinality<I: IntegerPType>(
+    codes: &PrimitiveArray,
+    validity_mask: &Mask,
+) -> Option<usize> {
+    let sample_count = codes.len().min(SAMPLE_SIZE);
+    let mut observed_codes = Vec::<(usize, usize)>::new();
+
+    // Sample deterministic bucket midpoints instead of using randomness. The estimate only gates
+    // whether to run the exact pass; correctness never depends on the sample.
+    for sample_idx in 0..sample_count {
+        let idx = sample_index(sample_idx, codes.len(), sample_count);
+        if !validity_mask.value(idx) {
+            continue;
+        }
+
+        let code: usize = codes.as_slice::<I>()[idx].as_();
+        if let Some((_, count)) = observed_codes
+            .iter_mut()
+            .find(|(observed, _)| *observed == code)
+        {
+            *count += 1;
+        } else {
+            observed_codes.push((code, 1));
+        }
+    }
+
+    estimate_cardinality_from_observations(&observed_codes)
+}
+
+/// Estimate total cardinality from `(code, observed_count)` sample observations.
+///
+/// The correction is Chao1-style: singleton-heavy samples imply more unseen codes, while repeated
+/// observations imply the code stream is likely low-cardinality.
+fn estimate_cardinality_from_observations(observed_codes: &[(usize, usize)]) -> Option<usize> {
+    if observed_codes.is_empty() {
+        return None;
+    }
+
+    let unique_count = observed_codes.len();
+    let singleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 1)
+        .count();
+    let doubleton_count = observed_codes
+        .iter()
+        .filter(|(_, count)| *count == 2)
+        .count();
+
+    // Chao1-style lower-bias estimate for unseen codes. Repeated samples keep the estimate small
+    // for low-cardinality code streams; many singleton samples make dense streams look expensive.
+    let unseen_estimate = if doubleton_count == 0 {
+        singleton_count.saturating_mul(singleton_count.saturating_sub(1)) / 2
+    } else {
+        div_ceil(
+            singleton_count.saturating_mul(singleton_count),
+            2 * doubleton_count,
+        )
+    };
+
+    Some(unique_count.saturating_add(unseen_estimate))
+}
+
+/// Return the midpoint index for one deterministic sampling bucket.
+///
+/// Splitting the full code range into buckets avoids clustering all samples near the start while
+/// avoiding RNG state in a hot execution path.
+fn sample_index(sample_idx: usize, len: usize, sample_count: usize) -> usize {
+    debug_assert!(len > 0);
+    debug_assert!(sample_count > 0);
+
+    let sample_idx = sample_idx as u128;
+    let len = len as u128;
+    let sample_count = sample_count as u128;
+    let bucket_start = sample_idx * len / sample_count;
+    let bucket_end = (sample_idx + 1) * len / sample_count;
+
+    ((bucket_start + bucket_end) / 2).min(len - 1) as usize
+}
+
+fn div_ceil(numerator: usize, denominator: usize) -> usize {
+    debug_assert!(denominator > 0);
+    numerator / denominator + usize::from(!numerator.is_multiple_of(denominator))
+}
diff --git a/vortex-array/src/arrays/dict/compute/slice.rs b/vortex-array/src/arrays/dict/compute/slice.rs
@@ -12,25 +12,36 @@ use crate::arrays::Constant;
 use crate::arrays::ConstantArray;
 use crate::arrays::Dict;
 use crate::arrays::DictArray;
+use crate::arrays::dict::DictArrayExt;
 use crate::arrays::dict::DictArraySlotsExt;
 use crate::arrays::slice::SliceReduce;
 use crate::scalar::Scalar;
 
 impl SliceReduce for Dict {
     fn slice(array: ArrayView<'_, Self>, range: Range<usize>) -> VortexResult<Option<ArrayRef>> {
+        let is_full_slice = range.len() == array.len();
         let sliced_code = array.codes().slice(range)?;
         // TODO(joe): if the range is size 1 replace with a constant array
         if let Some(code) = sliced_code.as_opt::<Constant>() {
             let code = code.scalar().as_primitive().as_::<usize>();
             return if let Some(code) = code {
                 let values = array.values().slice(code..code + 1)?;
-                Ok(Some(
-                    DictArray::new(
+                // SAFETY: the only dictionary value is referenced by every non-null code when the
+                // slice is non-empty. An empty code stream cannot reference a non-empty values
+                // array.
+                let sliced = unsafe {
+                    DictArray::new_unchecked(
                         ConstantArray::new(0u8, sliced_code.len()).into_array(),
                         values,
                     )
-                    .into_array(),
-                ))
+                };
+                if sliced_code.is_empty() {
+                    Ok(Some(sliced.into_array()))
+                } else {
+                    Ok(Some(unsafe {
+                        sliced.set_all_values_referenced(true).into_array()
+                    }))
+                }
             } else {
                 Ok(Some(
                     ConstantArray::new(Scalar::null(array.dtype().clone()), sliced_code.len())
@@ -39,9 +50,18 @@ impl SliceReduce for Dict {
             };
         }
         // SAFETY: slicing the codes preserves invariants.
-        Ok(Some(
-            unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()) }.into_array(),
-        ))
+        let sliced = unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()) };
+        if is_full_slice {
+            // A full-length slice preserves the exact code stream, so the referenced-values
+            // metadata remains sound. Partial slices may drop the only reference to a value.
+            Ok(Some(unsafe {
+                sliced
+                    .set_all_values_referenced(array.has_all_values_referenced())
+                    .into_array()
+            }))
+        } else {
+            Ok(Some(sliced.into_array()))
+        }
     }
 }
 
@@ -51,8 +71,10 @@ mod tests {
     use vortex_error::VortexResult;
 
     use crate::IntoArray;
+    use crate::arrays::Dict;
     use crate::arrays::DictArray;
     use crate::arrays::PrimitiveArray;
+    use crate::arrays::dict::DictArrayExt;
     use crate::arrays::dict::compute::slice::ConstantArray;
     use crate::assert_arrays_eq;
     use crate::dtype::DType;
@@ -84,4 +106,36 @@ mod tests {
         assert_arrays_eq!(sliced, expected);
         Ok(())
     }
+
+    #[test]
+    fn full_slice_preserves_all_values_referenced_metadata() -> VortexResult<()> {
+        let dict = unsafe {
+            DictArray::new_unchecked(
+                buffer![0u8, 1].into_array(),
+                buffer![10i32, 20].into_array(),
+            )
+            .set_all_values_referenced(true)
+        };
+
+        let sliced = dict.slice(0..2)?;
+
+        assert!(sliced.as_::<Dict>().has_all_values_referenced());
+        Ok(())
+    }
+
+    #[test]
+    fn partial_slice_drops_all_values_referenced_metadata() -> VortexResult<()> {
+        let dict = unsafe {
+            DictArray::new_unchecked(
+                buffer![0u8, 1].into_array(),
+                buffer![10i32, 20].into_array(),
+            )
+            .set_all_values_referenced(true)
+        };
+
+        let sliced = dict.slice(0..1)?;
+
+        assert!(!sliced.as_::<Dict>().has_all_values_referenced());
+        Ok(())
+    }
 }
diff --git a/vortex-array/src/arrays/dict/mod.rs b/vortex-array/src/arrays/dict/mod.rs
@@ -14,6 +14,7 @@ pub use arbitrary::ArbitraryDictArray;
 mod array;
 pub use array::*;
 
+pub mod cardinality;
 pub(crate) mod compute;
 mod execute;