Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions encodings/fsst/src/tests.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use rand::SeedableRng;
use rand::rngs::StdRng;
use rand::seq::IndexedRandom;
use vortex_array::ArrayRef;
use vortex_array::IntoArray;
use vortex_array::LEGACY_SESSION;
Expand Down Expand Up @@ -107,3 +110,49 @@ fn test_fsst_array_ops() {

assert_arrays_eq!(fsst_array, canonical_array);
}

/// Regression for #7833: `fsst_compress` must accept inputs whose cumulative
/// compressed bytes exceed `i32::MAX`. Pre-fix, `fsst_compress_iter` hardcoded
/// `VarBinBuilder::<i32>` for the FSST output buffer regardless of input size,
/// which panicked in `VarBinBuilder::<i32>::append_value` once cumulative
/// compressed bytes passed `i32::MAX`.
///
/// Allocates ~2.5 GiB for the input plus ~2.5 GiB for the FSST output, so the
/// test is `#[ignore]`-d by default. Run explicitly with:
/// `cargo test --release -p vortex-fsst -- --ignored fsst_compress_offsets`.
#[test]
#[ignore = "allocates ~5 GiB; run with --ignored"]
fn fsst_compress_offsets_overflow_i32() {
// High-entropy ASCII strings sliced from a random pool. FSST is a
// symbol-table compressor; pseudo-random data with no recurring byte
// sequences resists compression, so the compressed output stays close
// to input size and crosses the i32 boundary.
const STRING_LEN: usize = 64 * 1024;
const TOTAL_BYTES: usize = (1usize << 31) + (512 << 20); // ~2.5 GiB
const N: usize = TOTAL_BYTES / STRING_LEN;
const POOL_LEN: usize = 64 * 1024 * 1024;

// Printable ASCII alphabet so the result is valid UTF-8.
const ALPHABET: &[u8; 95] =
b" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";

let mut rng = StdRng::seed_from_u64(0xC0DE_C011_B711);
let pool: Vec<u8> = (0..POOL_LEN)
.map(|_| *ALPHABET.choose(&mut rng).unwrap())
.collect();

let mut builder = VarBinBuilder::<i64>::with_capacity(N);
for i in 0..N {
let off = (i.wrapping_mul(31337)) % (POOL_LEN - STRING_LEN);
builder.append_value(&pool[off..off + STRING_LEN]);
}
let array = builder.finish(DType::Utf8(Nullability::NonNullable));

let compressor = fsst_train_compressor(&array);
let len = array.len();
let dtype = array.dtype().clone();
let mut ctx = LEGACY_SESSION.create_execution_ctx();

let compressed = fsst_compress(array, len, &dtype, &compressor, &mut ctx);
assert_eq!(compressed.len(), len);
}
Loading