Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions lib/dictBuilder/cover.c
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,16 @@ static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
(unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
return ERROR(srcSize_wrong);
}
/* The training set must be large enough for the partial suffix array, whose
* size is `trainingSamplesSize - MAX(d, sizeof(U64)) + 1`. When splitPoint
* < 1.0 the training set is a subset of the corpus, so it can be smaller
* than the total even after the check above. Guard against the size_t
* subtraction underflowing into a huge allocation. */
if (trainingSamplesSize < MAX(d, sizeof(U64))) {
DISPLAYLEVEL(1, "Training samples size is too small (%u), minimum size is %u\n",
(unsigned)trainingSamplesSize, (unsigned)MAX(d, sizeof(U64)));
return ERROR(srcSize_wrong);
}
/* Check if there are at least 5 training samples */
if (nbTrainSamples < 5) {
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
Expand Down
19 changes: 19 additions & 0 deletions tests/fuzzer.c
Original file line number Diff line number Diff line change
Expand Up @@ -3812,6 +3812,25 @@ static int basicUnitTests(U32 const seed, double compressibility)
if (dictID==0) goto _output_error;
DISPLAYLEVEL(3, "OK : %u \n", (unsigned)dictID);

DISPLAYLEVEL(3, "test%3i : ZDICT_optimizeTrainFromBuffer_cover with tiny training set : ", testNb++);
{ /* With splitPoint < 1.0 the training set is a subset of the corpus.
* If that subset is smaller than MAX(d, sizeof(U64)) the partial
* suffix array size used to underflow size_t (issue #4682). The
* builder must reject it cleanly instead. The first few samples are
* tiny so the 0.75 training split stays below the threshold, while
* the total corpus is comfortably large. */
size_t tinySizes[8] = { 1, 1, 1, 1, 1, 1, 100, 100 };
ZDICT_cover_params_t tinyParams;
size_t tinyResult;
memset(&tinyParams, 0, sizeof(tinyParams));
tinyParams.steps = 4;
tinyParams.splitPoint = 0.75;
tinyResult = ZDICT_optimizeTrainFromBuffer_cover(
dictBuffer, optDictSize, CNBuffer, tinySizes, 8, &tinyParams);
if (!ZDICT_isError(tinyResult)) goto _output_error;
}
DISPLAYLEVEL(3, "OK \n");

ZSTD_freeCCtx(cctx);
free(dictBuffer);
free(samplesSizes);
Expand Down