apache · yaooqinn · May 26, 2026
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -80,6 +80,7 @@ license: |
 - Since Spark 4.0, The Storage-Partitioned Join feature flag `spark.sql.sources.v2.bucketing.pushPartValues.enabled` is set to `true`. To restore the previous behavior, set `spark.sql.sources.v2.bucketing.pushPartValues.enabled` to `false`.
 - Since Spark 4.0, the `sentences` function uses `Locale(language)` instead of `Locale.US` when `language` parameter is not `NULL` and `country` parameter is `NULL`.
 - Since Spark 4.0, reading from a file source table will correctly respect query options, e.g. delimiters. Previously, the first query plan was cached and subsequent option changes ignored. To restore the previous behavior, set `spark.sql.legacy.readFileSourceTableCacheIgnoreOptions` to `true`.
+- Since Spark 4.0, string columns may carry a non-binary collation (`UTF8_LCASE`, `UTF8_LCASE_TRIM`, or any ICU collation). `DataFrameStatFunctions.bloomFilter` builds the underlying sketch using raw UTF-8 byte equality and is therefore collation-blind: two values that compare equal under the column's collation will occupy distinct slots and `mightContain` may return inconsistent results. For collation-consistent membership, build the filter over a column declared with the default `UTF8_BINARY` collation, or normalize the values yourself before building and querying. For ASCII data under `UTF8_LCASE`, `lower(col)` is sufficient; for `UTF8_LCASE_TRIM`, use `lower(trim(col))`. Note that `lower()` does not correctly handle certain non-ASCII Unicode cases (for example Turkish dotted/dotless 'I'/'i' or German 'ß'/'SS'); see [SPARK-57055](https://issues.apache.org/jira/browse/SPARK-57055) for a discussion of adding a public canonical-form helper for ICU collations.
 
 ## Upgrading from Spark SQL 3.5.3 to 3.5.4
 

diff --git a/sql/api/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/api/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -529,6 +529,14 @@ abstract class DataFrameStatFunctions {
    *   expected number of items which will be put into the filter.
    * @param fpp
    *   expected false positive probability of the filter.
+   * @note
+   *   For string columns, this filter is built and queried using raw UTF-8 byte equality. When
+   *   the column has a non-binary collation (e.g. UTF8_LCASE or any ICU collation), two values
+   *   that compare equal under that collation (such as `'Alice'` and `'alice'` under UTF8_LCASE)
+   *   will occupy distinct slots and `mightContain` may return inconsistent results. If
+   *   collation-consistent membership is required, build the filter over a column declared with
+   *   the default UTF8_BINARY collation, or normalize the values yourself before building and
+   *   querying the filter (for example, `lower(col)` for ASCII data under UTF8_LCASE).
    * @since 2.0.0
    */
   def bloomFilter(colName: String, expectedNumItems: Long, fpp: Double): BloomFilter = {
@@ -544,6 +552,14 @@ abstract class DataFrameStatFunctions {
    *   expected number of items which will be put into the filter.
    * @param fpp
    *   expected false positive probability of the filter.
+   * @note
+   *   For string columns, this filter is built and queried using raw UTF-8 byte equality. When
+   *   the column has a non-binary collation (e.g. UTF8_LCASE or any ICU collation), two values
+   *   that compare equal under that collation (such as `'Alice'` and `'alice'` under UTF8_LCASE)
+   *   will occupy distinct slots and `mightContain` may return inconsistent results. If
+   *   collation-consistent membership is required, build the filter over a column declared with
+   *   the default UTF8_BINARY collation, or normalize the values yourself before building and
+   *   querying the filter (for example, `lower(col)` for ASCII data under UTF8_LCASE).
    * @since 2.0.0
    */
   def bloomFilter(col: Column, expectedNumItems: Long, fpp: Double): BloomFilter = {
@@ -560,6 +576,14 @@ abstract class DataFrameStatFunctions {
    *   expected number of items which will be put into the filter.
    * @param numBits
    *   expected number of bits of the filter.
+   * @note
+   *   For string columns, this filter is built and queried using raw UTF-8 byte equality. When
+   *   the column has a non-binary collation (e.g. UTF8_LCASE or any ICU collation), two values
+   *   that compare equal under that collation (such as `'Alice'` and `'alice'` under UTF8_LCASE)
+   *   will occupy distinct slots and `mightContain` may return inconsistent results. If
+   *   collation-consistent membership is required, build the filter over a column declared with
+   *   the default UTF8_BINARY collation, or normalize the values yourself before building and
+   *   querying the filter (for example, `lower(col)` for ASCII data under UTF8_LCASE).
    * @since 2.0.0
    */
   def bloomFilter(colName: String, expectedNumItems: Long, numBits: Long): BloomFilter = {
@@ -575,6 +599,14 @@ abstract class DataFrameStatFunctions {
    *   expected number of items which will be put into the filter.
    * @param numBits
    *   expected number of bits of the filter.
+   * @note
+   *   For string columns, this filter is built and queried using raw UTF-8 byte equality. When
+   *   the column has a non-binary collation (e.g. UTF8_LCASE or any ICU collation), two values
+   *   that compare equal under that collation (such as `'Alice'` and `'alice'` under UTF8_LCASE)
+   *   will occupy distinct slots and `mightContain` may return inconsistent results. If
+   *   collation-consistent membership is required, build the filter over a column declared with
+   *   the default UTF8_BINARY collation, or normalize the values yourself before building and
+   *   querying the filter (for example, `lower(col)` for ASCII data under UTF8_LCASE).
    * @since 2.0.0
    */
   def bloomFilter(col: Column, expectedNumItems: Long, numBits: Long): BloomFilter = withOrigin {