From 392005163daccd793203feff4d017135388163d6 Mon Sep 17 00:00:00 2001 From: Mryange Date: Mon, 22 Jun 2026 17:55:30 +0800 Subject: [PATCH 1/2] upd --- be/src/common/config.cpp | 2 ++ be/src/common/config.h | 2 ++ be/src/core/column/column_string.cpp | 7 ++++-- be/src/core/column/column_string.h | 8 +++++-- .../data_type_serde/data_type_array_serde.cpp | 9 ++++---- .../data_type_serde/data_type_array_serde.h | 5 ++-- .../data_type_serde/data_type_bitmap_serde.h | 5 ++-- .../data_type_date_or_datetime_serde.cpp | 15 ++++++------ .../data_type_date_or_datetime_serde.h | 10 ++++---- .../data_type_datetimev2_serde.cpp | 8 +++---- .../data_type_datetimev2_serde.h | 5 ++-- .../data_type_datev2_serde.cpp | 7 +++--- .../data_type_serde/data_type_datev2_serde.h | 5 ++-- .../data_type_decimal_serde.cpp | 8 +++---- .../data_type_serde/data_type_decimal_serde.h | 5 ++-- .../data_type_serde/data_type_hll_serde.h | 5 ++-- .../data_type_serde/data_type_ipv4_serde.cpp | 7 +++--- .../data_type_serde/data_type_ipv4_serde.h | 5 ++-- .../data_type_serde/data_type_ipv6_serde.cpp | 7 +++--- .../data_type_serde/data_type_ipv6_serde.h | 5 ++-- .../data_type_serde/data_type_jsonb_serde.cpp | 7 +++--- .../data_type_serde/data_type_jsonb_serde.h | 5 ++-- .../data_type_serde/data_type_map_serde.cpp | 11 +++++---- .../data_type_serde/data_type_map_serde.h | 5 ++-- .../data_type_serde/data_type_nothing_serde.h | 5 ++-- .../data_type_nullable_serde.cpp | 12 +++++----- .../data_type_nullable_serde.h | 5 ++-- .../data_type_number_serde.cpp | 8 +++---- .../data_type_serde/data_type_number_serde.h | 5 ++-- .../data_type_quantilestate_serde.h | 5 ++-- .../core/data_type_serde/data_type_serde.cpp | 17 ++++++++++++++ be/src/core/data_type_serde/data_type_serde.h | 9 +++++--- .../data_type_string_serde.cpp | 2 +- .../data_type_serde/data_type_string_serde.h | 5 ++-- .../data_type_struct_serde.cpp | 9 ++++---- .../data_type_serde/data_type_struct_serde.h | 5 ++-- .../data_type_varbinary_serde.h | 5 ++-- .../data_type_serde/data_type_variant_serde.h | 5 ++-- be/src/exprs/vexpr.cpp | 3 +++ be/src/format/arrow/arrow_block_convertor.cpp | 10 ++++++++ be/test/core/column/column_string_test.cpp | 9 ++++++-- .../data_type_serde_arrow_test.cpp | 23 +++++++++++++++++++ 42 files changed, 200 insertions(+), 100 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 86b7d12e0aa9df..f5cdfec22471b0 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1456,6 +1456,8 @@ DEFINE_Bool(enable_snapshot_action, "false"); DEFINE_mInt32(variant_max_merged_tablet_schema_size, "2048"); DEFINE_mBool(enable_column_type_check, "true"); +DEFINE_mBool(enable_column_sanity_check, "false"); +DEFINE_mBool(enable_arrow_validate_full, "false"); // 128 MB DEFINE_mInt64(local_exchange_buffer_mem_limit, "134217728"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 0b415ed5d2c4ae..ef0f006c43541d 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1553,6 +1553,8 @@ DECLARE_mInt32(variant_max_merged_tablet_schema_size); DECLARE_mInt64(local_exchange_buffer_mem_limit); DECLARE_mBool(enable_column_type_check); +DECLARE_mBool(enable_column_sanity_check); +DECLARE_mBool(enable_arrow_validate_full); // Tolerance for the number of partition id 0 in rowset, default 0 DECLARE_Int32(ignore_invalid_partition_id_rowset_num); diff --git a/be/src/core/column/column_string.cpp b/be/src/core/column/column_string.cpp index 399c0952b892da..c9208db7080888 100644 --- a/be/src/core/column/column_string.cpp +++ b/be/src/core/column/column_string.cpp @@ -45,7 +45,11 @@ static constexpr auto CONVERT_COLUMN_IF_OVERFLOW_DEBUG_POINT = template void ColumnStr::sanity_check() const { -#ifndef NDEBUG +#ifdef NDEBUG + if (!config::enable_column_sanity_check) { + return; + } +#endif sanity_check_simple(); auto count = cast_set(offsets.size()); for (int64_t i = 0; i < count; ++i) { @@ -54,7 +58,6 @@ void ColumnStr::sanity_check() const { count, i, offsets[i], i - 1, offsets[i - 1])); } } -#endif } template diff --git a/be/src/core/column/column_string.h b/be/src/core/column/column_string.h index 7bae712f849721..498e49e78074c3 100644 --- a/be/src/core/column/column_string.h +++ b/be/src/core/column/column_string.h @@ -31,6 +31,7 @@ #include "common/cast_set.h" #include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" #include "common/exception.h" #include "common/status.h" #include "core/assert_cast.h" @@ -118,7 +119,11 @@ class ColumnStr final : public COWHelper> { void sanity_check() const override; void sanity_check_simple() const { -#ifndef NDEBUG +#ifdef NDEBUG + if (!config::enable_column_sanity_check) { + return; + } +#endif auto count = cast_set(offsets.size()); if (chars.size() != offsets[count - 1]) { throw Exception(Status::InternalError("row count: {}, chars.size(): {}, offset[{}]: {}", @@ -128,7 +133,6 @@ class ColumnStr final : public COWHelper> { if (offsets[-1] != 0) { throw Exception(Status::InternalError("wrong offsets[-1]: {}", offsets[-1])); } -#endif } std::string get_name() const override { return "String"; } diff --git a/be/src/core/data_type_serde/data_type_array_serde.cpp b/be/src/core/data_type_serde/data_type_array_serde.cpp index 42cd2e9df6bd41..c3090570ebc77a 100644 --- a/be/src/core/data_type_serde/data_type_array_serde.cpp +++ b/be/src/core/data_type_serde/data_type_array_serde.cpp @@ -317,9 +317,10 @@ Status DataTypeArraySerDe::write_column_to_arrow(const IColumn& column, const Nu return Status::OK(); } -Status DataTypeArraySerDe::read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeArraySerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { auto& column_array = static_cast(column); auto& offsets_data = column_array.get_offsets(); const auto* concrete_array = dynamic_cast(arrow_array); @@ -339,7 +340,7 @@ Status DataTypeArraySerDe::read_column_from_arrow(IColumn& column, const arrow:: // convert to doris offset, start from offsets.back() offsets_data.emplace_back(prev_size + current_offset - arrow_nested_start_offset); } - return nested_serde->read_column_from_arrow( + return nested_serde->read_column_from_arrow_impl( column_array.get_data(), concrete_array->values().get(), arrow_nested_start_offset, arrow_nested_end_offset, ctz); } diff --git a/be/src/core/data_type_serde/data_type_array_serde.h b/be/src/core/data_type_serde/data_type_array_serde.h index ceb97b7d57f4a0..28d1888289bac3 100644 --- a/be/src/core/data_type_serde/data_type_array_serde.h +++ b/be/src/core/data_type_serde/data_type_array_serde.h @@ -92,8 +92,9 @@ class DataTypeArraySerDe : public DataTypeSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_bitmap_serde.h b/be/src/core/data_type_serde/data_type_bitmap_serde.h index 023813b8f7f5da..d2db42b8b7de55 100644 --- a/be/src/core/data_type_serde/data_type_bitmap_serde.h +++ b/be/src/core/data_type_serde/data_type_bitmap_serde.h @@ -65,8 +65,9 @@ class DataTypeBitMapSerDe : public DataTypeSerDe { arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override { + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override { return Status::Error(ErrorCode::NOT_IMPLEMENTED_ERROR, "read_column_from_arrow with type " + column.get_name()); } diff --git a/be/src/core/data_type_serde/data_type_date_or_datetime_serde.cpp b/be/src/core/data_type_serde/data_type_date_or_datetime_serde.cpp index b4564e98c41b6a..b330e11158756d 100644 --- a/be/src/core/data_type_serde/data_type_date_or_datetime_serde.cpp +++ b/be/src/core/data_type_serde/data_type_date_or_datetime_serde.cpp @@ -137,10 +137,10 @@ Status DataTypeDateTimeSerDe::deserialize_one_cell_from_json(IColumn& column, Sl return Status::OK(); } -Status DataTypeDateTimeSerDe::read_column_from_arrow(IColumn& column, - const arrow::Array* arrow_array, int64_t start, - int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeDateTimeSerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { return _read_column_from_arrow(column, arrow_array, start, end, ctz); } @@ -250,9 +250,10 @@ Status DataTypeDateSerDe::_read_column_from_arrow(IColumn& column, } template -Status DataTypeDateSerDe::read_column_from_arrow(IColumn& column, - const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const { +Status DataTypeDateSerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { return _read_column_from_arrow(column, arrow_array, start, end, ctz); } diff --git a/be/src/core/data_type_serde/data_type_date_or_datetime_serde.h b/be/src/core/data_type_serde/data_type_date_or_datetime_serde.h index 693bfca738549d..2e2e9c00b1c45a 100644 --- a/be/src/core/data_type_serde/data_type_date_or_datetime_serde.h +++ b/be/src/core/data_type_serde/data_type_date_or_datetime_serde.h @@ -102,8 +102,9 @@ class DataTypeDateSerDe : public DataTypeNumberSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; @@ -142,7 +143,8 @@ class DataTypeDateTimeSerDe : public DataTypeDateSerDe& slices, uint64_t* num_deserialized, const FormatOptions& options) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; }; } // namespace doris diff --git a/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp b/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp index 92a5106b4815a8..9c324840eb9583 100644 --- a/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp +++ b/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp @@ -393,10 +393,10 @@ Status DataTypeDateTimeV2SerDe::write_column_to_arrow(const IColumn& column, return Status::OK(); } -Status DataTypeDateTimeV2SerDe::read_column_from_arrow(IColumn& column, - const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeDateTimeV2SerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { auto& col_data = static_cast(column).get_data(); int64_t divisor = 1; if (arrow_array->type()->id() == arrow::Type::TIMESTAMP) { diff --git a/be/src/core/data_type_serde/data_type_datetimev2_serde.h b/be/src/core/data_type_serde/data_type_datetimev2_serde.h index 0389432a621730..749ae467596867 100644 --- a/be/src/core/data_type_serde/data_type_datetimev2_serde.h +++ b/be/src/core/data_type_serde/data_type_datetimev2_serde.h @@ -86,8 +86,9 @@ class DataTypeDateTimeV2SerDe : public DataTypeNumberSerDe(column).get_data(); const auto* concrete_array = dynamic_cast(arrow_array); const auto* base_ptr = reinterpret_cast(concrete_array->raw_values()); diff --git a/be/src/core/data_type_serde/data_type_datev2_serde.h b/be/src/core/data_type_serde/data_type_datev2_serde.h index 0375f9be4b4b23..3a45e8e1f8228b 100644 --- a/be/src/core/data_type_serde/data_type_datev2_serde.h +++ b/be/src/core/data_type_serde/data_type_datev2_serde.h @@ -84,8 +84,9 @@ class DataTypeDateV2SerDe : public DataTypeNumberSerDe::write_column_to_arrow(const IColumn& column, } template -Status DataTypeDecimalSerDe::read_column_from_arrow(IColumn& column, - const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeDecimalSerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { auto& column_data = static_cast&>(column).get_data(); // Decimal for decimalv2 // Decimal for deicmalv3 diff --git a/be/src/core/data_type_serde/data_type_decimal_serde.h b/be/src/core/data_type_serde/data_type_decimal_serde.h index 0185672e024718..8cba545397fa42 100644 --- a/be/src/core/data_type_serde/data_type_decimal_serde.h +++ b/be/src/core/data_type_serde/data_type_decimal_serde.h @@ -105,8 +105,9 @@ class DataTypeDecimalSerDe : public DataTypeSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_hll_serde.h b/be/src/core/data_type_serde/data_type_hll_serde.h index 5df89f7f4d76cb..25d0a470229cb2 100644 --- a/be/src/core/data_type_serde/data_type_hll_serde.h +++ b/be/src/core/data_type_serde/data_type_hll_serde.h @@ -59,8 +59,9 @@ class DataTypeHLLSerDe : public DataTypeSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override { + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override { return Status::Error(ErrorCode::NOT_IMPLEMENTED_ERROR, "read_column_from_arrow with type " + column.get_name()); } diff --git a/be/src/core/data_type_serde/data_type_ipv4_serde.cpp b/be/src/core/data_type_serde/data_type_ipv4_serde.cpp index 0255504b08a1f2..32279a98be3d54 100644 --- a/be/src/core/data_type_serde/data_type_ipv4_serde.cpp +++ b/be/src/core/data_type_serde/data_type_ipv4_serde.cpp @@ -110,9 +110,10 @@ Status DataTypeIPv4SerDe::write_column_to_arrow(const IColumn& column, const Nul return Status::OK(); } -Status DataTypeIPv4SerDe::read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeIPv4SerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { auto& col_data = assert_cast(column).get_data(); int64_t row_count = end - start; /// buffers[0] is a null bitmap and buffers[1] are actual values diff --git a/be/src/core/data_type_serde/data_type_ipv4_serde.h b/be/src/core/data_type_serde/data_type_ipv4_serde.h index e23695dbca504a..f01dcfa085c108 100644 --- a/be/src/core/data_type_serde/data_type_ipv4_serde.h +++ b/be/src/core/data_type_serde/data_type_ipv4_serde.h @@ -53,8 +53,9 @@ class DataTypeIPv4SerDe : public DataTypeNumberSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status from_string_batch(const ColumnString& str, ColumnNullable& column, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_ipv6_serde.cpp b/be/src/core/data_type_serde/data_type_ipv6_serde.cpp index 54624521b04302..5646adf96fb8f8 100644 --- a/be/src/core/data_type_serde/data_type_ipv6_serde.cpp +++ b/be/src/core/data_type_serde/data_type_ipv6_serde.cpp @@ -140,9 +140,10 @@ Status DataTypeIPv6SerDe::write_column_to_arrow(const IColumn& column, const Nul return Status::OK(); } -Status DataTypeIPv6SerDe::read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeIPv6SerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { auto& col_data = assert_cast(column).get_data(); const auto* concrete_array = assert_cast(arrow_array); std::shared_ptr buffer = concrete_array->value_data(); diff --git a/be/src/core/data_type_serde/data_type_ipv6_serde.h b/be/src/core/data_type_serde/data_type_ipv6_serde.h index 226e65663f00e5..1293eaecede645 100644 --- a/be/src/core/data_type_serde/data_type_ipv6_serde.h +++ b/be/src/core/data_type_serde/data_type_ipv6_serde.h @@ -59,8 +59,9 @@ class DataTypeIPv6SerDe : public DataTypeNumberSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; void read_one_cell_from_jsonb(IColumn& column, const JsonbValue* arg) const override; void write_one_cell_to_jsonb(const IColumn& column, JsonbWriterT& result, Arena& mem_pool, int unique_id, int64_t row_num, diff --git a/be/src/core/data_type_serde/data_type_jsonb_serde.cpp b/be/src/core/data_type_serde/data_type_jsonb_serde.cpp index bd8dfdfd31211b..28eff7ba3c8e3c 100644 --- a/be/src/core/data_type_serde/data_type_jsonb_serde.cpp +++ b/be/src/core/data_type_serde/data_type_jsonb_serde.cpp @@ -114,9 +114,10 @@ Status DataTypeJsonbSerDe::write_column_to_arrow(const IColumn& column, const Nu return Status::OK(); } -Status DataTypeJsonbSerDe::read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeJsonbSerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { if (arrow_array->type_id() == arrow::Type::STRING || arrow_array->type_id() == arrow::Type::BINARY) { const auto* concrete_array = dynamic_cast(arrow_array); diff --git a/be/src/core/data_type_serde/data_type_jsonb_serde.h b/be/src/core/data_type_serde/data_type_jsonb_serde.h index 8d14c6d254a868..70ab0a99151127 100644 --- a/be/src/core/data_type_serde/data_type_jsonb_serde.h +++ b/be/src/core/data_type_serde/data_type_jsonb_serde.h @@ -46,8 +46,9 @@ class DataTypeJsonbSerDe : public DataTypeStringSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status serialize_one_cell_to_json(const IColumn& column, int64_t row_num, BufferWritable& bw, FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_map_serde.cpp b/be/src/core/data_type_serde/data_type_map_serde.cpp index 89a2b2955dfa08..6fa3f4be90d2bb 100644 --- a/be/src/core/data_type_serde/data_type_map_serde.cpp +++ b/be/src/core/data_type_serde/data_type_map_serde.cpp @@ -383,9 +383,10 @@ Status DataTypeMapSerDe::write_column_to_arrow(const IColumn& column, const Null return Status::OK(); } -Status DataTypeMapSerDe::read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeMapSerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, int64_t start, + int64_t end, + const cctz::time_zone& ctz) const { auto& column_map = static_cast(column); auto& offsets_data = column_map.get_offsets(); const auto* concrete_map = dynamic_cast(arrow_array); @@ -405,10 +406,10 @@ Status DataTypeMapSerDe::read_column_from_arrow(IColumn& column, const arrow::Ar // convert to doris offset, start from offsets.back() offsets_data.emplace_back(prev_size + current_offset - arrow_nested_start_offset); } - RETURN_IF_ERROR(key_serde->read_column_from_arrow( + RETURN_IF_ERROR(key_serde->read_column_from_arrow_impl( column_map.get_keys(), concrete_map->keys().get(), arrow_nested_start_offset, arrow_nested_end_offset, ctz)); - RETURN_IF_ERROR(value_serde->read_column_from_arrow( + RETURN_IF_ERROR(value_serde->read_column_from_arrow_impl( column_map.get_values(), concrete_map->items().get(), arrow_nested_start_offset, arrow_nested_end_offset, ctz)); return Status::OK(); diff --git a/be/src/core/data_type_serde/data_type_map_serde.h b/be/src/core/data_type_serde/data_type_map_serde.h index 72ac28721b926a..d383c21bca3495 100644 --- a/be/src/core/data_type_serde/data_type_map_serde.h +++ b/be/src/core/data_type_serde/data_type_map_serde.h @@ -82,8 +82,9 @@ class DataTypeMapSerDe : public DataTypeSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_nothing_serde.h b/be/src/core/data_type_serde/data_type_nothing_serde.h index 1e164e85a9100f..678c079d4ae7eb 100644 --- a/be/src/core/data_type_serde/data_type_nothing_serde.h +++ b/be/src/core/data_type_serde/data_type_nothing_serde.h @@ -85,8 +85,9 @@ class DataTypeNothingSerde : public DataTypeSerDe { return Status::Error(ErrorCode::NOT_IMPLEMENTED_ERROR, "write_column_to_arrow with type " + column.get_name()); } - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override { + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override { return Status::Error(ErrorCode::NOT_IMPLEMENTED_ERROR, "read_column_from_arrow with type " + column.get_name()); } diff --git a/be/src/core/data_type_serde/data_type_nullable_serde.cpp b/be/src/core/data_type_serde/data_type_nullable_serde.cpp index a93f8d6126c7d5..4a78f3d95ba530 100644 --- a/be/src/core/data_type_serde/data_type_nullable_serde.cpp +++ b/be/src/core/data_type_serde/data_type_nullable_serde.cpp @@ -336,18 +336,18 @@ Status DataTypeNullableSerDe::write_column_to_arrow(const IColumn& column, const start, end, ctz); } -Status DataTypeNullableSerDe::read_column_from_arrow(IColumn& column, - const arrow::Array* arrow_array, int64_t start, - int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeNullableSerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { auto& col = reinterpret_cast(column); NullMap& map_data = col.get_null_map_data(); for (auto i = start; i < end; ++i) { auto is_null = arrow_array->IsNull(i); map_data.emplace_back(is_null); } - return nested_serde->read_column_from_arrow(col.get_nested_column(), arrow_array, start, end, - ctz); + return nested_serde->read_column_from_arrow_impl(col.get_nested_column(), arrow_array, start, + end, ctz); } bool DataTypeNullableSerDe::write_column_to_mysql_text(const IColumn& column, BufferWritable& bw, diff --git a/be/src/core/data_type_serde/data_type_nullable_serde.h b/be/src/core/data_type_serde/data_type_nullable_serde.h index 6e069444483b87..26f03528209d2c 100644 --- a/be/src/core/data_type_serde/data_type_nullable_serde.h +++ b/be/src/core/data_type_serde/data_type_nullable_serde.h @@ -84,8 +84,9 @@ class DataTypeNullableSerDe : public DataTypeSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_number_serde.cpp b/be/src/core/data_type_serde/data_type_number_serde.cpp index 5f518aa6bbdc5d..dfb1079ff2ba8c 100644 --- a/be/src/core/data_type_serde/data_type_number_serde.cpp +++ b/be/src/core/data_type_serde/data_type_number_serde.cpp @@ -230,10 +230,10 @@ Status DataTypeNumberSerDe::deserialize_column_from_json_vector( } template -Status DataTypeNumberSerDe::read_column_from_arrow(IColumn& column, - const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeNumberSerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { auto row_count = end - start; auto& col_data = static_cast(column).get_data(); diff --git a/be/src/core/data_type_serde/data_type_number_serde.h b/be/src/core/data_type_serde/data_type_number_serde.h index b57f9f9d21298d..1aa46e959ae097 100644 --- a/be/src/core/data_type_serde/data_type_number_serde.h +++ b/be/src/core/data_type_serde/data_type_number_serde.h @@ -114,8 +114,9 @@ class DataTypeNumberSerDe : public DataTypeSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_quantilestate_serde.h b/be/src/core/data_type_serde/data_type_quantilestate_serde.h index 606a1c2116eff8..2b57356c1756fb 100644 --- a/be/src/core/data_type_serde/data_type_quantilestate_serde.h +++ b/be/src/core/data_type_serde/data_type_quantilestate_serde.h @@ -120,8 +120,9 @@ class DataTypeQuantileStateSerDe : public DataTypeSerDe { } return Status::OK(); } - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override { + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override { return Status::Error(ErrorCode::NOT_IMPLEMENTED_ERROR, "read_column_from_arrow with type " + column.get_name()); } diff --git a/be/src/core/data_type_serde/data_type_serde.cpp b/be/src/core/data_type_serde/data_type_serde.cpp index ac688ae6c307a3..0ce592c58fbf15 100644 --- a/be/src/core/data_type_serde/data_type_serde.cpp +++ b/be/src/core/data_type_serde/data_type_serde.cpp @@ -16,7 +16,10 @@ // under the License. #include "core/data_type_serde/data_type_serde.h" +#include + #include "common/cast_set.h" +#include "common/config.h" #include "common/exception.h" #include "common/status.h" #include "core/column/column.h" @@ -34,6 +37,20 @@ namespace doris { DataTypeSerDe::~DataTypeSerDe() = default; +Status DataTypeSerDe::read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { + if (config::enable_arrow_validate_full) { + auto status = arrow_array->ValidateFull(); + if (!status.ok()) { + return Status::InternalError( + "arrow validate full failed with arrow: {} with column : {} with error msg: {}", + arrow_array->type()->name(), column.get_name(), status.ToString()); + } + } + return read_column_from_arrow_impl(column, arrow_array, start, end, ctz); +} + DataTypeSerDeSPtrs create_data_type_serdes(const DataTypes& types) { DataTypeSerDeSPtrs serdes; serdes.reserve(types.size()); diff --git a/be/src/core/data_type_serde/data_type_serde.h b/be/src/core/data_type_serde/data_type_serde.h index 7c007c6558ddf3..aff70fca04ea76 100644 --- a/be/src/core/data_type_serde/data_type_serde.h +++ b/be/src/core/data_type_serde/data_type_serde.h @@ -481,9 +481,12 @@ class DataTypeSerDe { virtual Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const = 0; - virtual Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const = 0; + Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, + int64_t end, const cctz::time_zone& ctz) const; + + virtual Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const = 0; // ORC serializer virtual Status write_column_to_orc(const std::string& timezone, const IColumn& column, diff --git a/be/src/core/data_type_serde/data_type_string_serde.cpp b/be/src/core/data_type_serde/data_type_string_serde.cpp index 08d87cc38cac92..d2474cd3c25edc 100644 --- a/be/src/core/data_type_serde/data_type_string_serde.cpp +++ b/be/src/core/data_type_serde/data_type_string_serde.cpp @@ -254,7 +254,7 @@ Status DataTypeStringSerDeBase::write_column_to_arrow( } template -Status DataTypeStringSerDeBase::read_column_from_arrow( +Status DataTypeStringSerDeBase::read_column_from_arrow_impl( IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const { if (arrow_array->type_id() == arrow::Type::STRING || diff --git a/be/src/core/data_type_serde/data_type_string_serde.h b/be/src/core/data_type_serde/data_type_string_serde.h index 79c8450835d39c..7db325d2440f56 100644 --- a/be/src/core/data_type_serde/data_type_string_serde.h +++ b/be/src/core/data_type_serde/data_type_string_serde.h @@ -200,8 +200,9 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_struct_serde.cpp b/be/src/core/data_type_serde/data_type_struct_serde.cpp index 7514089ea0d3c8..6ef33343ec715b 100644 --- a/be/src/core/data_type_serde/data_type_struct_serde.cpp +++ b/be/src/core/data_type_serde/data_type_struct_serde.cpp @@ -416,14 +416,15 @@ Status DataTypeStructSerDe::write_column_to_arrow(const IColumn& column, const N return Status::OK(); } -Status DataTypeStructSerDe::read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, - int64_t start, int64_t end, - const cctz::time_zone& ctz) const { +Status DataTypeStructSerDe::read_column_from_arrow_impl(IColumn& column, + const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const { auto& struct_column = static_cast(column); const auto* concrete_struct = dynamic_cast(arrow_array); DCHECK_EQ(struct_column.tuple_size(), concrete_struct->num_fields()); for (auto i = 0; i < struct_column.tuple_size(); ++i) { - RETURN_IF_ERROR(elem_serdes_ptrs[i]->read_column_from_arrow( + RETURN_IF_ERROR(elem_serdes_ptrs[i]->read_column_from_arrow_impl( struct_column.get_column(i), concrete_struct->field(i).get(), start, end, ctz)); } return Status::OK(); diff --git a/be/src/core/data_type_serde/data_type_struct_serde.h b/be/src/core/data_type_serde/data_type_struct_serde.h index f6996265699ebe..b67e9437f3cef8 100644 --- a/be/src/core/data_type_serde/data_type_struct_serde.h +++ b/be/src/core/data_type_serde/data_type_struct_serde.h @@ -83,8 +83,9 @@ class DataTypeStructSerDe : public DataTypeSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_varbinary_serde.h b/be/src/core/data_type_serde/data_type_varbinary_serde.h index 41dfa3fce002f8..4f2f282baa8659 100644 --- a/be/src/core/data_type_serde/data_type_varbinary_serde.h +++ b/be/src/core/data_type_serde/data_type_varbinary_serde.h @@ -75,8 +75,9 @@ class DataTypeVarbinarySerDe : public DataTypeSerDe { arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override { + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override { return Status::Error(ErrorCode::NOT_IMPLEMENTED_ERROR, "read_column_from_arrow with type " + column.get_name()); } diff --git a/be/src/core/data_type_serde/data_type_variant_serde.h b/be/src/core/data_type_serde/data_type_variant_serde.h index 18b3809a22e9b9..2ffb8feb41ec6c 100644 --- a/be/src/core/data_type_serde/data_type_variant_serde.h +++ b/be/src/core/data_type_serde/data_type_variant_serde.h @@ -65,8 +65,9 @@ class DataTypeVariantSerDe : public DataTypeSerDe { Status write_column_to_arrow(const IColumn& column, const NullMap* null_map, arrow::ArrayBuilder* array_builder, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; - Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, - int64_t end, const cctz::time_zone& ctz) const override { + Status read_column_from_arrow_impl(IColumn& column, const arrow::Array* arrow_array, + int64_t start, int64_t end, + const cctz::time_zone& ctz) const override { return Status::Error(ErrorCode::NOT_IMPLEMENTED_ERROR, "read_column_from_arrow with type " + column.get_name()); } diff --git a/be/src/exprs/vexpr.cpp b/be/src/exprs/vexpr.cpp index 7f6b75ce6ddeab..bc9e37879fa083 100644 --- a/be/src/exprs/vexpr.cpp +++ b/be/src/exprs/vexpr.cpp @@ -1015,6 +1015,9 @@ Status VExpr::execute_column(VExprContext* context, const Block* block, const Se } } } + if (config::enable_column_sanity_check) { + result_column->sanity_check(); + } return Status::OK(); } diff --git a/be/src/format/arrow/arrow_block_convertor.cpp b/be/src/format/arrow/arrow_block_convertor.cpp index 91593898ac5e3d..671af81861bf95 100644 --- a/be/src/format/arrow/arrow_block_convertor.cpp +++ b/be/src/format/arrow/arrow_block_convertor.cpp @@ -36,12 +36,14 @@ #include #include +#include "common/config.h" #include "common/status.h" #include "core/block/column_with_type_and_name.h" #include "core/column/column.h" #include "core/data_type/data_type.h" #include "core/data_type/data_type_array.h" #include "core/data_type/data_type_nullable.h" +#include "core/data_type_serde/data_type_serde.h" #include "core/value/vdatetime_value.h" #include "format/arrow/arrow_row_batch.h" #include "format/arrow/arrow_utils.h" @@ -102,8 +104,16 @@ Status FromBlockToRecordBatchConverter::convert(std::shared_ptrValidateFull(), + _block.get_by_position(idx).name, + _arrays[_cur_field_idx]->type()->name())); + } } *out = arrow::RecordBatch::Make(_schema, actual_rows, std::move(_arrays)); + if (config::enable_arrow_validate_full) { + RETURN_IF_ERROR(to_doris_status((*out)->ValidateFull())); + } return Status::OK(); } diff --git a/be/test/core/column/column_string_test.cpp b/be/test/core/column/column_string_test.cpp index 6d29b555367b1f..d2c2fd75c5aefd 100644 --- a/be/test/core/column/column_string_test.cpp +++ b/be/test/core/column/column_string_test.cpp @@ -24,6 +24,7 @@ #include #include +#include "common/config.h" #include "core/block/block.h" #include "core/column/column_vector.h" #include "core/column/common_column_test.h" @@ -191,15 +192,19 @@ TEST_F(ColumnStringTest, is_variable_length) { EXPECT_TRUE(col64->is_variable_length()); } TEST_F(ColumnStringTest, sanity_check) { + auto old_enable_column_sanity_check = config::enable_column_sanity_check; + config::enable_column_sanity_check = true; + Defer defer {[&] { config::enable_column_sanity_check = old_enable_column_sanity_check; }}; + auto test_func = [](auto& col) { auto& chars = col->get_chars(); auto& offsets = col->get_offsets(); - col->sanity_check(); + EXPECT_NO_THROW(col->sanity_check()); std::string data = "123"; col->insert_data(data.data(), data.size()); - col->sanity_check(); + EXPECT_NO_THROW(col->sanity_check()); offsets[0] = 1; // chars.size() != offsets[count - 1] diff --git a/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp b/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp index 63a3819cd35fa3..c54e8036f50d96 100644 --- a/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp +++ b/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp @@ -42,6 +42,7 @@ #include #include +#include "common/config.h" #include "core/block/block.h" #include "core/column/column.h" #include "core/column/column_complex.h" @@ -76,6 +77,7 @@ #include "format/arrow/arrow_block_convertor.h" #include "format/arrow/arrow_row_batch.h" #include "runtime/descriptors.cpp" +#include "util/defer_op.h" #include "util/string_parser.hpp" namespace doris { @@ -529,4 +531,25 @@ TEST(DataTypeSerDeArrowTest, BlockConverterTest) { block_converter_test(cols, 7, false); } +TEST(DataTypeSerDeArrowTest, ReadColumnFromArrowValidateFullRejectsInvalidStringOffsets) { + const bool origin_enable_arrow_validate_full = config::enable_arrow_validate_full; + config::enable_arrow_validate_full = true; + Defer defer {[&]() { config::enable_arrow_validate_full = origin_enable_arrow_validate_full; }}; + + std::vector offsets = {0, 4, 2}; + std::string values = "abcd"; + auto offset_buffer = arrow::Buffer::Wrap(offsets); + auto value_buffer = + arrow::Buffer::Wrap(reinterpret_cast(values.data()), values.size()); + auto invalid_array = std::make_shared(2, offset_buffer, value_buffer); + ASSERT_FALSE(invalid_array->ValidateFull().ok()); + + DataTypePtr data_type = std::make_shared(); + auto column = data_type->create_column(); + cctz::time_zone default_timezone; + Status status = data_type->get_serde()->read_column_from_arrow( + *column, invalid_array.get(), 0, invalid_array->length(), default_timezone); + EXPECT_FALSE(status.ok()); +} + } // namespace doris From 7883ca94a092740928e0ffe3527a2bbb8f4fc3bd Mon Sep 17 00:00:00 2001 From: Mryange Date: Mon, 22 Jun 2026 22:46:34 +0800 Subject: [PATCH 2/2] upd --- be/src/format/arrow/arrow_block_convertor.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/be/src/format/arrow/arrow_block_convertor.cpp b/be/src/format/arrow/arrow_block_convertor.cpp index 671af81861bf95..65a37686db97eb 100644 --- a/be/src/format/arrow/arrow_block_convertor.cpp +++ b/be/src/format/arrow/arrow_block_convertor.cpp @@ -111,9 +111,6 @@ Status FromBlockToRecordBatchConverter::convert(std::shared_ptrValidateFull())); - } return Status::OK(); }