Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/arrow/dataset/file_parquet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field_ref);

bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0;
bool may_have_null = !statistics.HasNullCount() || statistics.NullCount().value() > 0;
// Optimize for corner case where all values are nulls
if (statistics.num_values() == 0) {
// If there are no non-null values, column `field_ref` in the fragment
Expand Down Expand Up @@ -540,7 +540,7 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
return parquet::ParquetFileReader::OpenAsync(input, properties, metadata)
.Then(
[=](const std::unique_ptr<parquet::ParquetFileReader>& reader) mutable
-> Result<std::shared_ptr<parquet::arrow::FileReader>> {
-> Result<std::shared_ptr<parquet::arrow::FileReader>> {
auto arrow_properties = MakeArrowReaderProperties(
*self, *reader->metadata(), *options, *parquet_scan_options);

Expand Down
6 changes: 2 additions & 4 deletions cpp/src/arrow/dataset/file_parquet_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -941,9 +941,8 @@ TEST(TestParquetStatistics, NoNullCount) {
::parquet::EncodedStatistics encoded_stats;
encoded_stats.set_min(int32_to_parquet_stats(1));
encoded_stats.set_max(int32_to_parquet_stats(100));
encoded_stats.has_null_count = false;
encoded_stats.all_null_value = false;
encoded_stats.null_count = 0;
encoded_stats.null_count = std::nullopt;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10);

auto stat_expression =
Expand All @@ -956,7 +955,6 @@ TEST(TestParquetStatistics, NoNullCount) {
// Special case: when num_value is 0, it would return
// "is_null".
::parquet::EncodedStatistics encoded_stats;
encoded_stats.has_null_count = true;
encoded_stats.null_count = 1;
encoded_stats.all_null_value = true;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
Expand All @@ -965,7 +963,7 @@ TEST(TestParquetStatistics, NoNullCount) {
ASSERT_TRUE(stat_expression.has_value());
EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");

encoded_stats.has_null_count = false;
encoded_stats.null_count = std::nullopt;
encoded_stats.all_null_value = false;
stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4735,8 +4735,8 @@ TEST_P(TestArrowWriteDictionary, Statistics) {

auto expect_has_min_max =
expected_has_min_max_by_page[case_index][row_group_index][page_index];
EXPECT_EQ(stats.has_min, expect_has_min_max);
EXPECT_EQ(stats.has_max, expect_has_min_max);
EXPECT_EQ(stats.HasMin(), expect_has_min_max);
EXPECT_EQ(stats.HasMax(), expect_has_min_max);
if (expect_has_min_max) {
EXPECT_EQ(stats.min(),
expected_min_by_page[case_index][row_group_index][page_index]);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/arrow/arrow_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) {
auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader));
std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata();
std::shared_ptr<Statistics> stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics();
EXPECT_EQ(stats->null_count(), GetParam().expected_null_count);
EXPECT_EQ(stats->NullCount(), GetParam().expected_null_count);
EXPECT_EQ(stats->num_values(), GetParam().expected_value_count);
ASSERT_TRUE(stats->HasMinMax());
EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min);
Expand Down
22 changes: 11 additions & 11 deletions cpp/src/parquet/arrow/reader_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ template <typename CType, typename StatisticsType>
Status MakeMinMaxScalar(const StatisticsType& statistics,
std::shared_ptr<::arrow::Scalar>* min,
std::shared_ptr<::arrow::Scalar>* max) {
*min = ::arrow::MakeScalar(static_cast<CType>(statistics.min()));
*max = ::arrow::MakeScalar(static_cast<CType>(statistics.max()));
*min = ::arrow::MakeScalar(static_cast<CType>(statistics.Min().value()));
*max = ::arrow::MakeScalar(static_cast<CType>(statistics.Max().value()));
return Status::OK();
}

Expand All @@ -179,8 +179,8 @@ Status MakeMinMaxTypedScalar(const StatisticsType& statistics,
std::shared_ptr<DataType> type,
std::shared_ptr<::arrow::Scalar>* min,
std::shared_ptr<::arrow::Scalar>* max) {
ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min()));
ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max()));
ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.Min().value()));
ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.Max().value()));
return Status::OK();
}

Expand Down Expand Up @@ -227,8 +227,8 @@ static Status FromInt32Statistics(const Int32Statistics& statistics,
case LogicalType::Type::NONE:
return MakeMinMaxTypedScalar<int32_t>(statistics, type, min, max);
case LogicalType::Type::DECIMAL:
return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(),
logical_type, min, max);
return ExtractDecimalMinMaxFromInteger(
statistics.Min().value(), statistics.Max().value(), logical_type, min, max);
default:
break;
}
Expand All @@ -252,8 +252,8 @@ static Status FromInt64Statistics(const Int64Statistics& statistics,
case LogicalType::Type::NONE:
return MakeMinMaxTypedScalar<int64_t>(statistics, type, min, max);
case LogicalType::Type::DECIMAL:
return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(),
logical_type, min, max);
return ExtractDecimalMinMaxFromInteger(
statistics.Min().value(), statistics.Max().value(), logical_type, min, max);
default:
break;
}
Expand Down Expand Up @@ -384,13 +384,13 @@ void AttachStatistics(::arrow::ArrayData* data,
}
if (statistics) {
if (statistics->HasDistinctCount()) {
array_statistics->distinct_count = statistics->distinct_count();
array_statistics->distinct_count = statistics->DistinctCount().value();
}
if (statistics->HasMinMax()) {
const auto* typed_statistics =
checked_cast<const ::parquet::TypedStatistics<ParquetType>*>(statistics.get());
const ArrowCType min = typed_statistics->min();
const ArrowCType max = typed_statistics->max();
const ArrowCType min = typed_statistics->Min().value();
const ArrowCType max = typed_statistics->Max().value();
if constexpr (std::is_same_v<ArrowCType, bool>) {
array_statistics->min = static_cast<bool>(min);
array_statistics->max = static_cast<bool>(max);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1084,7 +1084,7 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size,

// page_stats.null_count is not set when page_statistics_ is nullptr. It is only used
// here for safety check.
DCHECK(!page_stats.has_null_count || page_stats.null_count == null_count);
DCHECK(!page_stats.HasNullCount() || page_stats.null_count == null_count);

// Write the page to OutputStream eagerly if there is no dictionary or
// if dictionary encoding has fallen back to PLAIN
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
auto metadata_accessor = ColumnChunkMetaData::Make(
metadata_->contents(), this->descr_, default_reader_properties(), &app_version);
auto encoded_stats = metadata_accessor->statistics()->Encode();
return {encoded_stats.has_min, encoded_stats.has_max};
return {encoded_stats.HasMin(), encoded_stats.HasMax()};
}

std::vector<Encoding::type> metadata_encodings() {
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/parquet/file_deserialize_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ static inline void AddDummyStats(int stat_size, H& header, bool fill_all_stats =
template <typename H>
static inline void CheckStatistics(const H& expected, const EncodedStatistics& actual) {
if (expected.statistics.__isset.max) {
ASSERT_EQ(expected.statistics.max, actual.max());
ASSERT_EQ(expected.statistics.max, actual.Max());
}
if (expected.statistics.__isset.min) {
ASSERT_EQ(expected.statistics.min, actual.min());
ASSERT_EQ(expected.statistics.min, actual.Min());
}
if (expected.statistics.__isset.null_count) {
ASSERT_EQ(expected.statistics.null_count, actual.null_count);
Expand Down Expand Up @@ -513,8 +513,8 @@ TYPED_TEST(PageFilterTest, TestPageFilterCallback) {
CheckDataPageHeader(this->data_page_headers_[i], current_page.get()));
auto data_page = static_cast<const DataPage*>(current_page.get());
const EncodedStatistics encoded_statistics = data_page->statistics();
ASSERT_EQ(read_stats[i].max(), encoded_statistics.max());
ASSERT_EQ(read_stats[i].min(), encoded_statistics.min());
ASSERT_EQ(read_stats[i].Max(), encoded_statistics.Max());
ASSERT_EQ(read_stats[i].Min(), encoded_statistics.Min());
ASSERT_EQ(read_stats[i].null_count, encoded_statistics.null_count);
ASSERT_EQ(read_stats[i].distinct_count, encoded_statistics.distinct_count);
ASSERT_EQ(read_num_values[i], this->data_page_headers_[i].num_values);
Expand Down
46 changes: 31 additions & 15 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,24 +101,40 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
metadata.statistics.__isset.is_max_value_exact
? std::optional<bool>(metadata.statistics.is_max_value_exact)
: std::nullopt;
std::optional<int64_t> null_count =
metadata.statistics.__isset.null_count
? std::optional<int64_t>(metadata.statistics.null_count)
: std::nullopt;
std::optional<int64_t> distinct_count =
metadata.statistics.__isset.distinct_count
? std::optional<int64_t>(metadata.statistics.distinct_count)
: std::nullopt;
std::optional<std::string_view> min_val =
metadata.statistics.__isset.min
? std::optional<std::string_view>(metadata.statistics.min)
: std::nullopt;
std::optional<std::string_view> max_val =
metadata.statistics.__isset.max
? std::optional<std::string_view>(metadata.statistics.max)
: std::nullopt;
// If ColumnOrder is defined, return max_value and min_value
if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
return MakeStatistics<DType>(
descr, metadata.statistics.min_value, metadata.statistics.max_value,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value,
metadata.statistics.__isset.null_count,
metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool);
std::optional<std::string_view> min_value =
metadata.statistics.__isset.min_value
? std::optional<std::string_view>(metadata.statistics.min_value)
: std::nullopt;
std::optional<std::string_view> max_value =
metadata.statistics.__isset.max_value
? std::optional<std::string_view>(metadata.statistics.max_value)
: std::nullopt;
return MakeStatistics<DType>(descr, min_value, max_value,
metadata.num_values - null_count.value_or(0), null_count,
distinct_count, min_exact, max_exact, pool);
}
// Default behavior
return MakeStatistics<DType>(
descr, metadata.statistics.min, metadata.statistics.max,
metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max && metadata.statistics.__isset.min,
metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count,
min_exact, max_exact, pool);
return MakeStatistics<DType>(descr, min_val, max_val,
metadata.num_values - null_count.value_or(0), null_count,
distinct_count, min_exact, max_exact, pool);
}

namespace {
Expand Down Expand Up @@ -1610,7 +1626,7 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
(application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
// Only SIGNED are valid unless max and min are the same
// (in which case the sort order does not matter)
bool max_equals_min = statistics.has_min && statistics.has_max
bool max_equals_min = statistics.HasMin() && statistics.HasMax()
? statistics.min() == statistics.max()
: false;
if (SortOrder::SIGNED != sort_order && !max_equals_min) {
Expand Down
48 changes: 24 additions & 24 deletions cpp/src/parquet/metadata_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,18 +154,18 @@ TEST(Metadata, TestBuildAccess) {
auto rg1_column2 = rg1_accessor->ColumnChunk(1);
ASSERT_EQ(true, rg1_column1->is_stats_set());
ASSERT_EQ(true, rg1_column2->is_stats_set());
ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(stats_float.min(), rg1_column2->encoded_statistics()->min());
ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max());
ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min());
ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max());
ASSERT_EQ(0, rg1_column1->statistics()->null_count());
ASSERT_EQ(0, rg1_column2->statistics()->null_count());
ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count());
ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count());
ASSERT_EQ(stats_float.Min(), rg1_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.Max(), rg1_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(stats_float.Min(), rg1_column2->encoded_statistics()->Min());
ASSERT_EQ(stats_float.Max(), rg1_column2->encoded_statistics()->Max());
ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min());
ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max());
ASSERT_EQ(0, rg1_column1->statistics()->NullCount());
ASSERT_EQ(0, rg1_column2->statistics()->NullCount());
ASSERT_EQ(nrows, rg1_column1->statistics()->DistinctCount());
ASSERT_EQ(nrows, rg1_column2->statistics()->DistinctCount());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression());
ASSERT_EQ(nrows / 2, rg1_column1->num_values());
Expand Down Expand Up @@ -205,18 +205,18 @@ TEST(Metadata, TestBuildAccess) {
auto rg2_column2 = rg2_accessor->ColumnChunk(1);
ASSERT_EQ(true, rg2_column1->is_stats_set());
ASSERT_EQ(true, rg2_column2->is_stats_set());
ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(stats_float.min(), rg2_column2->encoded_statistics()->min());
ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max());
ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min());
ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max());
ASSERT_EQ(0, rg2_column1->statistics()->null_count());
ASSERT_EQ(0, rg2_column2->statistics()->null_count());
ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count());
ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count());
ASSERT_EQ(stats_float.Min(), rg2_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.Max(), rg2_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(stats_float.Min(), rg2_column2->encoded_statistics()->Min());
ASSERT_EQ(stats_float.Max(), rg2_column2->encoded_statistics()->Max());
ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min());
ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max());
ASSERT_EQ(0, rg2_column1->statistics()->NullCount());
ASSERT_EQ(0, rg2_column2->statistics()->NullCount());
ASSERT_EQ(nrows, rg2_column1->statistics()->DistinctCount());
ASSERT_EQ(nrows, rg2_column2->statistics()->DistinctCount());
ASSERT_EQ(nrows / 2, rg2_column1->num_values());
ASSERT_EQ(nrows / 2, rg2_column2->num_values());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/parquet/page_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -511,11 +511,11 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder {
column_index_.null_pages.emplace_back(true);
column_index_.min_values.emplace_back("");
column_index_.max_values.emplace_back("");
} else if (stats.has_min && stats.has_max) {
} else if (stats.HasMin() && stats.HasMax()) {
const size_t page_ordinal = column_index_.null_pages.size();
non_null_page_indices_.emplace_back(page_ordinal);
column_index_.min_values.emplace_back(stats.min());
column_index_.max_values.emplace_back(stats.max());
column_index_.min_values.emplace_back(stats.Min().value());
column_index_.max_values.emplace_back(stats.Max().value());
column_index_.null_pages.emplace_back(false);
} else {
/// This is a non-null page but it lacks of meaningful min/max values.
Expand All @@ -524,8 +524,8 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder {
return;
}

if (column_index_.__isset.null_counts && stats.has_null_count) {
column_index_.null_counts.emplace_back(stats.null_count);
if (column_index_.__isset.null_counts && stats.HasNullCount()) {
column_index_.null_counts.emplace_back(stats.null_count.value());
} else {
column_index_.__isset.null_counts = false;
column_index_.null_counts.clear();
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/parquet/page_index_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -546,8 +546,8 @@ void TestWriteTypedColumnIndex(schema::NodePtr node,

for (size_t i = 0; i < num_pages; ++i) {
ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]);
ASSERT_EQ(page_stats[i].min(), column_index->encoded_min_values()[i]);
ASSERT_EQ(page_stats[i].max(), column_index->encoded_max_values()[i]);
ASSERT_EQ(page_stats[i].Min().value_or(""), column_index->encoded_min_values()[i]);
ASSERT_EQ(page_stats[i].Max().value_or(""), column_index->encoded_max_values()[i]);
if (has_null_counts) {
ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]);
}
Expand Down Expand Up @@ -813,11 +813,11 @@ class PageIndexBuilderTest : public ::testing::Test {
ASSERT_NE(nullptr, column_index);
ASSERT_EQ(size_t{1}, column_index->null_pages().size());
ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]);
ASSERT_EQ(stats.min(), column_index->encoded_min_values()[0]);
ASSERT_EQ(stats.max(), column_index->encoded_max_values()[0]);
ASSERT_EQ(stats.has_null_count, column_index->has_null_counts());
if (stats.has_null_count) {
ASSERT_EQ(stats.null_count, column_index->null_counts()[0]);
ASSERT_EQ(stats.Min(), column_index->encoded_min_values()[0]);
ASSERT_EQ(stats.Max(), column_index->encoded_max_values()[0]);
ASSERT_EQ(stats.HasNullCount(), column_index->has_null_counts());
if (stats.HasNullCount()) {
ASSERT_EQ(stats.null_count.value(), column_index->null_counts()[0]);
}
}

Expand Down
Loading
Loading