-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-45847: [C++] Optimize Parquet column reader by fusing decoding and counting #48549
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
0292132
2280bb9
e4f35d1
6f4c7e1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -308,6 +308,20 @@ class RleRunDecoder { | |
| return to_read; | ||
| } | ||
|
|
||
| /// Get a batch of values and count how many equal match_value. | ||
| /// For RLE runs, counting is O(1) because all values in the run are identical. | ||
| [[nodiscard]] rle_size_t GetBatchWithCount(value_type* out, rle_size_t batch_size, | ||
| rle_size_t value_bit_width, | ||
| value_type match_value, int64_t* out_count) { | ||
| // Save value_ before GetBatch since it doesn't change during the call. | ||
| const auto current_value = value_; | ||
| const auto to_read = GetBatch(out, batch_size, value_bit_width); | ||
| if (current_value == match_value) { | ||
| *out_count += to_read; | ||
| } | ||
| return to_read; | ||
| } | ||
|
|
||
| private: | ||
| value_type value_ = {}; | ||
| rle_size_t remaining_count_ = 0; | ||
|
|
@@ -377,6 +391,19 @@ class BitPackedRunDecoder { | |
| return steps; | ||
| } | ||
|
|
||
| /// Get a batch of values and count how many equal match_value | ||
| /// Note: For bit-packed runs, we use std::count after GetBatch since it's | ||
| /// highly optimized by the compiler. The fused approach is only beneficial | ||
| /// for RLE runs where counting is O(1). | ||
| [[nodiscard]] rle_size_t GetBatchWithCount(value_type* out, rle_size_t batch_size, | ||
| rle_size_t value_bit_width, | ||
| value_type match_value, int64_t* out_count) { | ||
| const auto steps = GetBatch(out, batch_size, value_bit_width); | ||
| // std::count is highly optimized (SIMD) by modern compilers | ||
| *out_count += std::count(out, out + steps, match_value); | ||
|
Comment on lines
+401
to
+403
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have been working on the Still this could provide better data locality when doing run by run.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The typical batch size for levels is probably small, so it would fit at least in L2 cache and perhaps L1. Not sure it's worth trying to do it while decoding. |
||
| return steps; | ||
| } | ||
|
|
||
| private: | ||
| /// The pointer to the beginning of the run | ||
| const uint8_t* data_ = nullptr; | ||
|
|
@@ -438,6 +465,10 @@ class RleBitPackedDecoder { | |
| /// left or if an error occurred. | ||
| [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size); | ||
|
|
||
| /// Get a batch of values and count how many equal match_value | ||
| [[nodiscard]] rle_size_t GetBatchWithCount(value_type* out, rle_size_t batch_size, | ||
| value_type match_value, int64_t* out_count); | ||
|
|
||
| /// Like GetBatch but add spacing for null entries. | ||
| /// | ||
| /// Null entries will be set to an arbistrary value to avoid leaking private data. | ||
|
|
@@ -483,6 +514,18 @@ class RleBitPackedDecoder { | |
| decoder_); | ||
| } | ||
|
|
||
| /// Get a batch of values from the current run and return the number elements read. | ||
| [[nodiscard]] rle_size_t RunGetBatchWithCount(value_type* out, rle_size_t batch_size, | ||
| value_type match_value, | ||
| int64_t* out_count) { | ||
| return std::visit( | ||
| [&](auto& dec) { | ||
| return dec.GetBatchWithCount(out, batch_size, value_bit_width_, match_value, | ||
| out_count); | ||
| }, | ||
| decoder_); | ||
| } | ||
|
|
||
| /// Call the parser with a single callable for all event types. | ||
| template <typename Callable> | ||
| void ParseWithCallable(Callable&& func); | ||
|
|
@@ -1474,4 +1517,49 @@ inline void RleBitPackedEncoder::Clear() { | |
| bit_writer_.Clear(); | ||
| } | ||
|
|
||
| template <typename T> | ||
| auto RleBitPackedDecoder<T>::GetBatchWithCount(value_type* out, rle_size_t batch_size, | ||
| value_type match_value, int64_t* out_count) | ||
| -> rle_size_t { | ||
| using ControlFlow = RleBitPackedParser::ControlFlow; | ||
|
|
||
| rle_size_t values_read = 0; | ||
|
|
||
| // Remaining from a previous call that would have left some unread data from a run. | ||
| if (ARROW_PREDICT_FALSE(run_remaining() > 0)) { | ||
| const auto read = RunGetBatchWithCount(out, batch_size, match_value, out_count); | ||
| values_read += read; | ||
| out += read; | ||
|
|
||
| // Either we fulfilled all the batch to be read or we finished remaining run. | ||
| if (ARROW_PREDICT_FALSE(values_read == batch_size)) { | ||
| return values_read; | ||
| } | ||
| ARROW_DCHECK(run_remaining() == 0); | ||
| } | ||
|
|
||
| ParseWithCallable([&](auto run) { | ||
| using RunDecoder = typename decltype(run)::template DecoderType<value_type>; | ||
|
|
||
| ARROW_DCHECK_LT(values_read, batch_size); | ||
| RunDecoder decoder(run, value_bit_width_); | ||
| const auto read = | ||
| decoder.GetBatchWithCount(out, batch_size - values_read, value_bit_width_, | ||
| match_value, out_count); | ||
| ARROW_DCHECK_LE(read, batch_size - values_read); | ||
| values_read += read; | ||
| out += read; | ||
|
|
||
| // Stop reading and store remaining decoder | ||
| if (ARROW_PREDICT_FALSE(values_read == batch_size || read == 0)) { | ||
| decoder_ = std::move(decoder); | ||
| return ControlFlow::Break; | ||
| } | ||
|
|
||
| return ControlFlow::Continue; | ||
| }); | ||
|
|
||
| return values_read; | ||
| } | ||
|
|
||
| } // namespace arrow::util | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this call
RleRunDecoder::GetBatchto avoid duplicating the logic?