Conversation
3c02fb5 to
3381152
Compare
3381152 to
0e5727a
Compare
|
Depends on VectorDB-NTU/RaBitQ-Library#36 |
|
@greptile |
Greptile SummaryThis PR adds comprehensive HNSW-RaBitQ support to the zvec project, implementing a vector search index that combines Hierarchical Navigable Small World (HNSW) graph structure with RaBitQ quantization for memory-efficient approximate nearest neighbor search. Major changes:
Issues found:
Confidence Score: 3/5
Important Files Changed
Class Diagram%%{init: {'theme': 'neutral'}}%%
classDiagram
class HnswRabitqIndex {
+build()
+search()
+stream()
}
class HnswRabitqBuilder {
+init()
+train()
+build()
-rabitq_converter_
}
class HnswRabitqSearcher {
+init()
+search()
-entity_
-reformer_
}
class HnswRabitqStreamer {
+init()
+add()
+search()
-entity_
-reformer_
}
class HnswRabitqAlgorithm {
+add_node()
+search()
-entity_
}
class HnswRabitqEntity {
+get_neighbors()
+update_neighbors()
-graph_structure_
}
class RabitqConverter {
+train()
+transform()
+to_reformer()
-rotator_
-centroids_
}
class RabitqReformer {
+reform()
+get_quantized()
-rotator_
-centroids_
}
class HnswRabitqContext {
+dist_calculator()
+visit_filter()
}
HnswRabitqIndex --> HnswRabitqBuilder
HnswRabitqIndex --> HnswRabitqSearcher
HnswRabitqIndex --> HnswRabitqStreamer
HnswRabitqBuilder --> RabitqConverter
HnswRabitqBuilder --> HnswRabitqAlgorithm
HnswRabitqBuilder --> HnswRabitqEntity
HnswRabitqSearcher --> HnswRabitqAlgorithm
HnswRabitqSearcher --> HnswRabitqEntity
HnswRabitqSearcher --> RabitqReformer
HnswRabitqStreamer --> HnswRabitqAlgorithm
HnswRabitqStreamer --> HnswRabitqEntity
HnswRabitqStreamer --> RabitqReformer
HnswRabitqAlgorithm --> HnswRabitqEntity
HnswRabitqAlgorithm --> HnswRabitqContext
RabitqConverter --> RabitqReformer
Last reviewed commit: 42a30d6 |
| @@ -0,0 +1,4 @@ | |||
| add_library(rabitqlib INTERFACE) | |||
| target_include_directories( | |||
| sparsehash INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/RaBitQ-Library-0.1/include" | |||
There was a problem hiding this comment.
sparsehash -> rabitqlib?
There was a problem hiding this comment.
this class can be moved to quantizer folder with rabitq_reformer
There was a problem hiding this comment.
rabitq_converter/reformer is specialized for rabitq. It's better to put them inside hnsw_rabitq for now.
|
|
||
| void HnswRabitqQueryAlgorithm::expand_neighbors_by_group( | ||
| TopkHeap &topk, HnswRabitqContext *ctx) const { | ||
| // if (!ctx->group_by().is_valid()) { |
There was a problem hiding this comment.
to return with some message when group's setup?
There was a problem hiding this comment.
implemented group by search, will test it after framework support group by .
| continue; | ||
| } | ||
| } else { | ||
| // Candidate cand{ResultRecord(candest.est_dist, candest.low_dist), |
There was a problem hiding this comment.
no need to check ex bits here? can it be checked earlier?
There was a problem hiding this comment.
It's necessary to check ex_bits here.
- first get_bin_est use single bit to estimate score
- ex_bits > 0
- yes. check if need to do full estimate with extra bits
- no. Nothing to be done since we already used all the information.
|
|
||
| // TODO: check loop type | ||
|
|
||
| // if ((!topk.full()) || cur_dist < topk[0].second) { |
There was a problem hiding this comment.
lagacy codes here and elsewhere can be cleaned up.
| query_wrapper.set_g_add(q_to_centroids[cluster_id]); | ||
| float est_dist = rabitqlib::split_distance_boosting( | ||
| ex_data, ip_func_, query_wrapper, padded_dim_, ex_bits_, res.ip_x0_qr); | ||
| float low_dist = est_dist - (res.est_dist - res.low_dist) / (1 << ex_bits_); |
There was a problem hiding this comment.
can predefine param to convert to multiply manipulation:
double inv_divisor = 1.0 / (1 << ex_bits_);
auto result = (res.est_dist - res.low_dist) * inv_divisor;
There was a problem hiding this comment.
get_ex_est is not used, deleted.
|
@greptile |
| int HnswRabitqSearcher::search_bf_by_p_keys_impl( | ||
| const void *query, const std::vector<std::vector<uint64_t>> &p_keys, | ||
| const IndexQueryMeta &qmeta, uint32_t count, | ||
| Context::Pointer &context) const { | ||
| // if (ailego_unlikely(!query || !context)) { | ||
| // LOG_ERROR("The context is not created by this searcher"); | ||
| // return IndexError_Mismatch; | ||
| // } | ||
|
|
||
| // if (ailego_unlikely(p_keys.size() != count)) { | ||
| // LOG_ERROR("The size of p_keys is not equal to count"); | ||
| // return IndexError_InvalidArgument; | ||
| // } | ||
|
|
||
| // HnswRabitqContext *ctx = dynamic_cast<HnswRabitqContext *>(context.get()); | ||
| // ailego_do_if_false(ctx) { | ||
| // LOG_ERROR("Cast context to HnswRabitqContext failed"); | ||
| // return IndexError_Cast; | ||
| // } | ||
| // if (ctx->magic() != magic_) { | ||
| // //! context is created by another searcher or streamer | ||
| // int ret = update_context(ctx); | ||
| // if (ret != 0) { | ||
| // return ret; | ||
| // } | ||
| // } | ||
|
|
||
| // ctx->clear(); | ||
| // ctx->resize_results(count); | ||
|
|
||
| // if (ctx->group_by_search()) { | ||
| // if (!ctx->group_by().is_valid()) { | ||
| // LOG_ERROR("Invalid group-by function"); | ||
| // return IndexError_InvalidArgument; | ||
| // } | ||
|
|
||
| // std::function<std::string(node_id_t)> group_by = [&](node_id_t id) { | ||
| // return ctx->group_by()(entity_.get_key(id)); | ||
| // }; | ||
|
|
||
| // for (size_t q = 0; q < count; ++q) { | ||
| // ctx->reset_query(query); | ||
| // ctx->group_topk_heaps().clear(); | ||
|
|
||
| // for (size_t idx = 0; idx < p_keys[q].size(); ++idx) { | ||
| // uint64_t pk = p_keys[q][idx]; | ||
| // if (!ctx->filter().is_valid() || !ctx->filter()(pk)) { | ||
| // node_id_t id = entity_.get_id(pk); | ||
| // if (id != kInvalidNodeId) { | ||
| // dist_t dist = ctx->dist_calculator().dist(id); | ||
| // std::string group_id = group_by(id); | ||
|
|
||
| // auto &topk_heap = ctx->group_topk_heaps()[group_id]; | ||
| // if (topk_heap.empty()) { | ||
| // topk_heap.limit(ctx->group_topk()); | ||
| // } | ||
| // topk_heap.emplace_back(id, dist); | ||
| // } | ||
| // } | ||
| // } | ||
| // ctx->topk_to_result(q); | ||
| // query = static_cast<const char *>(query) + qmeta.element_size(); | ||
| // } | ||
| // } else { | ||
| // for (size_t q = 0; q < count; ++q) { | ||
| // ctx->reset_query(query); | ||
| // ctx->topk_heap().clear(); | ||
| // for (size_t idx = 0; idx < p_keys[q].size(); ++idx) { | ||
| // uint64_t pk = p_keys[q][idx]; | ||
| // if (!ctx->filter().is_valid() || !ctx->filter()(pk)) { | ||
| // node_id_t id = entity_.get_id(pk); | ||
| // if (id != kInvalidNodeId) { | ||
| // dist_t dist = ctx->dist_calculator().dist(id); | ||
| // ctx->topk_heap().emplace(id, dist); | ||
| // } | ||
| // } | ||
| // } | ||
| // ctx->topk_to_result(q); | ||
| // query = static_cast<const char *>(query) + qmeta.element_size(); | ||
| // } | ||
| // } | ||
|
|
||
| // if (ailego_unlikely(ctx->error())) { | ||
| // return IndexError_Runtime; | ||
| // } | ||
|
|
||
| return 0; | ||
| } |
There was a problem hiding this comment.
search_bf_by_p_keys_impl silently returns success with no-op
The entire body of search_bf_by_p_keys_impl is commented out. The function returns 0 (success), meaning any caller of this brute-force p_keys search path receives a success response while no actual search is performed — results remain uninitialized/empty. This is a silent data-corruption-level bug for callers that rely on this code path.
At a minimum, this should return IndexError_NotImplemented to surface the unimplemented state, rather than 0:
int HnswRabitqSearcher::search_bf_by_p_keys_impl(
const void *query, const std::vector<std::vector<uint64_t>> &p_keys,
const IndexQueryMeta &qmeta, uint32_t count,
Context::Pointer &context) const {
return IndexError_NotImplemented;
}| topk_heap.emplace_back(node, ResultRecord(candest)); | ||
|
|
||
| if (group_topk_heaps.size() >= ctx->group_num()) { | ||
| break; | ||
| } | ||
| } | ||
| candidates.emplace(node, ResultRecord(candest)); | ||
| } // end for |
There was a problem hiding this comment.
break only exits inner loop — outer while continues after group_num reached
When group_topk_heaps.size() >= ctx->group_num() is satisfied, the break on line 292 only exits the inner for loop over neighbor_ids. The outer while (!candidates.empty() && ...) loop continues, popping and processing more candidates and computing full estimates even after the desired group count is reached.
This wastes compute (each iteration calls get_full_est) and can add more entries into group_topk_heaps beyond the intended group_num. The outer loop should also terminate early once the target is met:
for (uint32_t i = 0; i < size; ++i) {
// ... compute candest ...
if (!filter(node)) {
std::string group_id = group_by(node);
auto &topk_heap = group_topk_heaps[group_id];
if (topk_heap.empty()) {
topk_heap.limit(ctx->group_topk());
}
topk_heap.emplace_back(node, ResultRecord(candest));
if (group_topk_heaps.size() >= ctx->group_num()) {
goto done; // or use a flag to break both loops
}
}
candidates.emplace(node, ResultRecord(candest));
} // end for
} // end while
done:;Alternatively, add a check at the top of the while loop condition: && group_topk_heaps.size() < ctx->group_num().
| auto memory_storage = IndexFactory::CreateStorage("MemoryReadStorage"); | ||
| ret = memory_storage->open(file_id, false); | ||
| if (ret != 0) { | ||
| LOG_ERROR("Failed to open memory storage: %d", ret); | ||
| return ret; | ||
| } | ||
| ret = reformer_->load(memory_storage); | ||
| if (ret != 0) { | ||
| LOG_ERROR("Failed to load RabitqReformer: %d", ret); | ||
| return ret; | ||
| } | ||
| // TODO: release memory of memory_storage | ||
| return 0; |
There was a problem hiding this comment.
Memory leak for in-memory storage object
memory_storage is created and opened but never explicitly released. The // TODO: release memory of memory_storage comment acknowledges this, but the object will hold onto memory-mapped data from the MemoryDumper until the IndexStorage::Pointer shared_ptr goes out of scope at the end of this function — which happens naturally on function return. However, if MemoryReadStorage retains a reference to the underlying MemoryDumper allocation beyond the shared_ptr lifetime, this could leak.
The same pattern (and the same // TODO comment) exists in rabitq_converter.cc in the to_reformer method. These TODOs should be resolved or confirmed safe before merging.
resolve #42
Greptile Summary
This PR introduces a new HNSW-RaBitQ index type — a graph-based approximate nearest neighbor search index that combines HNSW's hierarchical navigable small-world graph structure with RaBitQ quantization (binary + extended-bit codes) for compressed vector storage and fast distance estimation. The implementation is substantial (~14k lines) and covers the full index lifecycle: builder (offline batch), streamer (online incremental), searcher, and Python bindings.
Key changes:
hnsw_rabitqalgorithm module undersrc/core/algorithm/hnsw_rabitq/with graph construction, quantization, and query algorithmsRabitqConverterfor KMeans-based centroid training and vector quantization;RabitqReformerfor query transformationsegment.cc,proto_converter.cc,schema.cc), proto definitions, and Python parameter bindingsRaBitQ-Libraryadded via.gitmodulesIssues found:
search_bf_by_p_keys_implinhnsw_rabitq_searcher.ccis entirely commented out but returns0(success), silently producing empty results for any p_keys brute-force search callerexpand_neighbors_by_group(hnsw_rabitq_query_algorithm.cc), abreakwhengroup_numis reached only exits the innerforloop — the outerwhileloop continues unnecessarily, wasting compute and potentially adding extra groups beyond the target// TODO: release memory of memory_storagemarkers inhnsw_rabitq_builder.ccandrabitq_converter.ccflag unresolved potential memory leaks for the in-memory storage used during converter serialization// return search_bf_impl(...)) was left inhnsw_rabitq_searcher.ccPARAM_HNSW_RABITQ_STREAMER_EFinhnsw_rabitq_context.ccConfidence Score: 2/5
search_bf_by_p_keys_implfunction is entirely commented out but returns 0 (success), which is a correctness bug that will silently produce wrong results for any caller using the p_keys search path. Theexpand_neighbors_by_groupearly-exit bug causes unnecessary work. The unresolved memory TODO markers also need to be confirmed safe. Core HNSW and RaBitQ logic otherwise appears correct and well-structured.src/core/algorithm/hnsw_rabitq/hnsw_rabitq_searcher.ccrequires attention for the commented-outsearch_bf_by_p_keys_impl.src/core/algorithm/hnsw_rabitq/hnsw_rabitq_query_algorithm.ccneeds theexpand_neighbors_by_groupouter loop early-exit fix.Important Files Changed
search_bf_by_p_keys_implis entirely commented out but silently returns 0 (success), and a leftover debug comment on line 228.expand_neighbors_by_group: innerbreakwhengroup_numis reached does not exit the outer while loop, causing unnecessary work.transformmethod returnsIndexError_NotImplementedwhich is expected but undocumented.transformmethod is stubbed returningIndexError_NotImplemented, andto_reformerhas the same unresolved memory storage TODO.PARAM_HNSW_RABITQ_STREAMER_EFon line 174 inside the kStreamerContext update path.Sequence Diagram
sequenceDiagram participant Caller participant HnswRabitqBuilder participant RabitqConverter participant RabitqReformer participant HnswRabitqEntity participant HnswRabitqAlgorithm Caller->>HnswRabitqBuilder: init(meta, params) HnswRabitqBuilder->>RabitqConverter: init(meta, params) HnswRabitqBuilder->>HnswRabitqAlgorithm: init() Caller->>HnswRabitqBuilder: train(holder) HnswRabitqBuilder->>RabitqConverter: train(holder) [KMeans clustering] RabitqConverter-->>HnswRabitqBuilder: centroids ready HnswRabitqBuilder->>RabitqConverter: dump(MemoryDumper) HnswRabitqBuilder->>RabitqReformer: load(MemoryReadStorage) Caller->>HnswRabitqBuilder: build(threads, holder) loop For each vector HnswRabitqBuilder->>RabitqReformer: convert(vec) [rotate + quantize → bin+ex data] RabitqReformer-->>HnswRabitqBuilder: quantized_vector HnswRabitqBuilder->>HnswRabitqEntity: add_vector(level, key, quantized_vector) end loop Parallel threads HnswRabitqBuilder->>HnswRabitqAlgorithm: add_node(id, level, ctx) HnswRabitqAlgorithm->>HnswRabitqEntity: get_neighbors / update_neighbors end Caller->>HnswRabitqBuilder: dump(dumper) HnswRabitqBuilder->>RabitqConverter: dump(dumper) [save centroids + rotator] HnswRabitqBuilder->>HnswRabitqEntity: dump(dumper) [save graph + vectors] note over Caller,HnswRabitqAlgorithm: Search path Caller->>HnswRabitqSearcher: load(storage) HnswRabitqSearcher->>RabitqReformer: load(storage) HnswRabitqSearcher->>HnswRabitqEntity: load(storage) Caller->>HnswRabitqSearcher: search(query, count, ctx) HnswRabitqSearcher->>RabitqReformer: transform_to_entity(query) [rotate + compute q_to_centroids] RabitqReformer-->>HnswRabitqSearcher: HnswRabitqQueryEntity HnswRabitqSearcher->>HnswRabitqQueryAlgorithm: search(query_entity, ctx) HnswRabitqQueryAlgorithm->>HnswRabitqEntity: get_bin_data / get_ex_data [estimate distances] HnswRabitqQueryAlgorithm-->>HnswRabitqSearcher: topk resultsLast reviewed commit: 94837be