Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ repos:
- id: clang-format
name: clang-format
language: python
entry: tools/clang-format.sh
entry: tools/clang-format.sh clang-format true
args: [clang-format]
additional_dependencies: ["clang-format>=15,<16"]

Expand Down
6 changes: 3 additions & 3 deletions bindings/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

cmake_minimum_required(VERSION 3.21)
project(svs_runtime VERSION 0.1.0 LANGUAGES CXX)
project(svs_runtime VERSION 1.0.0 LANGUAGES CXX)
set(TARGET_NAME svs_runtime)

set(SVS_RUNTIME_HEADERS
Expand Down Expand Up @@ -98,14 +98,14 @@ if (SVS_RUNTIME_ENABLE_LVQ_LEANVEC)
else()
# Links to LTO-enabled static library, requires GCC/G++ 11.2
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "11.2" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "11.3")
set(SVS_URL "https://github.com/intel/ScalableVectorSearch/releases/download/v1.0.0-dev/svs-shared-library-1.0.0-NIGHTLY-20251119-807-lto.tar.gz"
set(SVS_URL "<PLEASE_UPDATE_ME>"
CACHE STRING "URL to download SVS shared library")
else()
message(WARNING
"Pre-built LVQ/LeanVec SVS library requires GCC/G++ v.11.2 to apply LTO optimizations."
"Current compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}"
)
set(SVS_URL "https://github.com/intel/ScalableVectorSearch/releases/download/v0.0.11/svs-shared-library-0.0.11.tar.gz"
set(SVS_URL "<PLEASE_UPDATE_ME>"
CACHE STRING "URL to download SVS shared library")
endif()
include(FetchContent)
Expand Down
16 changes: 16 additions & 0 deletions bindings/cpp/include/svs/runtime/api_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,5 +208,21 @@ struct SVS_RUNTIME_API_INTERFACE ResultsAllocator {
};

} // namespace v0

namespace v1 {
using v0::ErrorCode;
using v0::IDFilter;
using v0::is_specified;
using v0::MetricType;
using v0::OptionalBool;
using v0::ResultsAllocator;
using v0::SearchResultsStorage;
using v0::set_if_specified;
using v0::Status;
using v0::Status_Ok;
using v0::StorageKind;
using v0::Unspecify;
} // namespace v1

} // namespace runtime
} // namespace svs
6 changes: 6 additions & 0 deletions bindings/cpp/include/svs/runtime/dynamic_vamana_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,11 @@ struct SVS_RUNTIME_API DynamicVamanaIndexLeanVec : public DynamicVamanaIndex {
};

} // namespace v0

namespace v1 {
using v0::DynamicVamanaIndex;
using v0::DynamicVamanaIndexLeanVec;
} // namespace v1

} // namespace runtime
} // namespace svs
5 changes: 5 additions & 0 deletions bindings/cpp/include/svs/runtime/flat_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,10 @@ struct SVS_RUNTIME_API FlatIndex {
};

} // namespace v0

namespace v1 {
using v0::FlatIndex;
} // namespace v1

} // namespace runtime
} // namespace svs
41 changes: 41 additions & 0 deletions bindings/cpp/include/svs/runtime/training.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ namespace v0 {

struct SVS_RUNTIME_API LeanVecTrainingData {
virtual ~LeanVecTrainingData();

/* Build LeanVec training data (compression matrices) from the provided
* data.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we call "Leanvec transformation matrices" instead of training data?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LeanVecTrainingData was purposefully chosen to be generic. In hindsight, I think we could have been more specific. But changing this would require an API update and therefore conflicts with your suggestion in the other comment.

* @param training_data Output parameter to the created training data object
* @param dim Dimensionality of the input data and queries
* @param n Number of data points and queries
* @param x Pointer to the input data
* @param leanvec_dims Number of dimensions in the resulting LeanVec data
*/
static Status build(
LeanVecTrainingData** training_data,
size_t dim,
Expand All @@ -42,5 +51,37 @@ struct SVS_RUNTIME_API LeanVecTrainingData {
};

} // namespace v0

namespace v1 {

struct SVS_RUNTIME_API LeanVecTrainingData : public v0::LeanVecTrainingData {
using v0::LeanVecTrainingData::destroy;
using v0::LeanVecTrainingData::save;

/* Build LeanVec training data (compression matrices) from the provided
* data.
* Accepts optional training queries for out-of-distribution training.
* @param training_data Output parameter to the created training data object
* @param dim Dimensionality of the input data and queries
* @param n Number of data points and queries
* @param x Pointer to the input data
* @param n_train Number of training queries (can be 0)
* @param q Pointer to the training queries (can be nullptr)
* @param leanvec_dims Number of dimensions in the resulting LeanVec data
*/
static Status build(
LeanVecTrainingData** training_data,
size_t dim,
size_t n,
const float* x,
size_t n_train,
const float* q,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we keep these two new arguments in the end and default initialize to 0 and nullptr? In that case, the older v0 calls to build will still work without any modifications. Also "n_train" is kind of confusing as both data/queries are used for training? How about we explicitly say "n_queries"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's exactly the discussion I wanted to have. Your suggestion would indeed allow us to stick with v0 (only that we'd need a copy of the function and we can't use default values, due to ABI compability, but that's just a detail).

In my opinion, the order (n_data, const float* data, n_query, const float* queries, size_t leanvec_dims) just makes more sense than (n_data, const float* data, size_t leanvec_dims, n_query, const float* queries). But does it justify bumping to v0?

Your preference would be to stick with v0 for as long as possible?

size_t leanvec_dims
) noexcept;

static Status load(LeanVecTrainingData** training_data, std::istream& in) noexcept;
};

} // namespace v1
} // namespace runtime
} // namespace svs
5 changes: 5 additions & 0 deletions bindings/cpp/include/svs/runtime/vamana_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,10 @@ struct SVS_RUNTIME_API VamanaIndex {
};

} // namespace v0

namespace v1 {
using v0::VamanaIndex;
} // namespace v1

} // namespace runtime
} // namespace svs
26 changes: 20 additions & 6 deletions bindings/cpp/include/svs/runtime/version.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@
#ifndef SVS_RUNTIME_VERSION_MAJOR
/// Major version number - incremented for breaking API changes
/// When this changes, a new version namespace (e.g., v0 -> v1) is created
#define SVS_RUNTIME_VERSION_MAJOR 0
#define SVS_RUNTIME_VERSION_MAJOR 1
#endif

#ifndef SVS_RUNTIME_VERSION_MINOR
/// Minor version number - incremented for backward-compatible feature additions
#define SVS_RUNTIME_VERSION_MINOR 1
#define SVS_RUNTIME_VERSION_MINOR 0
#endif

#ifndef SVS_RUNTIME_VERSION_PATCH
Expand All @@ -51,21 +51,35 @@

#ifndef SVS_RUNTIME_VERSION_STRING
/// Complete version string
#define SVS_RUNTIME_VERSION_STRING "0.1.0"
#define SVS_RUNTIME_VERSION_STRING "1.0.0"
#endif

#ifndef SVS_RUNTIME_API_VERSION
/// Default to current major version if not specified by client
#define SVS_RUNTIME_API_VERSION SVS_RUNTIME_VERSION_MAJOR
#endif

#if (SVS_RUNTIME_API_VERSION == 0)
#if (SVS_RUNTIME_API_VERSION == 1)
/// Use v1 API
/// API version namespace (v1)
#define SVS_RUNTIME_CURRENT_API_NAMESPACE v1
namespace svs {
namespace runtime {
namespace v0 {}
/// All public runtime APIs live here and are accessible as svs::runtime::FunctionName
/// due to inline namespace
inline namespace v1 {
// Public runtime APIs will be defined in their respective headers
// IMPORTANT: include this header before other runtime headers to ensure proper versioning
}
} // namespace runtime
} // namespace svs
#elif (SVS_RUNTIME_API_VERSION == 0)
/// Use v0 API
/// Current API version namespace
/// API version namespace (v0)
#define SVS_RUNTIME_CURRENT_API_NAMESPACE v0
namespace svs {
namespace runtime {
/// Current API version namespace (v0)
/// All public runtime APIs live here and are accessible as svs::runtime::FunctionName
/// due to inline namespace
inline namespace v0 {
Expand Down
75 changes: 75 additions & 0 deletions bindings/cpp/src/training.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
namespace svs {
namespace runtime {

namespace v0 {

LeanVecTrainingData::~LeanVecTrainingData() = default;

Status LeanVecTrainingData::build(
Expand All @@ -50,13 +52,56 @@ LeanVecTrainingData::load(LeanVecTrainingData** training_data, std::istream& in)
*training_data = new LeanVecTrainingDataManager{LeanVecTrainingDataImpl::load(in)};
});
}

} // namespace v0

namespace v1 {

Status LeanVecTrainingData::build(
LeanVecTrainingData** training_data,
size_t dim,
size_t n,
const float* x,
size_t n_train,
const float* q,
size_t leanvec_dims
) noexcept {
return runtime_error_wrapper([&] {
const auto data = svs::data::ConstSimpleDataView<float>(x, n, dim);
if (!q) {
// ID training
*training_data =
new LeanVecTrainingDataManager{LeanVecTrainingDataImpl{data, leanvec_dims}};
} else {
// OOD training
const auto queries = svs::data::ConstSimpleDataView<float>(q, n_train, dim);
*training_data = new LeanVecTrainingDataManager{
LeanVecTrainingDataImpl{data, queries, leanvec_dims}};
}
});
}

Status
LeanVecTrainingData::load(LeanVecTrainingData** training_data, std::istream& in) noexcept {
v0::LeanVecTrainingData* ptr = nullptr;
auto status = v0::LeanVecTrainingData::load(&ptr, in);
// Safe static_cast because we know the implementation (Manager) inherits v1
*training_data = static_cast<v1::LeanVecTrainingData*>(ptr);
return status;
}

} // namespace v1

} // namespace runtime
} // namespace svs

#else // SVS_LEANVEC_HEADER
namespace svs {
namespace runtime {

namespace v0 {
LeanVecTrainingData::~LeanVecTrainingData() = default;

Status LeanVecTrainingData::build(
LeanVecTrainingData** SVS_UNUSED(training_data),
size_t SVS_UNUSED(dim),
Expand All @@ -69,13 +114,41 @@ Status LeanVecTrainingData::build(
"LeanVecTrainingData is not supported in this build configuration."
);
}

Status LeanVecTrainingData::destroy(LeanVecTrainingData* SVS_UNUSED(training_data)
) noexcept {
return Status(
ErrorCode::NOT_IMPLEMENTED,
"LeanVecTrainingData is not supported in this build configuration."
);
}

Status LeanVecTrainingData::load(
LeanVecTrainingData** SVS_UNUSED(training_data), std::istream& SVS_UNUSED(in)
) noexcept {
return Status(
ErrorCode::NOT_IMPLEMENTED,
"LeanVecTrainingData is not supported in this build configuration."
);
}
} // namespace v0

namespace v1 {
Status LeanVecTrainingData::build(
LeanVecTrainingData** SVS_UNUSED(training_data),
size_t SVS_UNUSED(dim),
size_t SVS_UNUSED(n),
const float* SVS_UNUSED(x),
size_t SVS_UNUSED(n_train),
const float* SVS_UNUSED(q),
size_t SVS_UNUSED(leanvec_dims)
) noexcept {
return Status(
ErrorCode::NOT_IMPLEMENTED,
"LeanVecTrainingData is not supported in this build configuration."
);
}

Status LeanVecTrainingData::load(
LeanVecTrainingData** SVS_UNUSED(training_data), std::istream& SVS_UNUSED(in)
) noexcept {
Expand All @@ -84,6 +157,8 @@ Status LeanVecTrainingData::load(
"LeanVecTrainingData is not supported in this build configuration."
);
}
} // namespace v1

} // namespace runtime
} // namespace svs
#endif // SVS_LEANVEC_HEADER
20 changes: 19 additions & 1 deletion bindings/cpp/src/training_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@ struct LeanVecTrainingDataImpl {
: leanvec_dims_{leanvec_dims}
, leanvec_matrices_{compute_leanvec_matrices(data, leanvec_dims)} {}

LeanVecTrainingDataImpl(
const svs::data::ConstSimpleDataView<float>& data,
const svs::data::ConstSimpleDataView<float>& queries,
size_t leanvec_dims
)
: leanvec_dims_{leanvec_dims}
, leanvec_matrices_{compute_leanvec_matrices_ood(data, queries, leanvec_dims)} {}

size_t get_leanvec_dims() const { return leanvec_dims_; }
const LeanVecMatricesType& get_leanvec_matrices() const { return leanvec_matrices_; }

Expand Down Expand Up @@ -83,9 +91,19 @@ struct LeanVecTrainingDataImpl {
// leading explicit creation of a copy of the matrix "to avoid double free".
return LeanVecMatricesType{std::move(matrix), std::move(query_matrix)};
}

static LeanVecMatricesType compute_leanvec_matrices_ood(
const svs::data::ConstSimpleDataView<float>& data,
const svs::data::ConstSimpleDataView<float>& queries,
size_t leanvec_dims
) {
return svs::leanvec::compute_leanvec_matrices_ood<svs::Dynamic>(
data, queries, svs::lib::MaybeStatic{leanvec_dims}
);
}
};

struct LeanVecTrainingDataManager : public svs::runtime::LeanVecTrainingData {
struct LeanVecTrainingDataManager : public svs::runtime::v1::LeanVecTrainingData {
LeanVecTrainingDataManager(LeanVecTrainingDataImpl impl)
: impl_{std::move(impl)} {}

Expand Down
7 changes: 6 additions & 1 deletion tools/clang-format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,14 @@

# Allow users to supply a custom path to `clang-format`
CLANGFORMAT="${1:-clang-format}"
STAGED_ONLY="${2:-false}"
DIRECTORIES=( "bindings/python/src" "bindings/python/include" "bindings/cpp" "include" "benchmark" "tests" "utils" "examples/cpp" )

for i in "${DIRECTORIES[@]}"
do
find "./$i" \( -iname "*.h" -o -iname "*.cpp" \) ! -iname "*toml_impl.h" | xargs "$CLANGFORMAT" -i
if [[ "$STAGED_ONLY" == "true" ]]; then
git diff --cached --name-only --diff-filter=ACM | grep -E "^$i/.*\.(h|cpp)$" | grep -v "toml_impl.h" | xargs -r "$CLANGFORMAT" -i
else
find "./$i" \( -iname "*.h" -o -iname "*.cpp" \) ! -iname "*toml_impl.h" -print0 | xargs -n1 -0 "$CLANGFORMAT" -i
fi
done
Loading