From f7c9cc200ad87cbeac67c614f255c08bbde4d65c Mon Sep 17 00:00:00 2001 From: Gracjan Sadowicz Date: Tue, 23 Jun 2026 13:51:50 +0200 Subject: [PATCH 1/3] RDBC-1068 Fix OverlapTokens supported-methods set to match server/C# Overlap tokens are only consumed by the two paragraph chunking methods (PlainTextSplitParagraphs, MarkDownSplitParagraphs) on the server. The Python set wrongly allowed PlainTextSplit/PlainTextSplitLines (silently ignored by the server) and omitted MarkDownSplitParagraphs (spurious client-side rejection of a valid config). Also makes the validation message derive from the set so it can't drift again. --- .../operations/ai/chunking_options.py | 12 +++++------- .../test_chunking_options.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/ravendb/documents/operations/ai/chunking_options.py b/ravendb/documents/operations/ai/chunking_options.py index 3f2ed98e..f6d70481 100644 --- a/ravendb/documents/operations/ai/chunking_options.py +++ b/ravendb/documents/operations/ai/chunking_options.py @@ -11,11 +11,11 @@ class ChunkingMethod(Enum): HTML_STRIP = "HtmlStrip" -# Methods that support overlap tokens +# Methods that support overlap tokens. Mirrors the server (TextChunker.cs) and the C#/Node clients: +# only the two *paragraph* methods consume OverlapTokens; every other method ignores it. METHODS_SUPPORTING_OVERLAP_TOKENS = { - ChunkingMethod.PLAIN_TEXT_SPLIT, - ChunkingMethod.PLAIN_TEXT_SPLIT_LINES, ChunkingMethod.PLAIN_TEXT_SPLIT_PARAGRAPHS, + ChunkingMethod.MARK_DOWN_SPLIT_PARAGRAPHS, } @@ -84,10 +84,8 @@ def validate(self, source: str, errors: List[str]) -> None: errors.append(f"{source}: OverlapTokens cannot be greater than MaxTokensPerChunk.") if self.overlap_tokens > 0 and self.chunking_method not in METHODS_SUPPORTING_OVERLAP_TOKENS: - errors.append( - f"{source}: OverlapTokens is only supported for PlainTextSplit, " - f"PlainTextSplitLines, and PlainTextSplitParagraphs chunking methods." - ) + supported = ", ".join(sorted(method.value for method in METHODS_SUPPORTING_OVERLAP_TOKENS)) + errors.append(f"{source}: OverlapTokens is only supported for the following chunking methods: {supported}.") @staticmethod def are_equal(left: Optional["ChunkingOptions"], right: Optional["ChunkingOptions"]) -> bool: diff --git a/ravendb/tests/embeddings_generation_tests/test_chunking_options.py b/ravendb/tests/embeddings_generation_tests/test_chunking_options.py index ab896680..7038903a 100644 --- a/ravendb/tests/embeddings_generation_tests/test_chunking_options.py +++ b/ravendb/tests/embeddings_generation_tests/test_chunking_options.py @@ -45,6 +45,25 @@ def test_no_chunking_marker_bypasses_budget_validation(self): options.validate("source", errors) self.assertEqual([], errors) + def test_overlap_allowed_only_on_paragraph_methods(self): + # Matches the server / C# / Node clients: overlap is consumed only by the two paragraph methods. + for method in (ChunkingMethod.PLAIN_TEXT_SPLIT_PARAGRAPHS, ChunkingMethod.MARK_DOWN_SPLIT_PARAGRAPHS): + errors = [] + ChunkingOptions(method, 100, 10).validate("source", errors) + self.assertEqual([], errors, method) + + def test_overlap_rejected_on_non_paragraph_methods(self): + for method in ( + ChunkingMethod.PLAIN_TEXT_SPLIT, + ChunkingMethod.PLAIN_TEXT_SPLIT_LINES, + ChunkingMethod.MARK_DOWN_SPLIT_LINES, + ChunkingMethod.HTML_STRIP, + ): + errors = [] + ChunkingOptions(method, 100, 10).validate("source", errors) + self.assertEqual(1, len(errors), method) + self.assertIn("OverlapTokens", errors[0]) + if __name__ == "__main__": unittest.main() From 03cbf14fb0011a4efb50e0a0ee4d55ab43e9f149 Mon Sep 17 00:00:00 2001 From: Gracjan Sadowicz Date: Tue, 23 Jun 2026 14:06:35 +0200 Subject: [PATCH 2/3] RDBC-1068 Fix C#-contradiction bugs found in post-sync audit - ChunkingOptions.from_json: default MaxTokensPerChunk/OverlapTokens to 512/0 (matching C#'s non-nullable int initializers) instead of None, which crashed validate() with a TypeError on missing keys. - EmbeddingsGenerationConfiguration.validate: drop the spurious 'cannot specify both paths and transformation' rejection (C# allows both), and add the per-path ChunkingOptions validation loop C# performs. - OrderByToken.create_distance_ascending_wkt: emit spatial.distance(field, spatial.wkt(...)) instead of the malformed spatial.distance(field), spatial.wkt(...) (stray paren). Each fix has a regression test that fails on the pre-fix code. --- .../operations/ai/chunking_options.py | 4 +- .../ai/embeddings_generation_configuration.py | 10 ++- .../tokens/query_tokens/definitions.py | 2 +- .../test_chunking_options.py | 11 +++ ...dings_generation_configuration_validate.py | 68 +++++++++++++++++++ .../test_order_by_distance_wkt.py | 34 ++++++++++ 6 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 ravendb/tests/embeddings_generation_tests/test_embeddings_generation_configuration_validate.py create mode 100644 ravendb/tests/jvm_migrated_tests/client_tests/queries_tests/test_order_by_distance_wkt.py diff --git a/ravendb/documents/operations/ai/chunking_options.py b/ravendb/documents/operations/ai/chunking_options.py index f6d70481..a1b71753 100644 --- a/ravendb/documents/operations/ai/chunking_options.py +++ b/ravendb/documents/operations/ai/chunking_options.py @@ -44,8 +44,8 @@ def __init__( def from_json(cls, json_dict: Dict[str, Any]) -> "ChunkingOptions": return cls( chunking_method=ChunkingMethod(json_dict["ChunkingMethod"]), - max_tokens_per_chunk=json_dict.get("MaxTokensPerChunk", None), - overlap_tokens=json_dict.get("OverlapTokens", None), + max_tokens_per_chunk=json_dict.get("MaxTokensPerChunk", 512), + overlap_tokens=json_dict.get("OverlapTokens", 0), context_prefix=json_dict.get("ContextPrefix", None), ) diff --git a/ravendb/documents/operations/ai/embeddings_generation_configuration.py b/ravendb/documents/operations/ai/embeddings_generation_configuration.py index 8c2f3681..b20ef25b 100644 --- a/ravendb/documents/operations/ai/embeddings_generation_configuration.py +++ b/ravendb/documents/operations/ai/embeddings_generation_configuration.py @@ -135,8 +135,14 @@ def validate( if not has_paths and not has_transformation: errors.append("Either EmbeddingsPathConfigurations or EmbeddingsTransformation must be provided") - elif has_paths and has_transformation: - errors.append("Cannot specify both EmbeddingsPathConfigurations and EmbeddingsTransformation") + + # Validate each path's chunking options (mirrors the server / C# client; both may be set). + if self.embeddings_path_configurations: + for path_configuration in self.embeddings_path_configurations: + if path_configuration.chunking_options is not None: + path_configuration.chunking_options.validate(path_configuration.path, errors) + else: + errors.append(f"Path '{path_configuration.path}': ChunkingOptions must be provided.") # Validate transformation if provided if has_transformation: diff --git a/ravendb/documents/session/tokens/query_tokens/definitions.py b/ravendb/documents/session/tokens/query_tokens/definitions.py index dad77529..23f261fb 100644 --- a/ravendb/documents/session/tokens/query_tokens/definitions.py +++ b/ravendb/documents/session/tokens/query_tokens/definitions.py @@ -299,7 +299,7 @@ def create_distance_ascending_wkt( nulls: NullsOrdering = NullsOrdering.DEFAULT, ) -> OrderByToken: return cls( - f"spatial.distance({field_name}), " + f"spatial.distance({field_name}, " f"spatial.wkt(${wkt_parameter_name})" f"{'' if round_factor_parameter_name is None else ', $' + round_factor_parameter_name})", False, diff --git a/ravendb/tests/embeddings_generation_tests/test_chunking_options.py b/ravendb/tests/embeddings_generation_tests/test_chunking_options.py index 7038903a..4c930a65 100644 --- a/ravendb/tests/embeddings_generation_tests/test_chunking_options.py +++ b/ravendb/tests/embeddings_generation_tests/test_chunking_options.py @@ -64,6 +64,17 @@ def test_overlap_rejected_on_non_paragraph_methods(self): self.assertEqual(1, len(errors), method) self.assertIn("OverlapTokens", errors[0]) + def test_from_json_missing_int_keys_use_csharp_defaults(self): + # Missing MaxTokensPerChunk/OverlapTokens must default to 512/0 (matching C#'s non-nullable + # int initializers), not None. None would crash validate() with a TypeError. + options = ChunkingOptions.from_json({"ChunkingMethod": "HtmlStrip"}) + self.assertEqual(512, options.max_tokens_per_chunk) + self.assertEqual(0, options.overlap_tokens) + + errors = [] + options.validate("source", errors) # must not raise TypeError on the defaults + self.assertEqual([], errors) + if __name__ == "__main__": unittest.main() diff --git a/ravendb/tests/embeddings_generation_tests/test_embeddings_generation_configuration_validate.py b/ravendb/tests/embeddings_generation_tests/test_embeddings_generation_configuration_validate.py new file mode 100644 index 00000000..788b9b8b --- /dev/null +++ b/ravendb/tests/embeddings_generation_tests/test_embeddings_generation_configuration_validate.py @@ -0,0 +1,68 @@ +import unittest + +from ravendb.documents.operations.ai.chunking_options import ChunkingOptions, ChunkingMethod +from ravendb.documents.operations.ai.embedding_path_configuration import EmbeddingPathConfiguration +from ravendb.documents.operations.ai.embeddings_generation_configuration import EmbeddingsGenerationConfiguration +from ravendb.documents.operations.ai.embeddings_transformation import EmbeddingsTransformation + + +def _config(**overrides) -> EmbeddingsGenerationConfiguration: + # A base config that passes every check except the paths/transformation logic under test. + base = dict( + name="emb", + identifier="emb", + collection="Docs", + connection_string_name="cs", + chunking_options_for_querying=ChunkingOptions(ChunkingMethod.HTML_STRIP, 100, 0), + ) + base.update(overrides) + return EmbeddingsGenerationConfiguration(**base) + + +def _validate(config) -> list: + return config.validate(validate_name=False, validate_identifier=False) + + +class TestEmbeddingsGenerationConfigurationValidate(unittest.TestCase): + def test_paths_and_transformation_together_are_allowed(self): + # C# has no mutual-exclusivity rule; setting both must NOT be rejected client-side. + config = _config( + embeddings_path_configurations=[ + EmbeddingPathConfiguration("Name", ChunkingOptions(ChunkingMethod.HTML_STRIP, 100, 0)) + ], + embeddings_transformation=EmbeddingsTransformation(script="embeddings.generate(this.Name)"), + ) + errors = _validate(config) + self.assertNotIn("Cannot specify both EmbeddingsPathConfigurations and EmbeddingsTransformation", errors) + self.assertEqual([], errors) + + def test_path_missing_chunking_options_is_rejected(self): + # Mirrors C#: each path must carry ChunkingOptions. + config = _config(embeddings_path_configurations=[EmbeddingPathConfiguration("Name", None)]) + errors = _validate(config) + self.assertIn("Path 'Name': ChunkingOptions must be provided.", errors) + + def test_path_invalid_chunking_options_is_rejected(self): + # Each path's ChunkingOptions is validated with the path as the error source. + config = _config( + embeddings_path_configurations=[ + EmbeddingPathConfiguration("Name", ChunkingOptions(ChunkingMethod.HTML_STRIP, 0, 0)) + ] + ) + errors = _validate(config) + self.assertTrue( + any("Name" in e and "MaxTokensPerChunk" in e for e in errors), + f"expected a per-path MaxTokensPerChunk error, got: {errors}", + ) + + def test_path_with_valid_chunking_options_passes(self): + config = _config( + embeddings_path_configurations=[ + EmbeddingPathConfiguration("Name", ChunkingOptions(ChunkingMethod.HTML_STRIP, 100, 0)) + ] + ) + self.assertEqual([], _validate(config)) + + +if __name__ == "__main__": + unittest.main() diff --git a/ravendb/tests/jvm_migrated_tests/client_tests/queries_tests/test_order_by_distance_wkt.py b/ravendb/tests/jvm_migrated_tests/client_tests/queries_tests/test_order_by_distance_wkt.py new file mode 100644 index 00000000..34d42648 --- /dev/null +++ b/ravendb/tests/jvm_migrated_tests/client_tests/queries_tests/test_order_by_distance_wkt.py @@ -0,0 +1,34 @@ +import unittest + +from ravendb.documents.session.tokens.query_tokens.definitions import OrderByToken + + +def _rql(token: OrderByToken) -> str: + writer = [] + token.write_to(writer) + return "".join(writer) + + +class TestOrderByDistanceWktRql(unittest.TestCase): + """The WKT ascending distance factory must nest spatial.wkt(...) inside spatial.distance(field, ...), + matching C# (OrderByToken.cs) and its three sibling factories. The bug emitted a stray ')' right + after the field name: 'spatial.distance(loc), spatial.wkt(...)'.""" + + def test_distance_ascending_wkt_nests_wkt_inside_distance(self): + rql = _rql(OrderByToken.create_distance_ascending_wkt("loc", "p0", None)) + self.assertIn("spatial.distance(loc, spatial.wkt($p0))", rql) + self.assertNotIn("spatial.distance(loc),", rql) + + def test_distance_ascending_wkt_matches_descending_structure(self): + asc = _rql(OrderByToken.create_distance_ascending_wkt("loc", "p0", None)) + desc = _rql(OrderByToken.create_distance_descending_wkt("loc", "p0", None)) + # Identical apart from the trailing descending marker. + self.assertEqual(asc, desc.replace(" desc", "")) + + def test_distance_ascending_wkt_with_round_factor(self): + rql = _rql(OrderByToken.create_distance_ascending_wkt("loc", "p0", "p1")) + self.assertIn("spatial.distance(loc, spatial.wkt($p0), $p1)", rql) + + +if __name__ == "__main__": + unittest.main() From 7aee9aad6d7b3a781285537a32c3199c5dcaa1e1 Mon Sep 17 00:00:00 2001 From: Gracjan Sadowicz Date: Tue, 23 Jun 2026 14:17:10 +0200 Subject: [PATCH 3/3] RDBC-1068 Fix WKT order-by-distance parameterization + DB-facing tests _order_by_distance_wkt / _order_by_distance_descending_wkt passed the raw WKT string straight in as the parameter *name*, producing spatial.wkt($POINT(...)) which the server rejects at parse time. Register it via add_query_parameter first, matching C# (AbstractDocumentQuery.Spatial). This is the calling-side half of the WKT distance bug (the token-side stray paren was fixed in the previous commit). Adds DB-facing regression tests (embedded server): - spatial: order_by_distance_wkt ascending + descending (fail pre-fix: server 500 parse error). - embeddings: a config with BOTH paths and a transformation is accepted by the server, confirming the dropped client-side mutual-exclusivity rule. --- ravendb/documents/session/query.py | 8 +++++-- .../test_tasks_management.py | 22 +++++++++++++++++++ .../test_order_by_distance_quoting.py | 22 +++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/ravendb/documents/session/query.py b/ravendb/documents/session/query.py index 3d6402cd..5d3542bd 100644 --- a/ravendb/documents/session/query.py +++ b/ravendb/documents/session/query.py @@ -1571,7 +1571,9 @@ def _order_by_distance_wkt( round_factor_parameter_name = None if round_factor == 0 else self.__add_query_parameter(round_factor) self._order_by_tokens.append( - OrderByToken.create_distance_ascending_wkt(field_name, shape_wkt, round_factor_parameter_name, nulls) + OrderByToken.create_distance_ascending_wkt( + field_name, self.__add_query_parameter(shape_wkt), round_factor_parameter_name, nulls + ) ) def _order_by_distance_descending( @@ -1627,7 +1629,9 @@ def _order_by_distance_descending_wkt( round_factor_parameter_name = None if round_factor == 0 else self.__add_query_parameter(round_factor) self._order_by_tokens.append( - OrderByToken.create_distance_descending_wkt(field_name, shape_wkt, round_factor_parameter_name, nulls) + OrderByToken.create_distance_descending_wkt( + field_name, self.__add_query_parameter(shape_wkt), round_factor_parameter_name, nulls + ) ) def _init_sync(self) -> None: diff --git a/ravendb/tests/embeddings_generation_tests/test_tasks_management.py b/ravendb/tests/embeddings_generation_tests/test_tasks_management.py index ce1be190..98233fa2 100644 --- a/ravendb/tests/embeddings_generation_tests/test_tasks_management.py +++ b/ravendb/tests/embeddings_generation_tests/test_tasks_management.py @@ -14,6 +14,7 @@ UpdateEmbeddingsGenerationOperation, ) from ravendb.documents.operations.ai.embedded_settings import EmbeddedSettings +from ravendb.documents.operations.ai.embeddings_transformation import EmbeddingsTransformation from ravendb.documents.operations.connection_string.put_connection_string_operation import PutConnectionStringOperation from ravendb.documents.operations.connection_string.remove_connection_string_operation import ( RemoveConnectionStringOperation, @@ -214,6 +215,27 @@ def test_embeddings_generation_configuration_without_chunking_options_should_pro self.assertIn("PostContent", error_message) self.assertIn("ChunkingOptions", error_message) + def test_can_add_task_with_both_paths_and_transformation(self): + """The server imposes no mutual-exclusivity between paths and transformation, so a config + that sets BOTH must be accepted (the client must not reject it pre-send either).""" + config = EmbeddingsGenerationConfiguration( + name="ai-task-both", + connection_string_name=self.CONNECTION_STRING_NAME, + embeddings_path_configurations=[ + EmbeddingPathConfiguration(path="PostContent", chunking_options=self.DEFAULT_CHUNKING_OPTIONS), + ], + embeddings_transformation=EmbeddingsTransformation( + script="embeddings.generate(this.Comments)", + chunking_options=self.DEFAULT_CHUNKING_OPTIONS, + ), + collection="Posts", + chunking_options_for_querying=self.DEFAULT_CHUNKING_OPTIONS, + ) + + add_result = self.store.maintenance.send(AddEmbeddingsGenerationOperation(config)) + self.assertIsNotNone(add_result.task_id) + self._created_task_ids.append(add_result.task_id) + def test_database_record_contains_embeddings_generations(self): config = self._create_valid_config() diff --git a/ravendb/tests/jvm_migrated_tests/spatial_tests/test_order_by_distance_quoting.py b/ravendb/tests/jvm_migrated_tests/spatial_tests/test_order_by_distance_quoting.py index 9958a443..46ee8043 100644 --- a/ravendb/tests/jvm_migrated_tests/spatial_tests/test_order_by_distance_quoting.py +++ b/ravendb/tests/jvm_migrated_tests/spatial_tests/test_order_by_distance_quoting.py @@ -48,6 +48,28 @@ def test_order_by_distance_descending_with_dynamic_point_field(self): self.assertGreaterEqual(len(results), 2) self.assertEqual("geo/2", results[0].Id) + def test_order_by_distance_wkt_ascending_with_dynamic_point_field(self): + # Regression for the create_distance_ascending_wkt RQL bug: a stray ')' produced + # 'spatial.distance(), spatial.wkt(...)', which the server rejects at parse time. + # WKT is "POINT(longitude latitude)"; the point below is geo/1 (Greenwich). + with self.store.open_session() as s: + results = list( + s.query(object_type=_Geo).order_by_distance_wkt(PointField("lat", "lng"), "POINT(0.0015 51.4779)") + ) + self.assertGreaterEqual(len(results), 2) + self.assertEqual("geo/1", results[0].Id) + + def test_order_by_distance_wkt_descending_with_dynamic_point_field(self): + # Same WKT path, descending: the farthest document (New York) comes first. + with self.store.open_session() as s: + results = list( + s.query(object_type=_Geo).order_by_distance_descending_wkt( + PointField("lat", "lng"), "POINT(0.0015 51.4779)" + ) + ) + self.assertGreaterEqual(len(results), 2) + self.assertEqual("geo/2", results[0].Id) + if __name__ == "__main__": unittest.main()