Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5a4b74d
Cosmos Java: share PartitionKeyRangeCache across CosmosClients target…
xinlian12 Jun 18, 2026
f3fa638
Cosmos Java: use URI (not String) as registry key for case-insensitiv…
xinlian12 Jun 19, 2026
75f93d5
Cosmos Java: keep registry key as serviceEndpoint URI (not _rid) for …
xinlian12 Jun 19, 2026
05f6780
Cosmos Java: add PhantomReference-based leak safety net for unclosed …
xinlian12 Jun 19, 2026
cbd47a8
Cosmos Java: use azure-core ReferenceManager for leaked-client safety…
xinlian12 Jun 19, 2026
9b43616
remove kafka test output
xinlian12 Jun 19, 2026
892a7d7
Cosmos Java: trim comments to core logic; drop cross-SDK references
xinlian12 Jun 19, 2026
1a5e92d
Cosmos Java: address PR review feedback
xinlian12 Jun 19, 2026
9356fbc
Retrigger CI
xinlian12 Jun 19, 2026
3afed67
Retrigger CI
xinlian12 Jun 19, 2026
39913c6
Cosmos Java: clarify 2-arg RxPartitionKeyRangeCache ctor behavior
xinlian12 Jun 19, 2026
022836d
Retrigger CI
xinlian12 Jun 19, 2026
c3eaba8
Retrigger CI
xinlian12 Jun 19, 2026
d585e36
Retrigger CI
xinlian12 Jun 20, 2026
7285d1c
Remove 2-arg RxPartitionKeyRangeCache ctor; require explicit endpoint
xinlian12 Jun 22, 2026
d6494d8
Key shared PartitionKeyRangeCache registry by database account id
xinlian12 Jun 22, 2026
7a693fa
Add e2e tests for shared PartitionKeyRangeCache + refresh Configs com…
xinlian12 Jun 22, 2026
4fad3a9
Address PR review on e2e test: data provider, TestObject, public acce…
xinlian12 Jun 22, 2026
a806ee6
Fix two regressions caught by CI live tests
xinlian12 Jun 23, 2026
5ffb1c0
Revert registry key to service endpoint URI (drop fragile account-id …
xinlian12 Jun 23, 2026
9162bb2
Fix CI: remove unworkable e2e negative test; revert PKR_LOOK_UP diagn…
xinlian12 Jun 23, 2026
99dfd30
Cosmos: restore outer-site PKR_LOOK_UP diagnostics; fix FaultInjectio…
xinlian12 Jun 23, 2026
3559d7f
Cosmos: make CosmosContainerOpenConnectionsAndInitCachesTest robust t…
xinlian12 Jun 23, 2026
1db1321
Cosmos: drop racy exact refcount-delta assertions in SharedPartitionK…
xinlian12 Jun 23, 2026
abb1681
Cosmos: fix NPE in CosmosDiagnosticsContext.getRequestInfo() for stor…
xinlian12 Jun 24, 2026
0f35e7b
Merge remote-tracking branch 'upstream/main' into feature/shared-part…
xinlian12 Jun 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,9 @@ public void openConnectionsAndInitCachesForDirectMode(boolean useAsync) throws I

assertThat(provider.count()).isEqualTo(0);
assertThat(collectionInfoByNameMap.size()).isEqualTo(0);
assertThat(routingMap.size()).isEqualTo(0);
// The partition-key-range (routing map) cache is shared across clients targeting the same
// service endpoint, so its size reflects every container routed to in this JVM. Assert on this
// container's own entry (populated by openConnectionsAndInitCaches) instead of the total size.
assertThat(ReflectionUtils.isInitialized(asyncContainer).get()).isFalse();

// Calling it twice to make sure no side effect of second time no-op call
Expand All @@ -143,7 +145,8 @@ public void openConnectionsAndInitCachesForDirectMode(boolean useAsync) throws I
}

assertThat(collectionInfoByNameMap.size()).isEqualTo(1);
assertThat(routingMap.size()).isEqualTo(1);
String collectionRid = asyncContainer.read().block().getProperties().getResourceId();
assertThat(routingMap).containsKey(collectionRid);
assertThat(ReflectionUtils.isInitialized(asyncContainer).get()).isTrue();

GlobalAddressResolver globalAddressResolver = ReflectionUtils.getGlobalAddressResolver(rxDocumentClient);
Expand Down Expand Up @@ -214,11 +217,12 @@ public void openConnectionsAndInitCachesForGatewayMode(boolean useAsync) {

RxDocumentClientImpl rxDocumentClient = (RxDocumentClientImpl) asyncClient.getDocClientWrapper();

ConcurrentHashMap<String, ?> routingMap = getRoutingMap(rxDocumentClient);
ConcurrentHashMap<String, ?> collectionInfoByNameMap = getCollectionInfoByNameMap(rxDocumentClient);

assertThat(collectionInfoByNameMap.size()).isEqualTo(0);
assertThat(routingMap.size()).isEqualTo(0);
// The routing-map (partition-key-range) cache is shared per service endpoint; gateway-mode
// openConnectionsAndInitCaches does not populate it, but a sibling direct-mode test may have,
// so the shared routing map's size is not asserted here.
assertThat(ReflectionUtils.isInitialized(asyncContainer).get()).isFalse();

// Verifying no error when initializeContainer called on gateway mode
Expand All @@ -232,7 +236,6 @@ public void openConnectionsAndInitCachesForGatewayMode(boolean useAsync) {
}

assertThat(collectionInfoByNameMap.size()).isEqualTo(0);
assertThat(routingMap.size()).isEqualTo(0);
assertThat(ReflectionUtils.isInitialized(asyncContainer).get()).isTrue();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
package com.azure.cosmos;

import com.azure.cosmos.implementation.RxDocumentClientImpl;
import com.azure.cosmos.implementation.caches.AsyncCacheNonBlocking;
import com.azure.cosmos.implementation.caches.RxPartitionKeyRangeCache;
import com.azure.cosmos.implementation.caches.SharedPartitionKeyRangeCacheRegistry;
import com.azure.cosmos.implementation.directconnectivity.ReflectionUtils;
import com.azure.cosmos.implementation.routing.CollectionRoutingMap;
import com.azure.cosmos.models.CosmosContainerProperties;
import com.azure.cosmos.models.CosmosItemRequestOptions;
import com.azure.cosmos.models.CosmosItemResponse;
import com.azure.cosmos.models.PartitionKey;
import com.azure.cosmos.models.ThroughputProperties;
import com.azure.cosmos.rx.TestSuiteBase;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Factory;
import org.testng.annotations.Test;

import java.lang.reflect.Method;
import java.net.URI;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;

import static org.assertj.core.api.Assertions.assertThat;

/**
* End-to-end tests for {@link SharedPartitionKeyRangeCacheRegistry}: spin up real
* {@link CosmosAsyncClient} instances, perform partition-key-routed operations to
* populate the routing-map cache, and verify the registry's sharing semantics.
*
* <p>Sharing is keyed by the service endpoint {@link URI} configured on
* {@link CosmosClientBuilder}. Two clients configured with the same endpoint
* URI share the cache; clients configured with different endpoint URIs (e.g.
* the global endpoint vs a regional endpoint of the same logical account) do
* <b>not</b> share — see {@link SharedPartitionKeyRangeCacheRegistry} javadoc
* for the rationale.</p>
*/
public class SharedPartitionKeyRangeCacheE2ETest extends TestSuiteBase {
private static final Logger logger = LoggerFactory.getLogger(SharedPartitionKeyRangeCacheE2ETest.class);

private static final int TIMEOUT = 90_000;
private static final int SETUP_TIMEOUT = 60_000;
private static final int SHUTDOWN_TIMEOUT = 30_000;

private CosmosAsyncClient setupClient;
private CosmosAsyncDatabase database;
private CosmosAsyncContainer container;
private URI serviceEndpoint;

@Factory(dataProvider = "simpleGatewayClient")
public SharedPartitionKeyRangeCacheE2ETest(CosmosClientBuilder clientBuilder) {
super(clientBuilder);
}

@BeforeClass(groups = {"emulator", "fast"}, timeOut = SETUP_TIMEOUT)
public void before() {
this.setupClient = getClientBuilder().buildAsyncClient();
this.database = getSharedCosmosDatabase(this.setupClient);

String containerId = "pkr-share-e2e-" + UUID.randomUUID();
CosmosContainerProperties properties =
new CosmosContainerProperties(containerId, "/mypk");
this.database
.createContainer(properties, ThroughputProperties.createManualThroughput(400))
.block();
this.container = this.database.getContainer(containerId);

this.serviceEndpoint = serviceEndpointOf(this.setupClient);
assertThat(this.serviceEndpoint)
.as("service endpoint must be available after client init")
.isNotNull();
}

@AfterClass(groups = {"emulator", "fast"}, timeOut = SHUTDOWN_TIMEOUT, alwaysRun = true)
public void after() {
if (this.container != null) {
try {
this.container.delete().block();
} catch (Exception e) {
logger.warn("Failed to delete e2e container", e);
}
}
safeClose(this.setupClient);
}

/**
* Two {@link CosmosAsyncClient} instances configured with the same service
* endpoint must share the underlying {@link AsyncCacheNonBlocking} routing-map
* storage, and the registry refcount must reflect both holders.
*/
@Test(groups = {"emulator", "fast"}, timeOut = TIMEOUT)
public void twoClientsOnSameEndpointShareRoutingMapStorage() {
CosmosAsyncClient clientA = null;
CosmosAsyncClient clientB = null;
try {
clientA = getClientBuilder().buildAsyncClient();
clientB = getClientBuilder().buildAsyncClient();

// Trigger PK-routed operations on both clients so the routing-map cache populates.
TestObject seed = TestObject.create();
createItem(clientA, seed);
readItemSilently(clientA, seed.getMypk());
readItemSilently(clientB, seed.getMypk());

AsyncCacheNonBlocking<String, CollectionRoutingMap> storageA = routingMapStorageOf(clientA);
AsyncCacheNonBlocking<String, CollectionRoutingMap> storageB = routingMapStorageOf(clientB);

assertThat(storageA)
.as("Two CosmosAsyncClients on the same endpoint must share the routing-map AsyncCacheNonBlocking instance")
.isSameAs(storageB);

int refCount = registryReferenceCount(this.serviceEndpoint);
assertThat(refCount)
.as("Registry refcount for endpoint [%s] must include both clients", this.serviceEndpoint)
.isGreaterThanOrEqualTo(2);

ConcurrentHashMap<String, ?> values =
ReflectionUtils.getValueMapNonBlockingCache(storageA);
assertThat(values)
.as("Routing-map cache must contain at least one entry after PK-routed reads")
.isNotEmpty();
} finally {
// The registry refcount for this endpoint is shared with every other client/test that
// targets it, so in the parallel "fast" suite the absolute count changes under us. The
// exact close-delta is therefore not asserted here; that wiring is covered
// deterministically by SharedPartitionKeyRangeCacheRegistryTest. This e2e test's value is
// proving that two real clients share the same storage instance (asserted above).
safeClose(clientA);
safeClose(clientB);
}
}

// --- helpers ----------------------------------------------------------------

private void createItem(CosmosAsyncClient client, TestObject item) {
CosmosAsyncContainer c = client
.getDatabase(this.database.getId())
.getContainer(this.container.getId());
c.createItem(item, new PartitionKey(item.getMypk()), new CosmosItemRequestOptions()).block();
}

private void readItemSilently(CosmosAsyncClient client, String pk) {
// The cache is populated by the resolve step regardless of whether the doc exists;
// we issue a random-id read and tolerate 404.
CosmosAsyncContainer c = client
.getDatabase(this.database.getId())
.getContainer(this.container.getId());
try {
CosmosItemResponse<TestObject> resp = c.readItem(
UUID.randomUUID().toString(),
new PartitionKey(pk),
new CosmosItemRequestOptions(),
TestObject.class).block();
assertThat(resp).isNotNull();
} catch (CosmosException ex) {
if (ex.getStatusCode() != 404) {
throw ex;
}
}
}

private static AsyncCacheNonBlocking<String, CollectionRoutingMap> routingMapStorageOf(CosmosAsyncClient client) {
RxDocumentClientImpl rxDocumentClient =
(RxDocumentClientImpl) CosmosBridgeInternal.getAsyncDocumentClient(client);
RxPartitionKeyRangeCache partitionKeyRangeCache = rxDocumentClient.getPartitionKeyRangeCache();
return ReflectionUtils.getRoutingMapAsyncCacheNonBlocking(partitionKeyRangeCache);
}

private static URI serviceEndpointOf(CosmosAsyncClient client) {
RxDocumentClientImpl rxDocumentClient =
(RxDocumentClientImpl) CosmosBridgeInternal.getAsyncDocumentClient(client);
return rxDocumentClient.getServiceEndpoint();
}

/**
* The registry's {@code referenceCount} accessor is package-private (test-only).
* Reflect into it from this package; widening visibility for a test-only check
* would pollute the implementation class's surface.
*/
private static int registryReferenceCount(URI endpoint) {
try {
SharedPartitionKeyRangeCacheRegistry registry = SharedPartitionKeyRangeCacheRegistry.getInstance();
Method m = SharedPartitionKeyRangeCacheRegistry.class.getDeclaredMethod("referenceCount", URI.class);
m.setAccessible(true);
return (Integer) m.invoke(registry, endpoint);
} catch (ReflectiveOperationException e) {
throw new RuntimeException("Failed to reflect SharedPartitionKeyRangeCacheRegistry.referenceCount", e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -610,25 +610,32 @@ public void faultInjectionServerErrorRuleTests_PartitionKeyRanges_DelayError(

assertThat(metadataDiagnosticList.size()).isGreaterThan(0);

JsonNode pkRangesLookup = null;
// PARTITION_KEY_RANGE_LOOK_UP is recorded once per cache lookup (cache hits and the
// actual /pkranges fetch alike), so several entries can be present. The injected delay
// only applies to the real network fetch, so pick the entry with the largest duration.
boolean pkRangesLookupFound = false;
long maxPkRangesLookupDurationInMs = -1;
for (int i = 0; i < metadataDiagnosticList.size(); i++) {
if (metadataDiagnosticList
.get(i)
JsonNode metadataDiagnostic = metadataDiagnosticList.get(i);
if (metadataDiagnostic
.get("metaDataName")
.asText()
.equalsIgnoreCase(MetadataDiagnosticsContext.MetadataType.PARTITION_KEY_RANGE_LOOK_UP.name())) {
pkRangesLookup = metadataDiagnosticList.get(i);
break;
pkRangesLookupFound = true;
maxPkRangesLookupDurationInMs =
Math.max(maxPkRangesLookupDurationInMs, metadataDiagnostic.get("durationinMS").asLong());
}
}

assertThat(pkRangesLookup).isNotNull();
assertThat(pkRangesLookupFound)
.as("At least one PARTITION_KEY_RANGE_LOOK_UP diagnostic must be present")
.isTrue();
if (faultInjectionServerErrorType == FaultInjectionServerErrorType.CONNECTION_DELAY) {
assertThat(pkRangesLookup.get("durationinMS").asLong()).isGreaterThanOrEqualTo(45 * 1000 * Math.min(applyLimit, 3)); // the duration will be at least one connection timeout
assertThat(maxPkRangesLookupDurationInMs).isGreaterThanOrEqualTo(45 * 1000 * Math.min(applyLimit, 3)); // the duration will be at least one connection timeout
}

if (faultInjectionServerErrorType == FaultInjectionServerErrorType.RESPONSE_DELAY) {
assertThat(pkRangesLookup.get("durationinMS").asLong()).isGreaterThanOrEqualTo(500 * Math.min(applyLimit, 3)); // the duration will be at least one response timeout
assertThat(maxPkRangesLookupDurationInMs).isGreaterThanOrEqualTo(500 * Math.min(applyLimit, 3)); // the duration will be at least one response timeout
}

} catch (CosmosException cosmosException) {
Expand Down
Loading
Loading