From ad924ad03044fb1e0fdb726c20fc776a4147b766 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 8 May 2026 17:10:49 +0900 Subject: [PATCH 1/3] fix(kv): bound verifyLeaderEngine ReadIndex with 5s deadline verifyLeaderEngine() called engine.VerifyLeader with context.Background(), so any caller without an upstream context blocked indefinitely on a ReadIndex round-trip. A single transient stall accumulated callers permanently because they never timed out and never returned. Production hit this on 2026-05-08: follower 192.168.0.214 lost its network route (no route to host, ARP INCOMPLETE), the leader's ReadIndex completion stalled intermittently, and verify-callers piled up at roughly 9/sec without bound. After ~37 minutes the leader (192.168.0.212) held 20,560 goroutines (20,478 in submitRead select, oldest 39 minutes), CPU pinned at 1870% (Engine.run Ready loop walks pendingReads O(N) per tick, so the queue feeds back on itself), and host MemAvailable trended toward 0 until OOM. Each new leader after failover re-entered the same death spiral. Affected callers (all use the no-context variant): - LeaderProxy.Commit / .Abort -- every Redis write hits this - Coordinate.VerifyLeader / ShardedCoordinator.VerifyLeader[ForKey] - adapter S3/SQS /healthz/leader handlers (Caddy probes) - main_admin.go LeaderProbe (admin dashboard /admin/healthz/leader) - adapter/sqs.go isVerifiedSQSLeader, adapter/s3.go isVerifiedS3Leader Fix: cap the no-context path at 5s (matching leaderForwardTimeout). On timeout, callers see context.DeadlineExceeded -- LeaderProxy falls back to forwardWithRetry as it already does for any verify failure, healthz handlers report not-leader, and the lock resolver skips this tick. Self-review (5 lenses): 1. Data loss -- none. The fix only shortens a never-returning wait. verifyLeaderEngine is a freshness check, not a write path. 2. Concurrency -- the new ctx is local to each call (defer cancel), no shared state, no lock changes. Engine-side blocking semantics unchanged; we just stop waiting forever. 3. Performance -- positive. Removes the unbounded goroutine pile-up and the O(N) pendingReads walk it caused. No new allocations on the success path beyond the WithTimeout context. 4. Data consistency -- ReadIndex still completes when quorum heartbeats land within 5s. A timeout means the caller could not confirm leadership freshness, which the existing "fall through to forward" path already treats as a soft failure. 5. Test coverage -- kv/raft_engine_test.go pins the regression: a blockingLeaderView that holds VerifyLeader on its ctx must surface DeadlineExceeded within 2x verifyLeaderTimeout. Test: go test -race -count=1 ./kv -- 9.3s, all green. Future work (separate PRs): plumb real request contexts through LeaderProxy.Commit/Abort and the healthz handlers so a client-side deadline cascades naturally instead of relying on this fixed bound. --- kv/raft_engine.go | 30 ++++++++++++++++++++++- kv/raft_engine_test.go | 54 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 kv/raft_engine_test.go diff --git a/kv/raft_engine.go b/kv/raft_engine.go index b5f68b8c6..c57b890e5 100644 --- a/kv/raft_engine.go +++ b/kv/raft_engine.go @@ -2,12 +2,38 @@ package kv import ( "context" + "time" "github.com/bootjp/elastickv/internal/monoclock" "github.com/bootjp/elastickv/internal/raftengine" "github.com/cockroachdb/errors" ) +// verifyLeaderTimeout caps how long the no-context verifyLeaderEngine path +// is willing to wait for a ReadIndex round-trip. +// +// A previous version called engine.VerifyLeader with context.Background(), +// so callers without an upstream deadline (LeaderProxy.Commit/Abort, +// Coordinate.VerifyLeader, ShardedCoordinator.VerifyLeader[ForKey], and +// the S3/SQS/admin /healthz/leader handlers) blocked indefinitely whenever +// ReadIndex completion stalled — a single transient stall accumulated +// callers permanently. +// +// Production hit this on 2026-05-08: a follower (192.168.0.214) lost its +// network route mid-flight and the leader's ReadIndex completion stalled +// intermittently. verifyLeaderEngine callers piled up at ~9/sec without +// bound; after ~37 minutes the leader was holding 20,560 goroutines +// (20,478 in submitRead select, oldest 39 minutes), CPU pinned at 1870% +// (the Engine.run Ready loop walks pendingReads O(N) per tick, so the +// queue feeds back on itself), and host MemAvailable trended toward 0 +// until OOM. The same pattern repeated on each new leader after failover. +// +// 5s matches leaderForwardTimeout: a verify that takes longer than a +// single forward RPC is, by definition, useless as a freshness check, +// and the proxy's verify-then-forward path stays within its 5s retry +// budget. +const verifyLeaderTimeout = 5 * time.Second + func engineForGroup(g *ShardGroup) raftengine.Engine { if g == nil { return nil @@ -41,7 +67,9 @@ func verifyLeaderEngineCtx(ctx context.Context, engine raftengine.LeaderView) er } func verifyLeaderEngine(engine raftengine.LeaderView) error { - return verifyLeaderEngineCtx(context.Background(), engine) + ctx, cancel := context.WithTimeout(context.Background(), verifyLeaderTimeout) + defer cancel() + return verifyLeaderEngineCtx(ctx, engine) } func linearizableReadEngineCtx(ctx context.Context, engine raftengine.LeaderView) (uint64, error) { diff --git a/kv/raft_engine_test.go b/kv/raft_engine_test.go new file mode 100644 index 000000000..5fa501c13 --- /dev/null +++ b/kv/raft_engine_test.go @@ -0,0 +1,54 @@ +package kv + +import ( + "context" + stderrors "errors" + "testing" + "time" + + "github.com/bootjp/elastickv/internal/raftengine" +) + +// blockingLeaderView is a LeaderView whose VerifyLeader blocks until ctx is +// cancelled, modelling the production pathology where ReadIndex stalls +// because heartbeat acks fail to land. LinearizableRead is similarly +// well-behaved on cancel; State / Leader are stamped enough to satisfy the +// callers under test. +type blockingLeaderView struct{} + +func (blockingLeaderView) State() raftengine.State { return raftengine.StateLeader } +func (blockingLeaderView) Leader() raftengine.LeaderInfo { return raftengine.LeaderInfo{ID: "self"} } +func (blockingLeaderView) VerifyLeader(ctx context.Context) error { + <-ctx.Done() + return ctx.Err() +} +func (blockingLeaderView) LinearizableRead(ctx context.Context) (uint64, error) { + <-ctx.Done() + return 0, ctx.Err() +} + +// TestVerifyLeaderEngine_BoundsBlockingReadIndex pins the regression: if a +// stalled ReadIndex used to return only when the underlying ctx fired, but +// callers passed context.Background(), the goroutine pinned forever. After +// 2026-05-08-style stalls in production this must complete within roughly +// verifyLeaderTimeout, surfacing context.DeadlineExceeded. +func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) { + t.Parallel() + + start := time.Now() + err := verifyLeaderEngine(blockingLeaderView{}) + elapsed := time.Since(start) + + if err == nil { + t.Fatalf("verifyLeaderEngine(blocking) returned nil; expected DeadlineExceeded") + } + if !stderrors.Is(err, context.DeadlineExceeded) { + t.Fatalf("verifyLeaderEngine(blocking) err = %v; want DeadlineExceeded", err) + } + // Allow generous slack so a slow CI host does not flake; the point is + // not to assert a tight bound but to prove the call returns at all. + if elapsed > 2*verifyLeaderTimeout { + t.Fatalf("verifyLeaderEngine(blocking) returned after %s; want <= 2x verifyLeaderTimeout (%s)", elapsed, verifyLeaderTimeout) + } +} + From 0be9fc76970cdb789771243cd3d951620465625e Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 8 May 2026 17:36:42 +0900 Subject: [PATCH 2/3] test(kv): address PR #745 round-1 review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-1 review on commit ad924ad0: - gemini medium: assert minimum elapsed time, not just upper bound. Without a lower bound a future regression returning DeadlineExceeded before actually waiting would silently pass. Added a lower-bound check with 200ms slack so a slow CI scheduler that trips ctx.Done() a hair before the wall clock catches up does not flake. - claude bot (style): trim incident narrative from the verifyLeaderTimeout doc comment. Specific IPs, goroutine counts, and timestamps belong in the commit message and PR body, not the source — they go stale and add no design value at the call site. Kept the rationale (why 5s, the O(N) pendingReads feedback loop, match leaderForwardTimeout) and added a "see PR #745" reference for readers who want the full incident detail. - claude bot: switch the regression test from stdlib `stderrors "errors"` to `github.com/cockroachdb/errors` for consistency with the project convention (CLAUDE.md mandates cockroachdb/errors at boundaries; the test is a boundary). Also drops the import alias which made gci complain about ordering. - claude bot: add `if testing.Short() { t.Skip(...) }` guard so `go test -short ./kv/...` does not pay the verifyLeaderTimeout (5s) wait. Default `make test` still runs it. - reviewdog/golangci (gci): import order fixed by reordering third-party (cockroachdb) above local (bootjp) per the project's default gci sections (standard, default, prefix(...)). No behavior change. Test still passes: go test -race -count=1 -run TestVerifyLeaderEngine ./kv -- 6.06s go test -race -count=1 -short ./kv -- 5.55s (test skipped under -short) No semantic change to the production fix; this commit is test + comment polish only, so no caller audit needed. --- kv/raft_engine.go | 28 ++++++++++------------------ kv/raft_engine_test.go | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/kv/raft_engine.go b/kv/raft_engine.go index c57b890e5..f896dc400 100644 --- a/kv/raft_engine.go +++ b/kv/raft_engine.go @@ -10,28 +10,20 @@ import ( ) // verifyLeaderTimeout caps how long the no-context verifyLeaderEngine path -// is willing to wait for a ReadIndex round-trip. -// -// A previous version called engine.VerifyLeader with context.Background(), -// so callers without an upstream deadline (LeaderProxy.Commit/Abort, +// is willing to wait for a ReadIndex round-trip. Without this bound, +// callers that hold context.Background() (LeaderProxy.Commit/Abort, // Coordinate.VerifyLeader, ShardedCoordinator.VerifyLeader[ForKey], and // the S3/SQS/admin /healthz/leader handlers) blocked indefinitely whenever -// ReadIndex completion stalled — a single transient stall accumulated -// callers permanently. -// -// Production hit this on 2026-05-08: a follower (192.168.0.214) lost its -// network route mid-flight and the leader's ReadIndex completion stalled -// intermittently. verifyLeaderEngine callers piled up at ~9/sec without -// bound; after ~37 minutes the leader was holding 20,560 goroutines -// (20,478 in submitRead select, oldest 39 minutes), CPU pinned at 1870% -// (the Engine.run Ready loop walks pendingReads O(N) per tick, so the -// queue feeds back on itself), and host MemAvailable trended toward 0 -// until OOM. The same pattern repeated on each new leader after failover. +// ReadIndex completion stalled, and a single transient stall accumulated +// callers permanently — Engine.run's Ready loop walks pendingReads O(N) +// per tick, so the queue feeds back on itself once it grows. // // 5s matches leaderForwardTimeout: a verify that takes longer than a -// single forward RPC is, by definition, useless as a freshness check, -// and the proxy's verify-then-forward path stays within its 5s retry -// budget. +// single forward RPC is useless as a freshness check, and the proxy's +// verify-then-forward path stays within its 5s retry budget. +// +// See PR #745 / incident 2026-05-08 for the goroutine-pile production +// failure this prevents. const verifyLeaderTimeout = 5 * time.Second func engineForGroup(g *ShardGroup) raftengine.Engine { diff --git a/kv/raft_engine_test.go b/kv/raft_engine_test.go index 5fa501c13..3291bdb26 100644 --- a/kv/raft_engine_test.go +++ b/kv/raft_engine_test.go @@ -2,10 +2,11 @@ package kv import ( "context" - stderrors "errors" "testing" "time" + "github.com/cockroachdb/errors" + "github.com/bootjp/elastickv/internal/raftengine" ) @@ -16,8 +17,8 @@ import ( // callers under test. type blockingLeaderView struct{} -func (blockingLeaderView) State() raftengine.State { return raftengine.StateLeader } -func (blockingLeaderView) Leader() raftengine.LeaderInfo { return raftengine.LeaderInfo{ID: "self"} } +func (blockingLeaderView) State() raftengine.State { return raftengine.StateLeader } +func (blockingLeaderView) Leader() raftengine.LeaderInfo { return raftengine.LeaderInfo{ID: "self"} } func (blockingLeaderView) VerifyLeader(ctx context.Context) error { <-ctx.Done() return ctx.Err() @@ -30,10 +31,17 @@ func (blockingLeaderView) LinearizableRead(ctx context.Context) (uint64, error) // TestVerifyLeaderEngine_BoundsBlockingReadIndex pins the regression: if a // stalled ReadIndex used to return only when the underlying ctx fired, but // callers passed context.Background(), the goroutine pinned forever. After -// 2026-05-08-style stalls in production this must complete within roughly +// the 2026-05-08 incident this must complete within roughly // verifyLeaderTimeout, surfacing context.DeadlineExceeded. +// +// Skipped under -short because the whole point is to wait for the deadline +// to fire; the no-skip path adds verifyLeaderTimeout (5s) to every default +// `make test` run. func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) { t.Parallel() + if testing.Short() { + t.Skip("skipping: blocks for verifyLeaderTimeout (5s)") + } start := time.Now() err := verifyLeaderEngine(blockingLeaderView{}) @@ -42,13 +50,26 @@ func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) { if err == nil { t.Fatalf("verifyLeaderEngine(blocking) returned nil; expected DeadlineExceeded") } - if !stderrors.Is(err, context.DeadlineExceeded) { + if !errors.Is(err, context.DeadlineExceeded) { t.Fatalf("verifyLeaderEngine(blocking) err = %v; want DeadlineExceeded", err) } - // Allow generous slack so a slow CI host does not flake; the point is - // not to assert a tight bound but to prove the call returns at all. + // Lower bound: confirm the engine actually held the call until the + // deadline fired. Without this, a future regression that returned + // DeadlineExceeded immediately (e.g. a misplaced ctx check before + // the engine call) would silently pass. Pulled in from gemini's + // PR #745 round-1 review — the upper bound alone proved bounded + // completion but not "the timeout actually fired." + // + // Tolerate a 200ms early-return slack so a slow CI scheduler that + // trips ctx.Done() a hair before the wall clock catches up does + // not flake. + const slack = 200 * time.Millisecond + if elapsed+slack < verifyLeaderTimeout { + t.Fatalf("verifyLeaderEngine(blocking) returned too early after %s; want >= %s (-%s slack)", elapsed, verifyLeaderTimeout, slack) + } + // Upper bound: prove the call returned at all. Generous so a slow + // CI host does not flake. if elapsed > 2*verifyLeaderTimeout { t.Fatalf("verifyLeaderEngine(blocking) returned after %s; want <= 2x verifyLeaderTimeout (%s)", elapsed, verifyLeaderTimeout) } } - From a2ab2450c3c57962f8fa807ed336cb71b6898b9a Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 8 May 2026 17:44:28 +0900 Subject: [PATCH 3/3] test(kv): merge import groups + drop tooling reference (PR #745 r2) Round-2 review on commit 0be9fc76: - reviewdog/golangci (gci): test file still failed gci because the project convention puts ALL non-stdlib imports in a single group (no blank line between third-party and local). Round-2's "cockroachdb above bootjp with a blank line between" guess was wrong; merging into one alphabetical group (bootjp before cockroachdb) matches kv/coordinator.go and resolves the warning. - claude bot (nit): drop "Pulled in from gemini's PR #745 round-1 review" from the lower-bound rationale comment. Process metadata rots once the PR is merged; the design reason is what the source needs. No behavior change. Test still passes 6.05s. --- kv/raft_engine_test.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kv/raft_engine_test.go b/kv/raft_engine_test.go index 3291bdb26..e8b8da3fb 100644 --- a/kv/raft_engine_test.go +++ b/kv/raft_engine_test.go @@ -5,9 +5,8 @@ import ( "testing" "time" - "github.com/cockroachdb/errors" - "github.com/bootjp/elastickv/internal/raftengine" + "github.com/cockroachdb/errors" ) // blockingLeaderView is a LeaderView whose VerifyLeader blocks until ctx is @@ -54,11 +53,10 @@ func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) { t.Fatalf("verifyLeaderEngine(blocking) err = %v; want DeadlineExceeded", err) } // Lower bound: confirm the engine actually held the call until the - // deadline fired. Without this, a future regression that returned - // DeadlineExceeded immediately (e.g. a misplaced ctx check before - // the engine call) would silently pass. Pulled in from gemini's - // PR #745 round-1 review — the upper bound alone proved bounded - // completion but not "the timeout actually fired." + // deadline fired, not that some other error path returned + // immediately. Without this, a future regression that returned + // DeadlineExceeded before doing any work (e.g. a misplaced ctx + // check before the engine call) would silently pass. // // Tolerate a 200ms early-return slack so a slow CI scheduler that // trips ctx.Done() a hair before the wall clock catches up does