From ad924ad03044fb1e0fdb726c20fc776a4147b766 Mon Sep 17 00:00:00 2001
From: "Yoshiaki Ueda (bootjp)" <contact@bootjp.me>
Date: Fri, 8 May 2026 17:10:49 +0900
Subject: [PATCH 1/3] fix(kv): bound verifyLeaderEngine ReadIndex with 5s
 deadline

verifyLeaderEngine() called engine.VerifyLeader with context.Background(),
so any caller without an upstream context blocked indefinitely on a
ReadIndex round-trip. A single transient stall accumulated callers
permanently because they never timed out and never returned.

Production hit this on 2026-05-08: follower 192.168.0.214 lost its
network route (no route to host, ARP INCOMPLETE), the leader's
ReadIndex completion stalled intermittently, and verify-callers piled
up at roughly 9/sec without bound. After ~37 minutes the leader
(192.168.0.212) held 20,560 goroutines (20,478 in submitRead select,
oldest 39 minutes), CPU pinned at 1870% (Engine.run Ready loop walks
pendingReads O(N) per tick, so the queue feeds back on itself), and
host MemAvailable trended toward 0 until OOM. Each new leader after
failover re-entered the same death spiral.

Affected callers (all use the no-context variant):
- LeaderProxy.Commit / .Abort -- every Redis write hits this
- Coordinate.VerifyLeader / ShardedCoordinator.VerifyLeader[ForKey]
- adapter S3/SQS /healthz/leader handlers (Caddy probes)
- main_admin.go LeaderProbe (admin dashboard /admin/healthz/leader)
- adapter/sqs.go isVerifiedSQSLeader, adapter/s3.go isVerifiedS3Leader

Fix: cap the no-context path at 5s (matching leaderForwardTimeout). On
timeout, callers see context.DeadlineExceeded -- LeaderProxy falls back
to forwardWithRetry as it already does for any verify failure, healthz
handlers report not-leader, and the lock resolver skips this tick.

Self-review (5 lenses):
1. Data loss -- none. The fix only shortens a never-returning wait.
   verifyLeaderEngine is a freshness check, not a write path.
2. Concurrency -- the new ctx is local to each call (defer cancel),
   no shared state, no lock changes. Engine-side blocking semantics
   unchanged; we just stop waiting forever.
3. Performance -- positive. Removes the unbounded goroutine pile-up
   and the O(N) pendingReads walk it caused. No new allocations on
   the success path beyond the WithTimeout context.
4. Data consistency -- ReadIndex still completes when quorum heartbeats
   land within 5s. A timeout means the caller could not confirm
   leadership freshness, which the existing "fall through to forward"
   path already treats as a soft failure.
5. Test coverage -- kv/raft_engine_test.go pins the regression: a
   blockingLeaderView that holds VerifyLeader on its ctx must surface
   DeadlineExceeded within 2x verifyLeaderTimeout.

Test: go test -race -count=1 ./kv -- 9.3s, all green.

Future work (separate PRs): plumb real request contexts through
LeaderProxy.Commit/Abort and the healthz handlers so a client-side
deadline cascades naturally instead of relying on this fixed bound.
---
 kv/raft_engine.go      | 30 ++++++++++++++++++++++-
 kv/raft_engine_test.go | 54 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 kv/raft_engine_test.go

diff --git a/kv/raft_engine.go b/kv/raft_engine.go
index b5f68b8c6..c57b890e5 100644
--- a/kv/raft_engine.go
+++ b/kv/raft_engine.go
@@ -2,12 +2,38 @@ package kv
 
 import (
 	"context"
+	"time"
 
 	"github.com/bootjp/elastickv/internal/monoclock"
 	"github.com/bootjp/elastickv/internal/raftengine"
 	"github.com/cockroachdb/errors"
 )
 
+// verifyLeaderTimeout caps how long the no-context verifyLeaderEngine path
+// is willing to wait for a ReadIndex round-trip.
+//
+// A previous version called engine.VerifyLeader with context.Background(),
+// so callers without an upstream deadline (LeaderProxy.Commit/Abort,
+// Coordinate.VerifyLeader, ShardedCoordinator.VerifyLeader[ForKey], and
+// the S3/SQS/admin /healthz/leader handlers) blocked indefinitely whenever
+// ReadIndex completion stalled — a single transient stall accumulated
+// callers permanently.
+//
+// Production hit this on 2026-05-08: a follower (192.168.0.214) lost its
+// network route mid-flight and the leader's ReadIndex completion stalled
+// intermittently. verifyLeaderEngine callers piled up at ~9/sec without
+// bound; after ~37 minutes the leader was holding 20,560 goroutines
+// (20,478 in submitRead select, oldest 39 minutes), CPU pinned at 1870%
+// (the Engine.run Ready loop walks pendingReads O(N) per tick, so the
+// queue feeds back on itself), and host MemAvailable trended toward 0
+// until OOM. The same pattern repeated on each new leader after failover.
+//
+// 5s matches leaderForwardTimeout: a verify that takes longer than a
+// single forward RPC is, by definition, useless as a freshness check,
+// and the proxy's verify-then-forward path stays within its 5s retry
+// budget.
+const verifyLeaderTimeout = 5 * time.Second
+
 func engineForGroup(g *ShardGroup) raftengine.Engine {
 	if g == nil {
 		return nil
@@ -41,7 +67,9 @@ func verifyLeaderEngineCtx(ctx context.Context, engine raftengine.LeaderView) er
 }
 
 func verifyLeaderEngine(engine raftengine.LeaderView) error {
-	return verifyLeaderEngineCtx(context.Background(), engine)
+	ctx, cancel := context.WithTimeout(context.Background(), verifyLeaderTimeout)
+	defer cancel()
+	return verifyLeaderEngineCtx(ctx, engine)
 }
 
 func linearizableReadEngineCtx(ctx context.Context, engine raftengine.LeaderView) (uint64, error) {
diff --git a/kv/raft_engine_test.go b/kv/raft_engine_test.go
new file mode 100644
index 000000000..5fa501c13
--- /dev/null
+++ b/kv/raft_engine_test.go
@@ -0,0 +1,54 @@
+package kv
+
+import (
+	"context"
+	stderrors "errors"
+	"testing"
+	"time"
+
+	"github.com/bootjp/elastickv/internal/raftengine"
+)
+
+// blockingLeaderView is a LeaderView whose VerifyLeader blocks until ctx is
+// cancelled, modelling the production pathology where ReadIndex stalls
+// because heartbeat acks fail to land. LinearizableRead is similarly
+// well-behaved on cancel; State / Leader are stamped enough to satisfy the
+// callers under test.
+type blockingLeaderView struct{}
+
+func (blockingLeaderView) State() raftengine.State        { return raftengine.StateLeader }
+func (blockingLeaderView) Leader() raftengine.LeaderInfo  { return raftengine.LeaderInfo{ID: "self"} }
+func (blockingLeaderView) VerifyLeader(ctx context.Context) error {
+	<-ctx.Done()
+	return ctx.Err()
+}
+func (blockingLeaderView) LinearizableRead(ctx context.Context) (uint64, error) {
+	<-ctx.Done()
+	return 0, ctx.Err()
+}
+
+// TestVerifyLeaderEngine_BoundsBlockingReadIndex pins the regression: if a
+// stalled ReadIndex used to return only when the underlying ctx fired, but
+// callers passed context.Background(), the goroutine pinned forever. After
+// 2026-05-08-style stalls in production this must complete within roughly
+// verifyLeaderTimeout, surfacing context.DeadlineExceeded.
+func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) {
+	t.Parallel()
+
+	start := time.Now()
+	err := verifyLeaderEngine(blockingLeaderView{})
+	elapsed := time.Since(start)
+
+	if err == nil {
+		t.Fatalf("verifyLeaderEngine(blocking) returned nil; expected DeadlineExceeded")
+	}
+	if !stderrors.Is(err, context.DeadlineExceeded) {
+		t.Fatalf("verifyLeaderEngine(blocking) err = %v; want DeadlineExceeded", err)
+	}
+	// Allow generous slack so a slow CI host does not flake; the point is
+	// not to assert a tight bound but to prove the call returns at all.
+	if elapsed > 2*verifyLeaderTimeout {
+		t.Fatalf("verifyLeaderEngine(blocking) returned after %s; want <= 2x verifyLeaderTimeout (%s)", elapsed, verifyLeaderTimeout)
+	}
+}
+

From 0be9fc76970cdb789771243cd3d951620465625e Mon Sep 17 00:00:00 2001
From: "Yoshiaki Ueda (bootjp)" <contact@bootjp.me>
Date: Fri, 8 May 2026 17:36:42 +0900
Subject: [PATCH 2/3] test(kv): address PR #745 round-1 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round-1 review on commit ad924ad0:

- gemini medium: assert minimum elapsed time, not just upper bound.
  Without a lower bound a future regression returning DeadlineExceeded
  before actually waiting would silently pass. Added a lower-bound
  check with 200ms slack so a slow CI scheduler that trips ctx.Done()
  a hair before the wall clock catches up does not flake.

- claude bot (style): trim incident narrative from the
  verifyLeaderTimeout doc comment. Specific IPs, goroutine counts,
  and timestamps belong in the commit message and PR body, not the
  source — they go stale and add no design value at the call site.
  Kept the rationale (why 5s, the O(N) pendingReads feedback loop,
  match leaderForwardTimeout) and added a "see PR #745" reference
  for readers who want the full incident detail.

- claude bot: switch the regression test from stdlib
  `stderrors "errors"` to `github.com/cockroachdb/errors` for
  consistency with the project convention (CLAUDE.md mandates
  cockroachdb/errors at boundaries; the test is a boundary). Also
  drops the import alias which made gci complain about ordering.

- claude bot: add `if testing.Short() { t.Skip(...) }` guard so
  `go test -short ./kv/...` does not pay the verifyLeaderTimeout
  (5s) wait. Default `make test` still runs it.

- reviewdog/golangci (gci): import order fixed by reordering
  third-party (cockroachdb) above local (bootjp) per the project's
  default gci sections (standard, default, prefix(...)).

No behavior change. Test still passes:
  go test -race -count=1 -run TestVerifyLeaderEngine ./kv -- 6.06s
  go test -race -count=1 -short ./kv -- 5.55s (test skipped under -short)

No semantic change to the production fix; this commit is test +
comment polish only, so no caller audit needed.
---
 kv/raft_engine.go      | 28 ++++++++++------------------
 kv/raft_engine_test.go | 37 +++++++++++++++++++++++++++++--------
 2 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/kv/raft_engine.go b/kv/raft_engine.go
index c57b890e5..f896dc400 100644
--- a/kv/raft_engine.go
+++ b/kv/raft_engine.go
@@ -10,28 +10,20 @@ import (
 )
 
 // verifyLeaderTimeout caps how long the no-context verifyLeaderEngine path
-// is willing to wait for a ReadIndex round-trip.
-//
-// A previous version called engine.VerifyLeader with context.Background(),
-// so callers without an upstream deadline (LeaderProxy.Commit/Abort,
+// is willing to wait for a ReadIndex round-trip. Without this bound,
+// callers that hold context.Background() (LeaderProxy.Commit/Abort,
 // Coordinate.VerifyLeader, ShardedCoordinator.VerifyLeader[ForKey], and
 // the S3/SQS/admin /healthz/leader handlers) blocked indefinitely whenever
-// ReadIndex completion stalled — a single transient stall accumulated
-// callers permanently.
-//
-// Production hit this on 2026-05-08: a follower (192.168.0.214) lost its
-// network route mid-flight and the leader's ReadIndex completion stalled
-// intermittently. verifyLeaderEngine callers piled up at ~9/sec without
-// bound; after ~37 minutes the leader was holding 20,560 goroutines
-// (20,478 in submitRead select, oldest 39 minutes), CPU pinned at 1870%
-// (the Engine.run Ready loop walks pendingReads O(N) per tick, so the
-// queue feeds back on itself), and host MemAvailable trended toward 0
-// until OOM. The same pattern repeated on each new leader after failover.
+// ReadIndex completion stalled, and a single transient stall accumulated
+// callers permanently — Engine.run's Ready loop walks pendingReads O(N)
+// per tick, so the queue feeds back on itself once it grows.
 //
 // 5s matches leaderForwardTimeout: a verify that takes longer than a
-// single forward RPC is, by definition, useless as a freshness check,
-// and the proxy's verify-then-forward path stays within its 5s retry
-// budget.
+// single forward RPC is useless as a freshness check, and the proxy's
+// verify-then-forward path stays within its 5s retry budget.
+//
+// See PR #745 / incident 2026-05-08 for the goroutine-pile production
+// failure this prevents.
 const verifyLeaderTimeout = 5 * time.Second
 
 func engineForGroup(g *ShardGroup) raftengine.Engine {
diff --git a/kv/raft_engine_test.go b/kv/raft_engine_test.go
index 5fa501c13..3291bdb26 100644
--- a/kv/raft_engine_test.go
+++ b/kv/raft_engine_test.go
@@ -2,10 +2,11 @@ package kv
 
 import (
 	"context"
-	stderrors "errors"
 	"testing"
 	"time"
 
+	"github.com/cockroachdb/errors"
+
 	"github.com/bootjp/elastickv/internal/raftengine"
 )
 
@@ -16,8 +17,8 @@ import (
 // callers under test.
 type blockingLeaderView struct{}
 
-func (blockingLeaderView) State() raftengine.State        { return raftengine.StateLeader }
-func (blockingLeaderView) Leader() raftengine.LeaderInfo  { return raftengine.LeaderInfo{ID: "self"} }
+func (blockingLeaderView) State() raftengine.State       { return raftengine.StateLeader }
+func (blockingLeaderView) Leader() raftengine.LeaderInfo { return raftengine.LeaderInfo{ID: "self"} }
 func (blockingLeaderView) VerifyLeader(ctx context.Context) error {
 	<-ctx.Done()
 	return ctx.Err()
@@ -30,10 +31,17 @@ func (blockingLeaderView) LinearizableRead(ctx context.Context) (uint64, error)
 // TestVerifyLeaderEngine_BoundsBlockingReadIndex pins the regression: if a
 // stalled ReadIndex used to return only when the underlying ctx fired, but
 // callers passed context.Background(), the goroutine pinned forever. After
-// 2026-05-08-style stalls in production this must complete within roughly
+// the 2026-05-08 incident this must complete within roughly
 // verifyLeaderTimeout, surfacing context.DeadlineExceeded.
+//
+// Skipped under -short because the whole point is to wait for the deadline
+// to fire; the no-skip path adds verifyLeaderTimeout (5s) to every default
+// `make test` run.
 func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) {
 	t.Parallel()
+	if testing.Short() {
+		t.Skip("skipping: blocks for verifyLeaderTimeout (5s)")
+	}
 
 	start := time.Now()
 	err := verifyLeaderEngine(blockingLeaderView{})
@@ -42,13 +50,26 @@ func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) {
 	if err == nil {
 		t.Fatalf("verifyLeaderEngine(blocking) returned nil; expected DeadlineExceeded")
 	}
-	if !stderrors.Is(err, context.DeadlineExceeded) {
+	if !errors.Is(err, context.DeadlineExceeded) {
 		t.Fatalf("verifyLeaderEngine(blocking) err = %v; want DeadlineExceeded", err)
 	}
-	// Allow generous slack so a slow CI host does not flake; the point is
-	// not to assert a tight bound but to prove the call returns at all.
+	// Lower bound: confirm the engine actually held the call until the
+	// deadline fired. Without this, a future regression that returned
+	// DeadlineExceeded immediately (e.g. a misplaced ctx check before
+	// the engine call) would silently pass. Pulled in from gemini's
+	// PR #745 round-1 review — the upper bound alone proved bounded
+	// completion but not "the timeout actually fired."
+	//
+	// Tolerate a 200ms early-return slack so a slow CI scheduler that
+	// trips ctx.Done() a hair before the wall clock catches up does
+	// not flake.
+	const slack = 200 * time.Millisecond
+	if elapsed+slack < verifyLeaderTimeout {
+		t.Fatalf("verifyLeaderEngine(blocking) returned too early after %s; want >= %s (-%s slack)", elapsed, verifyLeaderTimeout, slack)
+	}
+	// Upper bound: prove the call returned at all. Generous so a slow
+	// CI host does not flake.
 	if elapsed > 2*verifyLeaderTimeout {
 		t.Fatalf("verifyLeaderEngine(blocking) returned after %s; want <= 2x verifyLeaderTimeout (%s)", elapsed, verifyLeaderTimeout)
 	}
 }
-

From a2ab2450c3c57962f8fa807ed336cb71b6898b9a Mon Sep 17 00:00:00 2001
From: "Yoshiaki Ueda (bootjp)" <contact@bootjp.me>
Date: Fri, 8 May 2026 17:44:28 +0900
Subject: [PATCH 3/3] test(kv): merge import groups + drop tooling reference
 (PR #745 r2)

Round-2 review on commit 0be9fc76:

- reviewdog/golangci (gci): test file still failed gci because the
  project convention puts ALL non-stdlib imports in a single group
  (no blank line between third-party and local). Round-2's
  "cockroachdb above bootjp with a blank line between" guess was
  wrong; merging into one alphabetical group (bootjp before
  cockroachdb) matches kv/coordinator.go and resolves the warning.

- claude bot (nit): drop "Pulled in from gemini's PR #745 round-1
  review" from the lower-bound rationale comment. Process metadata
  rots once the PR is merged; the design reason is what the source
  needs.

No behavior change. Test still passes 6.05s.
---
 kv/raft_engine_test.go | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/kv/raft_engine_test.go b/kv/raft_engine_test.go
index 3291bdb26..e8b8da3fb 100644
--- a/kv/raft_engine_test.go
+++ b/kv/raft_engine_test.go
@@ -5,9 +5,8 @@ import (
 	"testing"
 	"time"
 
-	"github.com/cockroachdb/errors"
-
 	"github.com/bootjp/elastickv/internal/raftengine"
+	"github.com/cockroachdb/errors"
 )
 
 // blockingLeaderView is a LeaderView whose VerifyLeader blocks until ctx is
@@ -54,11 +53,10 @@ func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) {
 		t.Fatalf("verifyLeaderEngine(blocking) err = %v; want DeadlineExceeded", err)
 	}
 	// Lower bound: confirm the engine actually held the call until the
-	// deadline fired. Without this, a future regression that returned
-	// DeadlineExceeded immediately (e.g. a misplaced ctx check before
-	// the engine call) would silently pass. Pulled in from gemini's
-	// PR #745 round-1 review — the upper bound alone proved bounded
-	// completion but not "the timeout actually fired."
+	// deadline fired, not that some other error path returned
+	// immediately. Without this, a future regression that returned
+	// DeadlineExceeded before doing any work (e.g. a misplaced ctx
+	// check before the engine call) would silently pass.
 	//
 	// Tolerate a 200ms early-return slack so a slow CI scheduler that
 	// trips ctx.Done() a hair before the wall clock catches up does