codeprakhar25 · codeprakhar25 · Apr 23, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 21, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,25 @@
+# agentdiff — Project Context for Claude
+
+## Role
+Work on this as a senior engineer. The bar is production quality: correct attribution logic, no edge-case misattribution, clean diffs. Argue when you think a direction is wrong, but ship it working.
+
+## Project summary
+`agentdiff` is a Rust + Python CLI that tracks which AI agent (claude-code, cursor, opencode, copilot, etc.) wrote which lines of code in a git repo. It hooks into agent tool callbacks, captures to `session.jsonl`, then on commit runs `prepare-ledger.py` → `finalize-ledger.py` to produce signed `AgentTrace` records in `.git/agentdiff/traces/{branch}.jsonl`.
+
+## Architecture
+- **Capture**: per-agent Python scripts (`capture-claude.py`, etc.) write raw events to `.git/agentdiff/session.jsonl`
+- **Prepare** (`scripts/prepare-ledger.py`): runs pre-commit, reads session.jsonl, computes per-file attribution using line overlap, writes `pending_ledger.json`
+- **Finalize** (`scripts/finalize-ledger.py`): runs post-commit, converts pending payload to signed `AgentTrace`, appends to `traces/{branch}.jsonl`
+- **Store** (`src/store.rs`): reads traces into `Entry` structs for `list` / `report` commands
+- **Binary install path**: `~/.local/bin/agentdiff` (NOT `~/.cargo/bin/`) — always `cp target/release/agentdiff ~/.local/bin/agentdiff` after build
+
+## Attribution invariants
+- `copilot` is in `_EXCLUDED_AGENTS` — captured in session.jsonl for stats, never wins file attribution
+- Files with no session evidence → `agent = "human"`, must be explicit in attribution dict
+- `agent = "human"` in payload is the semantic token; `git_author` holds the display name
+- `contributor.type = "human"` iff `file_agent == "human"` — never infer from tool name
+
+## Key gotchas learned the hard way
+- Scripts installed to `~/.agentdiff/scripts/` must be manually synced after edits: `cp scripts/*.py ~/.agentdiff/scripts/`
+- `load_entries()` in store.rs must NOT load session.jsonl — only traces; uncommitted path uses `load_uncommitted_entries()`
+- Configure steps must check directory existence (e.g. `~/.cursor/`), not config file existence — create the file if absent
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "agentdiff"
-version = "0.1.23"
+version = "0.1.25"
 edition = "2024"
 rust-version = "1.85"
 description = "Audit and trace autonomous AI code contributions in git repositories"

diff --git a/README.md b/README.md
@@ -79,34 +79,31 @@ agentdiff stats
 
 That's it. From here every commit is attributed to whichever agent (or human) wrote it.
 
-> **Note:** `agentdiff configure` installs capture hooks globally — all repos you work on with AI agents will be tracked. To track only specific repos, you can skip the global configure and run `agentdiff init` per-repo only (you will need to configure hooks manually).
+> **Note:** `agentdiff configure` installs capture scripts globally, but capture only fires in repos where `agentdiff init` has been run (the `.git/agentdiff/` directory must exist). Running `configure` on its own does not track any repo — you must also run `agentdiff init` inside each repo you want to track.
 
 ---
 
 ## Commands
 
 | Command | Description |
 |---------|-------------|
-| `agentdiff configure` | Install global agent hooks — run once per machine |
-| `agentdiff init` | Initialize tracking in current repository |
+| `agentdiff configure` | Install global agent capture hooks — run once per machine |
+| `agentdiff init` | Initialize tracking in current repository (required per repo) |
+| `agentdiff install-ci` | Write CI workflow YAMLs to `.github/workflows/` — run once per repo |
 | `agentdiff list` | List attribution entries |
 | `agentdiff blame <file>` | Line-level attribution, like `git blame` |
-| `agentdiff stats` | Aggregate stats by agent, model, file |
-| `agentdiff log` | Chronological AI contribution history |
 | `agentdiff diff [<sha>]` | Attribution diff for a commit or range |
 | `agentdiff show <sha>` | Full details for one trace entry |
-| `agentdiff report` | CI report in Markdown or GitHub annotations |
+| `agentdiff report` | Aggregate report (text, markdown, annotations, JSONL) |
 | `agentdiff status` | Health check — hooks, keys, traces |
+| `agentdiff status --remote` | Show remote trace ref state (`refs/agentdiff/*` on origin) |
 | `agentdiff push` | Push local traces to per-branch ref on origin |
 | `agentdiff consolidate` | Merge per-branch traces into permanent store (CI) |
 | `agentdiff verify` | Verify ed25519 signatures on trace entries |
 | `agentdiff keys init` | Generate a local signing keypair |
 | `agentdiff keys register` | Register your public key in the git key registry |
 | `agentdiff keys rotate` | Rotate your keypair and register the new key |
 | `agentdiff policy check` | Enforce AI attribution policy rules |
-| `agentdiff export` | Export traces in Agent Trace JSONL format |
-| `agentdiff remote-status` | Show remote trace ref state (`refs/agentdiff/*` on origin) |
-| `agentdiff migrate` | Import legacy ledger.jsonl into new storage |
 | `agentdiff config` | Manage global configuration |
 
 <details>
@@ -120,15 +117,19 @@ agentdiff list --limit 50
 # Blame for a specific agent only
 agentdiff blame src/api.rs --agent claude-code
 
-# Stats broken down by file and model
-agentdiff stats --by-file --by-model
+# Report broken down by file and model
+agentdiff report --by-file --by-model
 
-# Stats from a specific date
-agentdiff stats --since 2026-01-01T00:00:00Z
+# Report from a specific date
+agentdiff report --since 2026-01-01T00:00:00Z
 
-# CI report to file
-agentdiff report --format markdown --out-md report.md
-agentdiff report --format annotations --out-annotations annotations.json
+# Report to file
+agentdiff report --format markdown --out report.md
+agentdiff report --format annotations --out annotations.json
+
+# Post report as a PR comment (auto-detects PR from current branch)
+agentdiff report --format markdown --post-pr-comment
+agentdiff report --format markdown --post-pr-comment 42   # explicit PR number
 
 # Attribution diff for last 3 commits
 agentdiff diff HEAD~3
@@ -147,15 +148,18 @@ agentdiff push
 # Consolidate a branch's traces into permanent store (CI step)
 agentdiff consolidate --branch feature/my-branch --push
 
+# Write CI workflows to .github/workflows/ (run once per repo)
+agentdiff install-ci
+
 # Skip specific agents during configure
 agentdiff configure --no-copilot --no-antigravity
 
 # Skip git hook install during init
 agentdiff init --no-git-hook
 
 # Check remote trace ref state after pushing
-agentdiff remote-status
-agentdiff remote-status --no-fetch   # fast: show refs + SHAs only, skip trace counts
+agentdiff status --remote
+agentdiff status --remote --no-fetch   # fast: show refs + SHAs only, skip trace counts
 ```
 
 </details>
@@ -174,7 +178,7 @@ agentdiff remote-status --no-fetch   # fast: show refs + SHAs only, skip trace c
 | **Codex CLI** | `notify` hook (`~/.codex/config.toml`) | Task-level file changes |
 | **Gemini / Antigravity** | `BeforeTool`/`AfterTool` hooks (`~/.gemini/settings.json`) | `write_file`, `replace` |
 
-Agent hooks for Claude, Cursor, Codex, Windsurf, OpenCode, and Gemini are all installed **globally once** via `agentdiff configure` — no per-repo setup needed for those.
+Agent hooks for Claude, Cursor, Codex, Windsurf, OpenCode, and Gemini are all installed **globally once** via `agentdiff configure`. However, capture only fires in repos where `agentdiff init` has been run — the `.git/agentdiff/` directory must exist for any data to be written.
 
 ---
 
@@ -456,11 +460,27 @@ Exits 0 on pass, 1 on violation. Use `--since <sha>` to scope to a specific rang
 
 ## CI Integration
 
-**Full pipeline** — report, verify, and enforce policy on every PR:
+Run once to write both workflow files into your repo:
+
+```bash
+agentdiff install-ci
+git add .github/workflows/agentdiff-*.yml
+git commit -m "ci: add agentdiff consolidation and policy workflows"
+```
+
+This writes two workflows:
+
+- **`agentdiff-consolidate.yml`** — triggers on PR merge: consolidates per-branch traces into the permanent store and posts an attribution comment to the PR.
+- **`agentdiff-policy.yml`** — triggers on every PR: runs `agentdiff policy check` and posts GitHub check annotations if rules are violated.
+
+For repos that need a custom pipeline, the manual equivalent:
 
 ```yaml
-# .github/workflows/agentdiff.yml
+# .github/workflows/agentdiff-policy.yml
 on: [pull_request]
+permissions:
+  contents: read
+  checks: write
 
 jobs:
   agentdiff:
@@ -472,31 +492,17 @@ jobs:
 
       - name: Install agentdiff
         run: |
-          curl -fsSL https://raw.githubusercontent.com/codeprakhar25/agentdiff/master/install.sh | bash
+          curl -fsSL https://raw.githubusercontent.com/codeprakhar25/agentdiff/main/install.sh | bash
           echo "$HOME/.local/bin" >> $GITHUB_PATH
 
-      - name: Init repo
-        run: agentdiff init --no-git-hook
-
       - name: Fetch agentdiff refs
-        run: git fetch origin 'refs/agentdiff/*:refs/agentdiff/*'
-
-      - name: Consolidate traces
-        run: agentdiff consolidate --branch ${{ github.head_ref }} --push
+        run: git fetch origin '+refs/agentdiff/*:refs/agentdiff/*' || true
 
       - name: Verify signatures
         run: agentdiff verify
 
       - name: Policy check
         run: agentdiff policy check --format github-annotations
-
-      - name: Generate report
-        run: agentdiff report --format markdown --out-md ai-report.md
-
-      - name: Post as PR comment
-        uses: marocchino/sticky-pull-request-comment@v2
-        with:
-          path: ai-report.md
 ```
 
 ---
@@ -508,17 +514,16 @@ Config lives at `~/.agentdiff/config.toml`:
 ```toml
 schema_version = "1.0"
 scripts_dir = "~/.agentdiff/scripts"
-auto_amend_ledger = true        # include ledger in same commit automatically
-data_dir = "~/.agentdiff/spillover"
+capture_prompts = true   # set false to omit prompt excerpts from traces
 
 [[repos]]
 path = "/home/user/my-project"
 slug = "-home-user-my-project"
 ```
 
 ```bash
-# Disable auto-amend
-agentdiff config set auto_amend_ledger false
+# Disable prompt capture
+agentdiff config set capture_prompts false
 
 # View current config
 agentdiff config show

diff --git a/scripts/capture-claude.py b/scripts/capture-claude.py
@@ -85,50 +85,103 @@ def get_session_log(cwd: str):
     return None
 
 
+def _tail_read_jsonl(path: str, chunk_size: int = 32768) -> list:
+    """Read JSONL lines from the end of a potentially large file.
+
+    Returns parsed dicts, most-recent first.  Reads at most chunk_size bytes
+    from the end on the first pass — enough for thousands of short entries.
+    """
+    results = []
+    try:
+        size = os.path.getsize(path)
+        with open(path, "rb") as fh:
+            offset = max(0, size - chunk_size)
+            fh.seek(offset)
+            raw = fh.read()
+        if offset > 0:
+            # Skip the (possibly partial) first line we cut into.
+            nl = raw.find(b"\n")
+            raw = raw[nl + 1:] if nl >= 0 else raw
+        for line in reversed(raw.decode("utf-8", errors="replace").splitlines()):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                results.append(json.loads(line))
+            except Exception:
+                continue
+    except Exception:
+        pass
+    return results
+
+
+def get_prompt_from_history(session_id: str) -> str:
+    """Read the most-recent user prompt for session_id from ~/.claude/history.jsonl.
+
+    history.jsonl format (one JSON object per line):
+      {"display":"...", "pastedContents":{...}, "sessionId":"...", "project":"...", "timestamp":...}
+
+    We take the most-recent entry whose sessionId matches and whose display
+    is not a slash command.  We also append any inline pasted text content.
+    """
+    path = os.path.expanduser("~/.claude/history.jsonl")
+    entries = _tail_read_jsonl(path)
+    for entry in entries:
+        if entry.get("sessionId") != session_id:
+            continue
+        display = entry.get("display", "").strip()
+        if not display or display.startswith("/"):
+            continue
+        # Append pasted content that has actual text (not just a hash).
+        extra_parts = []
+        for pasted in (entry.get("pastedContents") or {}).values():
+            if isinstance(pasted, dict) and pasted.get("type") == "text":
+                content = pasted.get("content", "")
+                if content:
+                    extra_parts.append(content[:200])
+        if extra_parts:
+            display = display + " [pasted: " + " | ".join(extra_parts) + "]"
+        return display[:500]
+    return "unknown"
+
+
 def get_model_and_prompt(cwd: str, session_id: str) -> tuple:
-    """Read model and prompt from Claude session JSONL."""
+    """Read model from Claude Code session JSONL, prompt from history.jsonl.
+
+    Model: ~/.claude/projects/{repo-slug}/{session_id}.jsonl — assistant entries.
+      Skips <synthetic> model values (injected during context compression).
+    Prompt: ~/.claude/history.jsonl — most-recent display for this sessionId.
+    """
+    import glob as _glob
+    model = "unknown"
     try:
-        # Try to find the session file
         home = os.path.expanduser("~")
-        parts = session_id.split("-")
-        # Construct likely path
-        possible_paths = [
-            os.path.join(home, ".claude", "projects", parts[-1] if parts else "", f"{session_id}.jsonl"),
-            os.path.join(home, ".claude", "projects", session_id[:8], f"{session_id}.jsonl"),
-        ]
-
-        for session_path in possible_paths:
-            if os.path.exists(session_path):
-                with open(session_path) as f:
-                    lines = f.readlines()
-
-                # Find last assistant message for model
-                model = "unknown"
-                for line in reversed(lines):
-                    try:
-                        entry = json.loads(line)
-                        if entry.get("type") == "assistant" and entry.get("message", {}).get("model"):
-                            model = entry["message"]["model"]
-                            break
-                    except:
-                        continue
-
-                # Find last-prompt for the actual user request
-                prompt = "unknown"
-                for line in reversed(lines):
-                    try:
-                        entry = json.loads(line)
-                        if entry.get("type") == "last-prompt":
-                            prompt = entry.get("lastPrompt", "unknown")
-                            break
-                    except:
-                        continue
-
-                return model, prompt
-
-        return "unknown", "unknown"
-    except Exception:
-        return "unknown", "unknown"
+        pattern = os.path.join(home, ".claude", "projects", "**", f"{session_id}.jsonl")
+        debug_log(f"glob pattern: {pattern}")
+        matches = _glob.glob(pattern, recursive=True)
+        debug_log(f"glob matches: {matches}")
+        if matches:
+            session_path = matches[0]
+            debug_log(f"session_path: {session_path}")
+            for entry in _tail_read_jsonl(session_path):
+                if entry.get("type") == "assistant":
+                    m = entry.get("message", {}).get("model", "")
+                    if m and m != "<synthetic>":
+                        model = m
+                        debug_log(f"model found: {model}")
+                        break
+    except Exception as exc:
+        debug_log(f"model lookup error: {exc}")
+
+    prompt = get_prompt_from_history(session_id)
+    # Allow test/CI injection via env var when history lookup can't find the session.
+    if prompt == "unknown":
+        env_prompt = os.environ.get("AGENTDIFF_PROMPT", "")
+        if env_prompt:
+            prompt = env_prompt
+            debug_log(f"prompt from AGENTDIFF_PROMPT env var")
+    debug_log(f"prompt: {prompt[:80]!r}")
+    return model, prompt
 
 
 def is_in_repo(abs_file: str, repo_root: str) -> bool:
@@ -196,6 +249,7 @@ def main():
         sys.exit(0)
 
     session_id = first(payload, "session_id", "sessionId", default="unknown")
+    debug_log(f"before get_model_and_prompt session_id={session_id}")
     model, prompt = get_model_and_prompt(cwd, session_id)
 
     timestamp = datetime.now(timezone.utc).isoformat()