Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,56 @@
# Changelog

## [1.4.2] - 2026-02-17

### Overview

This patch release is a maintenance update. Determinism remains guaranteed: reports are stable and ordering is
unchanged.

### Performance & Implementation Cleanup

- `process_file()` now uses a single `os.stat()` call to obtain both size (size guard) and `st_mtime_ns`/`st_size` (file
stat signature), removing a redundant `os.path.getsize()` call.
- Discovery logic was deduplicated by extracting `_discover_files()`; quiet/non-quiet behavior differs only by UI status
wrapper, not by semantics or filtering.
- Cache path wiring now precomputes `wire_map` so `_wire_filepath_from_runtime()` is evaluated once per key.

### Hash Reuse for Block/Segment Analysis

- `extract_blocks()` and `extract_segments()` accept optional `precomputed_hashes`. When provided, they reuse hashes
instead of recomputing.
- The extractor computes function body hashes once and passes them to both block and segment extraction when both
analyses run for the same function.

### Scanner Efficiency (No Semantic Change)

- `iter_py_files()` now filters candidates before sorting, so only valid candidates are sorted. The final order remains
deterministic and equivalent to previous behavior.

### Contract Tightening

- `precomputed_hashes` type strengthened: `list[str] | None` → `Sequence[str] | None` (read-only intent in the type
contract).
- Added `assert len(precomputed_hashes) == len(body)` in both `extract_blocks()` and `extract_segments()` to catch
mismatched inputs early (development-time invariant).

### Testing & Determinism

- Byte-identical JSON reports verified across repeated runs; differences, when present, are limited to
volatile/provenance meta fields (e.g., cache status/path, timestamps), while semantic payload remains stable.
- Unit tests updated to mock `os.stat` instead of `os.path.getsize` where applicable (`test_process_file_stat_error`,
`test_process_file_size_limit`).

### Notes

- No changes to:
- detection semantics / fingerprints
- baseline hash inputs (`payload_sha256` semantic payload)
- exit code contract and precedence
- schema versions (baseline v1.0, cache v1.2, report v1.1)

---

## [1.4.1] - 2026-02-15

### CLI
Expand Down
21 changes: 19 additions & 2 deletions codeclone/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from __future__ import annotations

import ast
from collections.abc import Sequence
from dataclasses import dataclass

from .blockhash import stmt_hash
Expand Down Expand Up @@ -45,12 +46,20 @@ def extract_blocks(
cfg: NormalizationConfig,
block_size: int,
max_blocks: int,
precomputed_hashes: Sequence[str] | None = None,
) -> list[BlockUnit]:
body = getattr(func_node, "body", None)
if not isinstance(body, list) or len(body) < block_size:
return []

stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
if precomputed_hashes is not None:
assert len(precomputed_hashes) == len(body), (
f"precomputed_hashes length {len(precomputed_hashes)} "
f"!= body length {len(body)}"
)
stmt_hashes = precomputed_hashes
else:
stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]

blocks: list[BlockUnit] = []
last_start: int | None = None
Expand Down Expand Up @@ -94,12 +103,20 @@ def extract_segments(
cfg: NormalizationConfig,
window_size: int,
max_segments: int,
precomputed_hashes: Sequence[str] | None = None,
) -> list[SegmentUnit]:
body = getattr(func_node, "body", None)
if not isinstance(body, list) or len(body) < window_size:
return []

stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]
if precomputed_hashes is not None:
assert len(precomputed_hashes) == len(body), (
f"precomputed_hashes length {len(precomputed_hashes)} "
f"!= body length {len(body)}"
)
stmt_hashes = precomputed_hashes
else:
stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body]

segments: list[SegmentUnit] = []

Expand Down
10 changes: 5 additions & 5 deletions codeclone/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,14 +344,14 @@ def save(self) -> None:
try:
self.path.parent.mkdir(parents=True, exist_ok=True)
wire_files: dict[str, object] = {}
for runtime_path in sorted(
self.data["files"], key=self._wire_filepath_from_runtime
):
wire_map = {
rp: self._wire_filepath_from_runtime(rp) for rp in self.data["files"]
}
for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__):
entry = self.get_file_entry(runtime_path)
if entry is None:
continue
wire_path = self._wire_filepath_from_runtime(runtime_path)
wire_files[wire_path] = _encode_wire_file_entry(entry)
wire_files[wire_map[runtime_path]] = _encode_wire_file_entry(entry)

payload: dict[str, object] = {
"py": current_python_tag(),
Expand Down
101 changes: 39 additions & 62 deletions codeclone/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,14 @@ def process_file(
"""

try:
# Check file size
# Single os.stat() for both size check and cache signature
try:
st_size = os.path.getsize(filepath)
if st_size > MAX_FILE_SIZE:
st = os.stat(filepath)
if st.st_size > MAX_FILE_SIZE:
return ProcessingResult(
filepath=filepath,
success=False,
error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})",
error=f"File too large: {st.st_size} bytes (max {MAX_FILE_SIZE})",
error_kind="file_too_large",
)
except OSError as e:
Expand All @@ -140,6 +140,8 @@ def process_file(
error_kind="stat_error",
)

stat: FileStat = {"mtime_ns": st.st_mtime_ns, "size": st.st_size}

try:
source = Path(filepath).read_text("utf-8")
except UnicodeDecodeError as e:
Expand All @@ -157,7 +159,6 @@ def process_file(
error_kind="source_read_error",
)

stat = file_stat_signature(filepath)
module_name = module_name_from_path(root, filepath)

units, blocks, segments = extract_units_from_source(
Expand Down Expand Up @@ -355,68 +356,44 @@ def _safe_future_result(
return None, str(e)

# Discovery phase
try:
if args.quiet:
for fp in iter_py_files(str(root_path)):
files_found += 1
stat, cached, warn = _get_cached_entry(fp)
if warn:
console.print(warn)
files_skipped += 1
continue
if cached and cached.get("stat") == stat:
cache_hits += 1
all_units.extend(
cast(
list[GroupItem],
cast(object, cached.get("units", [])),
)
def _discover_files() -> None:
nonlocal files_found, cache_hits, files_skipped
for fp in iter_py_files(str(root_path)):
files_found += 1
stat, cached, warn = _get_cached_entry(fp)
if warn:
console.print(warn)
files_skipped += 1
continue
if cached and cached.get("stat") == stat:
cache_hits += 1
all_units.extend(
cast(
list[GroupItem],
cast(object, cached.get("units", [])),
)
all_blocks.extend(
cast(
list[GroupItem],
cast(object, cached.get("blocks", [])),
)
)
all_blocks.extend(
cast(
list[GroupItem],
cast(object, cached.get("blocks", [])),
)
all_segments.extend(
cast(
list[GroupItem],
cast(object, cached.get("segments", [])),
)
)
all_segments.extend(
cast(
list[GroupItem],
cast(object, cached.get("segments", [])),
)
else:
files_to_process.append(fp)
)
else:
files_to_process.append(fp)

try:
if args.quiet:
_discover_files()
else:
with console.status(ui.STATUS_DISCOVERING, spinner="dots"):
for fp in iter_py_files(str(root_path)):
files_found += 1
stat, cached, warn = _get_cached_entry(fp)
if warn:
console.print(warn)
files_skipped += 1
continue
if cached and cached.get("stat") == stat:
cache_hits += 1
all_units.extend(
cast(
list[GroupItem],
cast(object, cached.get("units", [])),
)
)
all_blocks.extend(
cast(
list[GroupItem],
cast(object, cached.get("blocks", [])),
)
)
all_segments.extend(
cast(
list[GroupItem],
cast(object, cached.get("segments", [])),
)
)
else:
files_to_process.append(fp)
_discover_files()
except OSError as e:
console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e)))
sys.exit(ExitCode.CONTRACT_ERROR)
Expand Down
61 changes: 38 additions & 23 deletions codeclone/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from contextlib import contextmanager
from dataclasses import dataclass

from .blockhash import stmt_hash
from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments
from .cfg import CFGBuilder
from .errors import ParseError
Expand Down Expand Up @@ -250,28 +251,42 @@ def extract_units_from_source(
)
)

# Block-level units (exclude __init__)
if not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10:
blocks = extract_blocks(
node,
filepath=filepath,
qualname=qualname,
cfg=cfg,
block_size=4,
max_blocks=15,
)
block_units.extend(blocks)

# Segment-level units (windows within functions, for internal clones)
if loc >= 30 and stmt_count >= 12:
segments = extract_segments(
node,
filepath=filepath,
qualname=qualname,
cfg=cfg,
window_size=6,
max_segments=60,
)
segment_units.extend(segments)
# Block-level and segment-level units share statement hashes
needs_blocks = (
not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10
)
needs_segments = loc >= 30 and stmt_count >= 12

if needs_blocks or needs_segments:
body = getattr(node, "body", None)
hashes: list[str] | None = None
if isinstance(body, list):
hashes = [stmt_hash(stmt, cfg) for stmt in body]

if needs_blocks:
block_units.extend(
extract_blocks(
node,
filepath=filepath,
qualname=qualname,
cfg=cfg,
block_size=4,
max_blocks=15,
precomputed_hashes=hashes,
)
)

if needs_segments:
segment_units.extend(
extract_segments(
node,
filepath=filepath,
qualname=qualname,
cfg=cfg,
window_size=6,
max_segments=60,
precomputed_hashes=hashes,
)
)

return units, block_units, segment_units
20 changes: 12 additions & 8 deletions codeclone/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ def iter_py_files(
if root_str.startswith(sensitive + "/"):
raise ValidationError(f"Cannot scan under sensitive directory: {root}")

file_count = 0
for p in sorted(rootp.rglob("*.py"), key=lambda path: str(path)):
# Collect and filter first, then sort — avoids sorting excluded paths
candidates: list[Path] = []
for p in rootp.rglob("*.py"):
# Verify path is actually under root (prevent symlink attacks)
try:
p.resolve().relative_to(rootp)
Expand All @@ -90,12 +91,15 @@ def iter_py_files(
if any(ex in parts for ex in excludes):
continue

file_count += 1
if file_count > max_files:
raise ValidationError(
f"File count exceeds limit of {max_files}. "
"Use more specific root or increase limit."
)
candidates.append(p)

if len(candidates) > max_files:
raise ValidationError(
f"File count exceeds limit of {max_files}. "
"Use more specific root or increase limit."
)

for p in sorted(candidates, key=lambda path: str(path)):
yield str(p)


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "codeclone"
version = "1.4.1"
version = "1.4.2"
description = "AST and CFG-based code clone detector for Python focused on architectural duplication"
readme = { file = "README.md", content-type = "text/markdown" }
license = { text = "MIT" }
Expand Down
Loading