diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md index bde1b5a1..ad05587f 100644 --- a/.claude/commands/ai-review-local.md +++ b/.claude/commands/ai-review-local.md @@ -1,6 +1,6 @@ --- description: Run AI code review locally using OpenAI API before opening a PR -argument-hint: "[--full-registry] [--model ] [--dry-run]" +argument-hint: "[--context minimal|standard|deep] [--include-files ] [--token-budget ] [--force-fresh] [--full-registry] [--model ] [--dry-run]" --- # Local AI Code Review @@ -12,6 +12,15 @@ pre-PR use. Designed for iterative review/revision cycles before submitting a PR ## Arguments `$ARGUMENTS` may contain optional flags: +- `--context {minimal,standard,deep}`: Context depth (default: `standard`) + - `minimal`: Diff only (original behavior) + - `standard`: Diff + full contents of changed `diff_diff/` source files + - `deep`: Standard + import-graph expansion (files imported by changed files) +- `--include-files `: Extra files to include as read-only context + (filenames resolve under `diff_diff/`, or use paths relative to repo root) +- `--token-budget `: Max estimated input tokens before dropping import-context + files (default: 200000). Changed source files are always included regardless of budget. +- `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists - `--full-registry`: Include the entire REGISTRY.md instead of selective sections - `--model `: Override the OpenAI model (default: `gpt-5.4`) - `--dry-run`: Print the compiled prompt without calling the API @@ -31,7 +40,8 @@ before any data is sent externally. ### Step 1: Parse Arguments Parse `$ARGUMENTS` for the optional flags listed above. All flags are optional — -the default behavior (selective registry, gpt-5.4, live API call) requires no arguments. +the default behavior (standard context, selective registry, gpt-5.4, live API call) +requires no arguments. ### Step 2: Validate Prerequisites @@ -146,20 +156,149 @@ Options: If user selects abort, clean up temp files and stop. If continue, proceed. +### Step 3c: Full-File Secret Scan (for standard/deep context) + +When `--context` is not `minimal`, full source files will be uploaded to OpenAI. The diff-based scan in Step 3b only covers changed lines, so scan the full content of every file that will be transmitted: + +```bash +# Category 1: Changed diff_diff/ source files (standard/deep) +upload_scan_files="" +if [ "$context_level" != "minimal" ]; then + upload_scan_files=$(git diff --name-only "${comparison_ref}...HEAD" | grep "^diff_diff/.*\.py$" || true) +fi + +# Category 2: All diff_diff/*.py for deep mode (conservative superset of import expansion) +if [ "$context_level" = "deep" ]; then + upload_scan_files=$(find diff_diff -name "*.py" -not -path "*/__pycache__/*" 2>/dev/null | sort -u) +fi + +# Category 3: --include-files (mirror script's path confinement) +if [ -n "$include_files" ]; then + repo_root_real=$(pwd -P) + for f in $(echo "$include_files" | tr ',' ' '); do + # Reject absolute paths (mirrors script's os.path.isabs check) + case "$f" in /*) continue ;; esac + # Mirror script's branching: only bare filenames (no /) resolve under diff_diff/ + # Slash-containing inputs resolve relative to repo root only + case "$f" in + */*) if [ -f "$f" ]; then candidate="$f"; else continue; fi ;; + *) if [ -f "diff_diff/$f" ]; then candidate="diff_diff/$f"; else continue; fi ;; + esac + # Verify within repo root (mirrors script's realpath containment) + real_candidate=$(cd "$(dirname "$candidate")" && pwd -P)/$(basename "$candidate") + case "$real_candidate" in "$repo_root_real"/*) upload_scan_files="$upload_scan_files $candidate" ;; esac + done +fi + +# Scan using same canonical content patterns from Step 3b (never echo secret values) +if [ -n "$upload_scan_files" ]; then + secret_hits=$(grep -rlE "(AKIA[A-Z0-9]{16}|ghp_[a-zA-Z0-9]{36}|sk-[a-zA-Z0-9]{48}|gho_[a-zA-Z0-9]{36}|[Aa][Pp][Ii][_-]?[Kk][Ee][Yy][[:space:]]*[=:]|[Ss][Ee][Cc][Rr][Ee][Tt][_-]?[Kk][Ee][Yy][[:space:]]*[=:]|[Pp][Aa][Ss][Ss][Ww][Oo][Rr][Dd][[:space:]]*[=:]|[Pp][Rr][Ii][Vv][Aa][Tt][Ee][_-]?[Kk][Ee][Yy]|[Bb][Ee][Aa][Rr][Ee][Rr][[:space:]]+[a-zA-Z0-9_-]+|[Tt][Oo][Kk][Ee][Nn][[:space:]]*[=:])" $upload_scan_files 2>/dev/null || true) + sensitive_hits=$(echo "$upload_scan_files" | tr ' ' '\n' | grep -iE "(\.env|credentials|secret|\.pem|\.key|\.p12|\.pfx|id_rsa|id_ed25519)$" || true) +fi +``` + +If either `secret_hits` or `sensitive_hits` is non-empty, use AskUserQuestion: +``` +Warning: Potential secrets detected in source files that would be uploaded to OpenAI +(full-file scan, not just diff hunks): +- + +Options: +1. Abort — review and remove secrets before retrying +2. Continue — I confirm these are not real secrets +``` + +If user selects abort, clean up temp files and stop. If continue, proceed. + ### Step 4: Handle Re-Review State -If `.claude/reviews/local-review-latest.md` exists, preserve it for re-review context: +Check for existing review state and generate delta diff if applicable. **Validate that the +stored state matches the current branch and base** before reusing — stale state from a +different branch can contaminate re-review context. + +**If `--force-fresh` is set**, delete prior state but still seed a new baseline: ```bash -if [ -f .claude/reviews/local-review-latest.md ]; then - cp .claude/reviews/local-review-latest.md .claude/reviews/local-review-previous.md - echo "Previous review preserved for re-review context." +rm -f .claude/reviews/review-state.json +rm -f .claude/reviews/local-review-latest.md +rm -f .claude/reviews/local-review-previous.md +echo "Force-fresh: deleted all prior review state. Fresh review will seed new baseline." +# Do NOT pass --previous-review or --delta-diff/--delta-changed-files +# DO still pass --review-state, --commit-sha, --base-ref so the fresh run seeds a new baseline +``` + +**Otherwise**, validate existing review state using the Python validator (single-point +validation that checks schema version, branch/base match, and required finding fields): +```bash +if [ -f .claude/reviews/review-state.json ]; then + # Use the script's validate_review_state() for comprehensive validation + # Returns: last_reviewed_commit and is_valid flag + validation_result=$(python3 -c " +import importlib.util, sys +spec = importlib.util.spec_from_file_location('openai_review', '.claude/scripts/openai_review.py') +mod = importlib.util.module_from_spec(spec) +spec.loader.exec_module(mod) +_, _, commit, valid = mod.validate_review_state( + '.claude/reviews/review-state.json', '$branch_name', '$comparison_ref' +) +print(f'{commit}|{valid}') +" 2>/dev/null || echo "|False") + last_reviewed_commit=$(echo "$validation_result" | cut -d'|' -f1) + state_valid=$(echo "$validation_result" | cut -d'|' -f2) + + if [ "$state_valid" != "True" ]; then + echo "Warning: review-state.json validation failed. Running fresh review." + rm -f .claude/reviews/review-state.json + rm -f .claude/reviews/local-review-previous.md + last_reviewed_commit="" + fi + + # Only generate delta when state is valid AND commit is an ancestor of HEAD + if [ -n "$last_reviewed_commit" ] && git merge-base --is-ancestor "$last_reviewed_commit" HEAD 2>/dev/null; then + # SHA is a valid ancestor — generate delta diff + git diff --unified=5 "${last_reviewed_commit}...HEAD" > /tmp/ai-review-delta-diff.patch + git diff --name-status "${last_reviewed_commit}...HEAD" > /tmp/ai-review-delta-files.txt + + # Check if delta is empty + if [ ! -s /tmp/ai-review-delta-diff.patch ]; then + echo "No changes since last review (commit ${last_reviewed_commit:0:7}). Use --force-fresh to re-review." + rm -f /tmp/ai-review-delta-diff.patch /tmp/ai-review-delta-files.txt + rm -f /tmp/ai-review-diff.patch /tmp/ai-review-files.txt + # Stop here + fi + # State validated and delta generated — preserve previous review for re-review context + if [ -f .claude/reviews/local-review-latest.md ]; then + cp .claude/reviews/local-review-latest.md .claude/reviews/local-review-previous.md + echo "Previous review preserved for re-review context." + fi + else + echo "Warning: Previous review commit is not an ancestor of HEAD (likely rebase). Running fresh review." + rm -f .claude/reviews/review-state.json + rm -f .claude/reviews/local-review-previous.md + fi +fi + +# Final cleanup: if delta mode was NOT activated (no delta files generated), +# ensure no stale previous-review file can leak into the fresh review. +# This covers the case where review-state.json is missing entirely but +# local-review-previous.md lingers from a prior run. +if [ ! -f /tmp/ai-review-delta-diff.patch ]; then + rm -f .claude/reviews/local-review-previous.md fi ``` +**Important**: Previous review text is ONLY preserved when delta mode is active (delta files +generated and state validated). When delta mode is NOT active — whether because +`review-state.json` is missing, branch/base mismatch, non-ancestor, or `--force-fresh` — +the previous review file is deleted to prevent stale findings from leaking into a fresh +review via `--previous-review`. + ### Step 5: Run the Review Script -Build and run the command. Include `--previous-review` only if the previous review file -exists from Step 4. +Build and run the command. Include optional arguments only when their conditions are met: +- `--previous-review`: only if `.claude/reviews/local-review-previous.md` exists AND `--force-fresh` was NOT set +- `--delta-diff` and `--delta-changed-files`: only if delta files were generated in Step 4 +- `--review-state`, `--commit-sha`, `--base-ref`: always include (even with `--force-fresh`, to seed a new baseline) +- `--context`, `--include-files`, `--token-budget`: pass through from parsed arguments ```bash python3 .claude/scripts/openai_review.py \ @@ -169,14 +308,26 @@ python3 .claude/scripts/openai_review.py \ --changed-files /tmp/ai-review-files.txt \ --output .claude/reviews/local-review-latest.md \ --branch-info "$branch_name" \ + --repo-root "$(pwd)" \ + --context "$context_level" \ + --review-state .claude/reviews/review-state.json \ + --commit-sha "$(git rev-parse HEAD)" \ + --base-ref "$comparison_ref" \ [--previous-review .claude/reviews/local-review-previous.md] \ + [--delta-diff /tmp/ai-review-delta-diff.patch] \ + [--delta-changed-files /tmp/ai-review-delta-files.txt] \ + [--include-files "$include_files"] \ + [--token-budget "$token_budget"] \ [--full-registry] \ [--model ] \ [--dry-run] ``` -If `--dry-run`: display the prompt output and stop. Report the estimated token count -and model that would be used. +Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are +generated in Step 4 and is NOT passed to the script. + +If `--dry-run`: display the prompt output and stop. Report the estimated token count, +cost estimate, and model that would be used. If the script exits non-zero, display the error output and stop. @@ -191,6 +342,10 @@ Parse the review output to extract ALL findings. For each finding, capture: - Section (Methodology, Code Quality, etc.) - One-line summary of the issue +Note: The script handles writing `review-state.json` automatically (finding tracking +across rounds). The skill does NOT need to write JSON — just pass `--review-state` +and `--commit-sha` to the script. + Present a **findings summary** showing every finding, grouped by severity: ``` @@ -243,13 +398,21 @@ directly — for P0/P1 issues use `EnterPlanMode` for a structured approach; for issues, fix them directly since they are minor. After fixes are committed, the user re-runs `/ai-review-local` for a follow-up review. +On re-review, the script automatically activates delta-diff mode (comparing only +changes since the last reviewed commit) and shows a structured findings table +tracking which previous findings have been addressed. ### Step 8: Cleanup ```bash -rm -f /tmp/ai-review-diff.patch /tmp/ai-review-files.txt +rm -f /tmp/ai-review-diff.patch /tmp/ai-review-files.txt \ + /tmp/ai-review-delta-diff.patch /tmp/ai-review-delta-files.txt ``` +Note: `.claude/reviews/review-state.json` is preserved across review rounds (it +tracks the last reviewed commit and findings). It is cleaned up when the user +runs `--force-fresh` or when a rebase invalidates the tracked commit. + ## Error Handling | Scenario | Response | @@ -260,21 +423,36 @@ rm -f /tmp/ai-review-diff.patch /tmp/ai-review-files.txt | Script exits non-zero | Display stderr output from script | | Previous review file missing on re-run | Script warns and continues as fresh review | | User aborts due to uncommitted changes | Clean exit | +| No changes since last review (empty delta) | Report and stop, suggest `--force-fresh` | +| Rebase invalidates last reviewed commit | Warn, delete stale state, run fresh review | +| `review-state.json` schema mismatch | Script warns and starts fresh | ## Examples ```bash -# Standard review of current branch vs main +# Standard review of current branch vs main (default: full source file context) /ai-review-local -# Review with full methodology registry -/ai-review-local --full-registry +# Review with minimal context (diff only, original behavior) +/ai-review-local --context minimal + +# Review with deep context (changed files + imported files) +/ai-review-local --context deep + +# Include specific files as extra context +/ai-review-local --include-files linalg.py,utils.py # Preview the compiled prompt without calling the API /ai-review-local --dry-run -# Use a different model -/ai-review-local --model gpt-4.1 +# Force a fresh review (ignore previous review state) +/ai-review-local --force-fresh + +# Use a different model with full registry +/ai-review-local --model gpt-4.1 --full-registry + +# Limit token budget for faster/cheaper reviews +/ai-review-local --token-budget 100000 ``` ## Notes @@ -282,6 +460,20 @@ rm -f /tmp/ai-review-diff.patch /tmp/ai-review-files.txt - This skill does NOT modify source files — it only generates temp files and review artifacts in `.claude/reviews/` (which is gitignored). It may also create a commit if there are uncommitted changes (Step 3). +- **Context levels**: By default (`standard`), the full contents of changed + `diff_diff/` source files are sent alongside the diff. This catches "sins of + omission" — code that should have changed but wasn't (e.g., a wrapper missing + a new parameter). Use `--context deep` to also include files imported by + changed files as read-only reference. +- **Delta-diff re-review**: When `review-state.json` exists from a previous run, + the script automatically generates a delta diff (changes since the last reviewed + commit) and focuses the reviewer on those changes. The full branch diff is + included as reference context. Use `--force-fresh` to bypass this. +- **Finding tracking**: The script writes structured findings to `review-state.json` + after each review. On re-review, previous findings are shown in a table with + their status (open/addressed), enabling the reviewer to focus on what changed. +- **Cost visibility**: The script shows estimated cost before the API call and + actual cost (from the API response) after completion. - Re-review mode activates automatically when a previous review exists in `.claude/reviews/local-review-latest.md` - The review criteria are adapted from `.github/codex/prompts/pr_review.md` (same @@ -290,8 +482,9 @@ rm -f /tmp/ai-review-diff.patch /tmp/ai-review-files.txt - The CI review (Codex action with full repo access) remains the authoritative final check — local review is a fast first pass to catch most issues early - **Data transmission**: In non-dry-run mode, this skill transmits the unified diff, - changed-file metadata, selected methodology registry text, and prior review context - (if present) to OpenAI via the Chat Completions API. Use `--dry-run` to preview - exactly what would be sent. + changed-file metadata, full source file contents (in standard/deep mode), + import-context files (in deep mode), selected methodology registry text, and + prior review context (if present) to OpenAI via the Chat Completions API. + Use `--dry-run` to preview exactly what would be sent. - This skill pairs naturally with the iterative workflow: `/ai-review-local` -> address findings -> `/ai-review-local` -> `/submit-pr` diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index 8cb3bbeb..b305b2c0 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -22,13 +22,18 @@ - Registry section extraction: mapping changed files to REGISTRY.md sections - OpenAI API call: authentication, request, error handling, timeout - Output: writing review markdown to --output path + - Review state: reading/writing review-state.json (finding tracking across rounds) + - Cost estimation: token counting and pricing lookup The script does NOT perform secret scanning. The skill must scan before calling. """ import argparse +import ast +import datetime import json import os +import re import sys import urllib.error import urllib.request @@ -139,6 +144,750 @@ def extract_registry_sections(registry_text: str, section_names: "set[str]") -> return "\n".join(extracted) +# --------------------------------------------------------------------------- +# File context — reading full source files for context +# --------------------------------------------------------------------------- + + +def resolve_changed_source_files( + changed_files_text: str, repo_root: str +) -> "list[str]": + """Return absolute paths of changed diff_diff/ .py files that exist on disk. + + Filters to diff_diff/**/*.py only (not tests, docs, configs). + Skips deleted files (status D in name-status output). + """ + paths: list[str] = [] + for line in changed_files_text.strip().splitlines(): + parts = line.split() + if not parts: + continue + # name-status format: "M\tdiff_diff/foo.py" or "D\tdiff_diff/bar.py" + status = parts[0] if len(parts) >= 2 else "" + path = parts[-1] + if status == "D": + continue + if not path.startswith("diff_diff/") or not path.endswith(".py"): + continue + abs_path = os.path.join(repo_root, path) + if os.path.isfile(abs_path): + paths.append(abs_path) + return paths + + +def read_source_files( + paths: "list[str]", repo_root: str, role: "str | None" = None +) -> str: + """Read files and wrap in XML-style tags for the prompt. + + Args: + paths: Absolute file paths to read. + repo_root: Repository root for computing relative paths. + role: Optional role attribute (e.g., "import-context"). + + Returns: + Concatenated string of tagged file contents. + """ + parts: list[str] = [] + for abs_path in paths: + rel_path = os.path.relpath(abs_path, repo_root) + try: + with open(abs_path) as f: + content = f.read() + except (OSError, IOError) as e: + print( + f"Warning: Could not read {rel_path}: {e}", file=sys.stderr + ) + continue + if role: + parts.append(f'') + else: + parts.append(f'') + parts.append(content) + parts.append("\n") + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Import graph expansion +# --------------------------------------------------------------------------- + + +def parse_imports(file_path: str) -> "set[str]": + """Extract diff_diff.* imports from a Python file using AST. + + Returns set of module names (e.g., {"diff_diff.linalg", "diff_diff.utils"}). + Resolves relative imports using the file's position within diff_diff/. + """ + try: + with open(file_path) as f: + source = f.read() + except (OSError, IOError): + return set() + + try: + tree = ast.parse(source, filename=file_path) + except SyntaxError: + print( + f"Warning: SyntaxError parsing {file_path} for imports.", + file=sys.stderr, + ) + return set() + + # Determine the package path for resolving relative imports + # e.g., diff_diff/staggered.py -> package = "diff_diff" + # e.g., diff_diff/visualization/_event_study.py -> package = "diff_diff.visualization" + package = _package_for_file(file_path) + + imports: set[str] = set() + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + if alias.name.startswith("diff_diff."): + imports.add(alias.name) + elif isinstance(node, ast.ImportFrom): + if node.module and node.level == 0: + # Absolute import: from diff_diff.linalg import ... + if node.module.startswith("diff_diff."): + imports.add(node.module) + elif node.level > 0 and package: + # Relative import: from .foo import bar, or from . import foo + resolved = _resolve_relative_import( + package, node.module, node.level + ) + if resolved and resolved.startswith("diff_diff"): + if node.module: + # from .linalg import solve_ols → resolved = "diff_diff.linalg" + imports.add(resolved) + else: + # from . import _event_study, _common → append each alias + for alias in node.names: + imports.add(f"{resolved}.{alias.name}") + return imports + + +def _package_for_file(file_path: str) -> "str | None": + """Determine the package name for a file within the repo. + + E.g., /repo/diff_diff/staggered.py -> "diff_diff" + /repo/diff_diff/visualization/_event_study.py -> "diff_diff.visualization" + """ + # Normalize and find diff_diff in the path + norm = os.path.normpath(file_path) + parts = norm.split(os.sep) + try: + idx = parts.index("diff_diff") + except ValueError: + return None + # Package is everything from diff_diff up to (but not including) the filename + pkg_parts = parts[idx:-1] + return ".".join(pkg_parts) if pkg_parts else None + + +def _resolve_relative_import( + package: str, module: "str | None", level: int +) -> "str | None": + """Resolve a relative import to an absolute module name. + + E.g., package="diff_diff", module="utils", level=1 -> "diff_diff.utils" + package="diff_diff.visualization", module=None, level=2 -> "diff_diff" + """ + parts = package.split(".") + # Go up `level` levels (level=1 means current package, level=2 means parent) + up = level - 1 + if up > 0: + parts = parts[:-up] if up < len(parts) else [] + if not parts: + return None + base = ".".join(parts) + if module: + return f"{base}.{module}" + return base + + +def resolve_module_to_path(module_name: str, repo_root: str) -> "str | None": + """Convert a module name to a file path if it exists. + + E.g., "diff_diff.linalg" -> "/diff_diff/linalg.py" + Also tries __init__.py for packages. + """ + rel = module_name.replace(".", os.sep) + # Try as a .py file first + candidate = os.path.join(repo_root, rel + ".py") + if os.path.isfile(candidate): + return candidate + # Try as a package (__init__.py) + candidate = os.path.join(repo_root, rel, "__init__.py") + if os.path.isfile(candidate): + return candidate + return None + + +def expand_import_graph( + changed_paths: "list[str]", repo_root: str +) -> "list[str]": + """Expand first-level imports from changed files. + + Returns additional file paths (not in changed_paths) that are imported + by the changed files. Only diff_diff.* imports are considered. + """ + changed_set = set(os.path.normpath(p) for p in changed_paths) + import_paths: set[str] = set() + + for file_path in changed_paths: + for module_name in parse_imports(file_path): + resolved = resolve_module_to_path(module_name, repo_root) + if resolved and os.path.normpath(resolved) not in changed_set: + import_paths.add(resolved) + + return sorted(import_paths) + + +# --------------------------------------------------------------------------- +# Review state — tracking findings across review rounds +# --------------------------------------------------------------------------- + +_REVIEW_STATE_SCHEMA_VERSION = 1 + + +def parse_review_state(path: str) -> "tuple[list[dict], int]": + """Read review-state.json and return (findings, review_round). + + Returns ([], 0) on missing file or schema mismatch. + """ + try: + with open(path) as f: + data = json.load(f) + except FileNotFoundError: + return ([], 0) + except (json.JSONDecodeError, OSError) as e: + print( + f"Warning: Could not parse review state {path}: {e}", + file=sys.stderr, + ) + return ([], 0) + + # Validate structure: must be a dict with expected fields + if not isinstance(data, dict): + print( + "Warning: review-state.json is not a JSON object. Starting fresh.", + file=sys.stderr, + ) + return ([], 0) + + if data.get("schema_version") != _REVIEW_STATE_SCHEMA_VERSION: + print( + f"Warning: review-state.json schema version mismatch " + f"(expected {_REVIEW_STATE_SCHEMA_VERSION}, " + f"got {data.get('schema_version')}). Starting fresh.", + file=sys.stderr, + ) + return ([], 0) + + findings = data.get("findings", []) + if not isinstance(findings, list): + print( + "Warning: review-state.json findings is not a list. Starting fresh.", + file=sys.stderr, + ) + return ([], 0) + # Filter to well-formed finding dicts only — require id, severity, summary, + # status keys to prevent crashes in merge_findings() and compile_prompt() + _REQUIRED_FINDING_KEYS = {"id", "severity", "summary", "status"} + findings = [ + f for f in findings + if isinstance(f, dict) and _REQUIRED_FINDING_KEYS.issubset(f.keys()) + ] + + review_round = data.get("review_round", 0) + if not isinstance(review_round, int): + review_round = 0 + + return (findings, review_round) + + +def validate_review_state( + path: str, expected_branch: str, expected_base: str +) -> "tuple[list[dict], int, str, bool]": + """Comprehensive review-state.json validation. + + Returns (findings, review_round, last_commit, is_valid) where is_valid + means delta mode is safe to use. Checks: file exists, valid JSON, + schema version, branch/base match, and required finding fields. + + The skill should call this once and use is_valid to gate ALL delta behavior. + """ + # Read raw data for validation (separate from parse_review_state which filters) + try: + with open(path) as f: + data = json.load(f) + except (FileNotFoundError, json.JSONDecodeError, OSError): + return ([], 0, "", False) + + if not isinstance(data, dict): + return ([], 0, "", False) + + if data.get("schema_version") != _REVIEW_STATE_SCHEMA_VERSION: + return ([], 0, "", False) + + # Fail closed on ANY malformed finding — if raw findings contain non-dict + # or missing-key entries, the entire state is invalid for delta mode + _REQUIRED_FINDING_KEYS = {"id", "severity", "summary", "status"} + raw_findings = data.get("findings", []) + if not isinstance(raw_findings, list): + return ([], 0, "", False) + for f in raw_findings: + if not isinstance(f, dict) or not _REQUIRED_FINDING_KEYS.issubset(f.keys()): + print( + "Warning: review-state.json contains malformed finding. " + "Delta mode disabled.", + file=sys.stderr, + ) + return ([], 0, "", False) + + last_commit = data.get("last_reviewed_commit", "") + stored_branch = data.get("branch", "") + stored_base = data.get("base_ref", "") + + if stored_branch != expected_branch or stored_base != expected_base: + print( + f"Warning: review-state.json is from branch '{stored_branch}' " + f"(base: '{stored_base}'), but current is '{expected_branch}' " + f"(base: '{expected_base}'). Delta mode disabled.", + file=sys.stderr, + ) + return ([], 0, last_commit, False) + + if not last_commit: + return ([], 0, "", False) + + # All checks passed — use parse_review_state for the filtered findings + findings, review_round = parse_review_state(path) + return (findings, review_round, last_commit, True) + + +def write_review_state( + path: str, + commit_sha: str, + base_ref: str, + branch: str, + review_round: int, + findings: "list[dict]", +) -> None: + """Write review-state.json with the current review state.""" + state = { + "schema_version": _REVIEW_STATE_SCHEMA_VERSION, + "last_reviewed_commit": commit_sha, + "base_ref": base_ref, + "review_round": review_round, + "branch": branch, + "reviewed_at": datetime.datetime.now(datetime.timezone.utc).isoformat(), + "findings": findings, + } + os.makedirs(os.path.dirname(path) or ".", exist_ok=True) + with open(path, "w") as f: + json.dump(state, f, indent=2) + + +# Match optional Markdown list prefix: -, *, +, or numbered (1., 2., etc.) +_LP = r"^(?:[-*+]|\d+\.?)?\s*" + +_BLOCK_START = re.compile( + _LP + r"\*\*(P[0-3])\*\*" # - **P1**, 1. **P1**, * **P1** + r"|" + _LP + r"\*\*(P[0-3]):\*\*" # - **P1:** summary (bold severity with colon) + r"|" + _LP + r"\*\*Severity:\*\*\s*(P[0-3])" # - **Severity:** P1 + r"|" + _LP + r"\*\*Severity:\s*(P[0-3])\*\*" # - **Severity: P1** + r"|" + _LP + r"Severity:\s*\*{0,2}`?(P[0-3])`?\*{0,2}" # Severity: P1, Severity: **P1**, Severity: `P1` + r"|" + _LP + r"(P[0-3]):\s" # - P1: summary (bare severity with colon) +) + +_IMPACT_PATTERN = re.compile(r"(?:\*\*)?Impact:(?:\*\*)?\s*(.+)") +_LOCATION_LABEL_PATTERN = re.compile(r"(?:\*\*)?Location:(?:\*\*)?\s*(.+)") + +_LOCATION_PATTERN = re.compile( + r"(?:`?)([\w/._-]+\.\w+(?::L?\d+(?:-L?\d+)?)?)(?:`?)" +) + +# Lines to skip when checking if a severity line is a real finding +_SKIP_PHRASES = [ + "findings are resolved", "findings have been addressed", + "should be marked", "assessment should be", + "does NOT need", "do NOT need", + "P1+ findings", "P0/P1 findings", +] +_SKIP_MARKERS = ["\u26d4", "\u26a0\ufe0f", "\u2705", "Blocker", "Needs changes", + "Looks good", "Path to Approval"] + + +def _should_skip_line(line: str) -> bool: + """Return True if the line is not a real finding.""" + stripped = line.strip() + if stripped.startswith("|") and stripped.endswith("|"): + return True + if re.search(r"P\d[/+]P\d", line): + return True + if any(m in line for m in _SKIP_MARKERS): + return True + if any(p in line for p in _SKIP_PHRASES): + return True + return False + + +def parse_review_findings( + review_text: str, review_round: int +) -> "tuple[list[dict], bool]": + """Parse AI review output for structured findings using block-based parsing. + + Supports both single-line findings (**P1** summary) and multi-line blocks + (Severity/Impact/Concrete fix on separate lines). + + Returns (findings, parse_uncertain) where parse_uncertain is True when + severity markers exist in the text but no findings could be parsed. + """ + # Pass 1: collect blocks + blocks: list[tuple[str, str, list[str]]] = [] # (severity, section, lines) + current_section = "General" + current_block: "list[str] | None" = None + current_severity = "" + current_block_section = "" + + for line in review_text.splitlines(): + # Detect section headings + if line.startswith("## ") or line.startswith("### "): + # Flush current block + if current_block is not None: + blocks.append((current_severity, current_block_section, current_block)) + current_block = None + heading = line.lstrip("#").strip() + if heading and "summary" not in heading.lower(): + current_section = heading + continue + + # Check for block start — trust the severity pattern; don't apply + # skip heuristics here (they're for the uncertainty scanner only) + sev_match = _BLOCK_START.search(line) + if sev_match: + # Flush previous block + if current_block is not None: + blocks.append((current_severity, current_block_section, current_block)) + severity = ( + sev_match.group(1) or sev_match.group(2) + or sev_match.group(3) or sev_match.group(4) + or sev_match.group(5) or sev_match.group(6) + ) + current_severity = severity + current_block_section = current_section + current_block = [line] + elif current_block is not None: + # Continuation line — append to current block + # End block on blank line followed by non-indented content + if not line.strip(): + current_block.append(line) + else: + current_block.append(line) + + # Flush final block + if current_block is not None: + blocks.append((current_severity, current_block_section, current_block)) + + # Pass 2: extract findings from blocks + findings: list[dict] = [] + counters: dict[str, int] = {} + + for severity, section, lines in blocks: + # Extract summary: prefer **Impact:** line, fall back to first line text + summary = "" + for bline in lines: + impact_match = _IMPACT_PATTERN.search(bline) + if impact_match: + summary = re.sub(r"\*\*", "", impact_match.group(1)).strip() + break + if not summary: + # Fall back to text after severity on the first line + first_line = lines[0] if lines else "" + sev_match = _BLOCK_START.search(first_line) + if sev_match: + text_after = first_line[sev_match.end():].strip().lstrip(":—- ").strip() + summary = re.sub(r"\*\*", "", text_after).strip() + + if not summary or len(summary) < 5: + continue + + if len(summary) > 120: + summary = summary[:117] + "..." + + # Extract location from all block lines — check labeled Location: first, + # then fall back to inline file:line pattern + location = "" + for bline in lines: + label_match = _LOCATION_LABEL_PATTERN.search(bline) + if label_match: + # Extract file:line from the label value + loc_in_label = _LOCATION_PATTERN.search(label_match.group(1)) + if loc_in_label: + location = loc_in_label.group(1) + break + loc_match = _LOCATION_PATTERN.search(bline) + if loc_match: + location = loc_match.group(1) + break + + counters[severity] = counters.get(severity, 0) + 1 + finding_id = f"R{review_round}-{severity}-{counters[severity]}" + + findings.append({ + "id": finding_id, + "severity": severity, + "section": section, + "summary": summary, + "location": location, + "status": "open", + }) + + # Fail-safe: detect unparsed severity lines. Count severity-like lines in + # the text and compare with findings parsed. If any remain unparsed, set + # parse_uncertain=True. This handles both total failures (0 findings with + # severity markers) and partial failures (some findings but others missed). + parse_uncertain = False + severity_line_count = 0 + for line in review_text.splitlines(): + if _should_skip_line(line): + continue + if _BLOCK_START.search(line): + severity_line_count += 1 + if severity_line_count > len(findings): + parse_uncertain = True + + return (findings, parse_uncertain) + + +def _finding_keys(f: dict) -> "tuple[tuple[str, str, str], tuple[str, str]]": + """Return (primary_key, fallback_key) for finding matching. + + Primary: (severity, file_path, normalized_summary) — uses normalized full relative + path (not basename) to avoid collisions like __init__.py in different dirs. + Fallback: (severity, normalized_summary) — used when either side lacks a file path, + with unique-candidate constraint to avoid ambiguous matching. + """ + summary = f.get("summary", "").lower().strip() + # Strip inline file:line references that cause churn on line number shifts + # e.g., "missing nan guard in `foo.py:l10`" → "missing nan guard in" + # (summary is already lowercased at this point) + summary = re.sub(r"`?[\w/.]+\.\w+(?::l?\d+(?:-l?\d+)?)?`?", "", summary) + summary = summary.strip() + severity = f.get("severity", "") + location = f.get("location", "") + # Use full relative path (strip line numbers only, keep directory structure) + file_path = location.split(":")[0] if location else "" + primary = (severity, file_path, summary) + fallback = (severity, summary) + return (primary, fallback) + + +def merge_findings( + previous: "list[dict]", current: "list[dict]" +) -> "list[dict]": + """Merge findings across review rounds using tiered matching. + + Pass 1: Match by primary key (severity + file_basename + normalized_summary). + Pass 2: For remaining unmatched findings, try fallback key (severity + + normalized_summary) but ONLY when there's exactly one candidate (unique match). + After both passes: mark unconsumed previous findings as addressed. + """ + # Build lookups — list per key to handle duplicates + prev_by_primary: dict[tuple, list[dict]] = {} + prev_by_fallback: dict[tuple, list[dict]] = {} + for f in previous: + primary, fallback = _finding_keys(f) + prev_by_primary.setdefault(primary, []).append(f) + prev_by_fallback.setdefault(fallback, []).append(f) + + # Track consumed previous findings by id + consumed_ids: set[str] = set() + merged: list[dict] = [] + + # Pass 1: primary key matching + for f in current: + primary, _ = _finding_keys(f) + if primary[1] and primary in prev_by_primary: + # Current finding has a file path — try exact match + candidates = [ + p for p in prev_by_primary[primary] + if p.get("id", "") not in consumed_ids + ] + if candidates: + consumed_ids.add(candidates[0].get("id", "")) + merged.append(f) + continue + merged.append(f) + + # Pass 2: fallback matching — when EITHER the current or previous finding + # lacks a file path. This is symmetric: handles both "current has location, + # previous doesn't" and "previous has location, current doesn't." + + # 2a: Current findings without file paths → try to match unconsumed previous + merged_pass2: list[dict] = [] + for f in merged: + primary, fallback = _finding_keys(f) + has_file = bool(primary[1]) + + if has_file: + merged_pass2.append(f) + continue + + # No file path on current — try fallback with unique unconsumed candidate + unconsumed = [ + p for p in prev_by_fallback.get(fallback, []) + if p.get("id", "") not in consumed_ids + ] + if len(unconsumed) == 1: + consumed_ids.add(unconsumed[0].get("id", "")) + merged_pass2.append(f) + + # 2b: Unconsumed previous findings without file paths → try to match + # current findings that DO have file paths (reverse direction). + # Track consumed current candidates to ensure one-to-one matching. + current_by_fallback: dict[tuple, list[dict]] = {} + for f in merged_pass2: + _, fallback = _finding_keys(f) + current_by_fallback.setdefault(fallback, []).append(f) + + consumed_current_ids: set[str] = set() + for f in previous: + fid = f.get("id", "") + if fid in consumed_ids: + continue + primary, fallback = _finding_keys(f) + has_file = bool(primary[1]) + if has_file: + continue # Has file path — should have matched in pass 1 if possible + # Previous finding without file path — try fallback against current + # Exclude already-consumed current candidates for one-to-one matching + candidates = [ + c for c in current_by_fallback.get(fallback, []) + if c.get("id", "") not in consumed_current_ids + ] + if len(candidates) == 1: + consumed_ids.add(fid) + consumed_current_ids.add(candidates[0].get("id", "")) + + # Mark unconsumed previous findings as addressed + for f in previous: + if f.get("id", "") not in consumed_ids: + addressed = dict(f) + addressed["status"] = "addressed" + merged_pass2.append(addressed) + + return merged_pass2 + + +# --------------------------------------------------------------------------- +# Token budget management +# --------------------------------------------------------------------------- + +DEFAULT_TOKEN_BUDGET = 200_000 + + +def apply_token_budget( + mandatory_tokens: int, + source_files_text: "str | None", + import_context_text: "str | None", + budget: int, +) -> "tuple[str | None, str | None, list[str]]": + """Apply token budget, dropping lowest-priority context first. + + Changed source files are always included (they are the highest-value + context for catching sins of omission). Only import-context files are + subject to the budget — they are included smallest-first until the + budget is exhausted. + + Returns (source_files_text, import_context_text, dropped_file_names). + """ + remaining = budget - mandatory_tokens + dropped: list[str] = [] + + # Source files are always included (sticky — not budget-governed) + if source_files_text: + remaining -= estimate_tokens(source_files_text) + + # Include import files individually, smallest first + final_import_text: "str | None" = None + if import_context_text and import_context_text.strip(): + # Split into individual file blocks + blocks = re.split(r"(?== block_tokens: + included_blocks.append(block) + remaining -= block_tokens + else: + # Extract filename from the block for the warning + name_match = re.search(r'path="([^"]+)"', block) + name = name_match.group(1) if name_match else "" + dropped.append(name) + + if included_blocks: + final_import_text = "\n".join(included_blocks) + + if mandatory_tokens > budget: + print( + f"Warning: Mandatory prompt sections alone are ~{mandatory_tokens:,} " + f"tokens, exceeding --token-budget of {budget:,}. Proceeding anyway.", + file=sys.stderr, + ) + + return (source_files_text, final_import_text, dropped) + + +# --------------------------------------------------------------------------- +# Cost estimation +# --------------------------------------------------------------------------- + +# Pricing per 1M tokens: (input, output) in USD. +# Source: https://platform.openai.com/docs/pricing +# MAINTENANCE: Update when OpenAI changes pricing. +PRICING = { + "gpt-5.4": (2.50, 15.00), + "gpt-4.1": (2.00, 8.00), + "gpt-4.1-mini": (0.40, 1.60), + "o3": (2.00, 8.00), + "o3-mini": (1.10, 4.40), +} + + +def estimate_cost( + input_tokens: int, output_tokens: int, model: str +) -> "str | None": + """Estimate cost for a given token count and model. + + Returns a formatted string like "$0.09 input + $0.13 max output = $0.22 max", + or None if model pricing is unknown. + """ + # Try exact match first, then longest prefix match + pricing = PRICING.get(model) + if not pricing: + for name in sorted(PRICING.keys(), key=len, reverse=True): + if model.startswith(name): + pricing = PRICING[name] + break + if not pricing: + return None + + input_cost = input_tokens * pricing[0] / 1_000_000 + output_cost = output_tokens * pricing[1] / 1_000_000 + total = input_cost + output_cost + return ( + f"${input_cost:.2f} input + ${output_cost:.2f} max output " + f"= ${total:.2f} max" + ) + + # --------------------------------------------------------------------------- # Prompt compilation # --------------------------------------------------------------------------- @@ -211,6 +960,12 @@ def compile_prompt( changed_files_text: str, branch_info: str, previous_review: "str | None", + # New parameters for enhanced context + source_files_text: "str | None" = None, + import_context_text: "str | None" = None, + delta_diff_text: "str | None" = None, + delta_changed_files_text: "str | None" = None, + structured_findings: "list[dict] | None" = None, ) -> str: """Assemble the full review prompt.""" sections: list[str] = [] @@ -227,30 +982,109 @@ def compile_prompt( ) sections.append(registry_content) - # Re-review block (before changes, so the model sees it in context) - if previous_review: + # Re-review block with structured findings and/or previous review text + if previous_review or structured_findings: + sections.append("\n---\n") + + if structured_findings and delta_diff_text: + # Enhanced re-review with structured findings table + round_num = max( + (f.get("id", "R0").split("-")[0].lstrip("R") for f in structured_findings), + default="0", + ) + sections.append(f"## Previous Review (Round {round_num})\n") + sections.append("### Previous Findings\n") + sections.append( + "| ID | Severity | Section | Summary | Location | Status |\n" + "|-----|----------|---------|---------|----------|--------|\n" + ) + for f in structured_findings: + # Escape pipe and newline chars to prevent table corruption + esc = lambda s: str(s).replace("|", "\\|").replace("\n", " ") + sections.append( + f"| {esc(f.get('id', ''))} | {esc(f.get('severity', ''))} " + f"| {esc(f.get('section', ''))} | {esc(f.get('summary', ''))} " + f"| {esc(f.get('location', ''))} | {esc(f.get('status', ''))} |\n" + ) + sections.append("") + elif previous_review: + sections.append("## Previous Review\n") + + if previous_review: + sections.append( + "This is a follow-up review. The previous review's findings are included " + "below. Focus on whether previous P0/P1 findings have been addressed. " + "New findings on unchanged code should be marked \"[Newly identified]\". " + "If all previous P1+ findings are resolved, the assessment should be " + "\u2705 even if new P2/P3 items are noticed.\n" + ) + if structured_findings: + sections.append("### Full Previous Review\n") + sections.append("") + sections.append(previous_review) + sections.append("\n") + + # Delta diff section (re-review with changes since last review) + if delta_diff_text: + sections.append("\n---\n") + sections.append("## Changes Since Last Review\n") + sections.append( + "These are the changes made since the last review. " + "Focus your review on these changes. Check whether previous " + "P0/P1 findings have been addressed.\n" + ) + if delta_changed_files_text: + sections.append("\nChanged files (since last review):\n") + sections.append(delta_changed_files_text) + sections.append("\nDelta diff:\n") + sections.append(delta_diff_text) + + # Full branch diff as reference + sections.append("\n---\n") + sections.append("## Full Branch Diff (Reference Only)\n") + sections.append( + "The complete diff from the base branch is included below for " + "reference context. Do NOT re-review unchanged code. Only reference " + "this section to understand the broader context of the delta changes " + "above.\n" + ) + sections.append("") + sections.append(diff_text) + sections.append("\n") + else: + # Fresh review — changes under review sections.append("\n---\n") - sections.append("## Previous Review\n") + sections.append("## Changes Under Review\n") + if branch_info: + sections.append(f"Branch: {branch_info}\n") + sections.append("\nChanged files:\n") + sections.append(changed_files_text) + sections.append("\nUnified diff (context=5):\n") + sections.append(diff_text) + + # Full source files section + if source_files_text: + sections.append("\n---\n") + sections.append("## Full Source Files (Changed)\n") sections.append( - "This is a follow-up review. The previous review's findings are included " - "below. Focus on whether previous P0/P1 findings have been addressed. " - "New findings on unchanged code should be marked \"[Newly identified]\". " - "If all previous P1+ findings are resolved, the assessment should be " - "\u2705 even if new P2/P3 items are noticed.\n" + "The complete contents of source files modified in this change are " + "provided below. Use these to identify \"sins of omission\" — code " + "that should have been changed but wasn't (e.g., a new parameter " + "added to one function but missing from its wrapper).\n" ) - sections.append("") - sections.append(previous_review) - sections.append("\n") - - # Section 3: Changes under review - sections.append("\n---\n") - sections.append("## Changes Under Review\n") - if branch_info: - sections.append(f"Branch: {branch_info}\n") - sections.append("\nChanged files:\n") - sections.append(changed_files_text) - sections.append("\nUnified diff (context=5):\n") - sections.append(diff_text) + sections.append(source_files_text) + + # Import context section + if import_context_text: + sections.append("\n---\n") + sections.append("## Import Context (Read-Only Reference)\n") + sections.append( + "These files are imported by the changed files but were not modified. " + "They are provided for cross-referencing function signatures, class " + "hierarchies, and constants. Do NOT flag issues in these files unless " + "directly related to changes in the diff.\n" + ) + sections.append(import_context_text) return "\n".join(sections) @@ -270,8 +1104,14 @@ def estimate_tokens(text: str) -> int: return len(text) // 4 -def call_openai(prompt: str, model: str, api_key: str) -> str: - """Call the OpenAI Chat Completions API and return the response content.""" +def call_openai( + prompt: str, model: str, api_key: str +) -> "tuple[str, dict]": + """Call the OpenAI Chat Completions API. + + Returns (content, usage) where usage is the API response's usage dict + containing prompt_tokens and completion_tokens. + """ payload = { "model": model, "messages": [{"role": "user", "content": prompt}], @@ -344,7 +1184,8 @@ def call_openai(prompt: str, model: str, api_key: str) -> str: print("Error: Empty review content from OpenAI API.", file=sys.stderr) sys.exit(1) - return content + usage = result.get("usage", {}) + return (content, usage) # --------------------------------------------------------------------------- @@ -415,9 +1256,68 @@ def main() -> None: action="store_true", help="Print compiled prompt to stdout without calling the API", ) + # New arguments + parser.add_argument( + "--context", + choices=["minimal", "standard", "deep"], + default="standard", + help="Context depth: minimal (diff only), standard (full changed files), " + "deep (changed files + imports). Default: standard", + ) + parser.add_argument( + "--repo-root", + default=None, + help="Repository root directory (required unless --context minimal)", + ) + parser.add_argument( + "--include-files", + default=None, + help="Comma-separated list of extra files to include as read-only context " + "(paths relative to repo root, or filenames to resolve under diff_diff/)", + ) + parser.add_argument( + "--token-budget", + type=int, + default=DEFAULT_TOKEN_BUDGET, + help=f"Max estimated input tokens before dropping context " + f"(default: {DEFAULT_TOKEN_BUDGET:,})", + ) + parser.add_argument( + "--delta-diff", + default=None, + help="Path to delta diff file (changes since last review)", + ) + parser.add_argument( + "--delta-changed-files", + default=None, + help="Path to delta changed-files list (since last review)", + ) + parser.add_argument( + "--review-state", + default=None, + help="Path to review-state.json for finding tracking across rounds", + ) + parser.add_argument( + "--commit-sha", + default=None, + help="HEAD commit SHA (required when --review-state is set)", + ) + parser.add_argument( + "--base-ref", + default="main", + help="Base branch name for review-state.json (default: main)", + ) args = parser.parse_args() + # Post-parse validation + if args.context != "minimal" and not args.repo_root: + parser.error( + "--repo-root is required when --context is 'standard' or 'deep'" + ) + if args.review_state and not args.commit_sha: + parser.error("--commit-sha is required when --review-state is set") + # Validate API key (unless dry-run) api_key = os.environ.get("OPENAI_API_KEY", "") if not args.dry_run and not api_key: @@ -468,7 +1368,146 @@ def main() -> None: file=sys.stderr, ) - # Compile prompt + # --- Read delta diff (before context expansion so we can scope context) --- + delta_diff_text = None + delta_changed_files_text = None + if args.delta_diff: + try: + with open(args.delta_diff) as f: + delta_diff_text = f.read() + if not delta_diff_text.strip(): + delta_diff_text = None + except FileNotFoundError: + print( + f"Warning: Delta diff not found: {args.delta_diff}.", + file=sys.stderr, + ) + if args.delta_changed_files: + try: + with open(args.delta_changed_files) as f: + delta_changed_files_text = f.read() + except FileNotFoundError: + pass + + # --- Context expansion --- + source_files_text = None + import_context_text = None + + if args.context in ("standard", "deep") and args.repo_root: + # In delta mode, scope source/import context to delta files only + context_files_text = ( + delta_changed_files_text + if delta_changed_files_text + else changed_files_text + ) + changed_paths = resolve_changed_source_files( + context_files_text, args.repo_root + ) + if changed_paths: + source_files_text = read_source_files(changed_paths, args.repo_root) + + if args.context == "deep": + import_paths = expand_import_graph(changed_paths, args.repo_root) + if import_paths: + import_context_text = read_source_files( + import_paths, args.repo_root, role="import-context" + ) + + # Handle --include-files (confined to repo root for security) + if args.include_files and args.repo_root: + repo_root_real = os.path.realpath(args.repo_root) + extra_paths: list[str] = [] + for name in args.include_files.split(","): + name = name.strip() + if not name: + continue + # Reject absolute paths + if os.path.isabs(name): + print( + f"Warning: --include-files: absolute paths not allowed " + f"({name}), skipping.", + file=sys.stderr, + ) + continue + if os.sep in name or "/" in name: + # Path relative to repo root + candidate = os.path.join(args.repo_root, name) + else: + # Filename to resolve under diff_diff/ + candidate = os.path.join(args.repo_root, "diff_diff", name) + # Normalize and verify within repo root (prevent ../ traversal) + candidate = os.path.realpath(candidate) + if not candidate.startswith(repo_root_real + os.sep): + print( + f"Warning: --include-files: {name} resolves outside repo " + f"root, skipping.", + file=sys.stderr, + ) + continue + if os.path.isfile(candidate): + extra_paths.append(candidate) + else: + print( + f"Warning: --include-files: {name} not found, skipping.", + file=sys.stderr, + ) + if extra_paths: + extra_text = read_source_files( + extra_paths, args.repo_root, role="import-context" + ) + if import_context_text: + import_context_text += "\n" + extra_text + else: + import_context_text = extra_text + + # --- Read review state for re-review --- + structured_findings = None + previous_round = 0 + if args.review_state: + structured_findings, previous_round = parse_review_state( + args.review_state + ) + if not structured_findings: + structured_findings = None # Normalize empty to None + + # --- Token budget --- + # Estimate mandatory content size (always included, not budget-governed) + mandatory_est = ( + estimate_tokens(criteria_text) + + estimate_tokens(registry_content) + + estimate_tokens(diff_text) + ) + # In delta mode, changed_files_text is NOT in the prompt (replaced by delta sections); + # only count it for fresh reviews where it appears in "Changes Under Review" + if not delta_diff_text: + mandatory_est += estimate_tokens(changed_files_text) + if previous_review: + mandatory_est += estimate_tokens(previous_review) + if delta_diff_text: + mandatory_est += estimate_tokens(delta_diff_text) + if delta_changed_files_text: + mandatory_est += estimate_tokens(delta_changed_files_text) + if structured_findings: + # Rough estimate for the findings table rendered in compile_prompt + findings_text = "\n".join(str(f) for f in structured_findings) + mandatory_est += estimate_tokens(findings_text) + + # Apply budget: source files are always included (sticky); + # only import-context files are dropped when over budget. + source_files_text, import_context_text, dropped = apply_token_budget( + mandatory_tokens=mandatory_est, + source_files_text=source_files_text, + import_context_text=import_context_text, + budget=args.token_budget, + ) + if dropped: + print( + f"Warning: Token budget exceeded. Dropped import context files: " + f"{', '.join(dropped)}", + file=sys.stderr, + ) + + # --- Compile prompt --- prompt = compile_prompt( criteria_text=criteria_text, registry_content=registry_content, @@ -476,6 +1515,11 @@ def main() -> None: changed_files_text=changed_files_text, branch_info=args.branch_info, previous_review=previous_review, + source_files_text=source_files_text, + import_context_text=import_context_text, + delta_diff_text=delta_diff_text, + delta_changed_files_text=delta_changed_files_text, + structured_findings=structured_findings, ) est_tokens = estimate_tokens(prompt) @@ -486,32 +1530,95 @@ def main() -> None: file=sys.stderr, ) + # Cost estimate + cost_str = estimate_cost(est_tokens, DEFAULT_MAX_TOKENS, args.model) + # Dry-run: print prompt and exit if args.dry_run: print(prompt) print(f"\n--- Dry run ---", file=sys.stderr) print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr) + if cost_str: + print(f"Estimated cost: {cost_str}", file=sys.stderr) print(f"Model: {args.model}", file=sys.stderr) + print(f"Context: {args.context}", file=sys.stderr) if previous_review: print("Mode: Re-review (previous review included)", file=sys.stderr) + if delta_diff_text: + print("Mode: Delta-diff (changes since last review)", file=sys.stderr) sys.exit(0) # Call OpenAI API print(f"Sending review to {args.model}...", file=sys.stderr) print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr) + if cost_str: + print(f"Estimated cost: {cost_str}", file=sys.stderr) + print(f"Context: {args.context}", file=sys.stderr) if previous_review: print("Mode: Re-review (previous review included)", file=sys.stderr) + if delta_diff_text: + print("Mode: Delta-diff (changes since last review)", file=sys.stderr) - review_content = call_openai(prompt, args.model, api_key) + review_content, usage = call_openai(prompt, args.model, api_key) - # Write output + # Write review output os.makedirs(os.path.dirname(args.output), exist_ok=True) with open(args.output, "w") as f: f.write(review_content) + # Write review state if requested + if args.review_state and args.commit_sha: + current_round = previous_round + 1 + current_findings, parse_uncertain = parse_review_findings( + review_content, current_round + ) + if parse_uncertain: + print( + "Warning: Could not parse findings from review output. " + "Preserving prior review state baseline (not advancing " + "last_reviewed_commit).", + file=sys.stderr, + ) + # Do NOT write review state at all — keep prior baseline intact + # regardless of whether prior findings exist, so the next delta + # review doesn't skip unparsed code + elif structured_findings: + final_findings = merge_findings(structured_findings, current_findings) + write_review_state( + path=args.review_state, + commit_sha=args.commit_sha, + base_ref=args.base_ref, + branch=args.branch_info, + review_round=current_round, + findings=final_findings, + ) + else: + write_review_state( + path=args.review_state, + commit_sha=args.commit_sha, + base_ref=args.base_ref, + branch=args.branch_info, + review_round=current_round, + findings=current_findings, + ) + + # Print completion summary with actual usage + actual_input = usage.get("prompt_tokens", 0) + actual_output = usage.get("completion_tokens", 0) + actual_cost = estimate_cost(actual_input, actual_output, args.model) + print(f"\nAI Review complete.", file=sys.stderr) print(f"Model: {args.model}", file=sys.stderr) - print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr) + if actual_input: + print( + f"Actual tokens: {actual_input:,} input, " + f"{actual_output:,} output", + file=sys.stderr, + ) + if actual_cost: + print(f"Actual cost: {actual_cost}", file=sys.stderr) + else: + print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr) print(f"Output saved to: {args.output}", file=sys.stderr) diff --git a/.github/codex/prompts/pr_review.md b/.github/codex/prompts/pr_review.md index 4cc9ea45..a64124de 100644 --- a/.github/codex/prompts/pr_review.md +++ b/.github/codex/prompts/pr_review.md @@ -158,3 +158,22 @@ ci = compute_confidence_interval(effect, se) # produces point estimate for se=0 t_stat, p_value, conf_int = safe_inference(effect, se) ``` Flag partial NaN guards as P0 — they produce misleading statistical output. + +### 4. Incomplete parameter propagation (P1) +For each changed public method signature (new parameter, renamed parameter, +changed default), verify that ALL callers and wrappers in the changed files +also received the same parameter. Check: +- Direct callers within the same file +- Cross-file callers visible in the diff or provided source files +- Wrapper methods that delegate to the changed method +- `get_params()` / `set_params()` return dicts +Flag each missing propagation as P1. + +### 5. Semantic contract violation in composed values (P1) +When code composes, transforms, or normalizes values from different sources +(e.g., weights from different estimators, variance components, time indices), +verify the semantic contract of each source is preserved through the operation: +- Units and scales must be compatible before arithmetic +- Normalization denominators must use the correct population +- Index alignment must match the data contract (inner vs outer join semantics) +Flag as P1 if semantic contracts are silently violated with no warning or check. diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index 8b4c2698..97675176 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -6,6 +6,8 @@ """ import importlib.util +import json +import os import pathlib import subprocess @@ -63,6 +65,13 @@ def review_mod(): return mod +@pytest.fixture +def repo_root(): + """Return the repo root directory.""" + assert _SCRIPT_PATH is not None + return str(_SCRIPT_PATH.parent.parent.parent) + + # --------------------------------------------------------------------------- # _sections_for_file # --------------------------------------------------------------------------- @@ -196,7 +205,7 @@ def test_replaces_pr_language(self, review_mod): def test_warns_on_missing_substitution(self, review_mod, capsys): # A text that doesn't contain any of the expected patterns - result = review_mod._adapt_review_criteria("Totally different text") + review_mod._adapt_review_criteria("Totally different text") captured = capsys.readouterr() assert "Warning: prompt substitution did not match" in captured.err @@ -259,6 +268,136 @@ def test_no_previous_review_block_when_none(self, review_mod): assert "" not in result +# --------------------------------------------------------------------------- +# compile_prompt — enhanced context modes +# --------------------------------------------------------------------------- + + +class TestCompilePromptWithContext: + """Test compile_prompt with the new context parameters.""" + + def test_backward_compatibility(self, review_mod): + """Original args produce same structure — no source/import sections.""" + result = review_mod.compile_prompt( + criteria_text="Criteria.", + registry_content="Registry.", + diff_text="diff content", + changed_files_text="M\tfoo.py", + branch_info="main", + previous_review=None, + ) + assert "Full Source Files" not in result + assert "Import Context" not in result + assert "Changes Under Review" in result + + def test_standard_mode_includes_source_files(self, review_mod): + result = review_mod.compile_prompt( + criteria_text="C.", + registry_content="R.", + diff_text="D.", + changed_files_text="M\tf.py", + branch_info="b", + previous_review=None, + source_files_text='content', + ) + assert "Full Source Files (Changed)" in result + assert "sins of omission" in result + assert '' in result + assert "Import Context" not in result + + def test_deep_mode_includes_import_context(self, review_mod): + result = review_mod.compile_prompt( + criteria_text="C.", + registry_content="R.", + diff_text="D.", + changed_files_text="M\tf.py", + branch_info="b", + previous_review=None, + source_files_text="src", + import_context_text='utils', + ) + assert "Full Source Files (Changed)" in result + assert "Import Context (Read-Only Reference)" in result + assert "Do NOT flag issues in these files" in result + + def test_delta_diff_structure(self, review_mod): + result = review_mod.compile_prompt( + criteria_text="C.", + registry_content="R.", + diff_text="full diff content", + changed_files_text="M\tf.py", + branch_info="b", + previous_review="Previous findings.", + delta_diff_text="delta diff content", + delta_changed_files_text="M\tf.py", + ) + assert "Changes Since Last Review" in result + assert "delta diff content" in result + assert "Full Branch Diff (Reference Only)" in result + assert "" in result + assert "full diff content" in result + + def test_delta_diff_with_structured_findings(self, review_mod): + findings = [ + { + "id": "R1-P1-1", + "severity": "P1", + "section": "Methodology", + "summary": "Missing NaN guard", + "location": "diff_diff/foo.py:L42", + "status": "open", + } + ] + result = review_mod.compile_prompt( + criteria_text="C.", + registry_content="R.", + diff_text="full diff", + changed_files_text="M\tf.py", + branch_info="b", + previous_review="Prev.", + delta_diff_text="delta", + structured_findings=findings, + ) + assert "Previous Findings" in result + assert "R1-P1-1" in result + assert "Missing NaN guard" in result + assert "diff_diff/foo.py:L42" in result + + def test_fresh_review_no_delta_sections(self, review_mod): + """Without delta_diff_text, no delta-specific sections appear.""" + result = review_mod.compile_prompt( + criteria_text="C.", + registry_content="R.", + diff_text="D.", + changed_files_text="M\tf.py", + branch_info="b", + previous_review=None, + source_files_text="src", + ) + assert "Changes Since Last Review" not in result + assert "Full Branch Diff (Reference Only)" not in result + assert "Changes Under Review" in result + + + def test_findings_table_escapes_pipe_chars(self, review_mod): + """Summary containing | should be escaped in the findings table.""" + findings = [ + { + "id": "R1-P1-1", "severity": "P1", "section": "Code Quality", + "summary": "Return type str | None is wrong", + "location": "foo.py:L10", "status": "open", + } + ] + result = review_mod.compile_prompt( + criteria_text="C.", registry_content="R.", diff_text="D.", + changed_files_text="M\tf.py", branch_info="b", + previous_review="Prev.", delta_diff_text="delta", + structured_findings=findings, + ) + # The pipe in "str | None" should be escaped as "str \| None" + assert "str \\| None" in result + + # --------------------------------------------------------------------------- # PREFIX_TO_SECTIONS mapping coverage # --------------------------------------------------------------------------- @@ -338,3 +477,1043 @@ def test_rough_estimate(self, review_mod): def test_empty_string(self, review_mod): assert review_mod.estimate_tokens("") == 0 + + +# --------------------------------------------------------------------------- +# resolve_changed_source_files +# --------------------------------------------------------------------------- + + +class TestResolveChangedSourceFiles: + def test_filters_to_diff_diff_py_files(self, review_mod, repo_root): + text = "M\tdiff_diff/bacon.py\nM\ttests/test_bacon.py\nM\tCLAUDE.md" + paths = review_mod.resolve_changed_source_files(text, repo_root) + assert any("bacon.py" in p for p in paths) + assert not any("test_bacon" in p for p in paths) + assert not any("CLAUDE" in p for p in paths) + + def test_skips_deleted_files(self, review_mod, repo_root): + text = "D\tdiff_diff/deleted_file.py\nM\tdiff_diff/bacon.py" + paths = review_mod.resolve_changed_source_files(text, repo_root) + assert not any("deleted_file" in p for p in paths) + assert any("bacon.py" in p for p in paths) + + def test_empty_input(self, review_mod, repo_root): + assert review_mod.resolve_changed_source_files("", repo_root) == [] + + def test_skips_nonexistent_files(self, review_mod, repo_root): + text = "M\tdiff_diff/nonexistent_xyz.py" + assert review_mod.resolve_changed_source_files(text, repo_root) == [] + + +# --------------------------------------------------------------------------- +# read_source_files +# --------------------------------------------------------------------------- + + +class TestReadSourceFiles: + def test_produces_xml_tagged_output(self, review_mod, repo_root): + # Use a real file that exists + path = os.path.join(repo_root, "diff_diff", "__init__.py") + if not os.path.isfile(path): + pytest.skip("diff_diff/__init__.py not found") + result = review_mod.read_source_files([path], repo_root) + assert '' in result + assert "" in result + + def test_role_attribute(self, review_mod, repo_root): + path = os.path.join(repo_root, "diff_diff", "__init__.py") + if not os.path.isfile(path): + pytest.skip("diff_diff/__init__.py not found") + result = review_mod.read_source_files([path], repo_root, role="import-context") + assert 'role="import-context"' in result + + def test_handles_missing_file(self, review_mod, repo_root, capsys): + result = review_mod.read_source_files( + ["/nonexistent/path.py"], repo_root + ) + assert result == "" + captured = capsys.readouterr() + assert "Warning" in captured.err + + def test_empty_paths(self, review_mod, repo_root): + assert review_mod.read_source_files([], repo_root) == "" + + +# --------------------------------------------------------------------------- +# parse_imports +# --------------------------------------------------------------------------- + + +class TestParseImports: + def test_extracts_absolute_import(self, review_mod, repo_root): + """Test with a real source file that imports diff_diff modules.""" + path = os.path.join(repo_root, "diff_diff", "bacon.py") + if not os.path.isfile(path): + pytest.skip("diff_diff/bacon.py not found") + imports = review_mod.parse_imports(path) + # bacon.py should import from diff_diff (e.g., diff_diff.linalg or diff_diff.utils) + assert all(m.startswith("diff_diff.") for m in imports) + + def test_ignores_non_diff_diff_imports(self, review_mod, tmp_path): + test_file = tmp_path / "test.py" + test_file.write_text("import numpy\nimport pandas\nfrom os import path\n") + imports = review_mod.parse_imports(str(test_file)) + assert imports == set() + + def test_submodule_imports_not_truncated(self, review_mod, repo_root): + """Submodule imports should keep full path, not truncate to 2 components.""" + path = os.path.join(repo_root, "diff_diff", "visualization", "_staggered.py") + if not os.path.isfile(path): + pytest.skip("diff_diff/visualization/_staggered.py not found") + imports = review_mod.parse_imports(path) + # Should include full submodule paths like diff_diff.visualization._common + has_submodule = any( + m.count(".") >= 2 for m in imports # at least 3 components + ) + assert has_submodule, ( + f"Expected submodule imports (3+ components) but got: {imports}" + ) + + def test_relative_import_aliases_expanded(self, review_mod, repo_root): + """from . import _event_study should resolve to diff_diff.visualization._event_study.""" + path = os.path.join(repo_root, "diff_diff", "visualization", "__init__.py") + if not os.path.isfile(path): + pytest.skip("diff_diff/visualization/__init__.py not found") + imports = review_mod.parse_imports(path) + # Should include individual submodule names, not just the package + submodules = [m for m in imports if m.startswith("diff_diff.visualization._")] + assert len(submodules) > 0, ( + f"Expected visualization submodule imports but got: {imports}" + ) + + def test_handles_syntax_error(self, review_mod, tmp_path, capsys): + test_file = tmp_path / "bad.py" + test_file.write_text("def foo(:\n pass\n") + imports = review_mod.parse_imports(str(test_file)) + assert imports == set() + captured = capsys.readouterr() + assert "SyntaxError" in captured.err + + def test_handles_missing_file(self, review_mod): + imports = review_mod.parse_imports("/nonexistent/file.py") + assert imports == set() + + +# --------------------------------------------------------------------------- +# expand_import_graph +# --------------------------------------------------------------------------- + + +class TestExpandImportGraph: + def test_expands_imports(self, review_mod, repo_root): + """Expanding imports for a real file produces additional paths.""" + path = os.path.join(repo_root, "diff_diff", "bacon.py") + if not os.path.isfile(path): + pytest.skip("diff_diff/bacon.py not found") + result = review_mod.expand_import_graph([path], repo_root) + # Should find at least some imports (linalg, utils, etc.) + assert isinstance(result, list) + # All paths should be absolute and exist + for p in result: + assert os.path.isabs(p) + assert os.path.isfile(p) + + def test_deduplicates_against_changed_set(self, review_mod, repo_root): + """Files already in changed_paths should not appear in expansion.""" + bacon = os.path.join(repo_root, "diff_diff", "bacon.py") + linalg = os.path.join(repo_root, "diff_diff", "linalg.py") + if not (os.path.isfile(bacon) and os.path.isfile(linalg)): + pytest.skip("required files not found") + result = review_mod.expand_import_graph([bacon, linalg], repo_root) + assert linalg not in [os.path.normpath(p) for p in result] + + def test_visualization_init_includes_submodules(self, review_mod, repo_root): + """expand_import_graph on visualization/__init__.py should include submodules.""" + path = os.path.join(repo_root, "diff_diff", "visualization", "__init__.py") + if not os.path.isfile(path): + pytest.skip("diff_diff/visualization/__init__.py not found") + result = review_mod.expand_import_graph([path], repo_root) + filenames = [os.path.basename(p) for p in result] + # Should include visualization submodules like _event_study.py, _staggered.py + assert any(f.startswith("_") and f.endswith(".py") for f in filenames), ( + f"Expected visualization submodule files but got: {filenames}" + ) + + def test_empty_input(self, review_mod, repo_root): + assert review_mod.expand_import_graph([], repo_root) == [] + + +# --------------------------------------------------------------------------- +# estimate_cost +# --------------------------------------------------------------------------- + + +class TestEstimateCost: + def test_known_model(self, review_mod): + result = review_mod.estimate_cost(100_000, 16_384, "gpt-5.4") + assert result is not None + assert "$" in result + assert "input" in result + assert "output" in result + + def test_unknown_model(self, review_mod): + result = review_mod.estimate_cost(100_000, 16_384, "unknown-model") + assert result is None + + def test_prefix_match(self, review_mod): + # gpt-5.4-turbo should match gpt-5.4 prefix + result = review_mod.estimate_cost(100_000, 16_384, "gpt-5.4-turbo") + assert result is not None + + +# --------------------------------------------------------------------------- +# Token budget — apply_token_budget +# --------------------------------------------------------------------------- + + +class TestTokenBudget: + def test_under_budget_all_included(self, review_mod): + src = "y" * 400 + imp = 'small' + result_src, result_imp, dropped = review_mod.apply_token_budget( + mandatory_tokens=100, + source_files_text=src, + import_context_text=imp, + budget=200_000, + ) + assert result_src == src + assert result_imp is not None + assert dropped == [] + + def test_over_budget_drops_imports_not_source(self, review_mod): + src = "y" * 400 + imp = ( + '' + "z" * 40_000 + "\n" + '' + "z" * 400 + "" + ) + result_src, result_imp, dropped = review_mod.apply_token_budget( + mandatory_tokens=200_000, # fills budget + source_files_text=src, + import_context_text=imp, + budget=200_000, + ) + # Source files always included (sticky) + assert result_src == src + # At least one import file should be dropped + assert len(dropped) > 0 + + def test_source_files_always_included(self, review_mod): + """Source files are sticky — never dropped even when over budget.""" + src = "y" * 800_000 # large source files + result_src, _, dropped = review_mod.apply_token_budget( + mandatory_tokens=100_000, + source_files_text=src, + import_context_text=None, + budget=50_000, # budget smaller than mandatory alone + ) + assert result_src == src + + def test_mandatory_exceeds_budget_warns(self, review_mod, capsys): + review_mod.apply_token_budget( + mandatory_tokens=300_000, + source_files_text=None, + import_context_text=None, + budget=200_000, + ) + captured = capsys.readouterr() + assert "exceeding --token-budget" in captured.err + + +# --------------------------------------------------------------------------- +# Review state — parse and write +# --------------------------------------------------------------------------- + + +class TestParseReviewState: + def test_reads_valid_json(self, review_mod, tmp_path): + state_file = tmp_path / "review-state.json" + state = { + "schema_version": 1, + "last_reviewed_commit": "abc123", + "review_round": 2, + "findings": [{"id": "R1-P1-1", "severity": "P1", "summary": "Test", "status": "open"}], + } + state_file.write_text(json.dumps(state)) + findings, round_num = review_mod.parse_review_state(str(state_file)) + assert len(findings) == 1 + assert round_num == 2 + + def test_missing_file_returns_empty(self, review_mod): + findings, round_num = review_mod.parse_review_state("/nonexistent.json") + assert findings == [] + assert round_num == 0 + + def test_schema_version_mismatch(self, review_mod, tmp_path, capsys): + state_file = tmp_path / "review-state.json" + state = {"schema_version": 999, "findings": []} + state_file.write_text(json.dumps(state)) + findings, round_num = review_mod.parse_review_state(str(state_file)) + assert findings == [] + assert round_num == 0 + captured = capsys.readouterr() + assert "schema version mismatch" in captured.err + + def test_non_dict_root_returns_empty(self, review_mod, tmp_path, capsys): + state_file = tmp_path / "review-state.json" + state_file.write_text("[1, 2, 3]") # list, not dict + findings, round_num = review_mod.parse_review_state(str(state_file)) + assert findings == [] + assert round_num == 0 + captured = capsys.readouterr() + assert "not a JSON object" in captured.err + + def test_non_list_findings_returns_empty(self, review_mod, tmp_path, capsys): + state_file = tmp_path / "review-state.json" + state = {"schema_version": 1, "findings": "not a list", "review_round": 1} + state_file.write_text(json.dumps(state)) + findings, round_num = review_mod.parse_review_state(str(state_file)) + assert findings == [] + assert round_num == 0 + captured = capsys.readouterr() + assert "not a list" in captured.err + + def test_non_int_round_defaults_to_zero(self, review_mod, tmp_path): + state_file = tmp_path / "review-state.json" + state = {"schema_version": 1, "findings": [], "review_round": "not_int"} + state_file.write_text(json.dumps(state)) + findings, round_num = review_mod.parse_review_state(str(state_file)) + assert findings == [] + assert round_num == 0 + + def test_non_dict_findings_filtered(self, review_mod, tmp_path): + """Non-dict elements in findings list are filtered out, not crash.""" + state_file = tmp_path / "review-state.json" + good_finding = { + "id": "R1-P1-1", "severity": "P1", + "summary": "Test finding", "status": "open", + } + state = { + "schema_version": 1, + "findings": ["oops", good_finding, 42], + "review_round": 1, + } + state_file.write_text(json.dumps(state)) + findings, round_num = review_mod.parse_review_state(str(state_file)) + assert len(findings) == 1 + assert findings[0]["id"] == "R1-P1-1" + assert round_num == 1 + + def test_findings_missing_required_keys_filtered(self, review_mod, tmp_path): + """Dict findings missing required keys (id, severity, summary, status) filtered.""" + state_file = tmp_path / "review-state.json" + state = { + "schema_version": 1, + "findings": [ + {"id": "R1-P1-1", "severity": "P1"}, # missing summary, status + {"id": "R1-P1-2", "severity": "P1", "summary": "Good", "status": "open"}, + {"severity": "P2", "summary": "No id", "status": "open"}, # missing id + ], + "review_round": 1, + } + state_file.write_text(json.dumps(state)) + findings, round_num = review_mod.parse_review_state(str(state_file)) + assert len(findings) == 1 + assert findings[0]["id"] == "R1-P1-2" + + +class TestWriteReviewState: + def test_writes_valid_json(self, review_mod, tmp_path): + path = str(tmp_path / "review-state.json") + review_mod.write_review_state( + path=path, + commit_sha="abc123", + base_ref="main", + branch="feature/test", + review_round=1, + findings=[{"id": "R1-P0-1", "severity": "P0"}], + ) + with open(path) as f: + data = json.load(f) + assert data["schema_version"] == 1 + assert data["last_reviewed_commit"] == "abc123" + assert data["review_round"] == 1 + assert len(data["findings"]) == 1 + + def test_round_trips_with_parse(self, review_mod, tmp_path): + path = str(tmp_path / "review-state.json") + original_findings = [ + {"id": "R1-P1-1", "severity": "P1", "summary": "Test finding", "status": "open"} + ] + review_mod.write_review_state( + path=path, + commit_sha="def456", + base_ref="main", + branch="fix/bug", + review_round=3, + findings=original_findings, + ) + findings, round_num = review_mod.parse_review_state(path) + assert round_num == 3 + assert findings[0]["id"] == "R1-P1-1" + + +# --------------------------------------------------------------------------- +# Review findings parsing +# --------------------------------------------------------------------------- + + +class TestParseReviewFindings: + def test_extracts_findings(self, review_mod): + review_text = ( + "## Methodology\n\n" + "**P1** Missing NaN guard in `diff_diff/staggered.py:L145`\n\n" + "## Code Quality\n\n" + "**P2** Unused import in `diff_diff/utils.py:L12`\n\n" + "## Summary\n" + "Overall assessment: Looks good\n" + ) + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + assert len(findings) >= 2 + assert not uncertain + severities = {f["severity"] for f in findings} + assert "P1" in severities + assert "P2" in severities + + def test_empty_review(self, review_mod): + findings, uncertain = review_mod.parse_review_findings("No issues found.", 1) + assert findings == [] + assert not uncertain + + def test_finding_ids_follow_format(self, review_mod): + review_text = ( + "**P0** Critical bug in `foo.py:L1`\n" + "**P1** Minor issue in the code\n" + ) + findings, _ = review_mod.parse_review_findings(review_text, 2) + for f in findings: + assert f["id"].startswith("R2-") + assert f["status"] == "open" + + def test_parses_bold_severity_format(self, review_mod): + """**P1** format should be parsed.""" + review_text = "**P1** Missing NaN guard in `foo.py:L10`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + + def test_parses_bold_colon_severity(self, review_mod): + """- **P1:** format (bold severity with colon) should be parsed.""" + review_text = "- **P1:** Missing NaN guard in `foo.py:L10`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + + def test_parses_bare_colon_severity(self, review_mod): + """- P1: format (bare severity with colon) should be parsed.""" + review_text = "- P1: Missing NaN guard in `foo.py:L10`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + + def test_mixed_format_both_parsed(self, review_mod): + """Review with supported + previously-unsupported format should parse both.""" + review_text = ( + "**P2** Code quality issue in `bar.py:L5`\n" + "- **P1:** Missing NaN guard in `foo.py:L10`\n" + ) + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 2 + severities = {f["severity"] for f in findings} + assert "P1" in severities + assert "P2" in severities + assert not uncertain + + def test_parses_severity_bold_value(self, review_mod): + """Severity: **P1** format (bold value after plain label) should be parsed.""" + review_text = "- Severity: **P1** — Missing NaN guard in `foo.py:L10`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + + def test_parses_numbered_list_severity(self, review_mod): + """1. Severity: P1 format should be parsed.""" + review_text = "1. Severity: P1 — Missing NaN guard in `foo.py:L10`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + + def test_parses_starred_bold_severity(self, review_mod): + """* **Severity:** P1 format should be parsed.""" + review_text = "* **Severity:** P1 — Missing NaN guard in `bar.py:L5`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + + def test_numbered_bold_severity_triggers_uncertainty(self, review_mod): + """1. **Severity:** P1 with no parseable summary → uncertain=True.""" + review_text = "1. **Severity:** P1\n" + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + assert findings == [] + assert uncertain + + def test_parses_bold_label_format(self, review_mod): + """**Severity:** P1 format should be parsed.""" + review_text = "- **Severity:** P1 — Missing NaN guard in `foo.py:L10`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + + def test_parses_plain_label_format(self, review_mod): + """Severity: P2 format should be parsed.""" + review_text = "Severity: P2 — Unused import in `bar.py:L5`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P2" + + def test_finding_with_skip_marker_in_summary_still_parsed(self, review_mod): + """Findings whose summaries contain skip markers like 'Path to Approval' should parse.""" + review_text = "**P2** The prompt omits the Path to Approval section in `foo.py:L10`\n" + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P2" + assert not uncertain + + def test_finding_with_looks_good_in_summary(self, review_mod): + """Finding mentioning 'Looks good' in summary should not be skipped.""" + review_text = "**P1** Assessment says Looks good but edge case is unhandled in `bar.py:L5`\n" + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + + def test_parses_multiline_finding_block(self, review_mod): + """Multi-line finding blocks (Severity/Impact on separate lines).""" + review_text = ( + "## Code Quality\n\n" + "- **Severity:** P1\n" + " **Impact:** Missing NaN guard causes silent incorrect output\n" + " **Location:** `diff_diff/staggered.py:L145`\n" + " **Concrete fix:** Use safe_inference()\n" + ) + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + assert "NaN guard" in findings[0]["summary"] + assert not uncertain + + def test_parses_plain_multiline_block(self, review_mod): + """Plain Severity: / Impact: labels (no bold) should be parsed.""" + review_text = ( + "## Code Quality\n\n" + "Severity: P1\n" + "Impact: Missing NaN guard causes silent incorrect output\n" + "Location: `diff_diff/staggered.py:L145`\n" + "Concrete fix: Use safe_inference()\n" + ) + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + assert len(findings) == 1 + assert findings[0]["severity"] == "P1" + assert "NaN guard" in findings[0]["summary"] + assert not uncertain + + def test_midline_severity_not_detected(self, review_mod): + """Severity markers embedded mid-line are not block starts — no uncertainty.""" + review_text = ( + "There is a Severity: P1 issue but the rest of the text\n" + "doesn't follow any recognized block structure at all\n" + ) + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + # Mid-line markers are not valid block starts — correctly returns ([], False) + assert findings == [] + assert not uncertain + + def test_midline_bold_severity_not_detected(self, review_mod): + """Bold severity mid-line (not at line start) is not a block start.""" + review_text = ( + "The review found **P1** issues but in a format\n" + "that the block parser cannot delimit properly.\n" + ) + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + # Mid-line bold is not a valid block start — correctly returns ([], False) + assert findings == [] + assert not uncertain + + def test_bold_label_severity_triggers_uncertainty(self, review_mod): + """**Severity:** P1 format with no parseable summary → uncertain=True.""" + review_text = "- **Severity:** P1\n" + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + assert findings == [] + assert uncertain + + def test_bold_inline_severity_triggers_uncertainty(self, review_mod): + """**Severity: P1** format with no parseable summary → uncertain=True.""" + review_text = "- **Severity: P1**\n" + findings, uncertain = review_mod.parse_review_findings(review_text, 1) + assert findings == [] + assert uncertain + + def test_ignores_multi_severity_prose(self, review_mod): + """Lines like 'P2/P3 items may exist' should not be parsed as findings.""" + review_text = ( + "P2/P3 items may exist. A PR does NOT need to be perfect.\n" + "If all previous P1+ findings are resolved, assessment should be good.\n" + ) + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert findings == [] + + def test_ignores_assessment_lines(self, review_mod): + """Assessment criteria lines with severity labels should be skipped.""" + review_text = ( + "⛔ Blocker — One or more P0: silent correctness bugs\n" + "⚠️ Needs changes — One or more P1 (no P0s)\n" + "✅ Looks good — No unmitigated P0 or P1 findings.\n" + ) + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert findings == [] + + def test_ignores_table_rows(self, review_mod): + """Findings tables from previous reviews should not be re-parsed.""" + review_text = ( + "| R1-P1-1 | P1 | Methodology | Missing NaN guard | foo.py:L10 | open |\n" + "| R1-P2-1 | P2 | Code Quality | Unused import | bar.py:L5 | addressed |\n" + ) + findings, _ = review_mod.parse_review_findings(review_text, 2) + assert findings == [] + + def test_ignores_instructional_text(self, review_mod): + """Instructional text referencing severities should be skipped.""" + review_text = ( + "Focus on whether previous P0/P1 findings have been addressed.\n" + "If all previous P1+ findings are resolved, the assessment should be good.\n" + ) + findings, _ = review_mod.parse_review_findings(review_text, 1) + assert findings == [] + + +# --------------------------------------------------------------------------- +# Merge findings +# --------------------------------------------------------------------------- + + +class TestMergeFindings: + def test_matching_finding_stays_open(self, review_mod): + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard", "status": "open"} + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard", "status": "open"} + ] + merged = review_mod.merge_findings(previous, current) + open_at_loc = [ + f for f in merged + if f["location"] == "foo.py:L10" and f["status"] == "open" + ] + assert len(open_at_loc) >= 1 + + def test_absent_finding_marked_addressed(self, review_mod): + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard", "status": "open"} + ] + current = [] # Finding was addressed + merged = review_mod.merge_findings(previous, current) + addressed = [f for f in merged if f["status"] == "addressed"] + assert len(addressed) == 1 + assert addressed[0]["location"] == "foo.py:L10" + + def test_new_finding_added_as_open(self, review_mod): + previous = [] + current = [ + {"id": "R2-P0-1", "severity": "P0", "location": "bar.py:L5", + "section": "Methodology", "summary": "Missing check", "status": "open"} + ] + merged = review_mod.merge_findings(previous, current) + assert len(merged) == 1 + assert merged[0]["status"] == "open" + assert merged[0]["location"] == "bar.py:L5" + + def test_matching_with_shifted_line_numbers(self, review_mod): + """Same finding at different line ranges should still match via summary.""" + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"} + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "foo.py:L10-L12", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"} + ] + merged = review_mod.merge_findings(previous, current) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + # Should match (same severity, file, summary) — not create a false "addressed" + assert len(open_findings) == 1 + assert len(addressed) == 0 + + def test_matching_with_missing_location(self, review_mod): + """Finding with no location should still match on summary fingerprint.""" + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"} + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"} + ] + merged = review_mod.merge_findings(previous, current) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + # Same severity + same summary = match. No false "addressed" record. + assert len(open_findings) == 1 + assert len(addressed) == 0 + + def test_multiple_findings_same_key(self, review_mod): + """Multiple previous findings with same key should not overwrite each other.""" + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"}, + {"id": "R1-P1-2", "severity": "P1", "location": "foo.py:L20", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"}, + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"}, + ] + merged = review_mod.merge_findings(previous, current) + # One should match, one should be addressed + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + assert len(open_findings) == 1 + assert len(addressed) == 1 + + def test_duplicate_no_location_findings_one_to_one(self, review_mod): + """Two prior no-location findings should not both match one current finding.""" + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "", + "section": "Code Quality", "summary": "Missing NaN guard", + "status": "open"}, + {"id": "R1-P1-2", "severity": "P1", "location": "", + "section": "Methodology", "summary": "Missing NaN guard", + "status": "open"}, + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard", + "status": "open"}, + ] + merged = review_mod.merge_findings(previous, current) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + # One current + one prior matched = 1 open; one prior unmatched = 1 addressed + assert len(open_findings) == 1 + assert len(addressed) == 1 + + def test_previous_missing_location_current_has_location(self, review_mod): + """Previous finding with no location, current has one → should match.""" + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"} + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "staggered.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard in staggered", + "status": "open"} + ] + merged = review_mod.merge_findings(previous, current) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + # Should match via symmetric fallback — no false "addressed" + assert len(open_findings) == 1 + assert len(addressed) == 0 + + def test_same_basename_different_dirs_no_cross_match(self, review_mod): + """__init__.py in different dirs with same summary should NOT cross-match.""" + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "diff_diff/__init__.py:L10", + "section": "Code Quality", "summary": "Missing type export", "status": "open"} + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "diff_diff/visualization/__init__.py:L5", + "section": "Code Quality", "summary": "Missing type export", "status": "open"} + ] + merged = review_mod.merge_findings(previous, current) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + # Different full paths: previous should be addressed, current stays open + assert len(open_findings) == 1 + assert len(addressed) == 1 + + def test_long_summaries_dont_collide(self, review_mod): + """Two findings with same first 50 chars but different suffixes should NOT collapse.""" + prefix = "a" * 50 + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": prefix + " first issue details", + "status": "open"}, + {"id": "R1-P1-2", "severity": "P1", "location": "foo.py:L20", + "section": "Code Quality", "summary": prefix + " second different issue", + "status": "open"}, + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": prefix + " first issue details", + "status": "open"}, + {"id": "R2-P1-2", "severity": "P1", "location": "foo.py:L20", + "section": "Code Quality", "summary": prefix + " second different issue", + "status": "open"}, + ] + merged = review_mod.merge_findings(previous, current) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + # Both should match — neither dropped + assert len(open_findings) == 2 + assert len(addressed) == 0 + + def test_same_summary_different_files_no_cross_match(self, review_mod): + """Two findings with same summary but different files should NOT cross-match.""" + previous = [ + {"id": "R1-P1-1", "severity": "P1", "location": "foo.py:L10", + "section": "Code Quality", "summary": "Missing NaN guard in estimator", + "status": "open"}, + ] + current = [ + {"id": "R2-P1-1", "severity": "P1", "location": "bar.py:L20", + "section": "Code Quality", "summary": "Missing NaN guard in estimator", + "status": "open"}, + ] + merged = review_mod.merge_findings(previous, current) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + # Different files: previous should be addressed, current stays open + assert len(open_findings) == 1 + assert open_findings[0]["location"] == "bar.py:L20" + assert len(addressed) == 1 + assert addressed[0]["location"] == "foo.py:L10" + + +# --------------------------------------------------------------------------- +# estimate_cost — prefix matching regression +# --------------------------------------------------------------------------- + + +class TestEstimateCostPrefixRegression: + def test_mini_model_gets_mini_pricing(self, review_mod): + """gpt-4.1-mini snapshot should get mini pricing, not parent gpt-4.1.""" + mini_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-4.1-mini-2025-04-14") + parent_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-4.1") + assert mini_cost is not None + assert parent_cost is not None + # Mini should be cheaper than parent + assert mini_cost != parent_cost + + def test_o3_mini_gets_mini_pricing(self, review_mod): + """o3-mini snapshot should get o3-mini pricing, not o3.""" + mini_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "o3-mini-2025-01-31") + parent_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "o3") + assert mini_cost is not None + assert parent_cost is not None + assert mini_cost != parent_cost + + +# --------------------------------------------------------------------------- +# Delta context derivation +# --------------------------------------------------------------------------- + + +class TestDeltaContextDerivation: + def test_delta_files_resolve_only_delta(self, review_mod, repo_root): + """resolve_changed_source_files with delta file list returns only delta files.""" + # Simulate: full branch changed bacon.py and staggered.py, but delta only has bacon.py + delta_text = "M\tdiff_diff/bacon.py" + paths = review_mod.resolve_changed_source_files(delta_text, repo_root) + filenames = [os.path.basename(p) for p in paths] + assert "bacon.py" in filenames + # staggered.py should NOT be in the result (it's not in delta) + assert "staggered.py" not in filenames + + +# --------------------------------------------------------------------------- +# Review state — branch/base validation support +# --------------------------------------------------------------------------- + + +class TestReviewStateBranchValidation: + def test_stores_and_retrieves_branch_and_base(self, review_mod, tmp_path): + """write_review_state stores branch/base; parse_review_state returns them.""" + path = str(tmp_path / "review-state.json") + review_mod.write_review_state( + path=path, + commit_sha="abc123", + base_ref="main", + branch="feature/test", + review_round=1, + findings=[], + ) + # Read back and verify fields are present + import json + with open(path) as f: + data = json.load(f) + assert data["branch"] == "feature/test" + assert data["base_ref"] == "main" + + +# --------------------------------------------------------------------------- +# End-to-end: parse then merge pipeline +# --------------------------------------------------------------------------- + + +class TestParseThenMerge: + def test_line_shift_does_not_cause_churn(self, review_mod): + """Same finding at different line numbers should merge as 1 open, 0 addressed.""" + review_r1 = "**P1** Missing NaN guard in `foo.py:L10`\n" + review_r2 = "**P1** Missing NaN guard in `foo.py:L12`\n" + findings_r1, _ = review_mod.parse_review_findings(review_r1, 1) + findings_r2, _ = review_mod.parse_review_findings(review_r2, 2) + assert len(findings_r1) == 1 + assert len(findings_r2) == 1 + merged = review_mod.merge_findings(findings_r1, findings_r2) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + assert len(open_findings) == 1 + assert len(addressed) == 0 + + def test_md_file_line_shift_does_not_cause_churn(self, review_mod): + """Same finding on a .md file at different line numbers should merge as 1 open.""" + review_r1 = "**P1** Missing docs in `ai-review-local.md:L10`\n" + review_r2 = "**P1** Missing docs in `ai-review-local.md:L20`\n" + findings_r1, _ = review_mod.parse_review_findings(review_r1, 1) + findings_r2, _ = review_mod.parse_review_findings(review_r2, 2) + assert len(findings_r1) == 1 + assert len(findings_r2) == 1 + merged = review_mod.merge_findings(findings_r1, findings_r2) + open_findings = [f for f in merged if f["status"] == "open"] + addressed = [f for f in merged if f["status"] == "addressed"] + assert len(open_findings) == 1 + assert len(addressed) == 0 + + def test_parse_uncertain_does_not_advance_state(self, review_mod, tmp_path): + """When parse_uncertain fires, review-state.json should not be modified.""" + state_path = str(tmp_path / "review-state.json") + # Write initial state + review_mod.write_review_state( + path=state_path, + commit_sha="initial123", + base_ref="main", + branch="feature/x", + review_round=1, + findings=[{"id": "R1-P1-1", "severity": "P1", "summary": "Test", "status": "open"}], + ) + initial_mtime = os.path.getmtime(state_path) + + # Simulate parse_uncertain scenario + unparseable_review = "- **Severity:** P1\n" # Will return ([], True) + findings, uncertain = review_mod.parse_review_findings(unparseable_review, 2) + assert uncertain + assert findings == [] + + # The state file should NOT have been modified + # (in production, main() skips write_review_state when uncertain) + current_mtime = os.path.getmtime(state_path) + assert current_mtime == initial_mtime + + # Verify original state is intact + stored_findings, stored_round = review_mod.parse_review_state(state_path) + assert stored_round == 1 + assert stored_findings[0]["id"] == "R1-P1-1" + + +# --------------------------------------------------------------------------- +# validate_review_state — comprehensive validation +# --------------------------------------------------------------------------- + + +class TestValidateReviewState: + def test_valid_state_returns_true(self, review_mod, tmp_path): + path = str(tmp_path / "review-state.json") + review_mod.write_review_state( + path=path, commit_sha="abc123", base_ref="main", + branch="feature/test", review_round=1, + findings=[{"id": "R1-P1-1", "severity": "P1", + "summary": "Test", "status": "open"}], + ) + findings, rnd, commit, valid = review_mod.validate_review_state( + path, "feature/test", "main" + ) + assert valid + assert commit == "abc123" + assert len(findings) == 1 + + def test_branch_mismatch_returns_false(self, review_mod, tmp_path): + path = str(tmp_path / "review-state.json") + review_mod.write_review_state( + path=path, commit_sha="abc123", base_ref="main", + branch="feature/old", review_round=1, findings=[], + ) + _, _, _, valid = review_mod.validate_review_state( + path, "feature/new", "main" + ) + assert not valid + + def test_schema_mismatch_returns_false(self, review_mod, tmp_path): + state_file = tmp_path / "review-state.json" + state_file.write_text(json.dumps({"schema_version": 999})) + _, _, _, valid = review_mod.validate_review_state( + str(state_file), "b", "main" + ) + assert not valid + + def test_missing_file_returns_false(self, review_mod): + _, _, _, valid = review_mod.validate_review_state( + "/nonexistent.json", "b", "main" + ) + assert not valid + + def test_malformed_finding_returns_false(self, review_mod, tmp_path): + """Any malformed finding dict should invalidate delta mode entirely.""" + state_file = tmp_path / "review-state.json" + state = { + "schema_version": 1, + "last_reviewed_commit": "abc123", + "branch": "feature/test", + "base_ref": "main", + "review_round": 1, + "findings": [ + {"id": "R1-P1-1", "severity": "P1"}, # missing summary, status + ], + } + state_file.write_text(json.dumps(state)) + _, _, _, valid = review_mod.validate_review_state( + str(state_file), "feature/test", "main" + ) + assert not valid # fail closed on malformed finding + + +# --------------------------------------------------------------------------- +# Include-files path confinement +# --------------------------------------------------------------------------- + + +class TestIncludeFilesConfinement: + """Verify --include-files rejects paths outside repo root.""" + + def test_rejects_absolute_path(self, review_mod, repo_root, capsys): + """Absolute paths should be rejected.""" + # Simulate the path resolution logic from main() + name = "/etc/passwd" + assert os.path.isabs(name) + # The script rejects absolute paths before even resolving + + def test_rejects_traversal(self, review_mod, repo_root): + """../ traversal should be detected after realpath normalization.""" + candidate = os.path.join(repo_root, "../../../etc/passwd") + candidate = os.path.realpath(candidate) + repo_root_real = os.path.realpath(repo_root) + assert not candidate.startswith(repo_root_real + os.sep)