Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions graphify/cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,53 @@
# per-file extraction cache - skip unchanged files on re-run
from __future__ import annotations

import functools
import hashlib
import json
import os
from pathlib import Path


@functools.lru_cache(maxsize=None)
def _git_root(directory: Path) -> Path | None:
"""Return the git repository root for *directory*, or None if not in a repo."""
current = directory.resolve()
while True:
if (current / ".git").exists():
return current
parent = current.parent
if parent == current:
return None
current = parent


@functools.lru_cache(maxsize=None)
def _git_common_dir(directory: Path) -> Path | None:
"""Return the shared .git directory — identical across all worktrees of a repo.

Regular clone → <repo>/.git/
git worktree → reads <worktree>/.git file, returns <repo>/.git/
Not a git repo → None
"""
root = _git_root(directory)
if root is None:
return None
git_path = root / ".git"
if git_path.is_dir():
return git_path
if git_path.is_file():
content = git_path.read_text(encoding="utf-8").strip()
if content.startswith("gitdir:"):
worktree_git = Path(content[len("gitdir:"):].strip())
if not worktree_git.is_absolute():
worktree_git = (root / worktree_git).resolve()
# .git/worktrees/<branch>/ → .git/
common = worktree_git.parent.parent
if (common / "HEAD").exists():
return common
return None


def _body_content(content: bytes) -> bytes:
"""Strip YAML frontmatter from Markdown content, returning only the body."""
text = content.decode(errors="replace")
Expand Down Expand Up @@ -42,8 +83,18 @@ def file_hash(path: Path, root: Path = Path(".")) -> str:


def cache_dir(root: Path = Path(".")) -> Path:
"""Returns graphify-out/cache/ - creates it if needed."""
d = Path(root) / "graphify-out" / "cache"
"""Return the cache directory, creating it if needed.

When inside a git repository, the cache lives in <git-common-dir>/graphify-cache/
so all worktrees of the same repo share one cache — a worktree for a feature
branch that touches 3 of 500 files gets 497 cache hits instead of a cold start.
Outside a git repo, falls back to <root>/graphify-out/cache/.
"""
common = _git_common_dir(Path(root).resolve())
if common is not None:
d = common / "graphify-cache"
else:
d = Path(root) / "graphify-out" / "cache"
d.mkdir(parents=True, exist_ok=True)
return d

Expand Down
109 changes: 105 additions & 4 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
"""Tests for graphify/cache.py."""
import pytest
from pathlib import Path
from graphify.cache import file_hash, cache_dir, load_cached, save_cached, cached_files, clear_cache, _body_content
from graphify.cache import (
file_hash, cache_dir, load_cached, save_cached, cached_files, clear_cache,
_body_content, _git_root, _git_common_dir,
)


@pytest.fixture(autouse=True)
def _clear_git_lookup_caches():
"""lru_cache on _git_root/_git_common_dir persists between tests; clear it."""
_git_root.cache_clear()
_git_common_dir.cache_clear()
yield
_git_root.cache_clear()
_git_common_dir.cache_clear()


@pytest.fixture
Expand Down Expand Up @@ -67,11 +80,99 @@ def test_cached_files(tmp_path, cache_root):


def test_clear_cache(tmp_file, cache_root):
"""clear_cache removes all .json files from graphify-out/cache/."""
"""clear_cache removes all .json files from the cache directory."""
save_cached(tmp_file, {"nodes": [], "edges": []}, root=cache_root)
assert len(list((cache_root / "graphify-out" / "cache").glob("*.json"))) > 0
cdir = cache_dir(cache_root)
assert len(list(cdir.glob("*.json"))) > 0
clear_cache(cache_root)
assert len(list((cache_root / "graphify-out" / "cache").glob("*.json"))) == 0
assert len(list(cdir.glob("*.json"))) == 0


def test_cache_dir_falls_back_when_not_in_git_repo(tmp_path):
"""Outside a git repo, cache_dir returns <root>/graphify-out/cache/ (legacy)."""
d = cache_dir(tmp_path)
assert d == tmp_path / "graphify-out" / "cache"
assert d.exists()


def test_cache_dir_uses_git_common_dir(tmp_path):
"""Inside a git repo (regular clone), cache_dir returns <.git>/graphify-cache/."""
repo = tmp_path / "repo"
repo.mkdir()
(repo / ".git").mkdir() # simulate a regular clone
(repo / ".git" / "HEAD").write_text("ref: refs/heads/main\n")

d = cache_dir(repo)
assert d == repo / ".git" / "graphify-cache"
assert d.exists()


def test_cache_dir_shared_between_main_and_worktree(tmp_path):
"""Main tree and worktree resolve to the SAME cache directory."""
repo = tmp_path / "repo"
repo.mkdir()
git_dir = repo / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text("ref: refs/heads/main\n")
# Create the per-worktree git dir that a `git worktree add` would create
wt_git = git_dir / "worktrees" / "feature"
wt_git.mkdir(parents=True)
(wt_git / "HEAD").write_text("ref: refs/heads/feature\n")
# The worktree itself (separate directory, .git is a *file*)
worktree = tmp_path / "worktrees" / "feature"
worktree.mkdir(parents=True)
(worktree / ".git").write_text(f"gitdir: {wt_git}\n")

main_cache = cache_dir(repo)
wt_cache = cache_dir(worktree)

assert main_cache == wt_cache == git_dir / "graphify-cache"


def test_cache_hit_across_worktrees(tmp_path):
"""A file cached from the main tree is reused when that file appears in a worktree."""
repo = tmp_path / "repo"
(repo / "src").mkdir(parents=True)
git_dir = repo / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text("ref: refs/heads/main\n")

# Same relative path + same content, different absolute paths.
main_file = repo / "src" / "foo.py"
main_file.write_text("def greet(): return 'hello'\n")

# Save cache from the main tree
result = {"nodes": [{"id": "greet"}], "edges": []}
save_cached(main_file, result, root=repo)

# Now set up the worktree with the SAME file content at the SAME relative path
wt_git = git_dir / "worktrees" / "feature"
wt_git.mkdir(parents=True)
(wt_git / "HEAD").write_text("ref: refs/heads/feature\n")
worktree = tmp_path / "worktrees" / "feature"
(worktree / "src").mkdir(parents=True)
(worktree / ".git").write_text(f"gitdir: {wt_git}\n")
wt_file = worktree / "src" / "foo.py"
wt_file.write_text("def greet(): return 'hello'\n")

# Loading from the worktree should hit the cache the main tree wrote.
loaded = load_cached(wt_file, root=worktree)
assert loaded == result


def test_file_hash_portable_across_worktrees(tmp_path):
"""Same content + same relative path → same hash, even at different absolute paths."""
repo_main = tmp_path / "main"
repo_wt = tmp_path / "worktree"
(repo_main / "src").mkdir(parents=True)
(repo_wt / "src").mkdir(parents=True)

f_main = repo_main / "src" / "foo.py"
f_wt = repo_wt / "src" / "foo.py"
f_main.write_text("x = 1\n")
f_wt.write_text("x = 1\n")

assert file_hash(f_main, root=repo_main) == file_hash(f_wt, root=repo_wt)


def test_md_frontmatter_only_change_same_hash(tmp_path):
Expand Down