Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions src/knowledge/openalex_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ class CitingPaper:
url: str


@dataclass
class ResolvedWork:
"""An OpenAlex work id paired with its publication year."""

work_id: str
publication_year: int | None


def _strip_id(value: str | None) -> str:
"""Reduce an OpenAlex IRI (https://openalex.org/W123) to its bare id."""
if not value:
Expand Down Expand Up @@ -91,18 +99,29 @@ def _params(self, **extra: object) -> dict[str, object]:
params["api_key"] = self._api_key
return params

def resolve_work_id(self, doi: str) -> str | None:
"""Resolve a DOI to its OpenAlex work id (e.g. ``W2128495200``)."""
def resolve_work(self, doi: str) -> ResolvedWork | None:
"""Resolve a DOI to its OpenAlex work id and publication year.

The year lets callers floor a citation histogram at the paper's own
publication, dropping impossible pre-publication citation buckets.
"""
resp = self._client.get(
f"{OPENALEX_BASE}/works/doi:{doi}",
params=self._params(select="id"),
params=self._params(select="id,publication_year"),
)
if resp.status_code == 404:
logger.warning("OpenAlex has no work for DOI %s", doi)
return None
resp.raise_for_status()
work_id = _strip_id(resp.json().get("id"))
return work_id or None
data = resp.json()
work_id = _strip_id(data.get("id"))
if not work_id:
return None
year = data.get("publication_year")
return ResolvedWork(
work_id=work_id,
publication_year=year if isinstance(year, int) else None,
)

@staticmethod
def _cites_filter(work_ids: str | Sequence[str]) -> str:
Expand Down
16 changes: 13 additions & 3 deletions src/knowledge/papers_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,15 +450,25 @@ def sync_citing_papers(
for doi in dois:
try:
# Resolve the primary DOI plus any version aliases to a group of
# OpenAlex work ids; citations across the group are merged.
# OpenAlex works; citations across the group are merged.
group_dois = [doi, *aliases.get(doi, [])]
work_ids = [wid for d in group_dois if (wid := client.resolve_work_id(d))]
if not work_ids:
resolved = [w for d in group_dois if (w := client.resolve_work(d))]
if not resolved:
logger.warning("Skipping citations: cannot resolve DOI %s", doi)
continue
work_ids = [w.work_id for w in resolved]

# The earliest publication year across the version group (the
# preprint, if any) is the floor: a paper cannot be cited before
# it exists, so drop impossible pre-publication buckets that come
# from citing works with bad dates.
pub_years = [w.publication_year for w in resolved if w.publication_year is not None]
floor_year = min(pub_years) if pub_years else None

# 1. Complete per-year counts (source of truth for the chart).
counts = client.counts_by_year(work_ids)
if floor_year is not None:
counts = {y: c for y, c in counts.items() if y >= floor_year}
if not counts:
# A canonical paper with zero citations is implausible; an
# empty histogram almost always means a transient OpenAlex
Expand Down
25 changes: 19 additions & 6 deletions tests/test_knowledge/test_openalex_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,34 @@ def test_strip_doi(self):
assert _strip_doi(None) is None


class TestResolveWorkId:
def test_resolves_doi_to_work_id(self):
class TestResolveWork:
def test_resolves_doi_to_work_and_year(self):
def handler(request: httpx.Request) -> httpx.Response:
assert "/works/doi:10.1/x" in str(request.url)
return httpx.Response(200, json={"id": "https://openalex.org/W999"})
return httpx.Response(
200, json={"id": "https://openalex.org/W999", "publication_year": 2019}
)

with _client(handler) as c:
resolved = c.resolve_work("10.1/x")
assert resolved.work_id == "W999"
assert resolved.publication_year == 2019

def test_missing_year_is_none(self):
def handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(200, json={"id": "https://openalex.org/W1"})

with _client(handler) as c:
assert c.resolve_work_id("10.1/x") == "W999"
resolved = c.resolve_work("10.1/x")
assert resolved.work_id == "W1"
assert resolved.publication_year is None

def test_unresolved_doi_returns_none(self):
def handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(404, json={"error": "not found"})

with _client(handler) as c:
assert c.resolve_work_id("10.1/missing") is None
assert c.resolve_work("10.1/missing") is None

def test_includes_mailto_param(self):
seen = {}
Expand All @@ -72,7 +85,7 @@ def handler(request: httpx.Request) -> httpx.Response:
return httpx.Response(200, json={"id": "https://openalex.org/W1"})

with _client(handler) as c:
c.resolve_work_id("10.1/x")
c.resolve_work("10.1/x")
assert seen["mailto"] == "t@example.org"


Expand Down
112 changes: 112 additions & 0 deletions tests/test_knowledge/test_papers_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,118 @@ def factory(**_kwargs):
assert stored == 0
assert stats.total == 0

def test_drops_prepublication_citations(self, tmp_path: Path, monkeypatch) -> None:
# The work was published in 2016; a citing bucket dated 2013 is
# impossible (bad OpenAlex date) and must be dropped from the histogram.
def handler(request: httpx.Request) -> httpx.Response:
if "/works/doi:" in str(request.url):
return httpx.Response(
200, json={"id": "https://openalex.org/W1", "publication_year": 2016}
)
if request.url.params.get("group_by"):
return httpx.Response(
200,
json={
"group_by": [
{"key": "2013", "count": 2}, # before publication
{"key": "2016", "count": 5},
{"key": "2020", "count": 9},
]
},
)
return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []})

def factory(**_kwargs):
return OpenAlexCitationClient(
client=httpx.Client(transport=httpx.MockTransport(handler))
)

monkeypatch.setattr(ps, "OpenAlexCitationClient", factory)

db_path = tmp_path / "knowledge" / "test.db"
with patch("src.knowledge.db.get_db_path", return_value=db_path):
init_db("test")
sync_citing_papers(["10.1/canon"], project="test")
stats = get_citation_stats("test")

assert stats.by_paper == {"10.1/canon": {"2016": 5, "2020": 9}}
assert "2013" not in stats.per_year

def test_over_aggressive_floor_preserves_existing(self, tmp_path: Path, monkeypatch) -> None:
# If OpenAlex reports a bogus future year for the canonical work, every
# current bucket is floored out; the empty-counts guard must then keep
# the existing histogram rather than wiping it.
def handler(request: httpx.Request) -> httpx.Response:
if "/works/doi:" in str(request.url):
return httpx.Response(
200, json={"id": "https://openalex.org/W1", "publication_year": 2099}
)
if request.url.params.get("group_by"):
return httpx.Response(200, json={"group_by": [{"key": "2024", "count": 10}]})
return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []})

def factory(**_kwargs):
return OpenAlexCitationClient(
client=httpx.Client(transport=httpx.MockTransport(handler))
)

monkeypatch.setattr(ps, "OpenAlexCitationClient", factory)

db_path = tmp_path / "knowledge" / "test.db"
with patch("src.knowledge.db.get_db_path", return_value=db_path):
init_db("test")
replace_citation_counts("10.1/canon", {2024: 50}, project="test")
sync_citing_papers(["10.1/canon"], project="test")
stats = get_citation_stats("test")

# Existing data preserved, not wiped to empty by the over-high floor.
assert stats.by_paper == {"10.1/canon": {"2024": 50}}

def test_floor_is_earliest_version_year(self, tmp_path: Path, monkeypatch) -> None:
# Primary published 2025, preprint 2024 -> floor is 2024 (the preprint).
def handler(request: httpx.Request) -> httpx.Response:
url = str(request.url)
if "/works/doi:10.1234/published" in url:
return httpx.Response(
200, json={"id": "https://openalex.org/W1", "publication_year": 2025}
)
if "/works/doi:10.1101/preprint" in url:
return httpx.Response(
200, json={"id": "https://openalex.org/W2", "publication_year": 2024}
)
if request.url.params.get("group_by"):
return httpx.Response(
200,
json={
"group_by": [
{"key": "2023", "count": 3}, # before the preprint
{"key": "2024", "count": 7},
{"key": "2025", "count": 11},
]
},
)
return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []})

def factory(**_kwargs):
return OpenAlexCitationClient(
client=httpx.Client(transport=httpx.MockTransport(handler))
)

monkeypatch.setattr(ps, "OpenAlexCitationClient", factory)

db_path = tmp_path / "knowledge" / "test.db"
with patch("src.knowledge.db.get_db_path", return_value=db_path):
init_db("test")
sync_citing_papers(
["10.1234/published"],
project="test",
aliases={"10.1234/published": ["10.1101/preprint"]},
)
stats = get_citation_stats("test")

# 2023 (before the 2024 preprint) dropped; 2024+ kept.
assert stats.by_paper == {"10.1234/published": {"2024": 7, "2025": 11}}

def test_version_aliases_merge_into_primary(self, tmp_path: Path, monkeypatch) -> None:
# Primary + preprint resolve to W1/W2; counts are queried as a group and
# attributed to the primary DOI.
Expand Down
Loading