OpenScience-Collective · neuromechanist · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/src/knowledge/openalex_citations.py b/src/knowledge/openalex_citations.py
@@ -38,6 +38,14 @@ class CitingPaper:
     url: str
 
 
+@dataclass
+class ResolvedWork:
+    """An OpenAlex work id paired with its publication year."""
+
+    work_id: str
+    publication_year: int | None
+
+
 def _strip_id(value: str | None) -> str:
     """Reduce an OpenAlex IRI (https://openalex.org/W123) to its bare id."""
     if not value:
@@ -91,18 +99,29 @@ def _params(self, **extra: object) -> dict[str, object]:
             params["api_key"] = self._api_key
         return params
 
-    def resolve_work_id(self, doi: str) -> str | None:
-        """Resolve a DOI to its OpenAlex work id (e.g. ``W2128495200``)."""
+    def resolve_work(self, doi: str) -> ResolvedWork | None:
+        """Resolve a DOI to its OpenAlex work id and publication year.
+
+        The year lets callers floor a citation histogram at the paper's own
+        publication, dropping impossible pre-publication citation buckets.
+        """
         resp = self._client.get(
             f"{OPENALEX_BASE}/works/doi:{doi}",
-            params=self._params(select="id"),
+            params=self._params(select="id,publication_year"),
         )
         if resp.status_code == 404:
             logger.warning("OpenAlex has no work for DOI %s", doi)
             return None
         resp.raise_for_status()
-        work_id = _strip_id(resp.json().get("id"))
-        return work_id or None
+        data = resp.json()
+        work_id = _strip_id(data.get("id"))
+        if not work_id:
+            return None
+        year = data.get("publication_year")
+        return ResolvedWork(
+            work_id=work_id,
+            publication_year=year if isinstance(year, int) else None,
+        )
 
     @staticmethod
     def _cites_filter(work_ids: str | Sequence[str]) -> str:

diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py
@@ -450,15 +450,25 @@ def sync_citing_papers(
         for doi in dois:
             try:
                 # Resolve the primary DOI plus any version aliases to a group of
-                # OpenAlex work ids; citations across the group are merged.
+                # OpenAlex works; citations across the group are merged.
                 group_dois = [doi, *aliases.get(doi, [])]
-                work_ids = [wid for d in group_dois if (wid := client.resolve_work_id(d))]
-                if not work_ids:
+                resolved = [w for d in group_dois if (w := client.resolve_work(d))]
+                if not resolved:
                     logger.warning("Skipping citations: cannot resolve DOI %s", doi)
                     continue
+                work_ids = [w.work_id for w in resolved]
+
+                # The earliest publication year across the version group (the
+                # preprint, if any) is the floor: a paper cannot be cited before
+                # it exists, so drop impossible pre-publication buckets that come
+                # from citing works with bad dates.
+                pub_years = [w.publication_year for w in resolved if w.publication_year is not None]
+                floor_year = min(pub_years) if pub_years else None
 
                 # 1. Complete per-year counts (source of truth for the chart).
                 counts = client.counts_by_year(work_ids)
+                if floor_year is not None:
+                    counts = {y: c for y, c in counts.items() if y >= floor_year}
                 if not counts:
                     # A canonical paper with zero citations is implausible; an
                     # empty histogram almost always means a transient OpenAlex

diff --git a/tests/test_knowledge/test_openalex_citations.py b/tests/test_knowledge/test_openalex_citations.py
@@ -48,21 +48,34 @@ def test_strip_doi(self):
         assert _strip_doi(None) is None
 
 
-class TestResolveWorkId:
-    def test_resolves_doi_to_work_id(self):
+class TestResolveWork:
+    def test_resolves_doi_to_work_and_year(self):
         def handler(request: httpx.Request) -> httpx.Response:
             assert "/works/doi:10.1/x" in str(request.url)
-            return httpx.Response(200, json={"id": "https://openalex.org/W999"})
+            return httpx.Response(
+                200, json={"id": "https://openalex.org/W999", "publication_year": 2019}
+            )
+
+        with _client(handler) as c:
+            resolved = c.resolve_work("10.1/x")
+        assert resolved.work_id == "W999"
+        assert resolved.publication_year == 2019
+
+    def test_missing_year_is_none(self):
+        def handler(_request: httpx.Request) -> httpx.Response:
+            return httpx.Response(200, json={"id": "https://openalex.org/W1"})
 
         with _client(handler) as c:
-            assert c.resolve_work_id("10.1/x") == "W999"
+            resolved = c.resolve_work("10.1/x")
+        assert resolved.work_id == "W1"
+        assert resolved.publication_year is None
 
     def test_unresolved_doi_returns_none(self):
         def handler(_request: httpx.Request) -> httpx.Response:
             return httpx.Response(404, json={"error": "not found"})
 
         with _client(handler) as c:
-            assert c.resolve_work_id("10.1/missing") is None
+            assert c.resolve_work("10.1/missing") is None
 
     def test_includes_mailto_param(self):
         seen = {}
@@ -72,7 +85,7 @@ def handler(request: httpx.Request) -> httpx.Response:
             return httpx.Response(200, json={"id": "https://openalex.org/W1"})
 
         with _client(handler) as c:
-            c.resolve_work_id("10.1/x")
+            c.resolve_work("10.1/x")
         assert seen["mailto"] == "t@example.org"
 
 

diff --git a/tests/test_knowledge/test_papers_sync.py b/tests/test_knowledge/test_papers_sync.py
@@ -413,6 +413,118 @@ def factory(**_kwargs):
         assert stored == 0
         assert stats.total == 0
 
+    def test_drops_prepublication_citations(self, tmp_path: Path, monkeypatch) -> None:
+        # The work was published in 2016; a citing bucket dated 2013 is
+        # impossible (bad OpenAlex date) and must be dropped from the histogram.
+        def handler(request: httpx.Request) -> httpx.Response:
+            if "/works/doi:" in str(request.url):
+                return httpx.Response(
+                    200, json={"id": "https://openalex.org/W1", "publication_year": 2016}
+                )
+            if request.url.params.get("group_by"):
+                return httpx.Response(
+                    200,
+                    json={
+                        "group_by": [
+                            {"key": "2013", "count": 2},  # before publication
+                            {"key": "2016", "count": 5},
+                            {"key": "2020", "count": 9},
+                        ]
+                    },
+                )
+            return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []})
+
+        def factory(**_kwargs):
+            return OpenAlexCitationClient(
+                client=httpx.Client(transport=httpx.MockTransport(handler))
+            )
+
+        monkeypatch.setattr(ps, "OpenAlexCitationClient", factory)
+
+        db_path = tmp_path / "knowledge" / "test.db"
+        with patch("src.knowledge.db.get_db_path", return_value=db_path):
+            init_db("test")
+            sync_citing_papers(["10.1/canon"], project="test")
+            stats = get_citation_stats("test")
+
+        assert stats.by_paper == {"10.1/canon": {"2016": 5, "2020": 9}}
+        assert "2013" not in stats.per_year
+
+    def test_over_aggressive_floor_preserves_existing(self, tmp_path: Path, monkeypatch) -> None:
+        # If OpenAlex reports a bogus future year for the canonical work, every
+        # current bucket is floored out; the empty-counts guard must then keep
+        # the existing histogram rather than wiping it.
+        def handler(request: httpx.Request) -> httpx.Response:
+            if "/works/doi:" in str(request.url):
+                return httpx.Response(
+                    200, json={"id": "https://openalex.org/W1", "publication_year": 2099}
+                )
+            if request.url.params.get("group_by"):
+                return httpx.Response(200, json={"group_by": [{"key": "2024", "count": 10}]})
+            return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []})
+
+        def factory(**_kwargs):
+            return OpenAlexCitationClient(
+                client=httpx.Client(transport=httpx.MockTransport(handler))
+            )
+
+        monkeypatch.setattr(ps, "OpenAlexCitationClient", factory)
+
+        db_path = tmp_path / "knowledge" / "test.db"
+        with patch("src.knowledge.db.get_db_path", return_value=db_path):
+            init_db("test")
+            replace_citation_counts("10.1/canon", {2024: 50}, project="test")
+            sync_citing_papers(["10.1/canon"], project="test")
+            stats = get_citation_stats("test")
+
+        # Existing data preserved, not wiped to empty by the over-high floor.
+        assert stats.by_paper == {"10.1/canon": {"2024": 50}}
+
+    def test_floor_is_earliest_version_year(self, tmp_path: Path, monkeypatch) -> None:
+        # Primary published 2025, preprint 2024 -> floor is 2024 (the preprint).
+        def handler(request: httpx.Request) -> httpx.Response:
+            url = str(request.url)
+            if "/works/doi:10.1234/published" in url:
+                return httpx.Response(
+                    200, json={"id": "https://openalex.org/W1", "publication_year": 2025}
+                )
+            if "/works/doi:10.1101/preprint" in url:
+                return httpx.Response(
+                    200, json={"id": "https://openalex.org/W2", "publication_year": 2024}
+                )
+            if request.url.params.get("group_by"):
+                return httpx.Response(
+                    200,
+                    json={
+                        "group_by": [
+                            {"key": "2023", "count": 3},  # before the preprint
+                            {"key": "2024", "count": 7},
+                            {"key": "2025", "count": 11},
+                        ]
+                    },
+                )
+            return httpx.Response(200, json={"meta": {"next_cursor": None}, "results": []})
+
+        def factory(**_kwargs):
+            return OpenAlexCitationClient(
+                client=httpx.Client(transport=httpx.MockTransport(handler))
+            )
+
+        monkeypatch.setattr(ps, "OpenAlexCitationClient", factory)
+
+        db_path = tmp_path / "knowledge" / "test.db"
+        with patch("src.knowledge.db.get_db_path", return_value=db_path):
+            init_db("test")
+            sync_citing_papers(
+                ["10.1234/published"],
+                project="test",
+                aliases={"10.1234/published": ["10.1101/preprint"]},
+            )
+            stats = get_citation_stats("test")
+
+        # 2023 (before the 2024 preprint) dropped; 2024+ kept.
+        assert stats.by_paper == {"10.1234/published": {"2024": 7, "2025": 11}}
+
     def test_version_aliases_merge_into_primary(self, tmp_path: Path, monkeypatch) -> None:
         # Primary + preprint resolve to W1/W2; counts are queried as a group and
         # attributed to the primary DOI.