Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion src/api/routers/community.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from src.assistants.registry import AssistantInfo
from src.core.config.community import WidgetConfig
from src.core.services.litellm_llm import create_openrouter_llm
from src.knowledge.search import FAQResult, list_faq_entries
from src.knowledge.search import FAQResult, get_citation_stats, list_faq_entries
from src.metrics.cost import COST_BLOCK_THRESHOLD, COST_WARN_THRESHOLD, MODEL_PRICING, estimate_cost
from src.metrics.db import (
RequestLogEntry,
Expand Down Expand Up @@ -229,6 +229,23 @@ class FAQFeedResponse(BaseModel):
entries: list[FAQEntryResponse] = Field(default_factory=list, description="FAQ entries")


class CitationsFeedResponse(BaseModel):
"""Public citation dashboard data for a community's canonical papers."""

community_id: str = Field(..., description="Community identifier")
total: int = Field(..., description="Total citing papers with a recorded canonical link")
per_year: dict[str, int] = Field(
default_factory=dict, description="Citing-paper count per year across all papers"
)
by_paper: dict[str, dict[str, int]] = Field(
default_factory=dict,
description="Stacked breakdown: canonical DOI -> year -> citing-paper count",
)
canonical_dois: list[str] = Field(
default_factory=list, description="Canonical DOIs tracked for this community"
)


# Matches bare email addresses so they can be stripped from the public feed.
_EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")

Expand Down Expand Up @@ -1621,6 +1638,50 @@ async def community_faq(
entries=[_faq_result_to_response(e) for e in entries],
)

@router.get("/citations", response_model=CitationsFeedResponse)
async def community_citations(response: Response) -> CitationsFeedResponse:
"""Public, read-only citation dashboard for this community.

Returns per-year counts of papers citing the community's canonical
works, plus a stacked breakdown keyed by the cited DOI (the shape
behind a citations-per-year chart). Disabled by default; a community
opts in via ``public_feeds.citations: true`` in its config.
"""
config = info.community_config
if config is None or config.public_feeds is None or not config.public_feeds.citations:
raise HTTPException(
status_code=404,
detail="Public citations feed is not enabled for this community.",
)

try:
stats = get_citation_stats(project=community_id)
except sqlite3.Error:
logger.exception("Failed to query citations for community %s", community_id)
raise HTTPException(
status_code=503,
detail="Knowledge database is temporarily unavailable.",
)
except Exception:
logger.exception(
"Unexpected error serving citations feed for community %s", community_id
)
raise HTTPException(
status_code=500,
detail="An unexpected error occurred while building the citations feed.",
)

canonical_dois = list(config.citations.dois) if config.citations else []

response.headers["Cache-Control"] = "public, max-age=3600"
return CitationsFeedResponse(
community_id=community_id,
total=stats.total,
per_year=stats.per_year,
by_paper=stats.by_paper,
canonical_dois=canonical_dois,
)

return router


Expand Down
45 changes: 41 additions & 4 deletions src/knowledge/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ def active_mirror_context(mirror_id: str) -> Iterator[None]:
url TEXT NOT NULL,
created_at TEXT,
synced_at TEXT NOT NULL,
-- Canonical DOI this paper cites, when discovered via citation sync.
-- NULL for papers found through keyword search rather than a citation link.
cites_doi TEXT,
UNIQUE(source, external_id)
);

Expand Down Expand Up @@ -409,6 +412,8 @@ def active_mirror_context(mirror_id: str) -> Iterator[None]:
CREATE INDEX IF NOT EXISTS idx_github_items_status ON github_items(status);
CREATE INDEX IF NOT EXISTS idx_github_items_type ON github_items(item_type);
CREATE INDEX IF NOT EXISTS idx_papers_source ON papers(source);
-- idx_papers_cites_doi is created in _migrate_db, after the cites_doi column
-- is ensured, so init_db stays safe on databases predating that column.
CREATE INDEX IF NOT EXISTS idx_docstrings_repo ON docstrings(repo);
CREATE INDEX IF NOT EXISTS idx_docstrings_language ON docstrings(language);
CREATE INDEX IF NOT EXISTS idx_messages_list ON mailing_list_messages(list_name);
Expand Down Expand Up @@ -507,6 +512,28 @@ def _migrate_db(conn: sqlite3.Connection) -> None:
# Table doesn't exist yet - this is fine, schema will create it
logger.debug("Docstrings table not found during migration (will be created): %s", e)

# Migration: Add cites_doi column to papers table (added 2026-06-09).
# The index lives here (not in SCHEMA_SQL) so executescript never references
# cites_doi on a database created before the column existed.
try:
cursor = conn.execute("PRAGMA table_info(papers)")
columns = [row[1] for row in cursor.fetchall()]
except sqlite3.OperationalError as e:
# Only the PRAGMA is guarded here: a missing papers table is fine since
# SCHEMA_SQL creates it. DDL errors below (locked DB, I/O fault) must
# propagate rather than be swallowed and leave the table un-indexed.
logger.debug("Papers table not found during migration (will be created): %s", e)
columns = []

if columns: # papers table exists; migrate it in place
if "cites_doi" not in columns:
logger.info("Migrating papers table: adding cites_doi column")
conn.execute("ALTER TABLE papers ADD COLUMN cites_doi TEXT")
logger.info("Migration complete: cites_doi column added to papers")
# Ensure the index exists for both new and migrated databases.
conn.execute("CREATE INDEX IF NOT EXISTS idx_papers_cites_doi ON papers(cites_doi)")
conn.commit()


def init_db(project: str = "hed") -> None:
"""Initialize database schema for a project.
Expand Down Expand Up @@ -586,6 +613,7 @@ def upsert_paper(
first_message: str | None,
url: str,
created_at: str | None,
cites_doi: str | None = None,
) -> None:
"""Insert or update a paper.

Expand All @@ -597,6 +625,14 @@ def upsert_paper(
first_message: Abstract (limited to ~2000 chars)
url: URL to the paper (DOI or source URL)
created_at: Publication date (ISO 8601 or year string)
cites_doi: Canonical DOI this paper cites, when known from a citation
sync. ``None`` for keyword-search results. On conflict the first
recorded link is kept (COALESCE), so a later keyword sync passing
``None`` never erases an existing citation link, and a re-sync
backfills the link onto rows stored before this column existed.
A single column holds one link: a paper citing two tracked DOIs is
attributed to whichever was synced first (it is still counted once
in the per-year total, only its by-paper bucket is approximate).
"""
# Limit first_message size
if first_message and len(first_message) > 2000:
Expand All @@ -605,14 +641,15 @@ def upsert_paper(
conn.execute(
"""
INSERT INTO papers (source, external_id, title, first_message,
status, url, created_at, synced_at)
VALUES (?, ?, ?, ?, 'published', ?, ?, ?)
status, url, created_at, synced_at, cites_doi)
VALUES (?, ?, ?, ?, 'published', ?, ?, ?, ?)
ON CONFLICT(source, external_id) DO UPDATE SET
title=excluded.title,
first_message=excluded.first_message,
synced_at=excluded.synced_at
synced_at=excluded.synced_at,
cites_doi=COALESCE(papers.cites_doi, excluded.cites_doi)
""",
(source, external_id, title, first_message, url, created_at, _now_iso()),
(source, external_id, title, first_message, url, created_at, _now_iso(), cites_doi),
)


Expand Down
6 changes: 5 additions & 1 deletion src/knowledge/papers_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def _store_papers(
project: str,
*,
force_source: str | None = None,
cites_doi: str | None = None,
) -> dict[str, int]:
"""Upsert opencite papers into the knowledge DB, returning counts by source.

Expand All @@ -167,6 +168,8 @@ def _store_papers(
force_source: When set (a single-source sync), record this OSA source
label using its native identifier; falls back to the priority
mapping if that identifier is missing.
cites_doi: Canonical DOI these papers cite, recorded on each row when
storing the results of a citation sync. ``None`` for keyword search.
"""
counts: dict[str, int] = {}
with get_connection(project) as conn:
Expand All @@ -193,6 +196,7 @@ def _store_papers(
first_message=paper.abstract or None,
url=_paper_url(paper),
created_at=paper.publication_date or (str(paper.year) if paper.year else None),
cites_doi=cites_doi,
)
counts[source] = counts.get(source, 0) + 1
conn.commit()
Expand Down Expand Up @@ -420,7 +424,7 @@ def sync_citing_papers(
total = 0
for doi, papers in cited:
try:
counts = _store_papers(papers, project)
counts = _store_papers(papers, project, cites_doi=doi)
count = sum(counts.values())
update_sync_metadata("papers", f"citing_{doi}", count, project)
logger.info("Synced %d papers citing %s", count, doi)
Expand Down
71 changes: 71 additions & 0 deletions src/knowledge/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,77 @@ def search_github_items(
return results


@dataclass
class CitationStats:
"""Aggregated citation counts for a community's canonical papers."""

total: int
"""Total citing papers with a recorded canonical link and a valid year."""

per_year: dict[str, int]
"""Citing-paper count per publication year, summed across canonical DOIs."""

by_paper: dict[str, dict[str, int]]
"""Per canonical DOI: a mapping of publication year to citing-paper count."""


def get_citation_stats(project: str = "eeglab") -> CitationStats:
"""Aggregate citation counts for the public citations dashboard.

Counts papers that cite a community's canonical DOIs (``papers.cites_doi``
is set), grouped by the citing paper's publication year. The year is the
leading four digits of ``created_at`` (ISO date or bare year); rows whose
``created_at`` is missing or not a four-digit year are skipped so a bad
date never lands in a bogus year bucket.

Args:
project: Community ID for database isolation. Defaults to 'eeglab'.

Returns:
CitationStats with the overall ``total``, ``per_year`` totals, and the
stacked ``by_paper`` breakdown (canonical DOI -> year -> count). Years
are sorted ascending in every mapping.
"""
sql = """
SELECT cites_doi, substr(created_at, 1, 4) AS yr, COUNT(*) AS cnt
FROM papers
WHERE cites_doi IS NOT NULL
AND created_at IS NOT NULL
AND substr(created_at, 1, 4) GLOB '[0-9][0-9][0-9][0-9]'
GROUP BY cites_doi, yr
"""

per_year: dict[str, int] = {}
by_paper: dict[str, dict[str, int]] = {}
total = 0
try:
with get_connection(project) as conn:
for row in conn.execute(sql):
doi = row["cites_doi"]
year = row["yr"]
count = row["cnt"]
per_year[year] = per_year.get(year, 0) + count
by_paper.setdefault(doi, {})[year] = count
total += count
except sqlite3.OperationalError as e:
logger.error(
"Database operational error computing citation stats: %s",
e,
exc_info=True,
extra={"project": project},
)
raise
except sqlite3.Error as e:
logger.warning("Database error computing citation stats (project=%s): %s", project, e)
raise

return CitationStats(
total=total,
per_year=dict(sorted(per_year.items())),
by_paper={doi: dict(sorted(years.items())) for doi, years in by_paper.items()},
)


def search_papers(
query: str,
project: str = "hed",
Expand Down
Loading
Loading