OpenScience-Collective · neuromechanist · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/src/api/routers/community.py b/src/api/routers/community.py
@@ -18,7 +18,7 @@
 from pathlib import Path
 from typing import Annotated, Any, Literal
 
-from fastapi import APIRouter, Header, HTTPException, Query, Request
+from fastapi import APIRouter, Header, HTTPException, Query, Request, Response
 from fastapi.responses import FileResponse, StreamingResponse
 from langchain_core.messages import AIMessage, HumanMessage
 from langchain_core.messages.utils import count_tokens_approximately
@@ -34,6 +34,7 @@
 from src.assistants.registry import AssistantInfo
 from src.core.config.community import WidgetConfig
 from src.core.services.litellm_llm import create_openrouter_llm
+from src.knowledge.search import FAQResult, get_citation_stats, list_faq_entries
 from src.metrics.cost import COST_BLOCK_THRESHOLD, COST_WARN_THRESHOLD, MODEL_PRICING, estimate_cost
 from src.metrics.db import (
     RequestLogEntry,
@@ -205,6 +206,75 @@ class CommunityConfigResponse(BaseModel):
     status: str = Field(..., description="Health status: healthy, degraded, or error")
 
 
+class FAQEntryResponse(BaseModel):
+    """A single FAQ entry exposed via the public feed."""
+
+    question: str = Field(..., description="Synthesized question")
+    answer: str = Field(..., description="Synthesized answer")
+    tags: list[str] = Field(default_factory=list, description="Keyword tags")
+    category: str = Field(..., description="Entry category (how-to, troubleshooting, etc.)")
+    quality_score: float = Field(..., description="LLM quality score (0.0-1.0)")
+    message_count: int = Field(..., description="Number of source messages in the thread")
+    first_message_date: str = Field(..., description="Date of the first message in the thread")
+    thread_url: str = Field(..., description="URL of the source discussion thread")
+
+
+class FAQFeedResponse(BaseModel):
+    """Paginated public FAQ feed for a community."""
+
+    community_id: str = Field(..., description="Community identifier")
+    total: int = Field(..., description="Total entries matching the filters")
+    limit: int = Field(..., description="Page size used for this response")
+    offset: int = Field(..., description="Offset used for this response")
+    entries: list[FAQEntryResponse] = Field(default_factory=list, description="FAQ entries")
+
+
+class CitationsFeedResponse(BaseModel):
+    """Public citation dashboard data for a community's canonical papers."""
+
+    community_id: str = Field(..., description="Community identifier")
+    total: int = Field(..., description="Total citing papers with a recorded canonical link")
+    per_year: dict[str, int] = Field(
+        default_factory=dict, description="Citing-paper count per year across all papers"
+    )
+    by_paper: dict[str, dict[str, int]] = Field(
+        default_factory=dict,
+        description="Stacked breakdown: canonical DOI -> year -> citing-paper count",
+    )
+    canonical_dois: list[str] = Field(
+        default_factory=list, description="Canonical DOIs tracked for this community"
+    )
+
+
+# Matches bare email addresses so they can be stripped from the public feed.
+_EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
+
+
+def _redact_emails(text: str) -> str:
+    """Replace any email address in ``text`` with a redaction marker.
+
+    The FAQ feed is derived from public mailing-list content. The summarizer
+    strips most personal data, but a handful of entries still embed addresses
+    (mostly vendor support lines). A public JSON feed should not emit raw
+    addresses, so they are redacted at serialization time.
+    """
+    return _EMAIL_PATTERN.sub("[email redacted]", text)
+
+
+def _faq_result_to_response(entry: FAQResult) -> FAQEntryResponse:
+    """Convert a knowledge-layer FAQResult into a public response model."""
+    return FAQEntryResponse(
+        question=_redact_emails(entry.question),
+        answer=_redact_emails(entry.answer),
+        tags=[_redact_emails(tag) for tag in entry.tags],
+        category=entry.category,
+        quality_score=entry.quality_score,
+        message_count=entry.message_count,
+        first_message_date=entry.first_message_date,
+        thread_url=entry.thread_url,
+    )
+
+
 # ---------------------------------------------------------------------------
 # Session Management (In-Memory, per-community isolation)
 # ---------------------------------------------------------------------------
@@ -1502,6 +1572,116 @@ async def community_usage_public(
                 detail="Metrics database is temporarily unavailable.",
             )
 
+    @router.get("/faq", response_model=FAQFeedResponse)
+    async def community_faq(
+        response: Response,
+        q: str | None = Query(
+            default=None,
+            description="Optional full-text search phrase. If omitted, browses all entries.",
+            max_length=200,
+        ),
+        category: str | None = Query(
+            default=None,
+            description="Filter by category (how-to, troubleshooting, reference, etc.)",
+            max_length=50,
+        ),
+        min_quality: float = Query(
+            default=0.0, ge=0.0, le=1.0, description="Minimum quality score"
+        ),
+        limit: int = Query(default=50, ge=1, le=200, description="Page size"),
+        offset: int = Query(default=0, ge=0, description="Pagination offset"),
+    ) -> FAQFeedResponse:
+        """Public, read-only FAQ feed for this community.
+
+        Returns synthesized question/answer entries generated from the
+        community's mailing-list and forum archives. Disabled by default;
+        a community opts in via ``public_feeds.faq: true`` in its config.
+        Email addresses are redacted from the output. ``total`` is the full
+        match count before pagination, in both browse and search modes.
+        """
+        config = info.community_config
+        if config is None or config.public_feeds is None or not config.public_feeds.faq:
+            raise HTTPException(
+                status_code=404,
+                detail="Public FAQ feed is not enabled for this community.",
+            )
+
+        try:
+            entries, total = list_faq_entries(
+                project=community_id,
+                limit=limit,
+                offset=offset,
+                query=q,
+                category=category,
+                min_quality=min_quality,
+            )
+        except sqlite3.Error:
+            logger.exception("Failed to query FAQ feed for community %s", community_id)
+            raise HTTPException(
+                status_code=503,
+                detail="Knowledge database is temporarily unavailable.",
+            )
+        except Exception:
+            logger.exception("Unexpected error serving FAQ feed for community %s", community_id)
+            raise HTTPException(
+                status_code=500,
+                detail="An unexpected error occurred while building the FAQ feed.",
+            )
+
+        # Public, read-only data; cacheable like the other /…/public endpoints.
+        response.headers["Cache-Control"] = "public, max-age=3600"
+        return FAQFeedResponse(
+            community_id=community_id,
+            total=total,
+            limit=limit,
+            offset=offset,
+            entries=[_faq_result_to_response(e) for e in entries],
+        )
+
+    @router.get("/citations", response_model=CitationsFeedResponse)
+    async def community_citations(response: Response) -> CitationsFeedResponse:
+        """Public, read-only citation dashboard for this community.
+
+        Returns per-year counts of papers citing the community's canonical
+        works, plus a stacked breakdown keyed by the cited DOI (the shape
+        behind a citations-per-year chart). Disabled by default; a community
+        opts in via ``public_feeds.citations: true`` in its config.
+        """
+        config = info.community_config
+        if config is None or config.public_feeds is None or not config.public_feeds.citations:
+            raise HTTPException(
+                status_code=404,
+                detail="Public citations feed is not enabled for this community.",
+            )
+
+        try:
+            stats = get_citation_stats(project=community_id)
+        except sqlite3.Error:
+            logger.exception("Failed to query citations for community %s", community_id)
+            raise HTTPException(
+                status_code=503,
+                detail="Knowledge database is temporarily unavailable.",
+            )
+        except Exception:
+            logger.exception(
+                "Unexpected error serving citations feed for community %s", community_id
+            )
+            raise HTTPException(
+                status_code=500,
+                detail="An unexpected error occurred while building the citations feed.",
+            )
+
+        canonical_dois = list(config.citations.dois) if config.citations else []
+
+        response.headers["Cache-Control"] = "public, max-age=3600"
+        return CitationsFeedResponse(
+            community_id=community_id,
+            total=stats.total,
+            per_year=stats.per_year,
+            by_paper=stats.by_paper,
+            canonical_dois=canonical_dois,
+        )
+
     return router
 
 

diff --git a/src/core/config/community.py b/src/core/config/community.py
@@ -637,6 +637,23 @@ def validate_agent_roles(self) -> "FAQGenerationConfig":
         return self
 
 
+class PublicFeedsConfig(BaseModel):
+    """Opt-in flags for exposing community data as public, read-only JSON feeds.
+
+    Both feeds are off by default. Enabling a feed publishes already-synced
+    data (FAQ entries, citation counts) at unauthenticated endpoints so
+    communities can build their own frontends on top of it.
+    """
+
+    model_config = ConfigDict(extra="forbid")
+
+    faq: bool = False
+    """Expose generated FAQ entries at GET /{community_id}/faq."""
+
+    citations: bool = False
+    """Expose canonical-paper citation counts at GET /{community_id}/citations."""
+
+
 class BudgetConfig(BaseModel):
     """Budget limits and alert thresholds for a community.
 
@@ -918,6 +935,9 @@ def validate_id(cls, v: str) -> str:
     faq_generation: FAQGenerationConfig | None = None
     """FAQ generation configuration from threaded discussions (mailman, discourse, etc.)."""
 
+    public_feeds: PublicFeedsConfig | None = None
+    """Opt-in flags for exposing FAQ/citation data as public JSON feeds."""
+
     sync: SyncConfig | None = None
     """Per-community sync schedule configuration.
 

diff --git a/src/knowledge/db.py b/src/knowledge/db.py
@@ -132,6 +132,9 @@ def active_mirror_context(mirror_id: str) -> Iterator[None]:
     url TEXT NOT NULL,
     created_at TEXT,
     synced_at TEXT NOT NULL,
+    -- Canonical DOI this paper cites, when discovered via citation sync.
+    -- NULL for papers found through keyword search rather than a citation link.
+    cites_doi TEXT,
     UNIQUE(source, external_id)
 );
 
@@ -409,6 +412,8 @@ def active_mirror_context(mirror_id: str) -> Iterator[None]:
 CREATE INDEX IF NOT EXISTS idx_github_items_status ON github_items(status);
 CREATE INDEX IF NOT EXISTS idx_github_items_type ON github_items(item_type);
 CREATE INDEX IF NOT EXISTS idx_papers_source ON papers(source);
+-- idx_papers_cites_doi is created in _migrate_db, after the cites_doi column
+-- is ensured, so init_db stays safe on databases predating that column.
 CREATE INDEX IF NOT EXISTS idx_docstrings_repo ON docstrings(repo);
 CREATE INDEX IF NOT EXISTS idx_docstrings_language ON docstrings(language);
 CREATE INDEX IF NOT EXISTS idx_messages_list ON mailing_list_messages(list_name);
@@ -507,6 +512,28 @@ def _migrate_db(conn: sqlite3.Connection) -> None:
         # Table doesn't exist yet - this is fine, schema will create it
         logger.debug("Docstrings table not found during migration (will be created): %s", e)
 
+    # Migration: Add cites_doi column to papers table (added 2026-06-09).
+    # The index lives here (not in SCHEMA_SQL) so executescript never references
+    # cites_doi on a database created before the column existed.
+    try:
+        cursor = conn.execute("PRAGMA table_info(papers)")
+        columns = [row[1] for row in cursor.fetchall()]
+    except sqlite3.OperationalError as e:
+        # Only the PRAGMA is guarded here: a missing papers table is fine since
+        # SCHEMA_SQL creates it. DDL errors below (locked DB, I/O fault) must
+        # propagate rather than be swallowed and leave the table un-indexed.
+        logger.debug("Papers table not found during migration (will be created): %s", e)
+        columns = []
+
+    if columns:  # papers table exists; migrate it in place
+        if "cites_doi" not in columns:
+            logger.info("Migrating papers table: adding cites_doi column")
+            conn.execute("ALTER TABLE papers ADD COLUMN cites_doi TEXT")
+            logger.info("Migration complete: cites_doi column added to papers")
+        # Ensure the index exists for both new and migrated databases.
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_papers_cites_doi ON papers(cites_doi)")
+        conn.commit()
+
 
 def init_db(project: str = "hed") -> None:
     """Initialize database schema for a project.
@@ -586,6 +613,7 @@ def upsert_paper(
     first_message: str | None,
     url: str,
     created_at: str | None,
+    cites_doi: str | None = None,
 ) -> None:
     """Insert or update a paper.
 
@@ -597,6 +625,14 @@ def upsert_paper(
         first_message: Abstract (limited to ~2000 chars)
         url: URL to the paper (DOI or source URL)
         created_at: Publication date (ISO 8601 or year string)
+        cites_doi: Canonical DOI this paper cites, when known from a citation
+            sync. ``None`` for keyword-search results. On conflict the first
+            recorded link is kept (COALESCE), so a later keyword sync passing
+            ``None`` never erases an existing citation link, and a re-sync
+            backfills the link onto rows stored before this column existed.
+            A single column holds one link: a paper citing two tracked DOIs is
+            attributed to whichever was synced first (it is still counted once
+            in the per-year total, only its by-paper bucket is approximate).
     """
     # Limit first_message size
     if first_message and len(first_message) > 2000:
@@ -605,14 +641,15 @@ def upsert_paper(
     conn.execute(
         """
         INSERT INTO papers (source, external_id, title, first_message,
-                            status, url, created_at, synced_at)
-        VALUES (?, ?, ?, ?, 'published', ?, ?, ?)
+                            status, url, created_at, synced_at, cites_doi)
+        VALUES (?, ?, ?, ?, 'published', ?, ?, ?, ?)
         ON CONFLICT(source, external_id) DO UPDATE SET
             title=excluded.title,
             first_message=excluded.first_message,
-            synced_at=excluded.synced_at
+            synced_at=excluded.synced_at,
+            cites_doi=COALESCE(papers.cites_doi, excluded.cites_doi)
         """,
-        (source, external_id, title, first_message, url, created_at, _now_iso()),
+        (source, external_id, title, first_message, url, created_at, _now_iso(), cites_doi),
     )
 
 

diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py
@@ -158,6 +158,7 @@ def _store_papers(
     project: str,
     *,
     force_source: str | None = None,
+    cites_doi: str | None = None,
 ) -> dict[str, int]:
     """Upsert opencite papers into the knowledge DB, returning counts by source.
 
@@ -167,6 +168,8 @@ def _store_papers(
         force_source: When set (a single-source sync), record this OSA source
             label using its native identifier; falls back to the priority
             mapping if that identifier is missing.
+        cites_doi: Canonical DOI these papers cite, recorded on each row when
+            storing the results of a citation sync. ``None`` for keyword search.
     """
     counts: dict[str, int] = {}
     with get_connection(project) as conn:
@@ -193,6 +196,7 @@ def _store_papers(
                 first_message=paper.abstract or None,
                 url=_paper_url(paper),
                 created_at=paper.publication_date or (str(paper.year) if paper.year else None),
+                cites_doi=cites_doi,
             )
             counts[source] = counts.get(source, 0) + 1
         conn.commit()
@@ -420,7 +424,7 @@ def sync_citing_papers(
     total = 0
     for doi, papers in cited:
         try:
-            counts = _store_papers(papers, project)
+            counts = _store_papers(papers, project, cites_doi=doi)
             count = sum(counts.values())
             update_sync_metadata("papers", f"citing_{doi}", count, project)
             logger.info("Synced %d papers citing %s", count, doi)