Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 181 additions & 1 deletion src/api/routers/community.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pathlib import Path
from typing import Annotated, Any, Literal

from fastapi import APIRouter, Header, HTTPException, Query, Request
from fastapi import APIRouter, Header, HTTPException, Query, Request, Response
from fastapi.responses import FileResponse, StreamingResponse
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.messages.utils import count_tokens_approximately
Expand All @@ -34,6 +34,7 @@
from src.assistants.registry import AssistantInfo
from src.core.config.community import WidgetConfig
from src.core.services.litellm_llm import create_openrouter_llm
from src.knowledge.search import FAQResult, get_citation_stats, list_faq_entries
from src.metrics.cost import COST_BLOCK_THRESHOLD, COST_WARN_THRESHOLD, MODEL_PRICING, estimate_cost
from src.metrics.db import (
RequestLogEntry,
Expand Down Expand Up @@ -205,6 +206,75 @@ class CommunityConfigResponse(BaseModel):
status: str = Field(..., description="Health status: healthy, degraded, or error")


class FAQEntryResponse(BaseModel):
"""A single FAQ entry exposed via the public feed."""

question: str = Field(..., description="Synthesized question")
answer: str = Field(..., description="Synthesized answer")
tags: list[str] = Field(default_factory=list, description="Keyword tags")
category: str = Field(..., description="Entry category (how-to, troubleshooting, etc.)")
quality_score: float = Field(..., description="LLM quality score (0.0-1.0)")
message_count: int = Field(..., description="Number of source messages in the thread")
first_message_date: str = Field(..., description="Date of the first message in the thread")
thread_url: str = Field(..., description="URL of the source discussion thread")


class FAQFeedResponse(BaseModel):
"""Paginated public FAQ feed for a community."""

community_id: str = Field(..., description="Community identifier")
total: int = Field(..., description="Total entries matching the filters")
limit: int = Field(..., description="Page size used for this response")
offset: int = Field(..., description="Offset used for this response")
entries: list[FAQEntryResponse] = Field(default_factory=list, description="FAQ entries")


class CitationsFeedResponse(BaseModel):
"""Public citation dashboard data for a community's canonical papers."""

community_id: str = Field(..., description="Community identifier")
total: int = Field(..., description="Total citing papers with a recorded canonical link")
per_year: dict[str, int] = Field(
default_factory=dict, description="Citing-paper count per year across all papers"
)
by_paper: dict[str, dict[str, int]] = Field(
default_factory=dict,
description="Stacked breakdown: canonical DOI -> year -> citing-paper count",
)
canonical_dois: list[str] = Field(
default_factory=list, description="Canonical DOIs tracked for this community"
)


# Matches bare email addresses so they can be stripped from the public feed.
_EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")


def _redact_emails(text: str) -> str:
"""Replace any email address in ``text`` with a redaction marker.

The FAQ feed is derived from public mailing-list content. The summarizer
strips most personal data, but a handful of entries still embed addresses
(mostly vendor support lines). A public JSON feed should not emit raw
addresses, so they are redacted at serialization time.
"""
return _EMAIL_PATTERN.sub("[email redacted]", text)


def _faq_result_to_response(entry: FAQResult) -> FAQEntryResponse:
"""Convert a knowledge-layer FAQResult into a public response model."""
return FAQEntryResponse(
question=_redact_emails(entry.question),
answer=_redact_emails(entry.answer),
tags=[_redact_emails(tag) for tag in entry.tags],
category=entry.category,
quality_score=entry.quality_score,
message_count=entry.message_count,
first_message_date=entry.first_message_date,
thread_url=entry.thread_url,
)


# ---------------------------------------------------------------------------
# Session Management (In-Memory, per-community isolation)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -1502,6 +1572,116 @@ async def community_usage_public(
detail="Metrics database is temporarily unavailable.",
)

@router.get("/faq", response_model=FAQFeedResponse)
async def community_faq(
response: Response,
q: str | None = Query(
default=None,
description="Optional full-text search phrase. If omitted, browses all entries.",
max_length=200,
),
category: str | None = Query(
default=None,
description="Filter by category (how-to, troubleshooting, reference, etc.)",
max_length=50,
),
min_quality: float = Query(
default=0.0, ge=0.0, le=1.0, description="Minimum quality score"
),
limit: int = Query(default=50, ge=1, le=200, description="Page size"),
offset: int = Query(default=0, ge=0, description="Pagination offset"),
) -> FAQFeedResponse:
"""Public, read-only FAQ feed for this community.

Returns synthesized question/answer entries generated from the
community's mailing-list and forum archives. Disabled by default;
a community opts in via ``public_feeds.faq: true`` in its config.
Email addresses are redacted from the output. ``total`` is the full
match count before pagination, in both browse and search modes.
"""
config = info.community_config
if config is None or config.public_feeds is None or not config.public_feeds.faq:
raise HTTPException(
status_code=404,
detail="Public FAQ feed is not enabled for this community.",
)

try:
entries, total = list_faq_entries(
project=community_id,
limit=limit,
offset=offset,
query=q,
category=category,
min_quality=min_quality,
)
except sqlite3.Error:
logger.exception("Failed to query FAQ feed for community %s", community_id)
raise HTTPException(
status_code=503,
detail="Knowledge database is temporarily unavailable.",
)
except Exception:
logger.exception("Unexpected error serving FAQ feed for community %s", community_id)
raise HTTPException(
status_code=500,
detail="An unexpected error occurred while building the FAQ feed.",
)

# Public, read-only data; cacheable like the other /…/public endpoints.
response.headers["Cache-Control"] = "public, max-age=3600"
return FAQFeedResponse(
community_id=community_id,
total=total,
limit=limit,
offset=offset,
entries=[_faq_result_to_response(e) for e in entries],
)

@router.get("/citations", response_model=CitationsFeedResponse)
async def community_citations(response: Response) -> CitationsFeedResponse:
"""Public, read-only citation dashboard for this community.

Returns per-year counts of papers citing the community's canonical
works, plus a stacked breakdown keyed by the cited DOI (the shape
behind a citations-per-year chart). Disabled by default; a community
opts in via ``public_feeds.citations: true`` in its config.
"""
config = info.community_config
if config is None or config.public_feeds is None or not config.public_feeds.citations:
raise HTTPException(
status_code=404,
detail="Public citations feed is not enabled for this community.",
)

try:
stats = get_citation_stats(project=community_id)
except sqlite3.Error:
logger.exception("Failed to query citations for community %s", community_id)
raise HTTPException(
status_code=503,
detail="Knowledge database is temporarily unavailable.",
)
except Exception:
logger.exception(
"Unexpected error serving citations feed for community %s", community_id
)
raise HTTPException(
status_code=500,
detail="An unexpected error occurred while building the citations feed.",
)

canonical_dois = list(config.citations.dois) if config.citations else []

response.headers["Cache-Control"] = "public, max-age=3600"
return CitationsFeedResponse(
community_id=community_id,
total=stats.total,
per_year=stats.per_year,
by_paper=stats.by_paper,
canonical_dois=canonical_dois,
)

return router


Expand Down
20 changes: 20 additions & 0 deletions src/core/config/community.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,23 @@ def validate_agent_roles(self) -> "FAQGenerationConfig":
return self


class PublicFeedsConfig(BaseModel):
"""Opt-in flags for exposing community data as public, read-only JSON feeds.

Both feeds are off by default. Enabling a feed publishes already-synced
data (FAQ entries, citation counts) at unauthenticated endpoints so
communities can build their own frontends on top of it.
"""

model_config = ConfigDict(extra="forbid")

faq: bool = False
"""Expose generated FAQ entries at GET /{community_id}/faq."""

citations: bool = False
"""Expose canonical-paper citation counts at GET /{community_id}/citations."""


class BudgetConfig(BaseModel):
"""Budget limits and alert thresholds for a community.

Expand Down Expand Up @@ -918,6 +935,9 @@ def validate_id(cls, v: str) -> str:
faq_generation: FAQGenerationConfig | None = None
"""FAQ generation configuration from threaded discussions (mailman, discourse, etc.)."""

public_feeds: PublicFeedsConfig | None = None
"""Opt-in flags for exposing FAQ/citation data as public JSON feeds."""

sync: SyncConfig | None = None
"""Per-community sync schedule configuration.

Expand Down
45 changes: 41 additions & 4 deletions src/knowledge/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ def active_mirror_context(mirror_id: str) -> Iterator[None]:
url TEXT NOT NULL,
created_at TEXT,
synced_at TEXT NOT NULL,
-- Canonical DOI this paper cites, when discovered via citation sync.
-- NULL for papers found through keyword search rather than a citation link.
cites_doi TEXT,
UNIQUE(source, external_id)
);

Expand Down Expand Up @@ -409,6 +412,8 @@ def active_mirror_context(mirror_id: str) -> Iterator[None]:
CREATE INDEX IF NOT EXISTS idx_github_items_status ON github_items(status);
CREATE INDEX IF NOT EXISTS idx_github_items_type ON github_items(item_type);
CREATE INDEX IF NOT EXISTS idx_papers_source ON papers(source);
-- idx_papers_cites_doi is created in _migrate_db, after the cites_doi column
-- is ensured, so init_db stays safe on databases predating that column.
CREATE INDEX IF NOT EXISTS idx_docstrings_repo ON docstrings(repo);
CREATE INDEX IF NOT EXISTS idx_docstrings_language ON docstrings(language);
CREATE INDEX IF NOT EXISTS idx_messages_list ON mailing_list_messages(list_name);
Expand Down Expand Up @@ -507,6 +512,28 @@ def _migrate_db(conn: sqlite3.Connection) -> None:
# Table doesn't exist yet - this is fine, schema will create it
logger.debug("Docstrings table not found during migration (will be created): %s", e)

# Migration: Add cites_doi column to papers table (added 2026-06-09).
# The index lives here (not in SCHEMA_SQL) so executescript never references
# cites_doi on a database created before the column existed.
try:
cursor = conn.execute("PRAGMA table_info(papers)")
columns = [row[1] for row in cursor.fetchall()]
except sqlite3.OperationalError as e:
# Only the PRAGMA is guarded here: a missing papers table is fine since
# SCHEMA_SQL creates it. DDL errors below (locked DB, I/O fault) must
# propagate rather than be swallowed and leave the table un-indexed.
logger.debug("Papers table not found during migration (will be created): %s", e)
columns = []

if columns: # papers table exists; migrate it in place
if "cites_doi" not in columns:
logger.info("Migrating papers table: adding cites_doi column")
conn.execute("ALTER TABLE papers ADD COLUMN cites_doi TEXT")
logger.info("Migration complete: cites_doi column added to papers")
# Ensure the index exists for both new and migrated databases.
conn.execute("CREATE INDEX IF NOT EXISTS idx_papers_cites_doi ON papers(cites_doi)")
conn.commit()


def init_db(project: str = "hed") -> None:
"""Initialize database schema for a project.
Expand Down Expand Up @@ -586,6 +613,7 @@ def upsert_paper(
first_message: str | None,
url: str,
created_at: str | None,
cites_doi: str | None = None,
) -> None:
"""Insert or update a paper.

Expand All @@ -597,6 +625,14 @@ def upsert_paper(
first_message: Abstract (limited to ~2000 chars)
url: URL to the paper (DOI or source URL)
created_at: Publication date (ISO 8601 or year string)
cites_doi: Canonical DOI this paper cites, when known from a citation
sync. ``None`` for keyword-search results. On conflict the first
recorded link is kept (COALESCE), so a later keyword sync passing
``None`` never erases an existing citation link, and a re-sync
backfills the link onto rows stored before this column existed.
A single column holds one link: a paper citing two tracked DOIs is
attributed to whichever was synced first (it is still counted once
in the per-year total, only its by-paper bucket is approximate).
"""
# Limit first_message size
if first_message and len(first_message) > 2000:
Expand All @@ -605,14 +641,15 @@ def upsert_paper(
conn.execute(
"""
INSERT INTO papers (source, external_id, title, first_message,
status, url, created_at, synced_at)
VALUES (?, ?, ?, ?, 'published', ?, ?, ?)
status, url, created_at, synced_at, cites_doi)
VALUES (?, ?, ?, ?, 'published', ?, ?, ?, ?)
ON CONFLICT(source, external_id) DO UPDATE SET
title=excluded.title,
first_message=excluded.first_message,
synced_at=excluded.synced_at
synced_at=excluded.synced_at,
cites_doi=COALESCE(papers.cites_doi, excluded.cites_doi)
""",
(source, external_id, title, first_message, url, created_at, _now_iso()),
(source, external_id, title, first_message, url, created_at, _now_iso(), cites_doi),
)


Expand Down
6 changes: 5 additions & 1 deletion src/knowledge/papers_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def _store_papers(
project: str,
*,
force_source: str | None = None,
cites_doi: str | None = None,
) -> dict[str, int]:
"""Upsert opencite papers into the knowledge DB, returning counts by source.

Expand All @@ -167,6 +168,8 @@ def _store_papers(
force_source: When set (a single-source sync), record this OSA source
label using its native identifier; falls back to the priority
mapping if that identifier is missing.
cites_doi: Canonical DOI these papers cite, recorded on each row when
storing the results of a citation sync. ``None`` for keyword search.
"""
counts: dict[str, int] = {}
with get_connection(project) as conn:
Expand All @@ -193,6 +196,7 @@ def _store_papers(
first_message=paper.abstract or None,
url=_paper_url(paper),
created_at=paper.publication_date or (str(paper.year) if paper.year else None),
cites_doi=cites_doi,
)
counts[source] = counts.get(source, 0) + 1
conn.commit()
Expand Down Expand Up @@ -420,7 +424,7 @@ def sync_citing_papers(
total = 0
for doi, papers in cited:
try:
counts = _store_papers(papers, project)
counts = _store_papers(papers, project, cites_doi=doi)
count = sum(counts.values())
update_sync_metadata("papers", f"citing_{doi}", count, project)
logger.info("Synced %d papers citing %s", count, doi)
Expand Down
Loading
Loading