Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
e12aa6b
feat: add a base version of the translation pipeline
fmueller Dec 15, 2025
9db47df
build: add `sacremoses` as dependency
fmueller Dec 16, 2025
242c04b
feat: ensure Markdown structure preservation in LLM outputs
fmueller Dec 16, 2025
ace2d76
feat: add options for LLM post-edit model and temperature in CLI
fmueller Dec 16, 2025
e67f0e2
feat: add `--device` option to CLI for translation model execution
fmueller Dec 16, 2025
17ab30b
feat: add `--verbose` CLI option and progress reporting in translation
fmueller Dec 16, 2025
23a61fb
feat: improve device detection for translation pipelines
fmueller Dec 16, 2025
999e431
Merge branch 'main' into feat/add-translation-pipeline
fmueller Dec 16, 2025
36d1bf1
refactor: simplify device detection and update `brief` imports in tests
fmueller Dec 16, 2025
319011b
feat: add project-level defaults and improve CLI flexibility
fmueller Dec 16, 2025
3e6ffd0
refactor: remove `voice` and `humor` options from CLI and tone profile
fmueller Dec 16, 2025
4d33db2
feat: add input trimming and guardrails for LLM post-editing
fmueller Dec 16, 2025
61978ea
feat: add output validation for LLM post-editing
fmueller Dec 16, 2025
76fada5
feat: add batch translation support and improve progress reporting
fmueller Dec 16, 2025
a97e9fd
test: strip ANSI color codes in CLI error message assertions
fmueller Dec 16, 2025
8e64ebe
feat: add language detection and validation to LLM post-editing
fmueller Dec 16, 2025
0d08634
refactor: defer transformers import for faster CLI startup
fmueller Dec 16, 2025
a6dab71
refactor: enhance post-editing constraints and input handling
fmueller Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ dependencies = [
"typer>=0.20.0",
"pyyaml>=6.0.2",
"python-frontmatter>=1.1.0",
"transformers>=4.46.3",
"torch>=2.5.1",
"sentencepiece>=0.2.0",
"sacremoses>=0.1.1",
"fast-langdetect>=1.0.0",
"fasttext>=0.9.3",
]

[project.scripts]
Expand Down
5 changes: 3 additions & 2 deletions src/scribae/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

import typer

from . import brief
from .brief_cli import brief_command
from .idea_cli import idea_command
from .meta_cli import meta_command
from .translate_cli import translate_command
from .write_cli import write_command

app = typer.Typer(help="Scribae CLI — generate writing briefs from local notes.")

__all__ = ["app", "main", "brief"]
__all__ = ["app", "main"]


@app.callback(invoke_without_command=True)
Expand All @@ -20,6 +20,7 @@ def app_callback() -> None:
app.command("brief", help="Create a structured creative brief from a Markdown note.")(brief_command)
app.command("write", help="Generate article body from a note + SeoBrief.")(write_command)
app.command("meta", help="Generate final article metadata/frontmatter.")(meta_command)
app.command("translate", help="Translate Markdown locally (MT + post-edit).")(translate_command)
app.command("idea", help="Brainstorm content ideas from a Markdown note.")(idea_command)


Expand Down
20 changes: 20 additions & 0 deletions src/scribae/translate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from .markdown_segmenter import MarkdownSegmenter, ProtectedText, TextBlock
from .model_registry import ModelRegistry, ModelSpec, RouteStep
from .mt import MTTranslator
from .pipeline import ToneProfile, TranslationConfig, TranslationPipeline
from .postedit import LLMPostEditor, PostEditValidationError

__all__ = [
"MarkdownSegmenter",
"ProtectedText",
"TextBlock",
"ModelRegistry",
"ModelSpec",
"RouteStep",
"MTTranslator",
"ToneProfile",
"TranslationConfig",
"TranslationPipeline",
"LLMPostEditor",
"PostEditValidationError",
]
148 changes: 148 additions & 0 deletions src/scribae/translate/markdown_segmenter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from __future__ import annotations

import re
from dataclasses import dataclass, field
from typing import Any


@dataclass
class TextBlock:
"""Structured unit of markdown content."""

kind: str
text: str
meta: dict[str, Any] = field(default_factory=dict)


@dataclass
class ProtectedText:
"""Text with protected spans swapped for sentinel tokens."""

text: str
placeholders: dict[str, str]

def restore(self, translated: str) -> str:
restored = translated
for token, original in self.placeholders.items():
restored = restored.replace(token, original)
return restored


class MarkdownSegmenter:
"""Lightweight markdown segmenter with protection helpers."""

DEFAULT_PATTERNS = [
r"`[^`]+`", # inline code
r"\{[^{}]+\}", # placeholders
r"\{\{[^{}]+\}\}",
r":[a-z0-9_-]+:",
r"https?://[^\s\]]+",
]

def __init__(self, protected_patterns: list[str] | None = None) -> None:
self.protected_patterns = self.DEFAULT_PATTERNS + (protected_patterns or [])

def segment(self, text: str) -> list[TextBlock]:
"""Split markdown text into blocks while preserving structure."""
remaining = text
blocks: list[TextBlock] = []
frontmatter_match = re.match(r"^---\n(.+?)\n---\n?", remaining, flags=re.DOTALL)
if frontmatter_match:
fm_text = frontmatter_match.group(0)
fm_body = frontmatter_match.group(1)
blocks.append(TextBlock(kind="frontmatter", text=fm_text, meta={"body": fm_body}))
remaining = remaining[len(fm_text) :]

blocks.extend(self._segment_body(remaining))
return blocks

def _segment_body(self, text: str) -> list[TextBlock]:
lines = text.splitlines()
blocks: list[TextBlock] = []
buffer: list[str] = []
current_kind = "paragraph"
in_code = False
fence = ""

def _flush() -> None:
nonlocal buffer, current_kind
if buffer:
blocks.append(TextBlock(kind=current_kind, text="\n".join(buffer)))
buffer = []
current_kind = "paragraph"

for line in lines:
fence_match = re.match(r"^(```|~~~)", line)
if fence_match:
marker = fence_match.group(1)
if in_code and marker == fence:
buffer.append(line)
_flush()
in_code = False
fence = ""
continue
if in_code:
buffer.append(line)
continue
_flush()
in_code = True
fence = marker
current_kind = "code_block"
buffer.append(line)
continue

if in_code:
buffer.append(line)
continue

if line.startswith("#"):
_flush()
level = len(line) - len(line.lstrip("#"))
heading_text = line.lstrip("#").strip()
blocks.append(TextBlock(kind="heading", text=line, meta={"level": level, "title": heading_text}))
continue

if re.match(r"^(\s*[-*+]|[0-9]+\.)\s+", line):
if current_kind not in {"list_item"}:
_flush()
current_kind = "list_item"
blocks.append(TextBlock(kind="list_item", text=line, meta={"marker": line.split()[0]}))
current_kind = "paragraph"
continue

if line.strip() == "":
_flush()
blocks.append(TextBlock(kind="blank", text=""))
continue

buffer.append(line)

_flush()
return blocks

def reconstruct(self, blocks: list[TextBlock]) -> str:
parts = [block.text for block in blocks]
return "\n".join(parts).rstrip("\n")

def protect_text(self, text: str, extra_patterns: list[str] | None = None) -> ProtectedText:
"""Return text where protected spans are replaced with sentinels."""
patterns = self.protected_patterns + (extra_patterns or [])
placeholders: dict[str, str] = {}
pattern = re.compile("|".join(f"({p})" for p in patterns), flags=re.IGNORECASE)

def _replace(match: re.Match[str]) -> str:
token = f"<<<PROTECTED_{len(placeholders)}>>>"
placeholders[token] = match.group(0)
return token

replaced = pattern.sub(_replace, text)
return ProtectedText(text=replaced, placeholders=placeholders)

def extract_links(self, text: str) -> list[str]:
return re.findall(r"https?://[^\s\])>]+", text)

def extract_numbers(self, text: str) -> list[str]:
return re.findall(r"\d+(?:[.,:/-]\d+)*", text)


__all__ = ["MarkdownSegmenter", "ProtectedText", "TextBlock"]
128 changes: 128 additions & 0 deletions src/scribae/translate/model_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from __future__ import annotations

from collections.abc import Iterable, Sequence
from dataclasses import dataclass
from typing import Literal

Backend = Literal["marian", "nllb"]


@dataclass(frozen=True)
class ModelSpec:
"""Translation model metadata."""

model_id: str
src_lang: str
tgt_lang: str
backend: Backend
disabled: bool = False


@dataclass(frozen=True)
class RouteStep:
"""Resolved translation step and model."""

src_lang: str
tgt_lang: str
model: ModelSpec


class ModelRegistry:
"""Registry for deterministic routing between language pairs."""

def __init__(
self,
specs: Iterable[ModelSpec] | None = None,
*,
nllb_model_id: str | None = None,
) -> None:
self._specs: list[ModelSpec] = list(specs) if specs else _default_specs()
self._nllb_model_id = nllb_model_id or "facebook/nllb-200-distilled-600M"

def normalize_lang(self, lang: str) -> str:
return lang.lower()

def find_direct(self, src_lang: str, tgt_lang: str) -> ModelSpec | None:
src = self.normalize_lang(src_lang)
tgt = self.normalize_lang(tgt_lang)
for spec in self._specs:
if spec.disabled:
continue
if self.normalize_lang(spec.src_lang) == src and self.normalize_lang(spec.tgt_lang) == tgt:
return spec
return None

def nllb_spec(self) -> ModelSpec:
return ModelSpec(
model_id=self._nllb_model_id,
src_lang="multi",
tgt_lang="multi",
backend="nllb",
)

def supported_pairs(self) -> set[tuple[str, str]]:
return {(self.normalize_lang(spec.src_lang), self.normalize_lang(spec.tgt_lang)) for spec in self._specs}

def route(
self,
src_lang: str,
tgt_lang: str,
*,
allow_pivot: bool = True,
backend: str = "marian_then_nllb",
) -> list[RouteStep]:
"""Return deterministic route for a language pair."""
src = self.normalize_lang(src_lang)
tgt = self.normalize_lang(tgt_lang)

direct = self.find_direct(src, tgt)
if direct:
return [RouteStep(src_lang=src, tgt_lang=tgt, model=direct)]

pivot_steps = self._pivot_route(src, tgt, allow_pivot=allow_pivot)
if pivot_steps:
return pivot_steps

if "nllb" in backend:
nllb = self.nllb_spec()
return [RouteStep(src_lang=src, tgt_lang=tgt, model=nllb)]

raise ValueError(f"No route found for {src}->{tgt}")

def _pivot_route(self, src_lang: str, tgt_lang: str, *, allow_pivot: bool) -> list[RouteStep] | None:
if not allow_pivot:
return None
if src_lang == "en" or tgt_lang == "en":
return None
first = self.find_direct(src_lang, "en")
second = self.find_direct("en", tgt_lang)
if first and second:
return [
RouteStep(src_lang=src_lang, tgt_lang="en", model=first),
RouteStep(src_lang="en", tgt_lang=tgt_lang, model=second),
]
return None


def _default_specs() -> list[ModelSpec]:
"""Default MarianMT pairs for Scribae."""
pairs: Sequence[tuple[str, str, str]] = (
("en", "de", "Helsinki-NLP/opus-mt-en-de"),
("de", "en", "Helsinki-NLP/opus-mt-de-en"),
("en", "es", "Helsinki-NLP/opus-mt-en-es"),
("es", "en", "Helsinki-NLP/opus-mt-es-en"),
("en", "fr", "Helsinki-NLP/opus-mt-en-fr"),
("fr", "en", "Helsinki-NLP/opus-mt-fr-en"),
("en", "it", "Helsinki-NLP/opus-mt-en-it"),
("it", "en", "Helsinki-NLP/opus-mt-it-en"),
("en", "pt", "Helsinki-NLP/opus-mt-en-pt"),
("pt", "en", "Helsinki-NLP/opus-mt-pt-en"),
("de", "es", "Helsinki-NLP/opus-mt-de-es"),
("de", "fr", "Helsinki-NLP/opus-mt-de-fr"),
("de", "it", "Helsinki-NLP/opus-mt-de-it"),
("de", "pt", "Helsinki-NLP/opus-mt-de-pt"),
)
return [ModelSpec(model_id=model_id, src_lang=src, tgt_lang=tgt, backend="marian") for src, tgt, model_id in pairs]


__all__ = ["ModelRegistry", "ModelSpec", "RouteStep"]
Loading