Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
aa60e67
fix(engine): reject plaintext secrets at connector registration
zc277584121 Jul 2, 2026
b1255c1
fix(cli): correct cat/head/tail output byte-fidelity and --range parsing
zc277584121 Jul 2, 2026
68b9cde
fix(api): reject non-object locator JSON on cat before it reaches the…
zc277584121 Jul 2, 2026
a9c4c55
fix(api): reject unknown chunk kinds on search instead of silently no…
zc277584121 Jul 2, 2026
e2547a4
fix(cli): validate profile URLs at add time
zc277584121 Jul 2, 2026
19f0867
fix(cli): recognize file:// targets as local in add's cost-estimate c…
zc277584121 Jul 2, 2026
53ee694
fix(cli): normalize file:// URI path scoping for search/cat/ls/grep/etc
zc277584121 Jul 2, 2026
787b059
fix(api): 404 grep on paths that don't exist under a connector
zc277584121 Jul 2, 2026
edd0fdb
fix(cli): reject --upload/--force-upload together with --no-upload
zc277584121 Jul 2, 2026
120abcd
fix(connectors/file): stop absolute-path leak from read() on missing …
zc277584121 Jul 2, 2026
0b69917
fix(cli): reject oversized search/grep queries before hitting reqwest
zc277584121 Jul 2, 2026
73a9336
fix(engine): reject implicit config drift on already-registered conne…
zc277584121 Jul 2, 2026
96e8c4e
fix(engine): dedupe same-batch chunk_id collisions and stop lying abo…
zc277584121 Jul 2, 2026
fc37ae9
docs(search): note that bm25 grep matches are keyword-based, not exac…
zc277584121 Jul 2, 2026
1ae82a1
cli(job list): surface a truncated error snippet on failed rows
zc277584121 Jul 2, 2026
8574a32
fix(connectors): add real healthcheck for web and s3, closing the pro…
zc277584121 Jul 2, 2026
b8979ea
fix(cli/serve): stop start/restart from racing an out-of-band server
zc277584121 Jul 2, 2026
77fdc54
fix(engine): probe/estimate reuse stored config when --config is omitted
zc277584121 Jul 2, 2026
d2ba90c
fix(connectors/web): port-aware domain matching + fail loud on zero-c…
zc277584121 Jul 2, 2026
83b598e
fix(engine): job cancel now stops future embed batches, not just the …
zc277584121 Jul 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions cli/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use std::process::Command;

fn main() {
// Stamp the build with a short git commit so `mfs --version` can answer
// "which build is this" without ps-aux archaeology across install paths
// (cargo install / uv tool / a worktree's own target dir can all differ).
// Falls back to "unknown" outside a git checkout (e.g. a source tarball).
let sha = Command::new("git")
.args(["rev-parse", "--short", "HEAD"])
.output()
.ok()
.filter(|o| o.status.success())
.and_then(|o| String::from_utf8(o.stdout).ok())
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| "unknown".to_string());
println!("cargo:rustc-env=MFS_GIT_SHA={sha}");
println!("cargo:rerun-if-changed=../.git/HEAD");
println!("cargo:rerun-if-changed=../.git/index");
}
300 changes: 277 additions & 23 deletions cli/src/main.rs

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion docs/search-and-browse.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ Grep JSON is smaller:
```

`via` can identify how grep found the match, for example pushdown, BM25, linear
scan, or a notice.
scan, or a notice. A `bm25` match is keyword-based, not an exact-literal or
regex match — expect token-level matching rather than character-for-character
matches on database-backed sources.

## Reopen File-Like Hits

Expand Down
28 changes: 26 additions & 2 deletions server/python/src/mfs_server/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import asyncio
from contextlib import asynccontextmanager
from typing import Literal
from typing import Literal, get_args

from fastapi import FastAPI, HTTPException, Request
from fastapi.exceptions import RequestValidationError
Expand All @@ -21,6 +21,7 @@
from .. import __version__
from ..common.logging import configure_logging
from ..config import ServerConfig, load_server_config
from ..connectors.base import ChunkKind
from ..engine.engine import Engine
from .models import (
AddRequest,
Expand All @@ -43,6 +44,8 @@
StatusResponse,
)

_VALID_CHUNK_KINDS = get_args(ChunkKind)

# Canonical error codes -> suggested next actions. The endpoints
# raise HTTPException with the canonical code as `detail` for these cases; the handler
# below turns that into the stable {code, detail, suggestions} envelope SDKs switch on.
Expand All @@ -54,6 +57,10 @@
"tail_unsupported": ["head", "cat --range"],
"locator_not_found": ["re-search; the record may have changed"],
"since_unsupported": ["drop --since"],
"config_required": [
"pass --config with the connector's full config; omitting it would silently drop "
"the existing stored configuration"
],
"sync_already_running": ["mfs job list", "mfs job cancel JOB_ID"],
"connector_removing": ["wait for removal to finish, then retry"],
"remove_requires_connector_root": [
Expand Down Expand Up @@ -477,7 +484,17 @@ async def search(
if path:
connector_uri, object_prefix = await eng().resolve_connector_uri(path)
# comma-separated chunk_kinds, e.g. ?kind=body,directory_summary
chunk_kinds = [k.strip() for k in kind.split(",") if k.strip()] if kind else None
chunk_kinds = None
if kind is not None:
chunk_kinds = [k.strip() for k in kind.split(",")]
invalid = sorted({k for k in chunk_kinds if k not in _VALID_CHUNK_KINDS})
if invalid:
bad = ", ".join(repr(k) for k in invalid)
raise HTTPException(
400,
f"unknown chunk kind(s): {bad} -- valid kinds are: "
f"{', '.join(_VALID_CHUNK_KINDS)}",
)
try:
results = await eng().search(
q,
Expand Down Expand Up @@ -555,6 +572,13 @@ async def cat(
loc = _json.loads(locator)
except ValueError:
raise HTTPException(400, "invalid locator JSON")
# A syntactically valid JSON value that isn't an object (array, number,
# string, bool, or null) can never match a record's locator_fields --
# reject it as malformed rather than letting eng().cat() either crash
# on non-dict.get()/`in` or (for null) silently fall through to the
# "no locator given" path, which would hide a real client-side bug.
if not isinstance(loc, dict):
raise HTTPException(400, "invalid locator JSON")
try:
out = await eng().cat(path, range=rg, meta=meta, density=density, locator=loc)
except IsADirectoryError:
Expand Down
5 changes: 3 additions & 2 deletions server/python/src/mfs_server/connectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,8 +484,9 @@ async def healthcheck(self) -> HealthStatus:
# only constructs an httpx.AsyncClient at connect() time will say
# ok=True even with a 401 token until the first real call. Override
# this with a cheap round-trip (GitHub /repos/{o}/{r}, Slack
# auth.test, etc.) when correctness matters for the probe UX. The
# github connector does; the rest currently inherit this default.
# auth.test, S3 head_bucket, etc.) when correctness matters for the
# probe UX. Most connectors do; check a given plugin's own file
# before assuming it still inherits this no-op.
return HealthStatus(ok=True)

async def introspect_for_wizard(self) -> dict[str, dict]:
Expand Down
2 changes: 2 additions & 0 deletions server/python/src/mfs_server/connectors/file/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,8 @@ async def list(self, path: str) -> list[Entry]:
# --- read ---
async def read(self, path: str, range: Optional[Range] = None) -> AsyncIterator[bytes]:
real = self._real(path)
if not real.exists():
raise FileNotFoundError(path)
if range is None:
with open(real, "rb") as f:
while chunk := f.read(65536):
Expand Down
25 changes: 25 additions & 0 deletions server/python/src/mfs_server/connectors/s3/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
Capabilities,
ConnectorPlugin,
Entry,
HealthStatus,
ObjectChange,
ObjectKind,
PathStat,
Expand Down Expand Up @@ -73,6 +74,30 @@ def _client_kwargs(self) -> dict:
kw["endpoint_url"] = self._cfg("endpoint_url")
return kw

async def healthcheck(self) -> HealthStatus:
# The base default never opens a real connection, so a bad access
# key, wrong bucket, or unreachable endpoint would probe clean and
# only surface once a real sync ran and failed. Prefer
# list_objects_v2(MaxKeys=1) over head_bucket: verified against the
# real (currently-broken) test bucket that head_bucket collapses
# both "bad credentials" and "bucket doesn't exist" into an
# undifferentiated 403, while list_objects_v2 surfaces the actual
# error code (e.g. InvalidAccessKeyId) — same cost, better diagnostic.
from botocore.exceptions import BotoCoreError, ClientError

bucket = self._bucket()
if not bucket:
return HealthStatus(ok=False, detail="no bucket configured")
try:
async with self._session().client("s3", **self._client_kwargs()) as s3:
await s3.list_objects_v2(Bucket=bucket, MaxKeys=1)
except ClientError as e:
code = e.response.get("Error", {}).get("Code", "?")
return HealthStatus(ok=False, detail=f"{bucket}: {code}")
except BotoCoreError as e:
return HealthStatus(ok=False, detail=f"network error reaching {bucket}: {e}")
return HealthStatus(ok=True, detail=f"bucket {bucket} reachable")

def object_kind_of(self, path: str) -> ObjectKind:
ext = os.path.splitext(path)[1].lower()
if ext in CODE_EXT:
Expand Down
57 changes: 56 additions & 1 deletion server/python/src/mfs_server/connectors/web/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
Capabilities,
ConnectorPlugin,
Entry,
HealthStatus,
ObjectChange,
ObjectKind,
PathStat,
Expand Down Expand Up @@ -81,7 +82,49 @@ def url_to_path(url: str) -> str:

def _allowed(self, url: str) -> bool:
domains = self._cfg("allowed_domains", []) or []
return urlparse(url).netloc in domains if domains else True
if not domains:
return True
u = urlparse(url)
# An `allowed_domains` entry with a port (e.g. "127.0.0.1:18080")
# must match the URL's exact host:port. An entry with no port (e.g.
# "example.com") matches that host on ANY port, via `.hostname`
# (port-stripped) -- previously every entry was compared against
# the port-inclusive `netloc`, so a bare-host entry never matched a
# non-default-port URL, including the connector's own seed URL.
for d in domains:
if ":" in d:
if u.netloc == d:
return True
elif u.hostname == d:
return True
return False

async def healthcheck(self) -> HealthStatus:
# The base default never makes a request, so a seed URL excluded by
# allowed_domains (or simply unreachable) would probe clean and only
# surface once a real sync crawled zero pages. Run the same
# _allowed() gate sync() uses, then a cheap GET against one seed URL.
import aiohttp

start = list(self._cfg("start_urls", []) or [])
if not start:
return HealthStatus(ok=False, detail="no start_urls configured")
blocked = [u for u in start if not self._allowed(u)]
if blocked:
return HealthStatus(
ok=False,
detail=f"allowed_domains excludes {len(blocked)} of {len(start)} start_urls: {blocked[:3]}",
)
try:
async with aiohttp.ClientSession(headers={"User-Agent": "mfs-web/0.4"}) as sess:
async with sess.get(start[0], timeout=aiohttp.ClientTimeout(total=10)) as resp:
if resp.status >= 400:
return HealthStatus(
ok=False, detail=f"{start[0]} returned HTTP {resp.status}"
)
except Exception as e: # noqa: BLE001
return HealthStatus(ok=False, detail=f"network error reaching {start[0]}: {e}")
return HealthStatus(ok=True, detail=f"{len(start)} start_url(s) allowed, seed reachable")

def object_kind_of(self, path: str) -> ObjectKind:
return "document" if path.endswith(".md") else "directory"
Expand Down Expand Up @@ -236,4 +279,16 @@ async def sync(self, opts: SyncOptions) -> AsyncIterator[ObjectChange]:
if link not in visited:
queue.append(link)
crawled += 1
if crawled == 0 and start and not any(self._allowed(u) for u in start):
# Every seed URL was excluded by allowed_domains -- this connector
# can never crawl anything as configured. That's almost never
# intentional (vs. e.g. a since-filter or transient fetch errors
# legitimately yielding zero *new* pages this run), so fail the
# job loudly instead of silently persisting an empty, "succeeded"
# connector -- previously the only way to notice was to realize
# search was returning nothing.
raise ValueError(
"0 pages crawled: every start_url was excluded by allowed_domains "
f"(start_urls={start!r}, allowed_domains={self._cfg('allowed_domains')!r})"
)
await self.state.set("pages", pages)
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,35 @@ class CredentialService:
_CONN_URI_RE = re.compile(r"[a-zA-Z][a-zA-Z0-9+.\-]*://[^/\s:@]+:[^/\s@]+@")
_REDACTED = "<redacted: use credential_ref=env:VAR>"

# --- validate ---

@classmethod
def validate_no_plaintext_secrets(cls, value: Any, key_path: str = "") -> None:
"""Recursively reject inline plaintext secrets BEFORE redact()/persistence.
A caller must supply credentials as env:VAR / file:/abs/path references; a
raw value under a secret-looking key (or an inline user:pass@host
connection string) is rejected outright rather than silently masked, so a
rebuild can never end up resolving a redaction placeholder as if it were a
real credential. Mirrors redact()'s detection surface exactly."""
if isinstance(value, dict):
for k, v in value.items():
cls.validate_no_plaintext_secrets(v, f"{key_path}.{k}" if key_path else str(k))
return
if isinstance(value, list):
for v in value:
cls.validate_no_plaintext_secrets(v, key_path)
return
if not isinstance(value, str) or value in ("", None):
return
if value.startswith(cls._CRED_REF_PREFIXES):
return # a safe credential reference
leaf_key = key_path.rsplit(".", 1)[-1] if key_path else ""
if cls.is_secret_key(leaf_key) or cls._CONN_URI_RE.search(value):
raise ValueError(
f"plaintext secret in config field {key_path!r}: use "
"credential_ref=env:VAR or file:/abs/path, not a literal value"
)

# --- redact ---

@classmethod
Expand All @@ -150,20 +179,24 @@ def is_secret_key(cls, key: str) -> bool:
def redact(cls, value: Any, key_is_secret: bool = False) -> Any:
"""Recursively redact raw inline secrets from a config before persistence.
A credential_ref (env:/secret:/file:/vault:) is kept; anything else under a
secret-looking key is replaced. Recurses into dicts/lists so nested OAuth
token dicts don't leak. Verbatim migration of ``_redact_config``."""
secret-looking key is replaced with None (never a placeholder string a
plugin could mistake for a real value). Recurses into dicts/lists so nested
OAuth token dicts don't leak. Verbatim migration of ``_redact_config``,
except the placeholder is now None instead of a literal sentinel string —
callers should have already rejected plaintext secrets via
``validate_no_plaintext_secrets``; this is defense-in-depth only."""
if isinstance(value, dict):
return {k: cls.redact(v, cls.is_secret_key(k)) for k, v in value.items()}
if isinstance(value, list):
return [cls.redact(v, key_is_secret) for v in value]
if isinstance(value, str) and value.startswith(cls._CRED_REF_PREFIXES):
return value # a safe credential reference, keep as-is
if key_is_secret and value not in (None, "", [], {}):
return cls._REDACTED
return None
# value-level catch: an inline connection string carrying a password leaks
# via a field name (dsn/uri/url/connection) that doesn't look secret — redact by shape.
if isinstance(value, str) and cls._CONN_URI_RE.search(value):
return cls._REDACTED
return None
return value

# --- resolve ---
Expand All @@ -177,6 +210,14 @@ def resolve(value: Any) -> Any:
a working ref and silently fail auth. Verbatim migration of ``_resolve_ref``."""
if not isinstance(value, str):
return value
if value == CredentialService._REDACTED:
# defense-in-depth: a pre-fix row (or any other path that still produced
# the old sentinel string) must never be resolved as a literal credential.
raise ValueError(
f"credential_ref {value!r}: this is a redaction placeholder, not a "
"real credential — re-register the connector with credential_ref=env:VAR "
"or file:/abs/path"
)
if value.startswith("env:"):
name = value[4:]
if name not in os.environ:
Expand Down Expand Up @@ -367,6 +408,12 @@ def resolve_target(self, target: str) -> TargetResolution:

# --- credentials (single security entry point) ---

def validate_credentials(self, config: Any) -> None:
"""Reject plaintext secrets in a caller-supplied config before it is ever
redacted/persisted. Callers that register/update a connector MUST call
this before ``redact``."""
self._creds.validate_no_plaintext_secrets(config)

def redact(self, config: Any) -> Any:
"""Recursively redact inline secrets before persistence.
ObjectRepository MUST call this before writing a connectors row."""
Expand Down
Loading