add autocheck synthetic path accessibility

CiaoHe · CiaoHe · commit 4c00b251a4c7 · 2025-11-13T17:57:05.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -52,3 +52,4 @@ submit*.sh
 qual.sh
 
 collectors/
+data/compounds
diff --git a/README_accessibility.md b/README_accessibility.md
@@ -0,0 +1,86 @@
+# 可购性检查 CLI 使用说明
+
+本说明介绍如何使用本仓库内的可购性检查工具，基于两部分：
+- `scripts/check_accessible.py`：单个 SMILES 可购性检查（本地库存优先，其次 PubChem）。
+- `inference.py`：命令行工具（CLI）。可对模型预测得到的合成路径（paths）进行筛选：若存在“全可购”路径优先保留；若不存在，则保留“不可购节点最少”的路径集合。也可先调用模型生成 paths 再筛选。
+
+适用场景：你已有模型输出的 `path_string` 列表，想快速过滤出所有节点都可购的路线；或希望一步生成+筛选。
+
+目录定位（默认）：
+- 可购库存清单：`data/compounds/buyables-stock.txt`
+- 可视化/模型配置：`data/configs/dms_dictionary.yaml`
+- 模型权重目录：`data/checkpoints/`
+
+注意：仅“检查模式”（只用 `--paths-file`）不需要安装 torch；只有选择“先生成再检查”时才会加载模型相关依赖。输入 paths 支持两种形式：
+- JSON 字符串 path_string（如 "{'smiles':'...','children':[...]}"）
+- 已解析的字典对象路径（形如 `{smiles, children}` 的树）
+
+一、快速开始（仅检查已有 paths）
+- 文件格式支持：
+  - JSON 数组：元素可以是 `path_string` 字符串，或已解析好的路径字典对象
+  - 行分隔文本：每行一个 `path_string` 字符串
+
+示例：
+- 只检查 children（起始物料），不要求根节点（产物）可购：
+  - `python3 inference.py --paths-file my_paths.txt --children-only --output filtered.json`
+- 要求路径中包含根节点在内所有节点都可购：
+  - `python3 inference.py --paths-file my_paths.txt --include-root --output filtered.json`
+
+可选参数：
+- `--stock-file data/compounds/buyables-stock.txt` 覆盖默认库存清单
+- `--sleep 0.2` PubChem 查询限速（秒），默认 0.2
+
+二、生成 + 检查（需要模型与 checkpoint）
+示例：
+- `python3 inference.py --target "CNCc1cc(-c2ccccc2F)n(S(=O)(=O)c2cccnc2)c1" --model explorer --beam-size 32 --children-only --output filtered.json`
+- 可选：`--n-steps`, `--starting-material`, `--ckpt-dir data/checkpoints`, `--config data/configs/dms_dictionary.yaml`
+
+提示：脚本会在运行时自动将 `./src` 加入 `sys.path`，可直接以源码方式调用模块；若使用自定义安装环境，确保 `directmultistep` 模块可被导入。
+
+三、筛选策略与输出说明（JSON）
+- 筛选策略：
+  - 若存在“整条路径所有被检查节点都可购”的路径，优先保留这些（all_access）。
+  - 若不存在全可购路径，则保留“不可购节点数最少”的路径集合（min_inaccessible）。若并列，全部保留。
+- 输出字段：
+  - `total`: 候选路径数量
+  - `accessible`: 全可购路径数量（仅统计 all_access）
+  - `include_root`: 是否要求根节点一并检查
+  - `selection`: `all_access` 或 `min_inaccessible`
+  - `min_not_accessible`: 若为 `min_inaccessible` 策略，给出最小不可购节点数；否则为 null
+  - `paths`: 选中的路径列表（JSON 结构：每条路径为 `{smiles, children}` 的字典树，不是字符串）
+  - `statuses`: 每个出现过的 SMILES 的判定结果，形如 `{"status": "purchasable" | "no_vendors" | "not_found" | "error", "detail": "..."}`
+  - `path_reports`: 每条候选路径的诊断：`{"path": <JSON路径>, "not_accessible": <数量>, "not_accessible_smiles": [..]}`
+
+四、判定逻辑与来源
+- 先查本地清单（精确匹配）：`data/compounds/buyables-stock.txt`
+- 不在本地清单时，调用 PubChem PUG/PUG View：
+  - SMILES → CID；若 CID == 0 或未找到，判为 `not_found`
+  - CID → Chemical Vendors；若存在并为真，判为 `purchasable`；否则 `no_vendors`
+- 为减少对 PubChem 的压力，在网络查询间加入 `--sleep` 延迟。
+
+五、返回码（exit code）
+- 0：至少有一条路径被选中（全可购或“最少不可购”）
+- 3：没有路径被选中
+- 2：输入错误或文件不存在等问题
+
+六、常见用法小抄
+- 检查 children-only（最常用）：
+  - `python3 inference.py --paths-file my_paths.txt --children-only --output filtered.json`
+- 使用自定义库存清单：
+  - `python3 inference.py --paths-file my_paths.txt --children-only --stock-file path/to/stock.txt`
+- 生成+检查（含根节点）：
+  - `python3 inference.py --target "<SMILES>" --model explorer --include-root --output filtered.json`
+  - 若不存在全可购路径，输出会自动切换到 `min_inaccessible` 策略并保留不可购最少的路径集合。
+
+七、局限与建议
+- PubChem 作为可购性代理指标对常见试剂较准，但对盐型/互变异构体/异常表示可能返回 `not_found` 或 `no_vendors`。
+- 本地清单为精确字符串匹配；若需 SMILES 规范化后匹配，可扩展脚本进行标准化。
+- 批量大规模查询时适当增大 `--sleep`，避免过快触发限速。
+
+相关脚本
+- `inference.py`：CLI 主脚本
+- `scripts/check_accessible.py`：单个 SMILES 判定逻辑（可独立运行）
+
+问题反馈或扩展（建议）
+- 需要导出“不可购/未知”路径及不可购节点详细清单（CSV/JSON）？
+- 需要只检查叶子节点（起始原料）/只检查 children/包含根节点的不同策略？目前已支持 `--children-only` 与 `--include-root`。
diff --git a/inference.py b/inference.py
@@ -0,0 +1,229 @@
+# Allow running without installation by adding ./src to sys.path (done lazily in main before model import)
+
+from pathlib import Path
+from typing import Any, Dict, List, Sequence, Set, Tuple
+
+import argparse
+import json
+import ast
+import requests
+import sys
+import time
+
+# Reuse the per-SMILES checker from scripts/check_accessible.py
+from scripts.check_accessible import check_accessible  # type: ignore
+
+
+def _iter_all_smiles(node: Dict[str, Any]) -> List[str]:
+    smiles: List[str] = []
+    smi = node.get("smiles")
+    if isinstance(smi, str) and smi:
+        smiles.append(smi)
+    for child in node.get("children", []) or []:
+        smiles.extend(_iter_all_smiles(child))
+    return smiles
+
+
+def _load_stock(stock_file: Path) -> Set[str]:
+    stock: Set[str] = set()
+    if stock_file.is_file():
+        with stock_file.open("r", encoding="utf-8") as f:
+            for line in f:
+                t = line.strip()
+                if not t or t.startswith("#"):
+                    continue
+                stock.add(t)
+    return stock
+
+
+def filter_accessible_paths(
+    paths: Sequence[Any], stock_file: Path, include_root: bool = True, sleep: float = 0.2
+) -> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, str]], List[Dict[str, Any]]]:
+    """Return only those path strings where all nodes are accessible.
+
+    Uses local stock first, then PubChem via check_accessible(). Returns the filtered
+    list and a map of SMILES -> {status, detail} used for the decision.
+    """
+    stock = _load_stock(stock_file)
+    session = requests.Session()
+    status_cache: Dict[str, Dict[str, str]] = {}
+
+    def is_accessible_smiles(smi: str) -> bool:
+        if smi in status_cache:
+            return status_cache[smi]["status"] == "purchasable"
+        if smi in stock:
+            status_cache[smi] = {"status": "purchasable", "detail": "found in stock file"}
+            return True
+        status, detail = check_accessible(smi, session)
+        # Be polite to PubChem
+        time.sleep(sleep)
+        status_cache[smi] = {"status": status, "detail": detail}
+        return status == "purchasable"
+
+    filtered: List[Dict[str, Any]] = []
+    reports: List[Dict[str, Any]] = []
+    for path_item in paths:
+        # Accept either a stringified Python/JSON literal or a dict-like object
+        node: Dict[str, Any]
+        if isinstance(path_item, str):
+            try:
+                node = ast.literal_eval(path_item)
+            except Exception:
+                # skip malformed path strings
+                continue
+            path_string = path_item
+        elif isinstance(path_item, dict):
+            node = path_item
+            path_string = None  # not used for output, keep node structure
+        else:
+            # Attempt to coerce unknown objects exposing mapping-like API
+            try:
+                node = dict(path_item)  # type: ignore[arg-type]
+                path_string = None
+            except Exception:
+                continue
+        smiles_chain = _iter_all_smiles(node)
+        # optionally exclude root
+        if not include_root and smiles_chain:
+            smiles_chain = smiles_chain[1:]
+        ok = True
+        bad_nodes: List[str] = []
+        for smi in smiles_chain:
+            if not is_accessible_smiles(smi):
+                ok = False
+                bad_nodes.append(smi)
+        if ok:
+            filtered.append(node)
+        # Collect a per-path report
+        reports.append({
+            "path": node,
+            "not_accessible": len(bad_nodes),
+            "not_accessible_smiles": bad_nodes,
+        })
+    return filtered, status_cache, reports
+
+
+def _load_paths_from_file(p: Path) -> List[Any]:
+    text = p.read_text(encoding="utf-8").strip()
+    if not text:
+        return []
+    # Try JSON array first
+    try:
+        arr = json.loads(text)
+        if isinstance(arr, list):
+            # Accept lists of strings or dicts
+            if all(isinstance(x, (str, dict)) for x in arr):
+                return arr
+    except Exception:
+        pass
+    # Fallback: newline-delimited strings
+    return [line for line in text.splitlines() if line.strip()]
+
+
+def main() -> int:
+    data_path = Path("./data")
+    default_ckpt = data_path / "checkpoints"
+    default_config = data_path / "configs" / "dms_dictionary.yaml"
+    default_stock = data_path / "compounds" / "buyables-stock.txt"
+    default_output = data_path / "accessible_paths_from_inference.json"
+
+    ap = argparse.ArgumentParser(description="Generate routes (optional) and filter paths where all nodes are purchasable.")
+    src = ap.add_argument_group("source of paths")
+    src.add_argument("--paths-file", type=Path, default=None, help="If provided, read candidate paths from file (JSON array or newline separated)")
+    src.add_argument("--target", default=None, help="Target SMILES. Required if not using --paths-file")
+    src.add_argument("--n-steps", type=int, default=None, help="Number of steps (None lets the model decide)")
+    src.add_argument("--starting-material", default=None, help="Optional starting material SMILES")
+    src.add_argument("--model", default="explorer", help="Model name or checkpoint; defaults to explorer")
+    src.add_argument("--beam-size", type=int, default=32, help="Beam size (default: 32)")
+    src.add_argument("--ckpt-dir", type=Path, default=default_ckpt, help="Checkpoint directory (default: data/checkpoints)")
+    src.add_argument("--config", type=Path, default=default_config, help="Config path (default: data/configs/dms_dictionary.yaml)")
+
+    filt = ap.add_argument_group("filter options")
+    filt.add_argument("--stock-file", type=Path, default=default_stock, help="Local purchasable SMILES list")
+    excl = filt.add_mutually_exclusive_group()
+    excl.add_argument("--include-root", action="store_true", help="Require root (product) to be purchasable")
+    excl.add_argument("--children-only", action="store_true", help="Only require children (reactants) to be purchasable")
+    filt.add_argument("--sleep", type=float, default=0.2, help="Delay between PubChem requests (default: 0.2s)")
+
+    ap.add_argument("--output", type=Path, default=default_output, help="Output JSON file path")
+
+    args = ap.parse_args()
+
+    # Determine include_root flag
+    include_root = True
+    if args.children_only:
+        include_root = False
+    elif args.include_root:
+        include_root = True
+
+    # Load or generate candidate paths
+    if args.paths_file is not None:
+        if not args.paths_file.is_file():
+            print(f"Error: paths file not found: {args.paths_file}", file=sys.stderr)
+            return 2
+        paths = _load_paths_from_file(args.paths_file)
+    else:
+        if not args.target:
+            print("Error: --target is required when --paths-file is not provided", file=sys.stderr)
+            return 2
+        # Lazy import to avoid requiring torch when only checking paths
+        # Also add ./src to sys.path if available
+        repo_root = Path(__file__).resolve().parent
+        src_dir = repo_root / "src"
+        if (src_dir / "directmultistep").exists():
+            sys.path.insert(0, str(src_dir))
+        from directmultistep.generate import generate_routes  # type: ignore
+        paths = generate_routes(
+            target=args.target,
+            n_steps=args.n_steps,
+            starting_material=args.starting_material,
+            model=args.model,
+            beam_size=args.beam_size,
+            config_path=args.config,
+            ckpt_dir=args.ckpt_dir,
+        )
+
+    # Filter by accessibility
+    filtered_paths, statuses, reports = filter_accessible_paths(
+        paths=paths,
+        stock_file=args.stock_file,
+        include_root=include_root,
+        sleep=args.sleep,
+    )
+
+    selection_mode = "all_access"
+    min_inacc = None
+    selected_paths: List[Dict[str, Any]] = filtered_paths
+    # If none fully accessible, pick those with minimal number of not-accessible nodes
+    if not selected_paths:
+        if reports:
+            min_inacc = min(r["not_accessible"] for r in reports)
+            selection_mode = "min_inaccessible"
+            selected_paths = [r["path"] for r in reports if r["not_accessible"] == min_inacc]
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with args.output.open("w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "total": len(paths),
+                "accessible": len(filtered_paths),
+                "include_root": include_root,
+                "selection": selection_mode,
+                "min_not_accessible": min_inacc,
+                "paths": selected_paths,
+                "statuses": statuses,
+                "path_reports": reports,
+            },
+            f,
+            indent=2,
+            ensure_ascii=True,
+        )
+    print(
+        f"Candidates: {len(paths)}; fully-accessible ({'including root' if include_root else 'children only'}): {len(filtered_paths)}; "
+        f"selected: {len(selected_paths)} (mode={selection_mode}, min_not_accessible={min_inacc}); saved: {args.output}"
+    )
+    return 0 if selected_paths else 3
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/check_accessible.py b/scripts/check_accessible.py

Original file line number	Diff line number	Diff line change
`@@ -52,3 +52,4 @@ submit*.sh`
`52`	`52`	`qual.sh`
`53`	`53`
`54`	`54`	`collectors/`
	`55`	`+data/compounds`