HKUDS · pancacake · Apr 7, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/deeptutor/tools/question/question_extractor.py b/deeptutor/tools/question/question_extractor.py
@@ -25,6 +25,35 @@
 from deeptutor.utils.json_parser import parse_json_response
 
 
+def _find_parsed_content_dir(paper_dir: Path) -> Path:
+    """Locate the MinerU output directory that contains parsed markdown artifacts."""
+    candidate_dirs: list[Path] = []
+
+    for preferred_name in ("auto", "hybrid_auto"):
+        preferred_dir = paper_dir / preferred_name
+        if preferred_dir.is_dir():
+            candidate_dirs.append(preferred_dir)
+
+    for child in sorted(paper_dir.iterdir()):
+        if child.is_dir() and child not in candidate_dirs:
+            candidate_dirs.append(child)
+
+    nested_artifact_dirs = {
+        artifact.parent
+        for pattern in ("*.md", "*_content_list.json")
+        for artifact in paper_dir.rglob(pattern)
+    }
+    for artifact_dir in sorted(nested_artifact_dirs):
+        if artifact_dir not in candidate_dirs:
+            candidate_dirs.append(artifact_dir)
+
+    for candidate_dir in candidate_dirs:
+        if list(candidate_dir.glob("*.md")):
+            return candidate_dir
+
+    return candidate_dirs[0] if candidate_dirs else paper_dir
+
+
 def load_parsed_paper(paper_dir: Path) -> tuple[str | None, list[dict] | None, Path]:
     """
     Load MinerU-parsed exam paper files
@@ -35,9 +64,9 @@ def load_parsed_paper(paper_dir: Path) -> tuple[str | None, list[dict] | None, P
     Returns:
         (markdown_content, content_list, images_dir)
     """
-    auto_dir = paper_dir / "auto"
-    if not auto_dir.exists():
-        auto_dir = paper_dir
+    auto_dir = _find_parsed_content_dir(paper_dir)
+    if auto_dir != paper_dir:
+        print(f"📂 Using parsed content directory: {auto_dir.relative_to(paper_dir)}")
 
     md_files = list(auto_dir.glob("*.md"))
     if not md_files:

diff --git a/pyproject.toml b/pyproject.toml
@@ -40,7 +40,7 @@ deeptutor = "deeptutor_cli.main:main"
 anthropic = ["anthropic>=0.30.0"]
 dashscope = ["dashscope>=1.14.0"]
 search = ["perplexityai>=0.1.0"]
-oauth = ["oauth-cli-kit>=0.2.0"]
+oauth = ["oauth-cli-kit>=0.2.0; python_version >= '3.11'"]
 server = [
     "fastapi>=0.100.0",
     "uvicorn[standard]>=0.24.0",
@@ -52,7 +52,7 @@ all = [
     "anthropic>=0.30.0",
     "dashscope>=1.14.0",
     "perplexityai>=0.1.0",
-    "oauth-cli-kit>=0.2.0",
+    "oauth-cli-kit>=0.2.0; python_version >= '3.11'",
     "fastapi>=0.100.0",
     "uvicorn[standard]>=0.24.0",
     "websockets>=12.0",

diff --git a/requirements/cli.txt b/requirements/cli.txt
@@ -18,7 +18,7 @@ tiktoken>=0.5.0
 anthropic>=0.30.0
 dashscope>=1.14.0
 perplexityai>=0.1.0
-oauth-cli-kit>=0.1.1
+oauth-cli-kit>=0.1.1; python_version >= "3.11"
 
 # --- HTTP clients ---
 aiohttp>=3.9.4

diff --git a/tests/tools/test_question_extractor.py b/tests/tools/test_question_extractor.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import importlib.util
+import json
+from pathlib import Path
+import sys
+import types
+
+
+def _load_question_extractor_module():
+    module_path = (
+        Path(__file__).resolve().parents[2]
+        / "deeptutor"
+        / "tools"
+        / "question"
+        / "question_extractor.py"
+    )
+
+    stubbed_modules = {
+        "deeptutor.services.config": {"get_agent_params": lambda *_args, **_kwargs: {}},
+        "deeptutor.services.llm": {"complete": lambda *_args, **_kwargs: None},
+        "deeptutor.services.llm.capabilities": {
+            "supports_response_format": lambda *_args, **_kwargs: False
+        },
+        "deeptutor.services.llm.config": {"get_llm_config": lambda: None},
+        "deeptutor.utils.json_parser": {
+            "parse_json_response": lambda *_args, **_kwargs: {}
+        },
+    }
+
+    original_modules: dict[str, types.ModuleType | None] = {}
+    for module_name, attributes in stubbed_modules.items():
+        original_modules[module_name] = sys.modules.get(module_name)
+        module = types.ModuleType(module_name)
+        for attr_name, value in attributes.items():
+            setattr(module, attr_name, value)
+        sys.modules[module_name] = module
+
+    try:
+        spec = importlib.util.spec_from_file_location("question_extractor_under_test", module_path)
+        assert spec and spec.loader
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+    finally:
+        for module_name, original_module in original_modules.items():
+            if original_module is None:
+                sys.modules.pop(module_name, None)
+            else:
+                sys.modules[module_name] = original_module
+
+
+def test_load_parsed_paper_supports_nested_hybrid_auto_output(tmp_path: Path) -> None:
+    question_extractor = _load_question_extractor_module()
+    paper_dir = tmp_path / "mimic_exam"
+    parsed_dir = paper_dir / "hybrid_auto"
+    images_dir = parsed_dir / "images"
+    images_dir.mkdir(parents=True)
+
+    markdown_path = parsed_dir / "exam.md"
+    markdown_path.write_text("# Exam content", encoding="utf-8")
+
+    content_list_path = parsed_dir / "exam_content_list.json"
+    content_list_path.write_text(
+        json.dumps([{"type": "text", "text": "Question 1"}], ensure_ascii=False),
+        encoding="utf-8",
+    )
+
+    (images_dir / "figure.png").write_text("image-bytes", encoding="utf-8")
+
+    markdown_content, content_list, discovered_images_dir = question_extractor.load_parsed_paper(
+        paper_dir
+    )
+
+    assert markdown_content == "# Exam content"
+    assert content_list == [{"type": "text", "text": "Question 1"}]
+    assert discovered_images_dir == images_dir