Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions deeptutor/tools/question/question_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,35 @@
from deeptutor.utils.json_parser import parse_json_response


def _find_parsed_content_dir(paper_dir: Path) -> Path:
"""Locate the MinerU output directory that contains parsed markdown artifacts."""
candidate_dirs: list[Path] = []

for preferred_name in ("auto", "hybrid_auto"):
preferred_dir = paper_dir / preferred_name
if preferred_dir.is_dir():
candidate_dirs.append(preferred_dir)

for child in sorted(paper_dir.iterdir()):
if child.is_dir() and child not in candidate_dirs:
candidate_dirs.append(child)

nested_artifact_dirs = {
artifact.parent
for pattern in ("*.md", "*_content_list.json")
for artifact in paper_dir.rglob(pattern)
}
for artifact_dir in sorted(nested_artifact_dirs):
if artifact_dir not in candidate_dirs:
candidate_dirs.append(artifact_dir)

for candidate_dir in candidate_dirs:
if list(candidate_dir.glob("*.md")):
return candidate_dir

return candidate_dirs[0] if candidate_dirs else paper_dir


def load_parsed_paper(paper_dir: Path) -> tuple[str | None, list[dict] | None, Path]:
"""
Load MinerU-parsed exam paper files
Expand All @@ -35,9 +64,9 @@ def load_parsed_paper(paper_dir: Path) -> tuple[str | None, list[dict] | None, P
Returns:
(markdown_content, content_list, images_dir)
"""
auto_dir = paper_dir / "auto"
if not auto_dir.exists():
auto_dir = paper_dir
auto_dir = _find_parsed_content_dir(paper_dir)
if auto_dir != paper_dir:
print(f"📂 Using parsed content directory: {auto_dir.relative_to(paper_dir)}")

md_files = list(auto_dir.glob("*.md"))
if not md_files:
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ deeptutor = "deeptutor_cli.main:main"
anthropic = ["anthropic>=0.30.0"]
dashscope = ["dashscope>=1.14.0"]
search = ["perplexityai>=0.1.0"]
oauth = ["oauth-cli-kit>=0.2.0"]
oauth = ["oauth-cli-kit>=0.2.0; python_version >= '3.11'"]
server = [
"fastapi>=0.100.0",
"uvicorn[standard]>=0.24.0",
Expand All @@ -52,7 +52,7 @@ all = [
"anthropic>=0.30.0",
"dashscope>=1.14.0",
"perplexityai>=0.1.0",
"oauth-cli-kit>=0.2.0",
"oauth-cli-kit>=0.2.0; python_version >= '3.11'",
"fastapi>=0.100.0",
"uvicorn[standard]>=0.24.0",
"websockets>=12.0",
Expand Down
2 changes: 1 addition & 1 deletion requirements/cli.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ tiktoken>=0.5.0
anthropic>=0.30.0
dashscope>=1.14.0
perplexityai>=0.1.0
oauth-cli-kit>=0.1.1
oauth-cli-kit>=0.1.1; python_version >= "3.11"

# --- HTTP clients ---
aiohttp>=3.9.4
Expand Down
77 changes: 77 additions & 0 deletions tests/tools/test_question_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from __future__ import annotations

import importlib.util
import json
from pathlib import Path
import sys
import types


def _load_question_extractor_module():
module_path = (
Path(__file__).resolve().parents[2]
/ "deeptutor"
/ "tools"
/ "question"
/ "question_extractor.py"
)

stubbed_modules = {
"deeptutor.services.config": {"get_agent_params": lambda *_args, **_kwargs: {}},
"deeptutor.services.llm": {"complete": lambda *_args, **_kwargs: None},
"deeptutor.services.llm.capabilities": {
"supports_response_format": lambda *_args, **_kwargs: False
},
"deeptutor.services.llm.config": {"get_llm_config": lambda: None},
"deeptutor.utils.json_parser": {
"parse_json_response": lambda *_args, **_kwargs: {}
},
}

original_modules: dict[str, types.ModuleType | None] = {}
for module_name, attributes in stubbed_modules.items():
original_modules[module_name] = sys.modules.get(module_name)
module = types.ModuleType(module_name)
for attr_name, value in attributes.items():
setattr(module, attr_name, value)
sys.modules[module_name] = module

try:
spec = importlib.util.spec_from_file_location("question_extractor_under_test", module_path)
assert spec and spec.loader
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
finally:
for module_name, original_module in original_modules.items():
if original_module is None:
sys.modules.pop(module_name, None)
else:
sys.modules[module_name] = original_module


def test_load_parsed_paper_supports_nested_hybrid_auto_output(tmp_path: Path) -> None:
question_extractor = _load_question_extractor_module()
paper_dir = tmp_path / "mimic_exam"
parsed_dir = paper_dir / "hybrid_auto"
images_dir = parsed_dir / "images"
images_dir.mkdir(parents=True)

markdown_path = parsed_dir / "exam.md"
markdown_path.write_text("# Exam content", encoding="utf-8")

content_list_path = parsed_dir / "exam_content_list.json"
content_list_path.write_text(
json.dumps([{"type": "text", "text": "Question 1"}], ensure_ascii=False),
encoding="utf-8",
)

(images_dir / "figure.png").write_text("image-bytes", encoding="utf-8")

markdown_content, content_list, discovered_images_dir = question_extractor.load_parsed_paper(
paper_dir
)

assert markdown_content == "# Exam content"
assert content_list == [{"type": "text", "text": "Question 1"}]
assert discovered_images_dir == images_dir
Loading