ModelEngine-Group · yafengzhang2025 · May 25, 2026
diff --git a/runtime/datamate-python/tests/conftest.py b/runtime/datamate-python/tests/conftest.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _register_namespace(module_name: str, module_path: Path) -> None:
+    namespace_pkg = ModuleType(module_name)
+    namespace_pkg.__path__ = [str(module_path)]  # type: ignore[attr-defined]
+    sys.modules.setdefault(module_name, namespace_pkg)
+
+
+def pytest_sessionstart(session) -> None:
+    """避免测试导入 app.module.* 时触发 app/module/__init__.py 的重依赖加载。"""
+    root = Path(__file__).resolve().parents[1] / "app" / "module"
+
+    _register_namespace("app.module", root)
+    _register_namespace("app.module.cleaning", root / "cleaning")
+    _register_namespace("app.module.cleaning.service", root / "cleaning" / "service")
+    _register_namespace("app.module.rag", root / "rag")
+    _register_namespace("app.module.rag.service", root / "rag" / "service")
+    _register_namespace("app.module.rag.service.common", root / "rag" / "service" / "common")
diff --git a/runtime/datamate-python/tests/test_dataset_service.py b/runtime/datamate-python/tests/test_dataset_service.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+from unittest.mock import MagicMock, AsyncMock
+import sys
+import os
+
+# 确保 runtime/datamate-python 目录在 sys.path 中
+TEST_DIR = os.path.dirname(os.path.abspath(__file__))
+APP_DIR = os.path.dirname(TEST_DIR)
+sys.path.insert(0, APP_DIR)
+
+from app.module.dataset.service.service import Service
+from app.module.dataset.schema import DatasetResponse, PagedDatasetFileResponse, DatasetFileResponse
+from app.db.models import Dataset, DatasetFiles
+
+
+class TestDatasetService(unittest.IsolatedAsyncioTestCase):
+
+    def setUp(self):
+        # 创建模拟的 AsyncSession 对象
+        self.mock_db = MagicMock()
+        self.mock_db.execute = AsyncMock()
+        self.mock_db.commit = AsyncMock()
+        self.mock_db.rollback = AsyncMock()
+        self.mock_db.flush = AsyncMock()
+
+        # 初始化 Service
+        self.service = Service(self.mock_db)
+
+    async def test_get_dataset_success(self):
+        """测试正常获取数据集详情"""
+        # 准备 Mock 数据
+        mock_dataset = Dataset(
+            id="test-dataset-id",
+            name="Test Dataset",
+            description="A test description",
+            dataset_type="TEXT",
+            status="DRAFT",
+            file_count=5,
+            size_bytes=1024,
+            created_by="system"
+        )
+
+        # 模拟 db.execute 返回值
+        mock_result = MagicMock()
+        mock_result.scalar_one_or_none.return_value = mock_dataset
+        self.mock_db.execute.return_value = mock_result
+
+        # 执行测试
+        response = await self.service.get_dataset("test-dataset-id")
+
+        # 校验结果
+        self.assertIsNotNone(response)
+        self.assertEqual(response.id, "test-dataset-id")
+        self.assertEqual(response.name, "Test Dataset")
+        self.assertEqual(response.description, "A test description")
+        self.assertEqual(response.datasetType, "TEXT")
+        self.assertEqual(response.status, "DRAFT")
+        self.assertEqual(response.fileCount, 5)
+        self.assertEqual(response.totalSize, 1024)
+
+    async def test_get_dataset_not_found(self):
+        """测试获取不存在的数据集时返回 None"""
+        # 模拟数据库未找到数据
+        mock_result = MagicMock()
+        mock_result.scalar_one_or_none.return_value = None
+        self.mock_db.execute.return_value = mock_result
+
+        # 执行并验证
+        response = await self.service.get_dataset("non-existent-id")
+        self.assertIsNone(response)
+
+    async def test_create_dataset_success(self):
+        """测试创建数据集成功流程"""
+        # 1. 模拟名称不存在检查 (select Dataset.name) -> 返回 None
+        mock_result_check = MagicMock()
+        mock_result_check.scalar_one_or_none.return_value = None
+        self.mock_db.execute.return_value = mock_result_check
+
+        # 2. 调用创建服务
+        response = await self.service.create_dataset(
+            name="New Dataset",
+            dataset_type="IMAGE",
+            description="Testing create_dataset API",
+            status="PUBLISHED"
+        )
+
+        # 3. 验证结果
+        self.assertIsNotNone(response)
+        self.assertEqual(response.name, "New Dataset")
+        self.assertEqual(response.datasetType, "IMAGE")
+        self.assertEqual(response.description, "Testing create_dataset API")
+        self.assertEqual(response.status, "PUBLISHED")
+
+        # 确认 db.add 和 db.commit 被调用
+        self.mock_db.add.assert_called_once()
+        self.mock_db.commit.assert_called_once()
+
+    async def test_create_dataset_duplicated_name(self):
+        """测试创建重名的数据集时抛出异常"""
+        # 模拟冲突的已有数据集
+        existing_dataset = Dataset(
+            id="existing-id",
+            name="Existing Dataset"
+        )
+        mock_result = MagicMock()
+        mock_result.scalar_one_or_none.return_value = existing_dataset
+        self.mock_db.execute.return_value = mock_result
+
+        # 检查是否正如预期抛出包含关键字 Exception
+        with self.assertRaises(Exception) as context:
+            await self.service.create_dataset(
+                name="Existing Dataset",
+                dataset_type="AUDIO"
+            )
+        self.assertIn("already exists", str(context.exception))
+
+        # 校验事务有无进行 commit
+        self.mock_db.commit.assert_not_called()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/runtime/datamate-python/tests/test_module_annotation.py b/runtime/datamate-python/tests/test_module_annotation.py
@@ -0,0 +1,132 @@
+from app.module.annotation.utils.config_validator import LabelStudioConfigValidator
+import pytest
+
+
+def test_validate_xml_success_with_object_and_control() -> None:
+    xml = """<View>
+    <Image name=\"image\" value=\"$image\"/>
+    <Choices name=\"label\" toName=\"image\">
+        <Choice value=\"Cat\"/>
+        <Choice value=\"Dog\"/>
+    </Choices>
+</View>"""
+
+    valid, error = LabelStudioConfigValidator.validate_xml(xml)
+
+    assert valid is True
+    assert error is None
+
+
+def test_validate_xml_fails_when_no_controls() -> None:
+    xml = """<View><Image name=\"image\" value=\"$image\"/></View>"""
+
+    valid, error = LabelStudioConfigValidator.validate_xml(xml)
+
+    assert valid is False
+    assert "No annotation controls" in (error or "")
+
+
+def test_validate_configuration_json_rejects_unknown_object_reference() -> None:
+    config = {
+        "labels": [
+            {
+                "fromName": "sentiment",
+                "toName": "missing_object",
+                "type": "Choices",
+                "options": ["positive", "negative"],
+            }
+        ],
+        "objects": [
+            {"name": "text", "type": "Text", "value": "$text"}
+        ],
+    }
+
+    valid, error = LabelStudioConfigValidator.validate_configuration_json(config)
+
+    assert valid is False
+    assert "unknown object" in (error or "")
+
+
+def test_extract_label_values() -> None:
+    xml = """<View>
+    <Text name=\"text\" value=\"$text\"/>
+    <Choices name=\"sentiment\" toName=\"text\">
+        <Choice value=\"positive\"/>
+        <Choice value=\"negative\"/>
+    </Choices>
+</View>"""
+
+    labels = LabelStudioConfigValidator.extract_label_values(xml)
+
+    assert labels == {"sentiment": ["positive", "negative"]}
+
+
+def test_validate_xml_rejects_invalid_root() -> None:
+    xml = """<Root><Text name=\"text\" value=\"$text\"/></Root>"""
+
+    valid, error = LabelStudioConfigValidator.validate_xml(xml)
+
+    assert valid is False
+    assert "Root element must be <View>" in (error or "")
+
+
+def test_validate_configuration_json_requires_labels() -> None:
+    valid, error = LabelStudioConfigValidator.validate_configuration_json({"objects": []})
+
+    assert valid is False
+    assert "Missing 'labels' field" in (error or "")
+
+
+def test_validate_xml_fails_for_invalid_xml() -> None:
+    xml = "<View><Text></View>"
+    valid, error = LabelStudioConfigValidator.validate_xml(xml)
+    assert valid is False
+    assert "XML parse error" in (error or "")
+
+
+@pytest.mark.parametrize(
+    "label,error_text",
+    [
+        ({"toName": "obj", "type": "Choices", "options": ["A"]}, "fromName"),
+        ({"fromName": "lbl", "type": "Choices", "options": ["A"]}, "toName"),
+        ({"fromName": "lbl", "toName": "obj", "options": ["A"]}, "type"),
+    ],
+)
+def test_validate_label_definition_required_fields(label, error_text: str) -> None:
+    valid, error = LabelStudioConfigValidator._validate_label_definition(label)
+    assert valid is False
+    assert error_text in (error or "")
+
+
+def test_validate_label_definition_rejects_unsupported_type() -> None:
+    label = {
+        "fromName": "x",
+        "toName": "obj",
+        "type": "NotSupported",
+    }
+    valid, error = LabelStudioConfigValidator._validate_label_definition(label)
+    assert valid is False
+    assert "Unsupported control type" in (error or "")
+
+
+def test_validate_object_definition_rejects_value_without_dollar_prefix() -> None:
+    obj = {"name": "txt", "type": "Text", "value": "text"}
+    valid, error = LabelStudioConfigValidator._validate_object_definition(obj)
+    assert valid is False
+    assert "must start with '$'" in (error or "")
+
+
+def test_extract_label_values_returns_empty_on_invalid_xml() -> None:
+    labels = LabelStudioConfigValidator.extract_label_values("<broken")
+    assert labels == {}
+
+
+def test_validate_xml_requires_control_name_and_to_name() -> None:
+    xml = """<View>
+    <Text name=\"text\" value=\"$text\"/>
+    <Choices toName=\"text\"><Choice value=\"A\"/></Choices>
+</View>"""
+    valid, error = LabelStudioConfigValidator.validate_xml(xml)
+    assert valid is False
+    assert "Missing 'name' attribute" in (error or "")
+
diff --git a/runtime/datamate-python/tests/test_module_cleaning.py b/runtime/datamate-python/tests/test_module_cleaning.py
@@ -0,0 +1,99 @@
+import pytest
+
+from app.core.exception import BusinessError
+from app.module.cleaning.schema.cleaning import OperatorInstanceDto
+from app.module.cleaning.service.cleaning_task_validator import CleaningTaskValidator
+from app.module.operator.constants import CATEGORY_DATAMATE_ID, CATEGORY_DATA_JUICER_ID
+
+
+def _op(op_id: str, inputs: str | None, outputs: str | None, categories: list[str] | None = None) -> OperatorInstanceDto:
+    return OperatorInstanceDto(id=op_id, inputs=inputs, outputs=outputs, categories=categories)
+
+
+def test_check_input_and_output_passes_with_multimodal() -> None:
+    instances = [
+        _op("a", "text", "multimodal"),
+        _op("b", "image", "text"),
+    ]
+
+    CleaningTaskValidator.check_input_and_output(instances)
+
+
+def test_check_input_and_output_raises_on_type_mismatch() -> None:
+    instances = [
+        _op("a", "text", "image"),
+        _op("b", "text", "text"),
+    ]
+
+    with pytest.raises(BusinessError):
+        CleaningTaskValidator.check_input_and_output(instances)
+
+
+def test_check_and_get_executor_type_raises_when_mixed_categories() -> None:
+    instances = [
+        _op("a", None, None, [CATEGORY_DATAMATE_ID]),
+        _op("b", None, None, [CATEGORY_DATA_JUICER_ID]),
+    ]
+
+    with pytest.raises(BusinessError):
+        CleaningTaskValidator.check_and_get_executor_type(instances)
+
+
+def test_check_and_get_executor_type_defaults_to_datamate() -> None:
+    instances = [_op("a", None, None, None)]
+
+    executor = CleaningTaskValidator.check_and_get_executor_type(instances)
+
+    assert executor == "datamate"
+
+
+def test_check_task_id_raises_when_empty() -> None:
+    with pytest.raises(BusinessError):
+        CleaningTaskValidator.check_task_id("")
+
+
+def test_check_task_id_accepts_normal_value() -> None:
+    CleaningTaskValidator.check_task_id("task-1")
+
+
+def test_check_input_and_output_returns_for_empty_instances() -> None:
+    CleaningTaskValidator.check_input_and_output([])
+
+
+def test_check_input_and_output_raises_when_current_has_no_outputs() -> None:
+    instances = [
+        _op("a", "text", None),
+        _op("b", "text", "text"),
+    ]
+    with pytest.raises(BusinessError):
+        CleaningTaskValidator.check_input_and_output(instances)
+
+
+def test_check_input_and_output_raises_when_next_has_no_inputs() -> None:
+    instances = [
+        _op("a", "text", "text"),
+        _op("b", None, "text"),
+    ]
+    with pytest.raises(BusinessError):
+        CleaningTaskValidator.check_input_and_output(instances)
+
+
+@pytest.mark.parametrize(
+    "out_type,in_type",
+    [
+        ("text", "text"),
+        (" image ", "image"),
+        ("AUDIO", "audio"),
+    ],
+)
+def test_check_input_and_output_allows_exact_match_with_normalization(out_type: str, in_type: str) -> None:
+    instances = [
+        _op("a", "x", out_type),
+        _op("b", in_type, "y"),
+    ]
+    CleaningTaskValidator.check_input_and_output(instances)
+
+
+def test_check_and_get_executor_type_prefers_datajuicer_when_only_datajuicer() -> None:
+    instances = [_op("a", None, None, [CATEGORY_DATA_JUICER_ID])]
+    assert CleaningTaskValidator.check_and_get_executor_type(instances) == "default"