Skip to content
23 changes: 23 additions & 0 deletions runtime/datamate-python/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from __future__ import annotations

import sys
from pathlib import Path
from types import ModuleType


def _register_namespace(module_name: str, module_path: Path) -> None:
namespace_pkg = ModuleType(module_name)
namespace_pkg.__path__ = [str(module_path)] # type: ignore[attr-defined]
sys.modules.setdefault(module_name, namespace_pkg)


def pytest_sessionstart(session) -> None:
"""避免测试导入 app.module.* 时触发 app/module/__init__.py 的重依赖加载。"""
root = Path(__file__).resolve().parents[1] / "app" / "module"

_register_namespace("app.module", root)
_register_namespace("app.module.cleaning", root / "cleaning")
_register_namespace("app.module.cleaning.service", root / "cleaning" / "service")
_register_namespace("app.module.rag", root / "rag")
_register_namespace("app.module.rag.service", root / "rag" / "service")
_register_namespace("app.module.rag.service.common", root / "rag" / "service" / "common")
124 changes: 124 additions & 0 deletions runtime/datamate-python/tests/test_dataset_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# -*- coding: utf-8 -*-

import unittest
from unittest.mock import MagicMock, AsyncMock
import sys
import os

# 确保 runtime/datamate-python 目录在 sys.path 中
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
APP_DIR = os.path.dirname(TEST_DIR)
sys.path.insert(0, APP_DIR)

from app.module.dataset.service.service import Service
from app.module.dataset.schema import DatasetResponse, PagedDatasetFileResponse, DatasetFileResponse
from app.db.models import Dataset, DatasetFiles


class TestDatasetService(unittest.IsolatedAsyncioTestCase):

def setUp(self):
# 创建模拟的 AsyncSession 对象
self.mock_db = MagicMock()
self.mock_db.execute = AsyncMock()
self.mock_db.commit = AsyncMock()
self.mock_db.rollback = AsyncMock()
self.mock_db.flush = AsyncMock()

# 初始化 Service
self.service = Service(self.mock_db)

async def test_get_dataset_success(self):
"""测试正常获取数据集详情"""
# 准备 Mock 数据
mock_dataset = Dataset(
id="test-dataset-id",
name="Test Dataset",
description="A test description",
dataset_type="TEXT",
status="DRAFT",
file_count=5,
size_bytes=1024,
created_by="system"
)

# 模拟 db.execute 返回值
mock_result = MagicMock()
mock_result.scalar_one_or_none.return_value = mock_dataset
self.mock_db.execute.return_value = mock_result

# 执行测试
response = await self.service.get_dataset("test-dataset-id")

# 校验结果
self.assertIsNotNone(response)
self.assertEqual(response.id, "test-dataset-id")
self.assertEqual(response.name, "Test Dataset")
self.assertEqual(response.description, "A test description")
self.assertEqual(response.datasetType, "TEXT")
self.assertEqual(response.status, "DRAFT")
self.assertEqual(response.fileCount, 5)
self.assertEqual(response.totalSize, 1024)

async def test_get_dataset_not_found(self):
"""测试获取不存在的数据集时返回 None"""
# 模拟数据库未找到数据
mock_result = MagicMock()
mock_result.scalar_one_or_none.return_value = None
self.mock_db.execute.return_value = mock_result

# 执行并验证
response = await self.service.get_dataset("non-existent-id")
self.assertIsNone(response)

async def test_create_dataset_success(self):
"""测试创建数据集成功流程"""
# 1. 模拟名称不存在检查 (select Dataset.name) -> 返回 None
mock_result_check = MagicMock()
mock_result_check.scalar_one_or_none.return_value = None
self.mock_db.execute.return_value = mock_result_check

# 2. 调用创建服务
response = await self.service.create_dataset(
name="New Dataset",
dataset_type="IMAGE",
description="Testing create_dataset API",
status="PUBLISHED"
)

# 3. 验证结果
self.assertIsNotNone(response)
self.assertEqual(response.name, "New Dataset")
self.assertEqual(response.datasetType, "IMAGE")
self.assertEqual(response.description, "Testing create_dataset API")
self.assertEqual(response.status, "PUBLISHED")

# 确认 db.add 和 db.commit 被调用
self.mock_db.add.assert_called_once()
self.mock_db.commit.assert_called_once()

async def test_create_dataset_duplicated_name(self):
"""测试创建重名的数据集时抛出异常"""
# 模拟冲突的已有数据集
existing_dataset = Dataset(
id="existing-id",
name="Existing Dataset"
)
mock_result = MagicMock()
mock_result.scalar_one_or_none.return_value = existing_dataset
self.mock_db.execute.return_value = mock_result

# 检查是否正如预期抛出包含关键字 Exception
with self.assertRaises(Exception) as context:
await self.service.create_dataset(
name="Existing Dataset",
dataset_type="AUDIO"
)
self.assertIn("already exists", str(context.exception))

# 校验事务有无进行 commit
self.mock_db.commit.assert_not_called()


if __name__ == "__main__":
unittest.main()
132 changes: 132 additions & 0 deletions runtime/datamate-python/tests/test_module_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from app.module.annotation.utils.config_validator import LabelStudioConfigValidator
import pytest


def test_validate_xml_success_with_object_and_control() -> None:
xml = """<View>
<Image name=\"image\" value=\"$image\"/>
<Choices name=\"label\" toName=\"image\">
<Choice value=\"Cat\"/>
<Choice value=\"Dog\"/>
</Choices>
</View>"""

valid, error = LabelStudioConfigValidator.validate_xml(xml)

assert valid is True
assert error is None


def test_validate_xml_fails_when_no_controls() -> None:
xml = """<View><Image name=\"image\" value=\"$image\"/></View>"""

valid, error = LabelStudioConfigValidator.validate_xml(xml)

assert valid is False
assert "No annotation controls" in (error or "")


def test_validate_configuration_json_rejects_unknown_object_reference() -> None:
config = {
"labels": [
{
"fromName": "sentiment",
"toName": "missing_object",
"type": "Choices",
"options": ["positive", "negative"],
}
],
"objects": [
{"name": "text", "type": "Text", "value": "$text"}
],
}

valid, error = LabelStudioConfigValidator.validate_configuration_json(config)

assert valid is False
assert "unknown object" in (error or "")


def test_extract_label_values() -> None:
xml = """<View>
<Text name=\"text\" value=\"$text\"/>
<Choices name=\"sentiment\" toName=\"text\">
<Choice value=\"positive\"/>
<Choice value=\"negative\"/>
</Choices>
</View>"""

labels = LabelStudioConfigValidator.extract_label_values(xml)

assert labels == {"sentiment": ["positive", "negative"]}


def test_validate_xml_rejects_invalid_root() -> None:
xml = """<Root><Text name=\"text\" value=\"$text\"/></Root>"""

valid, error = LabelStudioConfigValidator.validate_xml(xml)

assert valid is False
assert "Root element must be <View>" in (error or "")


def test_validate_configuration_json_requires_labels() -> None:
valid, error = LabelStudioConfigValidator.validate_configuration_json({"objects": []})

assert valid is False
assert "Missing 'labels' field" in (error or "")


def test_validate_xml_fails_for_invalid_xml() -> None:
xml = "<View><Text></View>"
valid, error = LabelStudioConfigValidator.validate_xml(xml)
assert valid is False
assert "XML parse error" in (error or "")


@pytest.mark.parametrize(
"label,error_text",
[
({"toName": "obj", "type": "Choices", "options": ["A"]}, "fromName"),
({"fromName": "lbl", "type": "Choices", "options": ["A"]}, "toName"),
({"fromName": "lbl", "toName": "obj", "options": ["A"]}, "type"),
],
)
def test_validate_label_definition_required_fields(label, error_text: str) -> None:
valid, error = LabelStudioConfigValidator._validate_label_definition(label)
assert valid is False
assert error_text in (error or "")


def test_validate_label_definition_rejects_unsupported_type() -> None:
label = {
"fromName": "x",
"toName": "obj",
"type": "NotSupported",
}
valid, error = LabelStudioConfigValidator._validate_label_definition(label)
assert valid is False
assert "Unsupported control type" in (error or "")


def test_validate_object_definition_rejects_value_without_dollar_prefix() -> None:
obj = {"name": "txt", "type": "Text", "value": "text"}
valid, error = LabelStudioConfigValidator._validate_object_definition(obj)
assert valid is False
assert "must start with '$'" in (error or "")


def test_extract_label_values_returns_empty_on_invalid_xml() -> None:
labels = LabelStudioConfigValidator.extract_label_values("<broken")
assert labels == {}


def test_validate_xml_requires_control_name_and_to_name() -> None:
xml = """<View>
<Text name=\"text\" value=\"$text\"/>
<Choices toName=\"text\"><Choice value=\"A\"/></Choices>
</View>"""
valid, error = LabelStudioConfigValidator.validate_xml(xml)
assert valid is False
assert "Missing 'name' attribute" in (error or "")

99 changes: 99 additions & 0 deletions runtime/datamate-python/tests/test_module_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import pytest

from app.core.exception import BusinessError
from app.module.cleaning.schema.cleaning import OperatorInstanceDto
from app.module.cleaning.service.cleaning_task_validator import CleaningTaskValidator
from app.module.operator.constants import CATEGORY_DATAMATE_ID, CATEGORY_DATA_JUICER_ID


def _op(op_id: str, inputs: str | None, outputs: str | None, categories: list[str] | None = None) -> OperatorInstanceDto:
return OperatorInstanceDto(id=op_id, inputs=inputs, outputs=outputs, categories=categories)


def test_check_input_and_output_passes_with_multimodal() -> None:
instances = [
_op("a", "text", "multimodal"),
_op("b", "image", "text"),
]

CleaningTaskValidator.check_input_and_output(instances)


def test_check_input_and_output_raises_on_type_mismatch() -> None:
instances = [
_op("a", "text", "image"),
_op("b", "text", "text"),
]

with pytest.raises(BusinessError):
CleaningTaskValidator.check_input_and_output(instances)


def test_check_and_get_executor_type_raises_when_mixed_categories() -> None:
instances = [
_op("a", None, None, [CATEGORY_DATAMATE_ID]),
_op("b", None, None, [CATEGORY_DATA_JUICER_ID]),
]

with pytest.raises(BusinessError):
CleaningTaskValidator.check_and_get_executor_type(instances)


def test_check_and_get_executor_type_defaults_to_datamate() -> None:
instances = [_op("a", None, None, None)]

executor = CleaningTaskValidator.check_and_get_executor_type(instances)

assert executor == "datamate"


def test_check_task_id_raises_when_empty() -> None:
with pytest.raises(BusinessError):
CleaningTaskValidator.check_task_id("")


def test_check_task_id_accepts_normal_value() -> None:
CleaningTaskValidator.check_task_id("task-1")


def test_check_input_and_output_returns_for_empty_instances() -> None:
CleaningTaskValidator.check_input_and_output([])


def test_check_input_and_output_raises_when_current_has_no_outputs() -> None:
instances = [
_op("a", "text", None),
_op("b", "text", "text"),
]
with pytest.raises(BusinessError):
CleaningTaskValidator.check_input_and_output(instances)


def test_check_input_and_output_raises_when_next_has_no_inputs() -> None:
instances = [
_op("a", "text", "text"),
_op("b", None, "text"),
]
with pytest.raises(BusinessError):
CleaningTaskValidator.check_input_and_output(instances)


@pytest.mark.parametrize(
"out_type,in_type",
[
("text", "text"),
(" image ", "image"),
("AUDIO", "audio"),
],
)
def test_check_input_and_output_allows_exact_match_with_normalization(out_type: str, in_type: str) -> None:
instances = [
_op("a", "x", out_type),
_op("b", in_type, "y"),
]
CleaningTaskValidator.check_input_and_output(instances)


def test_check_and_get_executor_type_prefers_datajuicer_when_only_datajuicer() -> None:
instances = [_op("a", None, None, [CATEGORY_DATA_JUICER_ID])]
assert CleaningTaskValidator.check_and_get_executor_type(instances) == "default"
Loading
Loading