diff --git a/runtime/datamate-python/tests/conftest.py b/runtime/datamate-python/tests/conftest.py new file mode 100644 index 00000000..9321d5f7 --- /dev/null +++ b/runtime/datamate-python/tests/conftest.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import sys +from pathlib import Path +from types import ModuleType + + +def _register_namespace(module_name: str, module_path: Path) -> None: + namespace_pkg = ModuleType(module_name) + namespace_pkg.__path__ = [str(module_path)] # type: ignore[attr-defined] + sys.modules.setdefault(module_name, namespace_pkg) + + +def pytest_sessionstart(session) -> None: + """避免测试导入 app.module.* 时触发 app/module/__init__.py 的重依赖加载。""" + root = Path(__file__).resolve().parents[1] / "app" / "module" + + _register_namespace("app.module", root) + _register_namespace("app.module.cleaning", root / "cleaning") + _register_namespace("app.module.cleaning.service", root / "cleaning" / "service") + _register_namespace("app.module.rag", root / "rag") + _register_namespace("app.module.rag.service", root / "rag" / "service") + _register_namespace("app.module.rag.service.common", root / "rag" / "service" / "common") diff --git a/runtime/datamate-python/tests/test_dataset_service.py b/runtime/datamate-python/tests/test_dataset_service.py new file mode 100644 index 00000000..fbfcd8d8 --- /dev/null +++ b/runtime/datamate-python/tests/test_dataset_service.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +import unittest +from unittest.mock import MagicMock, AsyncMock +import sys +import os + +# 确保 runtime/datamate-python 目录在 sys.path 中 +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) +APP_DIR = os.path.dirname(TEST_DIR) +sys.path.insert(0, APP_DIR) + +from app.module.dataset.service.service import Service +from app.module.dataset.schema import DatasetResponse, PagedDatasetFileResponse, DatasetFileResponse +from app.db.models import Dataset, DatasetFiles + + +class TestDatasetService(unittest.IsolatedAsyncioTestCase): + + def setUp(self): + # 创建模拟的 AsyncSession 对象 + self.mock_db = MagicMock() + self.mock_db.execute = AsyncMock() + self.mock_db.commit = AsyncMock() + self.mock_db.rollback = AsyncMock() + self.mock_db.flush = AsyncMock() + + # 初始化 Service + self.service = Service(self.mock_db) + + async def test_get_dataset_success(self): + """测试正常获取数据集详情""" + # 准备 Mock 数据 + mock_dataset = Dataset( + id="test-dataset-id", + name="Test Dataset", + description="A test description", + dataset_type="TEXT", + status="DRAFT", + file_count=5, + size_bytes=1024, + created_by="system" + ) + + # 模拟 db.execute 返回值 + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = mock_dataset + self.mock_db.execute.return_value = mock_result + + # 执行测试 + response = await self.service.get_dataset("test-dataset-id") + + # 校验结果 + self.assertIsNotNone(response) + self.assertEqual(response.id, "test-dataset-id") + self.assertEqual(response.name, "Test Dataset") + self.assertEqual(response.description, "A test description") + self.assertEqual(response.datasetType, "TEXT") + self.assertEqual(response.status, "DRAFT") + self.assertEqual(response.fileCount, 5) + self.assertEqual(response.totalSize, 1024) + + async def test_get_dataset_not_found(self): + """测试获取不存在的数据集时返回 None""" + # 模拟数据库未找到数据 + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = None + self.mock_db.execute.return_value = mock_result + + # 执行并验证 + response = await self.service.get_dataset("non-existent-id") + self.assertIsNone(response) + + async def test_create_dataset_success(self): + """测试创建数据集成功流程""" + # 1. 模拟名称不存在检查 (select Dataset.name) -> 返回 None + mock_result_check = MagicMock() + mock_result_check.scalar_one_or_none.return_value = None + self.mock_db.execute.return_value = mock_result_check + + # 2. 调用创建服务 + response = await self.service.create_dataset( + name="New Dataset", + dataset_type="IMAGE", + description="Testing create_dataset API", + status="PUBLISHED" + ) + + # 3. 验证结果 + self.assertIsNotNone(response) + self.assertEqual(response.name, "New Dataset") + self.assertEqual(response.datasetType, "IMAGE") + self.assertEqual(response.description, "Testing create_dataset API") + self.assertEqual(response.status, "PUBLISHED") + + # 确认 db.add 和 db.commit 被调用 + self.mock_db.add.assert_called_once() + self.mock_db.commit.assert_called_once() + + async def test_create_dataset_duplicated_name(self): + """测试创建重名的数据集时抛出异常""" + # 模拟冲突的已有数据集 + existing_dataset = Dataset( + id="existing-id", + name="Existing Dataset" + ) + mock_result = MagicMock() + mock_result.scalar_one_or_none.return_value = existing_dataset + self.mock_db.execute.return_value = mock_result + + # 检查是否正如预期抛出包含关键字 Exception + with self.assertRaises(Exception) as context: + await self.service.create_dataset( + name="Existing Dataset", + dataset_type="AUDIO" + ) + self.assertIn("already exists", str(context.exception)) + + # 校验事务有无进行 commit + self.mock_db.commit.assert_not_called() + + +if __name__ == "__main__": + unittest.main() diff --git a/runtime/datamate-python/tests/test_module_annotation.py b/runtime/datamate-python/tests/test_module_annotation.py new file mode 100644 index 00000000..5ae3a184 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_annotation.py @@ -0,0 +1,132 @@ +from app.module.annotation.utils.config_validator import LabelStudioConfigValidator +import pytest + + +def test_validate_xml_success_with_object_and_control() -> None: + xml = """ + + + + + +""" + + valid, error = LabelStudioConfigValidator.validate_xml(xml) + + assert valid is True + assert error is None + + +def test_validate_xml_fails_when_no_controls() -> None: + xml = """""" + + valid, error = LabelStudioConfigValidator.validate_xml(xml) + + assert valid is False + assert "No annotation controls" in (error or "") + + +def test_validate_configuration_json_rejects_unknown_object_reference() -> None: + config = { + "labels": [ + { + "fromName": "sentiment", + "toName": "missing_object", + "type": "Choices", + "options": ["positive", "negative"], + } + ], + "objects": [ + {"name": "text", "type": "Text", "value": "$text"} + ], + } + + valid, error = LabelStudioConfigValidator.validate_configuration_json(config) + + assert valid is False + assert "unknown object" in (error or "") + + +def test_extract_label_values() -> None: + xml = """ + + + + + +""" + + labels = LabelStudioConfigValidator.extract_label_values(xml) + + assert labels == {"sentiment": ["positive", "negative"]} + + +def test_validate_xml_rejects_invalid_root() -> None: + xml = """""" + + valid, error = LabelStudioConfigValidator.validate_xml(xml) + + assert valid is False + assert "Root element must be " in (error or "") + + +def test_validate_configuration_json_requires_labels() -> None: + valid, error = LabelStudioConfigValidator.validate_configuration_json({"objects": []}) + + assert valid is False + assert "Missing 'labels' field" in (error or "") + + +def test_validate_xml_fails_for_invalid_xml() -> None: + xml = "" + valid, error = LabelStudioConfigValidator.validate_xml(xml) + assert valid is False + assert "XML parse error" in (error or "") + + +@pytest.mark.parametrize( + "label,error_text", + [ + ({"toName": "obj", "type": "Choices", "options": ["A"]}, "fromName"), + ({"fromName": "lbl", "type": "Choices", "options": ["A"]}, "toName"), + ({"fromName": "lbl", "toName": "obj", "options": ["A"]}, "type"), + ], +) +def test_validate_label_definition_required_fields(label, error_text: str) -> None: + valid, error = LabelStudioConfigValidator._validate_label_definition(label) + assert valid is False + assert error_text in (error or "") + + +def test_validate_label_definition_rejects_unsupported_type() -> None: + label = { + "fromName": "x", + "toName": "obj", + "type": "NotSupported", + } + valid, error = LabelStudioConfigValidator._validate_label_definition(label) + assert valid is False + assert "Unsupported control type" in (error or "") + + +def test_validate_object_definition_rejects_value_without_dollar_prefix() -> None: + obj = {"name": "txt", "type": "Text", "value": "text"} + valid, error = LabelStudioConfigValidator._validate_object_definition(obj) + assert valid is False + assert "must start with '$'" in (error or "") + + +def test_extract_label_values_returns_empty_on_invalid_xml() -> None: + labels = LabelStudioConfigValidator.extract_label_values(" None: + xml = """ + + +""" + valid, error = LabelStudioConfigValidator.validate_xml(xml) + assert valid is False + assert "Missing 'name' attribute" in (error or "") + diff --git a/runtime/datamate-python/tests/test_module_cleaning.py b/runtime/datamate-python/tests/test_module_cleaning.py new file mode 100644 index 00000000..3e902041 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_cleaning.py @@ -0,0 +1,99 @@ +import pytest + +from app.core.exception import BusinessError +from app.module.cleaning.schema.cleaning import OperatorInstanceDto +from app.module.cleaning.service.cleaning_task_validator import CleaningTaskValidator +from app.module.operator.constants import CATEGORY_DATAMATE_ID, CATEGORY_DATA_JUICER_ID + + +def _op(op_id: str, inputs: str | None, outputs: str | None, categories: list[str] | None = None) -> OperatorInstanceDto: + return OperatorInstanceDto(id=op_id, inputs=inputs, outputs=outputs, categories=categories) + + +def test_check_input_and_output_passes_with_multimodal() -> None: + instances = [ + _op("a", "text", "multimodal"), + _op("b", "image", "text"), + ] + + CleaningTaskValidator.check_input_and_output(instances) + + +def test_check_input_and_output_raises_on_type_mismatch() -> None: + instances = [ + _op("a", "text", "image"), + _op("b", "text", "text"), + ] + + with pytest.raises(BusinessError): + CleaningTaskValidator.check_input_and_output(instances) + + +def test_check_and_get_executor_type_raises_when_mixed_categories() -> None: + instances = [ + _op("a", None, None, [CATEGORY_DATAMATE_ID]), + _op("b", None, None, [CATEGORY_DATA_JUICER_ID]), + ] + + with pytest.raises(BusinessError): + CleaningTaskValidator.check_and_get_executor_type(instances) + + +def test_check_and_get_executor_type_defaults_to_datamate() -> None: + instances = [_op("a", None, None, None)] + + executor = CleaningTaskValidator.check_and_get_executor_type(instances) + + assert executor == "datamate" + + +def test_check_task_id_raises_when_empty() -> None: + with pytest.raises(BusinessError): + CleaningTaskValidator.check_task_id("") + + +def test_check_task_id_accepts_normal_value() -> None: + CleaningTaskValidator.check_task_id("task-1") + + +def test_check_input_and_output_returns_for_empty_instances() -> None: + CleaningTaskValidator.check_input_and_output([]) + + +def test_check_input_and_output_raises_when_current_has_no_outputs() -> None: + instances = [ + _op("a", "text", None), + _op("b", "text", "text"), + ] + with pytest.raises(BusinessError): + CleaningTaskValidator.check_input_and_output(instances) + + +def test_check_input_and_output_raises_when_next_has_no_inputs() -> None: + instances = [ + _op("a", "text", "text"), + _op("b", None, "text"), + ] + with pytest.raises(BusinessError): + CleaningTaskValidator.check_input_and_output(instances) + + +@pytest.mark.parametrize( + "out_type,in_type", + [ + ("text", "text"), + (" image ", "image"), + ("AUDIO", "audio"), + ], +) +def test_check_input_and_output_allows_exact_match_with_normalization(out_type: str, in_type: str) -> None: + instances = [ + _op("a", "x", out_type), + _op("b", in_type, "y"), + ] + CleaningTaskValidator.check_input_and_output(instances) + + +def test_check_and_get_executor_type_prefers_datajuicer_when_only_datajuicer() -> None: + instances = [_op("a", None, None, [CATEGORY_DATA_JUICER_ID])] + assert CleaningTaskValidator.check_and_get_executor_type(instances) == "default" diff --git a/runtime/datamate-python/tests/test_module_collection.py b/runtime/datamate-python/tests/test_module_collection.py new file mode 100644 index 00000000..072ddfe9 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_collection.py @@ -0,0 +1,125 @@ +import pytest +from types import SimpleNamespace +from app.module.collection.schema.collection import converter_to_response, converter_execution_to_response + +from app.module.collection.schema.collection import ( + CollectionConfig, + CollectionTaskCreate, + CollectionTaskUpdate, + SyncMode, + convert_for_create, +) + + +def test_collection_task_update_rejects_blank_schedule_expression() -> None: + with pytest.raises(ValueError): + CollectionTaskUpdate(schedule_expression=" ") + + +def test_collection_task_update_rejects_non_positive_timeout() -> None: + with pytest.raises(ValueError): + CollectionTaskUpdate(timeout_seconds=0) + + +def test_convert_for_create_handles_sync_mode_schedule_expression() -> None: + config = CollectionConfig(parameter={"k": "v"}) + + scheduled = CollectionTaskCreate( + name="task-scheduled", + sync_mode=SyncMode.SCHEDULED, + schedule_expression="0 0 * * *", + config=config, + template_id="tpl-1", + ) + once = CollectionTaskCreate( + name="task-once", + sync_mode=SyncMode.ONCE, + schedule_expression="0 0 * * *", + config=config, + template_id="tpl-1", + ) + + scheduled_record = convert_for_create(scheduled, "task-1") + once_record = convert_for_create(once, "task-2") + + assert scheduled_record.schedule_expression == "0 0 * * *" + assert once_record.schedule_expression is None + assert scheduled_record.target_path == "/dataset/local/task-1" + + +def test_collection_task_update_accepts_positive_timeout() -> None: + updated = CollectionTaskUpdate(timeout_seconds=30) + assert updated.timeout_seconds == 30 + + +def test_convert_for_create_sets_pending_status() -> None: + config = CollectionConfig(parameter={"k": "v"}) + once = CollectionTaskCreate( + name="task-once", + sync_mode=SyncMode.ONCE, + config=config, + template_id="tpl-1", + ) + + record = convert_for_create(once, "task-3") + + assert record.status == "PENDING" + + +def test_collection_task_update_accepts_none_fields() -> None: + updated = CollectionTaskUpdate() + assert updated.timeout_seconds is None + assert updated.config is None + + +def test_converter_to_response_maps_json_config() -> None: + task = SimpleNamespace( + id="t1", + name="task", + description="desc", + sync_mode="ONCE", + template_id="tpl", + template_name="template", + target_path="/dataset/local/t1", + config='{"parameter": {"a": 1}}', + schedule_expression=None, + status="PENDING", + retry_count=3, + timeout_seconds=60, + last_execution_id="e1", + created_at=None, + updated_at=None, + created_by="u", + updated_by="u", + ) + + response = converter_to_response(task) + + assert response.id == "t1" + assert response.config.parameter == {"a": 1} + assert response.status.value == "PENDING" + + +def test_converter_execution_to_response_maps_fields() -> None: + execution = SimpleNamespace( + id="e1", + task_id="t1", + task_name="task", + status="RUNNING", + log_path="/x.log", + started_at=None, + completed_at=None, + duration_seconds=1, + error_message=None, + created_at=None, + updated_at=None, + created_by="u", + updated_by="u", + ) + + response = converter_execution_to_response(execution) + + assert response.id == "e1" + assert response.task_id == "t1" + assert response.status == "RUNNING" + diff --git a/runtime/datamate-python/tests/test_module_dataset.py b/runtime/datamate-python/tests/test_module_dataset.py new file mode 100644 index 00000000..c860420c --- /dev/null +++ b/runtime/datamate-python/tests/test_module_dataset.py @@ -0,0 +1,145 @@ +import asyncio +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +from app.module.dataset.service.service import Service + + +def _run(coro): + return asyncio.run(coro) + + +def test_create_dataset_uses_default_status_when_not_provided() -> None: + db = MagicMock() + first_result = MagicMock() + first_result.scalar_one_or_none.return_value = None + db.execute = AsyncMock(return_value=first_result) + db.flush = AsyncMock() + db.commit = AsyncMock() + db.rollback = AsyncMock() + + service = Service(db) + response = _run(service.create_dataset(name="ds1", dataset_type="TEXT", description="desc")) + + assert response.status == "DRAFT" + assert response.name == "ds1" + db.commit.assert_called_once() + + +def test_get_dataset_returns_none_when_execute_fails() -> None: + db = MagicMock() + db.execute = AsyncMock(side_effect=RuntimeError("db unavailable")) + + service = Service(db) + response = _run(service.get_dataset("dataset-1")) + + assert response is None + + +def test_get_file_download_url_returns_file_path() -> None: + db = MagicMock() + fake_file = SimpleNamespace(file_path="/dataset/ds1/a.txt") + result = MagicMock() + result.scalar_one_or_none.return_value = fake_file + db.execute = AsyncMock(return_value=result) + + service = Service(db) + file_path = _run(service.get_file_download_url("ds1", "file1")) + + assert file_path == "/dataset/ds1/a.txt" + + +def test_create_dataset_raises_for_duplicated_name() -> None: + db = MagicMock() + duplicated = SimpleNamespace(id="d1", name="dup") + result = MagicMock() + result.scalar_one_or_none.return_value = duplicated + db.execute = AsyncMock(return_value=result) + db.rollback = AsyncMock() + + service = Service(db) + + try: + _run(service.create_dataset(name="dup", dataset_type="TEXT")) + raised = False + except Exception as exc: # noqa: BLE001 + raised = True + assert "already exists" in str(exc) + + assert raised is True + db.rollback.assert_called_once() + + +def test_get_file_download_url_returns_none_when_file_missing() -> None: + db = MagicMock() + result = MagicMock() + result.scalar_one_or_none.return_value = None + db.execute = AsyncMock(return_value=result) + + service = Service(db) + file_path = _run(service.get_file_download_url("ds1", "missing")) + + assert file_path is None + + +def test_get_dataset_returns_none_when_not_found() -> None: + db = MagicMock() + result = MagicMock() + result.scalar_one_or_none.return_value = None + db.execute = AsyncMock(return_value=result) + + service = Service(db) + response = _run(service.get_dataset("not-exist")) + + assert response is None + + +def test_get_dataset_files_returns_paged_response() -> None: + db = MagicMock() + count_result = MagicMock() + count_result.scalar_one.return_value = 2 + files_result = MagicMock() + files_result.scalars.return_value.all.return_value = [ + SimpleNamespace( + id="f1", + file_name="a.txt", + file_type="txt", + file_path="/dataset/a.txt", + file_size=12, + status="ACTIVE", + upload_time=None, + last_access_time=None, + tags=[], + tags_updated_at=None, + ), + SimpleNamespace( + id="f2", + file_name="b.txt", + file_type="txt", + file_path="/dataset/b.txt", + file_size=20, + status="ACTIVE", + upload_time=None, + last_access_time=None, + tags=[], + tags_updated_at=None, + ), + ] + db.execute = AsyncMock(side_effect=[count_result, files_result]) + + service = Service(db) + response = _run(service.get_dataset_files("ds1", page=0, size=10)) + + assert response is not None + assert response.totalElements == 2 + assert len(response.content) == 2 + assert response.content[0].fileName == "a.txt" + + +def test_get_dataset_files_returns_none_when_query_fails() -> None: + db = MagicMock() + db.execute = AsyncMock(side_effect=RuntimeError("query fail")) + service = Service(db) + + response = _run(service.get_dataset_files("ds1")) + assert response is None diff --git a/runtime/datamate-python/tests/test_module_evaluation.py b/runtime/datamate-python/tests/test_module_evaluation.py new file mode 100644 index 00000000..a95159c0 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_evaluation.py @@ -0,0 +1,48 @@ +from app.module.evaluation.schema.prompt import EVALUATION_PROMPT_TEMPLATE +from app.module.evaluation.service.prompt_template_service import PromptTemplateService + + +def test_get_prompt_templates_size_matches_source() -> None: + response = PromptTemplateService.get_prompt_templates() + assert len(response.templates) == len(EVALUATION_PROMPT_TEMPLATE) + + +def test_get_prompt_templates_dimensions_are_mapped() -> None: + response = PromptTemplateService.get_prompt_templates() + assert response.templates + + first = response.templates[0] + assert isinstance(first.evalType, str) + assert isinstance(first.prompt, str) + for dim in first.defaultDimensions: + assert isinstance(dim.dimension, str) + assert isinstance(dim.description, str) + + +def test_get_prompt_templates_all_items_have_eval_type_and_prompt() -> None: + response = PromptTemplateService.get_prompt_templates() + + assert all(item.evalType for item in response.templates) + assert all(isinstance(item.prompt, str) for item in response.templates) + + +def test_get_prompt_templates_preserves_eval_type_order() -> None: + response = PromptTemplateService.get_prompt_templates() + expected = [item.get("evalType", "") for item in EVALUATION_PROMPT_TEMPLATE] + actual = [item.evalType for item in response.templates] + assert actual == expected + + +def test_get_prompt_templates_handles_empty_dimensions() -> None: + response = PromptTemplateService.get_prompt_templates() + for idx, raw in enumerate(EVALUATION_PROMPT_TEMPLATE): + if not raw.get("defaultDimensions"): + assert response.templates[idx].defaultDimensions == [] + + +def test_prompt_template_dimension_fields_are_non_none() -> None: + response = PromptTemplateService.get_prompt_templates() + for item in response.templates: + for dim in item.defaultDimensions: + assert dim.dimension is not None + assert dim.description is not None diff --git a/runtime/datamate-python/tests/test_module_generation.py b/runtime/datamate-python/tests/test_module_generation.py new file mode 100644 index 00000000..50f92330 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_generation.py @@ -0,0 +1,51 @@ +import pytest + +from app.module.generation.schema.generation import SynthesisType +from app.module.generation.service.prompt import ( + ANSWER_GENERATOR_PROMPT, + COT_GENERATOR_PROMPT, + QUESTION_GENERATOR_PROMPT, + get_prompt, +) + + +def test_get_prompt_dispatches_by_synthesis_type() -> None: + assert get_prompt(SynthesisType.QA) == ANSWER_GENERATOR_PROMPT + assert get_prompt(SynthesisType.COT) == COT_GENERATOR_PROMPT + assert get_prompt(SynthesisType.QUESTION) == QUESTION_GENERATOR_PROMPT + + +def test_get_prompt_raises_for_unsupported_type() -> None: + with pytest.raises(ValueError): + get_prompt("UNKNOWN") + + +def test_synthesis_type_values_are_stable() -> None: + assert SynthesisType.QA.value == "QA" + assert SynthesisType.COT.value == "COT" + assert SynthesisType.QUESTION.value == "QUESTION" + + +def test_get_prompt_error_contains_unsupported_type() -> None: + with pytest.raises(ValueError) as exc: + get_prompt("X") + + assert "Unsupported synthesis type" in str(exc.value) + + +@pytest.mark.parametrize( + "synth_type,required_text", + [ + (SynthesisType.QA, "output"), + (SynthesisType.COT, "chain_of_thought"), + (SynthesisType.QUESTION, "JSON"), + ], +) +def test_get_prompt_contains_expected_keywords(synth_type: SynthesisType, required_text: str) -> None: + prompt = get_prompt(synth_type) + assert required_text in prompt + + +def test_synthesis_type_enum_values_are_unique() -> None: + values = [t.value for t in SynthesisType] + assert len(values) == len(set(values)) diff --git a/runtime/datamate-python/tests/test_module_operator.py b/runtime/datamate-python/tests/test_module_operator.py new file mode 100644 index 00000000..73f4d566 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_operator.py @@ -0,0 +1,66 @@ +from unittest.mock import MagicMock + +import pytest + +from app.module.operator.parsers.parser_holder import ParserHolder +from app.module.operator.parsers.zip_parser import ZipParser + + +def test_get_parser_returns_zip_parser() -> None: + holder = ParserHolder() + parser = holder.get_parser("abc.zip") + assert isinstance(parser, ZipParser) + + +def test_get_parser_raises_for_unsupported_file() -> None: + holder = ParserHolder() + with pytest.raises(ValueError): + holder.get_parser("abc.txt") + + +def test_extract_to_delegates_to_target_parser() -> None: + holder = ParserHolder() + fake_parser = MagicMock() + holder._parsers["zip"] = fake_parser + + holder.extract_to("zip", "archive.zip", "target") + + fake_parser.extract_to.assert_called_once_with("archive.zip", "target") + + +def test_get_parser_supports_uppercase_extension() -> None: + holder = ParserHolder() + parser = holder.get_parser("ABC.ZIP") + assert isinstance(parser, ZipParser) + + +def test_parse_yaml_from_archive_delegates_to_selected_parser() -> None: + holder = ParserHolder() + fake_parser = MagicMock() + fake_result = object() + fake_parser.parse_yaml_from_archive.return_value = fake_result + holder._parsers["zip"] = fake_parser + + result = holder.parse_yaml_from_archive("zip", "a.zip", "metadata.yml") + + assert result is fake_result + fake_parser.parse_yaml_from_archive.assert_called_once_with("a.zip", "metadata.yml", None, None) + + +@pytest.mark.parametrize("name", ["a.tar", "a.gz", "a.tgz"]) +def test_get_parser_supports_tar_like_extensions(name: str) -> None: + holder = ParserHolder() + parser = holder.get_parser(name) + assert parser is not None + + +def test_parse_yaml_from_archive_raises_when_type_unsupported() -> None: + holder = ParserHolder() + with pytest.raises(ValueError): + holder.parse_yaml_from_archive("rar", "a.rar", "metadata.yml") + + +def test_extract_to_raises_when_type_unsupported() -> None: + holder = ParserHolder() + with pytest.raises(ValueError): + holder.extract_to("rar", "a.rar", "tmp") diff --git a/runtime/datamate-python/tests/test_module_orchestration.py b/runtime/datamate-python/tests/test_module_orchestration.py new file mode 100644 index 00000000..a7432f15 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_orchestration.py @@ -0,0 +1,32 @@ +from pathlib import Path + + +def test_orchestration_module_has_no_python_sources_yet() -> None: + module_dir = Path(__file__).resolve().parents[1] / "app" / "module" / "orchestration" + py_files = list(module_dir.rglob("*.py")) + + assert py_files == [], ( + "orchestration 模块已有 Python 实现,请补充真实业务单测并删除该占位用例" + ) + + +def test_orchestration_module_scaffold_directories_exist() -> None: + root = Path(__file__).resolve().parents[1] / "app" / "module" / "orchestration" + + assert (root / "interface").exists() + assert (root / "schema").exists() + assert (root / "service").exists() + + +def test_orchestration_scaffold_contains_only_directories_or_cache() -> None: + root = Path(__file__).resolve().parents[1] / "app" / "module" / "orchestration" + names = {p.name for p in root.iterdir()} + assert "interface" in names + assert "schema" in names + assert "service" in names + + +def test_orchestration_module_path_exists() -> None: + root = Path(__file__).resolve().parents[1] / "app" / "module" / "orchestration" + assert root.exists() + assert root.is_dir() diff --git a/runtime/datamate-python/tests/test_module_rag.py b/runtime/datamate-python/tests/test_module_rag.py new file mode 100644 index 00000000..2a61e72b --- /dev/null +++ b/runtime/datamate-python/tests/test_module_rag.py @@ -0,0 +1,49 @@ +import pytest + +from app.module.rag.service.common.text_cleaner import TextCleaner + + +def test_clean_removes_control_chars_and_empty_lines() -> None: + raw = "Hello\x00 world\n\n\n\tLine2\n" + + cleaned = TextCleaner.clean(raw) + + assert cleaned == "Hello world\n Line2" + + +def test_has_printable_content() -> None: + assert TextCleaner.has_printable_content(" \n\t") is False + assert TextCleaner.has_printable_content(" 数据A ") is True + + +def test_clean_returns_empty_string_for_none_or_empty() -> None: + assert TextCleaner.clean(None) == "" + assert TextCleaner.clean("") == "" + + +def test_clean_normalizes_multiple_spaces() -> None: + cleaned = TextCleaner.clean("A B\t\tC") + assert cleaned == "A B C" + + +def test_remove_control_characters_private_method_behavior() -> None: + cleaned = TextCleaner._remove_control_characters("ab\x01\x02cd") + assert cleaned == "abcd" + + +def test_remove_empty_lines_private_method_behavior() -> None: + text = "line1\n\n \nline2\n" + assert TextCleaner._remove_empty_lines(text) == "line1\nline2" + + +@pytest.mark.parametrize( + "text,expected", + [ + ("", False), + ("\n\t ", False), + ("A", True), + (" 1 ", True), + ], +) +def test_has_printable_content_parametrized(text: str, expected: bool) -> None: + assert TextCleaner.has_printable_content(text) is expected diff --git a/runtime/datamate-python/tests/test_module_ratio.py b/runtime/datamate-python/tests/test_module_ratio.py new file mode 100644 index 00000000..323763b5 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_ratio.py @@ -0,0 +1,88 @@ +import pytest + +from app.module.ratio.schema.ratio_task import CreateRatioTaskRequest, FilterCondition + + +def test_filter_condition_rejects_bad_date_range_order() -> None: + with pytest.raises(ValueError): + FilterCondition(dateRange=["2025-01-02", "2025-01-01"]) + + +def test_create_ratio_task_request_validates_numeric_totals() -> None: + with pytest.raises(ValueError): + CreateRatioTaskRequest(name="r1", totals="abc", config=[]) + + +def test_create_ratio_task_request_accepts_valid_numeric_values() -> None: + request = CreateRatioTaskRequest( + name="ratio-task", + totals="10", + config=[ + { + "datasetId": "ds-1", + "counts": "5", + "filterConditions": { + "dateRange": ["2025-01-01", "2025-01-31"], + "label": {"label": "intent", "value": "A"}, + }, + } + ], + ) + + assert request.totals == "10" + assert request.config[0].counts == "5" + + +def test_filter_condition_rejects_invalid_date_range_length() -> None: + with pytest.raises(ValueError): + FilterCondition(dateRange=["2025-01-01"]) + + +def test_create_ratio_task_request_rejects_non_numeric_counts() -> None: + with pytest.raises(ValueError): + CreateRatioTaskRequest( + name="ratio-task", + totals="10", + config=[ + { + "datasetId": "ds-1", + "counts": "x", + "filterConditions": {"dateRange": ["2025-01-01", "2025-01-02"]}, + } + ], + ) + + +def test_filter_condition_accepts_none_date_range() -> None: + cond = FilterCondition(dateRange=None) + assert cond.date_range is None + + +def test_filter_condition_rejects_invalid_date_string() -> None: + with pytest.raises(ValueError): + FilterCondition(dateRange=["bad-date", "2025-01-01"]) + + +def test_create_ratio_task_request_accepts_zero_totals() -> None: + req = CreateRatioTaskRequest(name="r0", totals="0", config=[]) + assert req.totals == "0" + + +def test_create_ratio_task_request_rejects_negative_totals() -> None: + with pytest.raises(ValueError): + CreateRatioTaskRequest(name="r1", totals="-1", config=[]) + + +def test_create_ratio_task_request_alias_mapping_for_dataset_id() -> None: + req = CreateRatioTaskRequest( + name="ratio-task", + totals="2", + config=[ + { + "datasetId": "ds-alias", + "counts": "1", + "filterConditions": {"dateRange": ["2025-01-01", "2025-01-02"]}, + } + ], + ) + assert req.config[0].dataset_id == "ds-alias" diff --git a/runtime/datamate-python/tests/test_module_shared.py b/runtime/datamate-python/tests/test_module_shared.py new file mode 100644 index 00000000..e81b7e49 --- /dev/null +++ b/runtime/datamate-python/tests/test_module_shared.py @@ -0,0 +1,77 @@ +import json + +import pytest + +from app.module.shared.util.structured_file import ( + COTItemHandler, + ItemTypes, + QAItemHandler, + StructuredFileHandlerFactory, +) + + +def test_qa_handler_validate_json_accepts_alpaca_item() -> None: + handler = QAItemHandler() + assert handler.validate_json({"instruction": "i", "output": "o"}) is True + + +def test_get_items_from_jsonl_skips_invalid_rows(tmp_path) -> None: + file_path = tmp_path / "qa.jsonl" + rows = [ + {"instruction": "i1", "output": "o1"}, + {"instruction": "missing_output"}, + {"instruction": "i2", "output": "o2"}, + ] + file_path.write_text("\n".join(json.dumps(r, ensure_ascii=False) for r in rows), encoding="utf-8") + + handler = QAItemHandler() + items = handler.get_items_from_file(str(file_path)) + + assert len(items) == 2 + assert items[0]["output"] == "o1" + assert items[1]["output"] == "o2" + + +def test_factory_get_handler_rejects_unknown_item_type() -> None: + factory = StructuredFileHandlerFactory() + with pytest.raises(ValueError): + factory.get_handler("UNKNOWN") + + +def test_qa_handler_validate_json_rejects_invalid_item() -> None: + handler = QAItemHandler() + assert handler.validate_json({"input": "x"}) is False + + +def test_factory_get_handler_returns_qa_handler() -> None: + factory = StructuredFileHandlerFactory() + handler = factory.get_handler(ItemTypes.QA.value) + assert isinstance(handler, QAItemHandler) + + +def test_get_items_from_json_file_for_qa(tmp_path) -> None: + file_path = tmp_path / "qa.json" + file_path.write_text( + json.dumps([ + {"instruction": "q1", "output": "a1"}, + {"instruction": "q2", "output": "a2"}, + ], ensure_ascii=False), + encoding="utf-8", + ) + + handler = QAItemHandler() + items = handler.get_items_from_file(str(file_path)) + + assert len(items) == 2 + assert items[0]["instruction"] == "q1" + + +def test_cot_handler_validate_json_requires_question_field() -> None: + handler = COTItemHandler() + assert handler.validate_json({"instruction": "x", "output": "y"}) is False + + +def test_factory_get_handler_returns_cot_handler() -> None: + factory = StructuredFileHandlerFactory() + handler = factory.get_handler(ItemTypes.COT.value) + assert isinstance(handler, COTItemHandler) diff --git a/runtime/datamate-python/tests/test_module_synthesis.py b/runtime/datamate-python/tests/test_module_synthesis.py new file mode 100644 index 00000000..2417308d --- /dev/null +++ b/runtime/datamate-python/tests/test_module_synthesis.py @@ -0,0 +1,32 @@ +from pathlib import Path + + +def test_synthesis_module_has_no_python_sources_yet() -> None: + module_dir = Path(__file__).resolve().parents[1] / "app" / "module" / "synthesis" + py_files = list(module_dir.rglob("*.py")) + + assert py_files == [], ( + "synthesis 模块已有 Python 实现,请补充真实业务单测并删除该占位用例" + ) + + +def test_synthesis_module_scaffold_directories_exist() -> None: + root = Path(__file__).resolve().parents[1] / "app" / "module" / "synthesis" + + assert (root / "interface").exists() + assert (root / "schema").exists() + assert (root / "service").exists() + + +def test_synthesis_scaffold_contains_only_expected_directories_or_cache() -> None: + root = Path(__file__).resolve().parents[1] / "app" / "module" / "synthesis" + names = {p.name for p in root.iterdir()} + assert "interface" in names + assert "schema" in names + assert "service" in names + + +def test_synthesis_module_path_exists() -> None: + root = Path(__file__).resolve().parents[1] / "app" / "module" / "synthesis" + assert root.exists() + assert root.is_dir() diff --git a/runtime/datamate-python/tests/test_module_system.py b/runtime/datamate-python/tests/test_module_system.py new file mode 100644 index 00000000..01349fef --- /dev/null +++ b/runtime/datamate-python/tests/test_module_system.py @@ -0,0 +1,67 @@ +import asyncio +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +from app.module.system.service.common_service import get_model_by_id + + +def _run(coro): + return asyncio.run(coro) + + +def test_get_model_by_id_returns_model_when_found() -> None: + db = MagicMock() + model = SimpleNamespace(id="m1") + result = MagicMock() + result.scalar_one_or_none.return_value = model + db.execute = AsyncMock(return_value=result) + + fetched = _run(get_model_by_id(db, "m1")) + + assert fetched is model + + +def test_get_model_by_id_returns_none_when_missing() -> None: + db = MagicMock() + result = MagicMock() + result.scalar_one_or_none.return_value = None + db.execute = AsyncMock(return_value=result) + + fetched = _run(get_model_by_id(db, "missing")) + + assert fetched is None + + +def test_get_model_by_id_invokes_db_execute_once() -> None: + db = MagicMock() + result = MagicMock() + result.scalar_one_or_none.return_value = None + db.execute = AsyncMock(return_value=result) + + _run(get_model_by_id(db, "m2")) + + db.execute.assert_called_once() + + +def test_get_model_by_id_passes_query_object_to_execute() -> None: + db = MagicMock() + result = MagicMock() + result.scalar_one_or_none.return_value = None + db.execute = AsyncMock(return_value=result) + + _run(get_model_by_id(db, "model-xyz")) + + args, _ = db.execute.call_args + assert len(args) == 1 + assert args[0] is not None + + +def test_get_model_by_id_returns_exact_scalar_object() -> None: + db = MagicMock() + model_obj = SimpleNamespace(id="m100", endpoint="x") + result = MagicMock() + result.scalar_one_or_none.return_value = model_obj + db.execute = AsyncMock(return_value=result) + + fetched = _run(get_model_by_id(db, "m100")) + assert fetched is model_obj