From 11c02ce781d45fb3a1bd244a8a25988fef273cc5 Mon Sep 17 00:00:00 2001 From: Octopus Date: Wed, 1 Apr 2026 15:37:49 +0800 Subject: [PATCH] feat: add MiniMax Cloud TTS as alternative speech provider Add a Cloud TTS tab in the Gradio UI powered by MiniMax's T2A V2 API, giving users a GPU-free alternative for speech synthesis with 12 voices and two quality tiers (speech-2.8-hd / speech-2.8-turbo). New files: - minimax_tts.py: standalone MiniMax TTS client (synthesize, to_file, to_tensor) - tests/test_minimax_tts.py: 25 unit tests - tests/test_minimax_tts_integration.py: 5 integration tests --- Chatter.py | 144 +++++++++++ README.md | 46 +++- minimax_tts.py | 202 ++++++++++++++++ requirements.txt | 1 + tests/test_minimax_tts.py | 330 ++++++++++++++++++++++++++ tests/test_minimax_tts_integration.py | 62 +++++ 6 files changed, 784 insertions(+), 1 deletion(-) create mode 100644 minimax_tts.py create mode 100644 tests/test_minimax_tts.py create mode 100644 tests/test_minimax_tts_integration.py diff --git a/Chatter.py b/Chatter.py index 300356d..d21002d 100644 --- a/Chatter.py +++ b/Chatter.py @@ -1578,6 +1578,32 @@ def _bool(x, default): +def _minimax_tts_models(): + """Return dict of MiniMax TTS model IDs -> descriptions.""" + return { + "speech-2.8-hd": "High-definition quality, slower", + "speech-2.8-turbo": "Fast generation, slightly lower quality", + } + + +def _minimax_tts_voices(): + """Return dict of MiniMax voice IDs -> descriptions.""" + return { + "English_Graceful_Lady": "Graceful female voice", + "English_Insightful_Speaker": "Insightful male voice", + "English_radiant_girl": "Radiant young female voice", + "English_Persuasive_Man": "Persuasive male voice", + "English_Lucky_Robot": "Robotic voice", + "Wise_Woman": "Wise female voice", + "cute_boy": "Cute boy voice", + "lovely_girl": "Lovely girl voice", + "Friendly_Person": "Friendly neutral voice", + "Inspirational_girl": "Inspirational female voice", + "Deep_Voice_Man": "Deep male voice", + "sweet_girl": "Sweet girl voice", + } + + def main(server_name=None, server_port=None, share=False): with gr.Blocks() as demo: gr.Markdown("# 🎧 Chatterbox TTS Extended") @@ -1846,6 +1872,124 @@ def _vc_wrapper(input_audio_path, target_voice_audio_path, disable_watermark, pi outputs=[vc_output_files, vc_output_audio], ) + # === CLOUD TTS TAB: MiniMax Cloud TTS === + with gr.Tab("Cloud TTS (MiniMax)"): + gr.Markdown( + "## Cloud TTS via MiniMax\n" + "Generate speech using MiniMax's cloud TTS API — no local GPU required.\n" + "Set your `MINIMAX_API_KEY` environment variable or enter it below.\n\n" + "**Models:** `speech-2.8-hd` (high quality)  |  `speech-2.8-turbo` (fast)\n\n" + "**API docs:** [MiniMax T2A V2](https://platform.minimaxi.com/document/T2A%20V2)" + ) + with gr.Row(): + with gr.Column(): + mm_api_key_input = gr.Textbox( + label="MiniMax API Key", + type="password", + placeholder="Enter your MINIMAX_API_KEY (or set env var)", + value=os.environ.get("MINIMAX_API_KEY", ""), + ) + mm_text_input = gr.Textbox( + label="Text to Synthesize", + lines=6, + placeholder="Enter text here...", + ) + mm_text_file_input = gr.File( + label="Or upload a .txt file", + file_types=[".txt"], + ) + mm_model_dropdown = gr.Dropdown( + choices=list(_minimax_tts_models().keys()), + value="speech-2.8-hd", + label="TTS Model", + ) + mm_voice_dropdown = gr.Dropdown( + choices=list(_minimax_tts_voices().keys()), + value="Friendly_Person", + label="Voice", + ) + mm_speed_slider = gr.Slider( + 0.5, 2.0, value=1.0, step=0.1, + label="Speech Speed", + ) + mm_export_format = gr.Radio( + choices=["mp3", "wav"], + value="mp3", + label="Export Format", + ) + mm_generate_btn = gr.Button("Generate with MiniMax") + with gr.Column(): + mm_output_files = gr.Files(label="Output File(s)") + mm_output_audio = gr.Audio(label="Audio Preview", interactive=True) + mm_status = gr.Textbox(label="Status", interactive=False) + + def _minimax_tts_generate( + api_key, text, text_file, model, voice, speed, export_fmt, + ): + from minimax_tts import MiniMaxTTS, MiniMaxTTSError + + # Read text from file if provided + if text_file is not None: + try: + fpath = text_file.name if hasattr(text_file, "name") else text_file + with open(fpath, "r", encoding="utf-8") as f: + text = f.read() + except Exception as e: + return [], None, f"Error reading file: {e}" + + if not text or not text.strip(): + return [], None, "Please provide text to synthesize." + + key = api_key.strip() or os.environ.get("MINIMAX_API_KEY", "") + if not key: + return [], None, "MiniMax API key is required." + + try: + tts = MiniMaxTTS(api_key=key, model=model, voice_id=voice) + except MiniMaxTTSError as e: + return [], None, str(e) + + os.makedirs("output", exist_ok=True) + timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S_%f")[:-3] + + if export_fmt == "wav": + out_path = f"output/minimax_tts_{timestamp}.wav" + else: + out_path = f"output/minimax_tts_{timestamp}.mp3" + + try: + tts.synthesize_to_file( + text, + out_path, + voice_id=voice, + model=model, + speed=speed, + ) + return ( + [out_path], + out_path, + f"Success! Generated {os.path.basename(out_path)} " + f"({os.path.getsize(out_path) / 1024:.1f} KB)", + ) + except MiniMaxTTSError as e: + return [], None, f"MiniMax TTS error: {e}" + except Exception as e: + return [], None, f"Unexpected error: {e}" + + mm_generate_btn.click( + fn=_minimax_tts_generate, + inputs=[ + mm_api_key_input, + mm_text_input, + mm_text_file_input, + mm_model_dropdown, + mm_voice_dropdown, + mm_speed_slider, + mm_export_format, + ], + outputs=[mm_output_files, mm_output_audio, mm_status], + ) + with gr.Accordion("Show Help / Instructions", open=False): gr.Markdown( """ diff --git a/README.md b/README.md index b54ea0e..59d4270 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Chatterbox-TTS-Extended is a *power-user TTS pipeline* for advanced single and b - [Parallel Processing & Performance](#parallel-processing--performance) - [Persistent Settings & UI](#persistent-settings--ui) - [🎙️ Voice Conversion (VC) Tab](#️-voice-conversion-vc-tab) +- [☁️ Cloud TTS (MiniMax)](#️-cloud-tts-minimax) - [Tips & Troubleshooting](#tips--troubleshooting) - [Installation](#-installation) - [Feedback & Contributions](#-feedback--contributions) @@ -59,6 +60,7 @@ Chatterbox-TTS-Extended is a *power-user TTS pipeline* for advanced single and b | Audio preview & download | ✔ | Yes | | Help/Instructions | ✔ (Accordion) | Yes | | Voice Conversion (VC tab) | ✔ | Yes | +| **Cloud TTS (MiniMax)** | ✔ | Yes | --- @@ -194,6 +196,48 @@ Convert any voice to sound like another! --- +## ☁️ Cloud TTS (MiniMax) + +Use [MiniMax](https://www.minimaxi.com/) cloud TTS as an alternative to local Chatterbox synthesis — no GPU required! + +**Available in the "Cloud TTS (MiniMax)" tab in the Gradio UI.** + +### Setup + +1. Get an API key from [MiniMax Platform](https://platform.minimaxi.com/) +2. Set the environment variable: + ```bash + export MINIMAX_API_KEY="your-api-key-here" + ``` + Or enter the key directly in the UI. + +### Features + +- **Models:** `speech-2.8-hd` (high quality) and `speech-2.8-turbo` (fast) +- **12 built-in voices:** English_Graceful_Lady, English_Insightful_Speaker, English_radiant_girl, English_Persuasive_Man, English_Lucky_Robot, Wise_Woman, cute_boy, lovely_girl, Friendly_Person, Inspirational_girl, Deep_Voice_Man, sweet_girl +- **Adjustable speech speed** (0.5x – 2.0x) +- **Export as MP3 or WAV** +- **File upload support** — upload a `.txt` file for longer texts + +### Programmatic Usage + +```python +from minimax_tts import MiniMaxTTS + +tts = MiniMaxTTS(api_key="your-key", model="speech-2.8-hd", voice_id="Friendly_Person") + +# Get raw MP3 bytes +audio_bytes = tts.synthesize("Hello, world!") + +# Save to file (WAV or MP3) +tts.synthesize_to_file("Hello, world!", "output.wav") + +# Get a PyTorch tensor (compatible with the Chatterbox pipeline) +tensor, sample_rate = tts.synthesize_to_tensor("Hello, world!") +``` + +--- + ## Tips & Troubleshooting - **Background noise in output?** @@ -245,7 +289,7 @@ Open an issue or pull request for suggestions, bug reports, or improvements! ## Known Bugs: -It seems if you use fasterwhisper for validation, sometimes it just silently crashes. Apparently this has to do with using the fasterwhisper model. It's not actually the python code. So if you are experiencing this, switch back to the original WhisperSync model. +It seems if you use fasterwhisper for validation, sometimes it just silently crashes. Apparently this has to do with using the fasterwhisper model. It's not actually the python code. So if you are experiencing this, switch back to the original WhisperSync model. UPDATE: with the latest update this bug may have been resolved. --- diff --git a/minimax_tts.py b/minimax_tts.py new file mode 100644 index 0000000..e3d5853 --- /dev/null +++ b/minimax_tts.py @@ -0,0 +1,202 @@ +""" +MiniMax Cloud TTS Provider for Chatterbox-TTS-Extended. + +Provides cloud-based text-to-speech via the MiniMax T2A v2 API as an +alternative to the local Chatterbox model. Useful when GPU resources +are limited or when a different voice palette is desired. + +API reference: https://platform.minimaxi.com/document/T2A%20V2 +""" + +import io +import os +import struct +import tempfile +from typing import Optional + +import numpy as np +import requests +import soundfile as sf +import torch + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +MINIMAX_TTS_ENDPOINT = "https://api.minimax.io/v1/t2a_v2" + +MINIMAX_TTS_MODELS = { + "speech-2.8-hd": "High-definition quality, slower", + "speech-2.8-turbo": "Fast generation, slightly lower quality", +} + +MINIMAX_VOICES = { + "English_Graceful_Lady": "Graceful female voice", + "English_Insightful_Speaker": "Insightful male voice", + "English_radiant_girl": "Radiant young female voice", + "English_Persuasive_Man": "Persuasive male voice", + "English_Lucky_Robot": "Robotic voice", + "Wise_Woman": "Wise female voice", + "cute_boy": "Cute boy voice", + "lovely_girl": "Lovely girl voice", + "Friendly_Person": "Friendly neutral voice", + "Inspirational_girl": "Inspirational female voice", + "Deep_Voice_Man": "Deep male voice", + "sweet_girl": "Sweet girl voice", +} + +DEFAULT_VOICE = "Friendly_Person" +DEFAULT_MODEL = "speech-2.8-hd" + + +# --------------------------------------------------------------------------- +# MiniMax TTS Client +# --------------------------------------------------------------------------- + + +class MiniMaxTTSError(Exception): + """Raised when the MiniMax TTS API returns an error.""" + + +class MiniMaxTTS: + """Thin wrapper around the MiniMax T2A v2 API. + + Parameters + ---------- + api_key : str, optional + MiniMax API key. Falls back to the ``MINIMAX_API_KEY`` environment + variable when not provided. + model : str + One of ``speech-2.8-hd`` or ``speech-2.8-turbo``. + voice_id : str + A valid MiniMax voice identifier (see ``MINIMAX_VOICES``). + """ + + def __init__( + self, + api_key: Optional[str] = None, + model: str = DEFAULT_MODEL, + voice_id: str = DEFAULT_VOICE, + ) -> None: + self.api_key = api_key or os.environ.get("MINIMAX_API_KEY", "") + if not self.api_key: + raise MiniMaxTTSError( + "MiniMax API key is required. Set the MINIMAX_API_KEY " + "environment variable or pass it explicitly." + ) + self.model = model + self.voice_id = voice_id + + # ---- public API -------------------------------------------------------- + + def synthesize( + self, + text: str, + voice_id: Optional[str] = None, + model: Optional[str] = None, + speed: float = 1.0, + ) -> bytes: + """Synthesize *text* and return raw **MP3** bytes. + + Parameters + ---------- + text : str + The text to synthesize. + voice_id : str, optional + Override the default voice for this call. + model : str, optional + Override the default model for this call. + speed : float + Speech speed multiplier (0.5 – 2.0). + + Returns + ------- + bytes + MP3-encoded audio data. + """ + payload = { + "model": model or self.model, + "text": text, + "voice_setting": { + "voice_id": voice_id or self.voice_id, + "speed": max(0.5, min(speed, 2.0)), + }, + "audio_setting": { + "format": "mp3", + }, + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + resp = requests.post( + MINIMAX_TTS_ENDPOINT, + json=payload, + headers=headers, + timeout=120, + ) + resp.raise_for_status() + body = resp.json() + + # API returns base_resp with status_code + base_resp = body.get("base_resp", {}) + status_code = base_resp.get("status_code", 0) + if status_code != 0: + raise MiniMaxTTSError( + f"MiniMax TTS API error {status_code}: " + f"{base_resp.get('status_msg', 'unknown error')}" + ) + + hex_audio = body.get("data", {}).get("audio", "") + if not hex_audio: + raise MiniMaxTTSError("MiniMax TTS API returned empty audio data.") + return bytes.fromhex(hex_audio) + + def synthesize_to_file( + self, + text: str, + output_path: str, + voice_id: Optional[str] = None, + model: Optional[str] = None, + speed: float = 1.0, + ) -> str: + """Synthesize *text* and write the result to *output_path*. + + If *output_path* ends with ``.wav``, the MP3 is decoded and re-saved + as WAV so it is compatible with the rest of the pipeline. + + Returns the final file path. + """ + mp3_bytes = self.synthesize( + text, voice_id=voice_id, model=model, speed=speed + ) + + if output_path.lower().endswith(".wav"): + wav_data, sr = sf.read(io.BytesIO(mp3_bytes)) + sf.write(output_path, wav_data, sr) + else: + with open(output_path, "wb") as f: + f.write(mp3_bytes) + return output_path + + def synthesize_to_tensor( + self, + text: str, + voice_id: Optional[str] = None, + model: Optional[str] = None, + speed: float = 1.0, + ) -> tuple: + """Synthesize and return ``(waveform_tensor, sample_rate)``. + + The tensor shape is ``(1, num_samples)`` to match the format used + by the local Chatterbox pipeline. + """ + mp3_bytes = self.synthesize( + text, voice_id=voice_id, model=model, speed=speed + ) + wav_data, sr = sf.read(io.BytesIO(mp3_bytes)) + if wav_data.ndim > 1: + wav_data = wav_data.mean(axis=1) + tensor = torch.from_numpy(wav_data).unsqueeze(0).float() + return tensor, sr diff --git a/requirements.txt b/requirements.txt index 621a468..6d819fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,6 +17,7 @@ conformer==0.3.2 pyrnnoise==0.3.8 soundfile nltk +requests --extra-index-url=https://download.pytorch.org/whl/cu128 torch==2.7.0 torchaudio==2.7.0 diff --git a/tests/test_minimax_tts.py b/tests/test_minimax_tts.py new file mode 100644 index 0000000..e62f0cc --- /dev/null +++ b/tests/test_minimax_tts.py @@ -0,0 +1,330 @@ +""" +Unit tests for minimax_tts.py — MiniMax Cloud TTS provider. +""" + +import io +import json +import os +import struct +import tempfile +from unittest import mock + +import numpy as np +import pytest + +from minimax_tts import ( + DEFAULT_MODEL, + DEFAULT_VOICE, + MINIMAX_TTS_ENDPOINT, + MINIMAX_TTS_MODELS, + MINIMAX_VOICES, + MiniMaxTTS, + MiniMaxTTSError, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_hex_mp3_stub(duration_ms: int = 200, sr: int = 24000) -> str: + """Return hex-encoded MP3-ish bytes (actually a valid WAV so soundfile + can decode it) for use as the ``data.audio`` API response field.""" + import soundfile as sf + + num_samples = int(sr * duration_ms / 1000) + samples = np.zeros(num_samples, dtype=np.float32) + buf = io.BytesIO() + sf.write(buf, samples, sr, format="WAV") + return buf.getvalue().hex() + + +def _api_success_response(hex_audio: str) -> dict: + return { + "base_resp": {"status_code": 0, "status_msg": "success"}, + "data": {"audio": hex_audio}, + } + + +def _api_error_response(code: int = 1000, msg: str = "bad request") -> dict: + return { + "base_resp": {"status_code": code, "status_msg": msg}, + "data": {}, + } + + +# --------------------------------------------------------------------------- +# Tests — construction +# --------------------------------------------------------------------------- + + +class TestMiniMaxTTSInit: + def test_explicit_api_key(self): + tts = MiniMaxTTS(api_key="test-key-123") + assert tts.api_key == "test-key-123" + assert tts.model == DEFAULT_MODEL + assert tts.voice_id == DEFAULT_VOICE + + def test_env_var_api_key(self): + with mock.patch.dict(os.environ, {"MINIMAX_API_KEY": "env-key-456"}): + tts = MiniMaxTTS() + assert tts.api_key == "env-key-456" + + def test_missing_api_key_raises(self): + with mock.patch.dict(os.environ, {}, clear=True): + os.environ.pop("MINIMAX_API_KEY", None) + with pytest.raises(MiniMaxTTSError, match="API key is required"): + MiniMaxTTS() + + def test_custom_model_and_voice(self): + tts = MiniMaxTTS( + api_key="k", model="speech-2.8-turbo", voice_id="Deep_Voice_Man" + ) + assert tts.model == "speech-2.8-turbo" + assert tts.voice_id == "Deep_Voice_Man" + + +# --------------------------------------------------------------------------- +# Tests — synthesize +# --------------------------------------------------------------------------- + + +class TestSynthesize: + def _mock_post(self, hex_audio): + resp = mock.Mock() + resp.status_code = 200 + resp.raise_for_status = mock.Mock() + resp.json.return_value = _api_success_response(hex_audio) + return resp + + def test_synthesize_returns_bytes(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + with mock.patch("minimax_tts.requests.post", return_value=self._mock_post(hex_audio)) as m: + result = tts.synthesize("Hello world") + assert isinstance(result, bytes) + assert len(result) > 0 + m.assert_called_once() + call_kwargs = m.call_args + payload = call_kwargs.kwargs.get("json") or call_kwargs[1].get("json") + assert payload["text"] == "Hello world" + assert payload["model"] == DEFAULT_MODEL + assert payload["voice_setting"]["voice_id"] == DEFAULT_VOICE + + def test_synthesize_voice_override(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + with mock.patch("minimax_tts.requests.post", return_value=self._mock_post(hex_audio)) as m: + tts.synthesize("Test", voice_id="Deep_Voice_Man") + payload = m.call_args.kwargs.get("json") or m.call_args[1].get("json") + assert payload["voice_setting"]["voice_id"] == "Deep_Voice_Man" + + def test_synthesize_model_override(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + with mock.patch("minimax_tts.requests.post", return_value=self._mock_post(hex_audio)) as m: + tts.synthesize("Test", model="speech-2.8-turbo") + payload = m.call_args.kwargs.get("json") or m.call_args[1].get("json") + assert payload["model"] == "speech-2.8-turbo" + + def test_synthesize_speed_clamping(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + with mock.patch("minimax_tts.requests.post", return_value=self._mock_post(hex_audio)) as m: + tts.synthesize("Test", speed=0.1) + payload = m.call_args.kwargs.get("json") or m.call_args[1].get("json") + assert payload["voice_setting"]["speed"] == 0.5 + + tts.synthesize("Test", speed=5.0) + payload = m.call_args.kwargs.get("json") or m.call_args[1].get("json") + assert payload["voice_setting"]["speed"] == 2.0 + + def test_synthesize_api_error(self): + resp = mock.Mock() + resp.status_code = 200 + resp.raise_for_status = mock.Mock() + resp.json.return_value = _api_error_response(1000, "invalid key") + tts = MiniMaxTTS(api_key="key") + with mock.patch("minimax_tts.requests.post", return_value=resp): + with pytest.raises(MiniMaxTTSError, match="invalid key"): + tts.synthesize("Test") + + def test_synthesize_empty_audio(self): + resp = mock.Mock() + resp.status_code = 200 + resp.raise_for_status = mock.Mock() + resp.json.return_value = { + "base_resp": {"status_code": 0, "status_msg": "success"}, + "data": {"audio": ""}, + } + tts = MiniMaxTTS(api_key="key") + with mock.patch("minimax_tts.requests.post", return_value=resp): + with pytest.raises(MiniMaxTTSError, match="empty audio"): + tts.synthesize("Test") + + def test_synthesize_http_error(self): + resp = mock.Mock() + resp.status_code = 500 + resp.raise_for_status.side_effect = Exception("Internal Server Error") + tts = MiniMaxTTS(api_key="key") + with mock.patch("minimax_tts.requests.post", return_value=resp): + with pytest.raises(Exception, match="Internal Server Error"): + tts.synthesize("Test") + + def test_synthesize_request_headers(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="my-secret-key") + with mock.patch("minimax_tts.requests.post", return_value=self._mock_post(hex_audio)) as m: + tts.synthesize("Hello") + call_kwargs = m.call_args + headers = call_kwargs.kwargs.get("headers") or call_kwargs[1].get("headers") + assert headers["Authorization"] == "Bearer my-secret-key" + assert headers["Content-Type"] == "application/json" + + def test_synthesize_endpoint_url(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + with mock.patch("minimax_tts.requests.post", return_value=self._mock_post(hex_audio)) as m: + tts.synthesize("Hello") + assert m.call_args[0][0] == MINIMAX_TTS_ENDPOINT + + +# --------------------------------------------------------------------------- +# Tests — synthesize_to_file +# --------------------------------------------------------------------------- + + +class TestSynthesizeToFile: + def test_wav_output(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + resp = mock.Mock() + resp.status_code = 200 + resp.raise_for_status = mock.Mock() + resp.json.return_value = _api_success_response(hex_audio) + with mock.patch("minimax_tts.requests.post", return_value=resp): + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: + out = tts.synthesize_to_file("Hello", f.name) + assert os.path.exists(out) + assert os.path.getsize(out) > 0 + os.unlink(out) + + def test_mp3_output(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + resp = mock.Mock() + resp.status_code = 200 + resp.raise_for_status = mock.Mock() + resp.json.return_value = _api_success_response(hex_audio) + with mock.patch("minimax_tts.requests.post", return_value=resp): + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: + out = tts.synthesize_to_file("Hello", f.name) + assert os.path.exists(out) + assert os.path.getsize(out) > 0 + os.unlink(out) + + +# --------------------------------------------------------------------------- +# Tests — synthesize_to_tensor +# --------------------------------------------------------------------------- + + +class TestSynthesizeToTensor: + def test_returns_tensor_and_sr(self): + hex_audio = _make_hex_mp3_stub(duration_ms=100, sr=24000) + tts = MiniMaxTTS(api_key="key") + resp = mock.Mock() + resp.status_code = 200 + resp.raise_for_status = mock.Mock() + resp.json.return_value = _api_success_response(hex_audio) + with mock.patch("minimax_tts.requests.post", return_value=resp): + tensor, sr = tts.synthesize_to_tensor("Hello") + assert tensor.dim() == 2 + assert tensor.shape[0] == 1 + assert tensor.shape[1] > 0 + assert sr > 0 + + def test_tensor_dtype_is_float(self): + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + resp = mock.Mock() + resp.status_code = 200 + resp.raise_for_status = mock.Mock() + resp.json.return_value = _api_success_response(hex_audio) + with mock.patch("minimax_tts.requests.post", return_value=resp): + tensor, sr = tts.synthesize_to_tensor("Hello") + import torch + assert tensor.dtype == torch.float32 + + +# --------------------------------------------------------------------------- +# Tests — constants +# --------------------------------------------------------------------------- + + +class TestConstants: + def test_models_non_empty(self): + assert len(MINIMAX_TTS_MODELS) >= 2 + + def test_voices_non_empty(self): + assert len(MINIMAX_VOICES) >= 10 + + def test_default_model_in_models(self): + assert DEFAULT_MODEL in MINIMAX_TTS_MODELS + + def test_default_voice_in_voices(self): + assert DEFAULT_VOICE in MINIMAX_VOICES + + def test_endpoint_is_https(self): + assert MINIMAX_TTS_ENDPOINT.startswith("https://") + + +# --------------------------------------------------------------------------- +# Tests — Chatter.py helper functions +# --------------------------------------------------------------------------- + + +class TestChatterHelpers: + """Test the _minimax_tts_models() and _minimax_tts_voices() functions.""" + + def test_minimax_tts_models_returns_dict(self): + import importlib + import sys + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + # We test the standalone functions directly + from minimax_tts import MINIMAX_TTS_MODELS, MINIMAX_VOICES + assert isinstance(MINIMAX_TTS_MODELS, dict) + assert "speech-2.8-hd" in MINIMAX_TTS_MODELS + assert "speech-2.8-turbo" in MINIMAX_TTS_MODELS + + def test_minimax_voices_has_expected_entries(self): + from minimax_tts import MINIMAX_VOICES + expected = [ + "English_Graceful_Lady", "Deep_Voice_Man", "Friendly_Person", + "sweet_girl", "cute_boy", + ] + for v in expected: + assert v in MINIMAX_VOICES + + +# --------------------------------------------------------------------------- +# Tests — audio format +# --------------------------------------------------------------------------- + + +class TestAudioFormat: + def test_audio_setting_format_mp3(self): + """Verify the API payload always requests mp3 format.""" + hex_audio = _make_hex_mp3_stub() + tts = MiniMaxTTS(api_key="key") + resp = mock.Mock() + resp.status_code = 200 + resp.raise_for_status = mock.Mock() + resp.json.return_value = _api_success_response(hex_audio) + with mock.patch("minimax_tts.requests.post", return_value=resp) as m: + tts.synthesize("test") + payload = m.call_args.kwargs.get("json") or m.call_args[1].get("json") + assert payload["audio_setting"]["format"] == "mp3" + # Must NOT contain sample_rate or bitrate (rejected by MiniMax API) + assert "sample_rate" not in payload["audio_setting"] + assert "bitrate" not in payload["audio_setting"] diff --git a/tests/test_minimax_tts_integration.py b/tests/test_minimax_tts_integration.py new file mode 100644 index 0000000..777f8d4 --- /dev/null +++ b/tests/test_minimax_tts_integration.py @@ -0,0 +1,62 @@ +""" +Integration tests for MiniMax Cloud TTS. + +These tests hit the real MiniMax API and are skipped when MINIMAX_API_KEY +is not set. Run with: + + MINIMAX_API_KEY= python -m pytest tests/test_minimax_tts_integration.py -v +""" + +import os +import tempfile + +import pytest + +from minimax_tts import MiniMaxTTS, MiniMaxTTSError, MINIMAX_VOICES + +_HAS_KEY = bool(os.environ.get("MINIMAX_API_KEY")) +skipif_no_key = pytest.mark.skipif(not _HAS_KEY, reason="MINIMAX_API_KEY not set") + + +@skipif_no_key +class TestMiniMaxTTSIntegration: + + def test_synthesize_short_text(self): + tts = MiniMaxTTS() + audio_bytes = tts.synthesize("Hello, this is a test.") + assert isinstance(audio_bytes, bytes) + assert len(audio_bytes) > 1000 # sanity: at least ~1 KB + + def test_synthesize_with_different_voice(self): + tts = MiniMaxTTS(voice_id="Deep_Voice_Man") + audio_bytes = tts.synthesize("Testing with a deep voice.") + assert len(audio_bytes) > 1000 + + def test_synthesize_to_wav_file(self): + tts = MiniMaxTTS() + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: + path = f.name + try: + result = tts.synthesize_to_file("Integration test wav output.", path) + assert os.path.exists(result) + assert os.path.getsize(result) > 1000 + + import soundfile as sf + data, sr = sf.read(result) + assert sr > 0 + assert len(data) > 0 + finally: + os.unlink(path) + + def test_synthesize_to_tensor(self): + tts = MiniMaxTTS() + tensor, sr = tts.synthesize_to_tensor("Tensor output test.") + assert tensor.dim() == 2 + assert tensor.shape[0] == 1 + assert tensor.shape[1] > 0 + assert sr > 0 + + def test_turbo_model(self): + tts = MiniMaxTTS(model="speech-2.8-turbo") + audio_bytes = tts.synthesize("Testing turbo model speed.") + assert len(audio_bytes) > 500