petermg · octo-patch · Apr 1, 2026
diff --git a/Chatter.py b/Chatter.py
@@ -1578,6 +1578,32 @@ def _bool(x, default):
 
 
 
+def _minimax_tts_models():
+    """Return dict of MiniMax TTS model IDs -> descriptions."""
+    return {
+        "speech-2.8-hd": "High-definition quality, slower",
+        "speech-2.8-turbo": "Fast generation, slightly lower quality",
+    }
+
+
+def _minimax_tts_voices():
+    """Return dict of MiniMax voice IDs -> descriptions."""
+    return {
+        "English_Graceful_Lady": "Graceful female voice",
+        "English_Insightful_Speaker": "Insightful male voice",
+        "English_radiant_girl": "Radiant young female voice",
+        "English_Persuasive_Man": "Persuasive male voice",
+        "English_Lucky_Robot": "Robotic voice",
+        "Wise_Woman": "Wise female voice",
+        "cute_boy": "Cute boy voice",
+        "lovely_girl": "Lovely girl voice",
+        "Friendly_Person": "Friendly neutral voice",
+        "Inspirational_girl": "Inspirational female voice",
+        "Deep_Voice_Man": "Deep male voice",
+        "sweet_girl": "Sweet girl voice",
+    }
+
+
 def main(server_name=None, server_port=None, share=False):
     with gr.Blocks() as demo:
         gr.Markdown("# 🎧 Chatterbox TTS Extended")
@@ -1846,6 +1872,124 @@ def _vc_wrapper(input_audio_path, target_voice_audio_path, disable_watermark, pi
                     outputs=[vc_output_files, vc_output_audio],
                 )
 
+            # === CLOUD TTS TAB: MiniMax Cloud TTS ===
+            with gr.Tab("Cloud TTS (MiniMax)"):
+                gr.Markdown(
+                    "## Cloud TTS via MiniMax\n"
+                    "Generate speech using MiniMax's cloud TTS API — no local GPU required.\n"
+                    "Set your `MINIMAX_API_KEY` environment variable or enter it below.\n\n"
+                    "**Models:** `speech-2.8-hd` (high quality) &nbsp;|&nbsp; `speech-2.8-turbo` (fast)\n\n"
+                    "**API docs:** [MiniMax T2A V2](https://platform.minimaxi.com/document/T2A%20V2)"
+                )
+                with gr.Row():
+                    with gr.Column():
+                        mm_api_key_input = gr.Textbox(
+                            label="MiniMax API Key",
+                            type="password",
+                            placeholder="Enter your MINIMAX_API_KEY (or set env var)",
+                            value=os.environ.get("MINIMAX_API_KEY", ""),
+                        )
+                        mm_text_input = gr.Textbox(
+                            label="Text to Synthesize",
+                            lines=6,
+                            placeholder="Enter text here...",
+                        )
+                        mm_text_file_input = gr.File(
+                            label="Or upload a .txt file",
+                            file_types=[".txt"],
+                        )
+                        mm_model_dropdown = gr.Dropdown(
+                            choices=list(_minimax_tts_models().keys()),
+                            value="speech-2.8-hd",
+                            label="TTS Model",
+                        )
+                        mm_voice_dropdown = gr.Dropdown(
+                            choices=list(_minimax_tts_voices().keys()),
+                            value="Friendly_Person",
+                            label="Voice",
+                        )
+                        mm_speed_slider = gr.Slider(
+                            0.5, 2.0, value=1.0, step=0.1,
+                            label="Speech Speed",
+                        )
+                        mm_export_format = gr.Radio(
+                            choices=["mp3", "wav"],
+                            value="mp3",
+                            label="Export Format",
+                        )
+                        mm_generate_btn = gr.Button("Generate with MiniMax")
+                    with gr.Column():
+                        mm_output_files = gr.Files(label="Output File(s)")
+                        mm_output_audio = gr.Audio(label="Audio Preview", interactive=True)
+                        mm_status = gr.Textbox(label="Status", interactive=False)
+
+                def _minimax_tts_generate(
+                    api_key, text, text_file, model, voice, speed, export_fmt,
+                ):
+                    from minimax_tts import MiniMaxTTS, MiniMaxTTSError
+
+                    # Read text from file if provided
+                    if text_file is not None:
+                        try:
+                            fpath = text_file.name if hasattr(text_file, "name") else text_file
+                            with open(fpath, "r", encoding="utf-8") as f:
+                                text = f.read()
+                        except Exception as e:
+                            return [], None, f"Error reading file: {e}"
+
+                    if not text or not text.strip():
+                        return [], None, "Please provide text to synthesize."
+
+                    key = api_key.strip() or os.environ.get("MINIMAX_API_KEY", "")
+                    if not key:
+                        return [], None, "MiniMax API key is required."
+
+                    try:
+                        tts = MiniMaxTTS(api_key=key, model=model, voice_id=voice)
+                    except MiniMaxTTSError as e:
+                        return [], None, str(e)
+
+                    os.makedirs("output", exist_ok=True)
+                    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S_%f")[:-3]
+
+                    if export_fmt == "wav":
+                        out_path = f"output/minimax_tts_{timestamp}.wav"
+                    else:
+                        out_path = f"output/minimax_tts_{timestamp}.mp3"
+
+                    try:
+                        tts.synthesize_to_file(
+                            text,
+                            out_path,
+                            voice_id=voice,
+                            model=model,
+                            speed=speed,
+                        )
+                        return (
+                            [out_path],
+                            out_path,
+                            f"Success! Generated {os.path.basename(out_path)} "
+                            f"({os.path.getsize(out_path) / 1024:.1f} KB)",
+                        )
+                    except MiniMaxTTSError as e:
+                        return [], None, f"MiniMax TTS error: {e}"
+                    except Exception as e:
+                        return [], None, f"Unexpected error: {e}"
+
+                mm_generate_btn.click(
+                    fn=_minimax_tts_generate,
+                    inputs=[
+                        mm_api_key_input,
+                        mm_text_input,
+                        mm_text_file_input,
+                        mm_model_dropdown,
+                        mm_voice_dropdown,
+                        mm_speed_slider,
+                        mm_export_format,
+                    ],
+                    outputs=[mm_output_files, mm_output_audio, mm_status],
+                )
+
         with gr.Accordion("Show Help / Instructions", open=False):
             gr.Markdown(
             """

diff --git a/README.md b/README.md
@@ -27,6 +27,7 @@ Chatterbox-TTS-Extended is a *power-user TTS pipeline* for advanced single and b
 - [Parallel Processing & Performance](#parallel-processing--performance)
 - [Persistent Settings & UI](#persistent-settings--ui)
 - [🎙️ Voice Conversion (VC) Tab](#️-voice-conversion-vc-tab)
+- [☁️ Cloud TTS (MiniMax)](#️-cloud-tts-minimax)
 - [Tips & Troubleshooting](#tips--troubleshooting)
 - [Installation](#-installation)
 - [Feedback & Contributions](#-feedback--contributions)
@@ -59,6 +60,7 @@ Chatterbox-TTS-Extended is a *power-user TTS pipeline* for advanced single and b
 | Audio preview & download                  | ✔             | Yes          |
 | Help/Instructions                         | ✔ (Accordion) | Yes          |
 | Voice Conversion (VC tab)                 | ✔             | Yes          |
+| **Cloud TTS (MiniMax)**                   | ✔             | Yes          |
 
 ---
 
@@ -194,6 +196,48 @@ Convert any voice to sound like another!
 
 ---
 
+## ☁️ Cloud TTS (MiniMax)
+
+Use [MiniMax](https://www.minimaxi.com/) cloud TTS as an alternative to local Chatterbox synthesis — no GPU required!
+
+**Available in the "Cloud TTS (MiniMax)" tab in the Gradio UI.**
+
+### Setup
+
+1. Get an API key from [MiniMax Platform](https://platform.minimaxi.com/)
+2. Set the environment variable:
+   ```bash
+   export MINIMAX_API_KEY="your-api-key-here"
+   ```
+   Or enter the key directly in the UI.
+
+### Features
+
+- **Models:** `speech-2.8-hd` (high quality) and `speech-2.8-turbo` (fast)
+- **12 built-in voices:** English_Graceful_Lady, English_Insightful_Speaker, English_radiant_girl, English_Persuasive_Man, English_Lucky_Robot, Wise_Woman, cute_boy, lovely_girl, Friendly_Person, Inspirational_girl, Deep_Voice_Man, sweet_girl
+- **Adjustable speech speed** (0.5x – 2.0x)
+- **Export as MP3 or WAV**
+- **File upload support** — upload a `.txt` file for longer texts
+
+### Programmatic Usage
+
+```python
+from minimax_tts import MiniMaxTTS
+
+tts = MiniMaxTTS(api_key="your-key", model="speech-2.8-hd", voice_id="Friendly_Person")
+
+# Get raw MP3 bytes
+audio_bytes = tts.synthesize("Hello, world!")
+
+# Save to file (WAV or MP3)
+tts.synthesize_to_file("Hello, world!", "output.wav")
+
+# Get a PyTorch tensor (compatible with the Chatterbox pipeline)
+tensor, sample_rate = tts.synthesize_to_tensor("Hello, world!")
+```
+
+---
+
 ## Tips & Troubleshooting
 
 - **Background noise in output?**
@@ -245,7 +289,7 @@ Open an issue or pull request for suggestions, bug reports, or improvements!
 
 ## Known Bugs:
 
-It seems if you use fasterwhisper for validation, sometimes it just silently crashes. Apparently this has to do with using the fasterwhisper model. It's not actually the python code. So if you are experiencing this, switch back to the original WhisperSync model. 
+It seems if you use fasterwhisper for validation, sometimes it just silently crashes. Apparently this has to do with using the fasterwhisper model. It's not actually the python code. So if you are experiencing this, switch back to the original WhisperSync model. 
 UPDATE: with the latest update this bug may have been resolved.
 
 ---