Skip to content

Commit 2c8e460

Browse files
committed
moss-tts: add first-class MOSS-TTS support
1 parent 34818ea commit 2c8e460

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+4461
-17
lines changed

convert_hf_to_gguf.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4628,6 +4628,73 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
46284628
yield from super().modify_tensors(data_torch, name, bid)
46294629

46304630

4631+
@ModelBase.register("MossTTSDelayModel", "MossTTSDelayForCausalLM")
4632+
class MossTTSDelayModel(Qwen3Model):
4633+
model_arch = gguf.MODEL_ARCH.MOSS_TTS_DELAY
4634+
4635+
def __init__(self, *args, **kwargs):
4636+
hparams = kwargs.get("hparams")
4637+
if hparams is None:
4638+
hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
4639+
else:
4640+
hparams = dict(hparams)
4641+
4642+
language_config = hparams.get("language_config")
4643+
if isinstance(language_config, dict):
4644+
# Expose the Qwen3 backbone params at the root level so TextModel can
4645+
# discover block_count / hidden_size / attention params without
4646+
# losing the top-level MOSS architecture identity.
4647+
language_hparams = {
4648+
key: value
4649+
for key, value in language_config.items()
4650+
if key not in ("architectures", "model_type")
4651+
}
4652+
hparams = {**hparams, **language_hparams}
4653+
4654+
kwargs["hparams"] = hparams
4655+
super().__init__(*args, **kwargs)
4656+
4657+
def set_gguf_parameters(self):
4658+
super().set_gguf_parameters()
4659+
4660+
arch = self.gguf_writer.arch
4661+
self.gguf_writer.add_uint32(gguf.Keys.LLM.N_VQ.format(arch=arch), self.hparams["n_vq"])
4662+
self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_VOCAB_SIZE.format(arch=arch), self.hparams["audio_vocab_size"])
4663+
self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_PAD_CODE.format(arch=arch), self.hparams["audio_pad_code"])
4664+
self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_START_TOKEN_ID.format(arch=arch), self.hparams["audio_start_token_id"])
4665+
self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_END_TOKEN_ID.format(arch=arch), self.hparams["audio_end_token_id"])
4666+
self.gguf_writer.add_uint32(gguf.Keys.LLM.AUDIO_USER_SLOT_TOKEN_ID.format(arch=arch), self.hparams["audio_user_slot_token_id"])
4667+
self.gguf_writer.add_uint32(
4668+
gguf.Keys.LLM.AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID.format(arch=arch),
4669+
self.hparams["audio_assistant_gen_slot_token_id"],
4670+
)
4671+
self.gguf_writer.add_uint32(
4672+
gguf.Keys.LLM.AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID.format(arch=arch),
4673+
self.hparams["audio_assistant_delay_slot_token_id"],
4674+
)
4675+
if (sampling_rate := self.hparams.get("sampling_rate")) is not None:
4676+
self.gguf_writer.add_uint32(gguf.Keys.LLM.SAMPLING_RATE.format(arch=arch), sampling_rate)
4677+
4678+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4679+
if name.startswith("language_model."):
4680+
name = name.replace("language_model.", "", 1)
4681+
4682+
if (match := re.fullmatch(r"emb_ext\.(\d+)\.weight", name)) is not None:
4683+
vq_idx = int(match.group(1))
4684+
yield (f"{gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD_AUDIO]}.{vq_idx}.weight", data_torch)
4685+
return
4686+
4687+
if (match := re.fullmatch(r"lm_heads\.(\d+)\.weight", name)) is not None:
4688+
head_idx = int(match.group(1))
4689+
if head_idx == 0:
4690+
yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight", data_torch)
4691+
else:
4692+
yield (f"{gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT_AUDIO]}.{head_idx - 1}.weight", data_torch)
4693+
return
4694+
4695+
yield from super().modify_tensors(data_torch, name, bid)
4696+
4697+
46314698
@ModelBase.register("Qwen3MoeForCausalLM")
46324699
class Qwen3MoeModel(Qwen2MoeModel):
46334700
model_arch = gguf.MODEL_ARCH.QWEN3MOE

docs/moss-tts-firstclass-e2e.md

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
# MOSS-TTS First-Class End-to-End Inference Pipeline
2+
3+
[English](moss-tts-firstclass-e2e.md) | [简体中文](moss-tts-firstclass-e2e_zh.md)
4+
5+
This document describes the **first-class** MOSS-TTS end-to-end inference pipeline in the current `llama.cpp` repository.
6+
7+
This pipeline uses:
8+
9+
- **llama.cpp** and `llama-moss-tts` to run the first-class MOSS-TTS-Delay GGUF model
10+
- **ONNX Runtime** for reference-audio encoding and final waveform decoding
11+
- **Python helper scripts** for prompt construction and end-to-end orchestration
12+
- A local **MOSS-TTS** checkout that provides the prompt builder and ONNX tokenizer Python modules
13+
14+
Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository, this path moves multi-channel inputs, the transformer backbone, multi-head outputs, and delay-pattern decoding into `llama.cpp`. Python is only responsible for preparing inputs and invoking the ONNX audio tokenizer.
15+
16+
## Prerequisites
17+
18+
1. **llama.cpp** built from source with the `llama-moss-tts` target
19+
2. **Python >= 3.10**
20+
3. A local **MOSS-TTS** checkout, provided in any of the following ways:
21+
- available at `../MOSS-TTS` relative to the repository root
22+
- passed through `--moss-tts-dir`
23+
- passed through `MOSS_TTS_DIR` or `MOSS_TTS_ROOT`
24+
4. Python packages required by the helper scripts:
25+
- `numpy`
26+
- `soundfile`
27+
- `onnxruntime`
28+
29+
## Build
30+
31+
```bash
32+
cd /path/to/llama.cpp
33+
34+
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
35+
cmake --build build --target llama-moss-tts -j
36+
```
37+
38+
The resulting binary is:
39+
40+
- `build/bin/llama-moss-tts`
41+
42+
If you want to build at runtime, you can also pass `--build` to the e2e script.
43+
44+
## Weight Preparation
45+
46+
### Step 1: Prepare the first-class GGUF model
47+
48+
You need a first-class MOSS-TTS-Delay GGUF model that already contains:
49+
50+
- text embedding tables
51+
- 32 audio embedding tables
52+
- Qwen3 backbone weights
53+
- a text output head
54+
- 32 audio output heads
55+
56+
For example:
57+
58+
- `out/stage1a_moss_delay_firstclass_f16.gguf`
59+
60+
### Step 2: Prepare the tokenizer directory
61+
62+
You need a tokenizer directory containing at least:
63+
64+
- `tokenizer.json`
65+
66+
For example:
67+
68+
- `weights/extracted/qwen3_backbone/`
69+
70+
### Step 3: Prepare the ONNX audio tokenizer
71+
72+
You need both ONNX files:
73+
74+
- `encoder.onnx`
75+
- `decoder.onnx`
76+
77+
For example:
78+
79+
- `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx`
80+
- `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx`
81+
82+
### Step 4: Make the MOSS-TTS repository visible
83+
84+
The helper scripts import:
85+
86+
- `moss_tts_delay.llama_cpp.processor`
87+
- `moss_audio_tokenizer.onnx`
88+
89+
You can provide the repository path like this:
90+
91+
```bash
92+
export MOSS_TTS_DIR=/path/to/MOSS-TTS
93+
```
94+
95+
or:
96+
97+
```bash
98+
python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ...
99+
```
100+
101+
## Usage
102+
103+
### CLI
104+
105+
```bash
106+
# Voice cloning: text + reference audio -> wav
107+
python tools/tts/moss-tts-firstclass-e2e.py \
108+
--model-gguf /path/to/moss_delay_firstclass.gguf \
109+
--moss-tts-dir /path/to/MOSS-TTS \
110+
--tokenizer-dir /path/to/tokenizer_dir \
111+
--onnx-encoder /path/to/encoder.onnx \
112+
--onnx-decoder /path/to/decoder.onnx \
113+
--text-file /path/to/text.txt \
114+
--reference-audio /path/to/reference_24k.wav \
115+
--output-wav /path/to/output.wav
116+
117+
# Direct generation without reference audio
118+
python tools/tts/moss-tts-firstclass-e2e.py \
119+
--model-gguf /path/to/moss_delay_firstclass.gguf \
120+
--moss-tts-dir /path/to/MOSS-TTS \
121+
--tokenizer-dir /path/to/tokenizer_dir \
122+
--onnx-encoder /path/to/encoder.onnx \
123+
--onnx-decoder /path/to/decoder.onnx \
124+
--text "Hello, world!" \
125+
--output-wav /path/to/output.wav
126+
127+
# Build llama-moss-tts before running
128+
python tools/tts/moss-tts-firstclass-e2e.py \
129+
--build \
130+
--model-gguf /path/to/moss_delay_firstclass.gguf \
131+
--moss-tts-dir /path/to/MOSS-TTS \
132+
--tokenizer-dir /path/to/tokenizer_dir \
133+
--onnx-encoder /path/to/encoder.onnx \
134+
--onnx-decoder /path/to/decoder.onnx \
135+
--text "Hello!" \
136+
--output-wav /path/to/output.wav
137+
```
138+
139+
## Key Options
140+
141+
| Option | Values | Description |
142+
|------|------|------|
143+
| `--model-gguf` | path | First-class MOSS-TTS GGUF model |
144+
| `--moss-tts-dir` | path | Local `MOSS-TTS` repository root |
145+
| `--tokenizer-dir` | path | Directory containing `tokenizer.json` |
146+
| `--onnx-encoder` | path | Audio tokenizer encoder ONNX |
147+
| `--onnx-decoder` | path | Audio tokenizer decoder ONNX |
148+
| `--text` / `--text-file` | string / path | Input text, choose exactly one |
149+
| `--reference-audio` | path | Optional 24 kHz reference audio |
150+
| `--language` | `zh` / `en` / tag | Language tag passed to the prompt builder |
151+
| `--max-new-tokens` | int | Maximum generation steps |
152+
| `--text-temperature` | float | Text-channel sampling temperature, default `1.5` |
153+
| `--audio-temperature` | float | Audio-channel sampling temperature, default `1.7` |
154+
| `--n-gpu-layers` | `-1` / `0` / `N` | GPU offload layers, default `-1` |
155+
| `--audio-decoder-cpu` | flag | Force ONNX waveform decoding on CPU |
156+
| `--cpu-audio-encode` | flag | Force ONNX reference-audio encoding on CPU |
157+
| `--build` | flag | Build `llama-moss-tts` before running |
158+
159+
## Architecture
160+
161+
```text
162+
Input text (+ optional reference wav)
163+
|
164+
v
165+
moss-tts-build-generation-ref.py
166+
|
167+
|- tokenizes text with the Qwen3 tokenizer
168+
|- optionally encodes the reference wav into audio codes with ONNX
169+
|- calls the prompt builder from the local MOSS-TTS repo
170+
v
171+
generation.ref.bin
172+
|
173+
v
174+
llama-moss-tts
175+
|
176+
|- loads the first-class GGUF model
177+
|- performs multi-channel embedding lookup in-graph
178+
|- runs the Qwen3 backbone inside llama.cpp
179+
|- samples multi-head logits
180+
|- performs delay-pattern decoding in C++
181+
v
182+
raw.codes.bin
183+
|
184+
v
185+
moss-tts-audio-decode.py
186+
|
187+
|- decodes raw audio codes into waveform with ONNX
188+
v
189+
wav
190+
```
191+
192+
## Temporary Artifacts
193+
194+
The e2e script creates a temporary directory and removes it automatically after the run.
195+
196+
The following intermediate files are not kept:
197+
198+
- `generation.ref.bin`
199+
- `raw.codes.bin`
200+
201+
The only visible artifact after the run is the output wav you requested.
202+
203+
## Output
204+
205+
At the end of a successful run, the script prints:
206+
207+
- `wav` — output path
208+
- `wav_info` — sample rate, channel count, frame count, and duration
209+
210+
## File Structure
211+
212+
```text
213+
llama.cpp/
214+
├── docs/
215+
│ ├── moss-tts-firstclass-e2e.md
216+
│ └── moss-tts-firstclass-e2e_zh.md
217+
├── tools/tts/
218+
│ ├── moss-tts-firstclass-e2e.py # End-to-end wrapper
219+
│ ├── moss-tts-build-generation-ref.py # Prompt / input builder
220+
│ ├── moss-tts-audio-decode.py # ONNX audio decode helper
221+
│ └── moss-tts.cpp # llama-moss-tts implementation
222+
└── build/bin/
223+
└── llama-moss-tts
224+
```

0 commit comments

Comments
 (0)