import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
device = "cuda:0"if torch.cuda.is_available() else"cpu"# 加载模型
model = ParlerTTSForConditionalGeneration.from_pretrained("CONCREE/Adia_TTS").to(device)
tokenizer = AutoTokenizer.from_pretrained("CONCREE/Adia_TTS")
# 待合成的沃洛夫语文本
text = "Entreprenariat ci Senegal dafa am solo lool ci yokkuteg koom-koom, di gëna yokk liggéey ak indi gis-gis yu bees ci dëkk bi."# 语音风格描述
description = "A clear and educational voice, with a flow adapted to learning"# 生成语音
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
audio = model.generate(
input_ids=input_ids,
prompt_input_ids=prompt_ids,
)
# 保存语音
sf.write("output.wav", audio.cpu().numpy().squeeze(), model.config.sampling_rate)
@misc{CONCREE-2024-Adia_TTS,
author = {CONCREE},
title = {Adia_TTS},
year = {2025},
publisher = {Hugging Face},
journal = {Hugging Face repository},
howpublished = {\url{https://huggingface.co/CONCREE/Adia_TTS}}
}
@misc{lyth2024natural,
title={Natural language guidance of high-fidelity text-to-speech with synthetic annotations},
author={Dan Lyth and Simon King},
year={2024},
eprint={2402.01912},
archivePrefix={arXiv},
primaryClass={cs.SD}
}