import torch
import torchaudio
from transformers import pipeline
model_id = '11mlabs/indri-0.1-124m-tts'
task = 'indri-tts'
pipe = pipeline(
task,
model=model_id,
device=torch.device('cuda:0'), # Update this based on your hardware,
trust_remote_code=True
)
output = pipe(['Hi, my name is Indri and I like to talk.'], speaker = '[spkr_63]')
torchaudio.save('output.wav', output[0]['audio'][0], sample_rate=24000)
import torch
import torchaudio
from transformers import pipeline
model_id = '11mlabs/indri-0.1-124m-tts'
task = 'indri-tts'
pipe = pipeline(
task,
model=model_id,
device=torch.device('cuda:0'), # Update this based on your hardware,
trust_remote_code=True
)
output = pipe(['Hi, my name is Indri and I like to talk.'], speaker = '[spkr_63]')
torchaudio.save('output.wav', output[0]['audio'][0], sample_rate=24000)
@techreport{kyutai2024moshi,
title={Moshi: a speech-text foundation model for real-time dialogue},
author={Alexandre D\'efossez and Laurent Mazar\'e and Manu Orsini and
Am\'elie Royer and Patrick P\'erez and Herv\'e J\'egou and Edouard Grave and Neil Zeghidour},
year={2024},
eprint={2410.00037},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2410.00037},
}
@misc{radford2022whisper,
doi = {10.48550/ARXIV.2212.04356},
url = {https://arxiv.org/abs/2212.04356},
author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
title = {Robust Speech Recognition via Large-Scale Weak Supervision},
publisher = {arXiv},
year = {2022},
copyright = {arXiv.org perpetual, non-exclusive license}
}