unit_hifigan_mhubert Open-Source Speech-to-Speech Translation Model - Free Deployment for Spanish-to-English Translation

Unit Hifigan Mhubert Vp En Es Fr It3 400k Layer11 Km1000 Lj Dur

Developed by facebook

A speech-to-speech translation model based on fairseq S2UT, supporting Spanish to English translation

Speech Synthesis English#Speech Translation #Multilingual Synthesis #Discrete Unit Conversion

Downloads 27

Release Time : 8/31/2022

Model Overview

This model is a discrete unit-based speech-to-speech translation model that uses the HiFiGAN vocoder for speech synthesis and supports translation from Spanish to English.

Model Features

Multilingual Support

Supports speech translation from Spanish to English

High-quality Speech Synthesis

Uses the HiFiGAN vocoder to generate high-quality speech output

Discrete Unit-based

Employs discrete unit representation for speech translation, improving efficiency and accuracy

Model Capabilities

Speech-to-speech translation

Text-to-speech

Multilingual processing

Use Cases

Speech Translation

Real-time Speech Translation

Translates Spanish speech into English speech in real-time

High-quality English speech output

Speech Synthesis

Speech Content Generation

Converts text content into natural speech

Natural and fluent speech output

🚀 unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur

Speech-to-speech translation model that supports Spanish-English translation, trained on mTEDx, CoVoST 2, Europarl-ST and VoxPopuli.

🚀 Quick Start

Speech-to-speech translation model from fairseq S2UT (paper/code):

Spanish-English
Trained on mTEDx, CoVoST 2, Europarl-ST and VoxPopuli

💻 Usage Examples

Basic Usage

import json
import os
from pathlib import Path

import IPython.display as ipd
from fairseq import hub_utils
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.speech_to_text.hub_interface import S2THubInterface
from fairseq.models.text_to_speech import CodeHiFiGANVocoder
from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface

from huggingface_hub import snapshot_download
import torchaudio

cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")

#models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
#     "facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
#     arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
#     cache_dir=cache_dir,
# )
# model = models[0].cpu()
# cfg["task"].cpu = True
# generator = task.build_generator([model], cfg)


# # requires 16000Hz mono channel audio
# audio, _ = torchaudio.load("/Users/lpw/git/api-inference-community/docker_images/fairseq/tests/samples/sample2.flac")

# sample = S2THubInterface.get_model_input(task, audio)
# unit = S2THubInterface.get_prediction(task, model, generator, sample)

# speech synthesis           
library_name = "fairseq"
cache_dir = (
    cache_dir or (Path.home() / ".cache" / library_name).as_posix()
)
cache_dir = snapshot_download(
    f"facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", cache_dir=cache_dir, library_name=library_name
)

x = hub_utils.from_pretrained(
    cache_dir,
    "model.pt",
    ".",
    archive_map=CodeHiFiGANVocoder.hub_models(),
    config_yaml="config.json",
    fp16=False,
    is_vocoder=True,
)

with open(f"{x['args']['data']}/config.json") as f:
    vocoder_cfg = json.load(f)
assert (
    len(x["args"]["model_path"]) == 1
), "Too many vocoder models in the input"

vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
tts_model = VocoderHubInterface(vocoder_cfg, vocoder)

tts_sample = tts_model.get_model_input(unit)
wav, sr = tts_model.get_prediction(tts_sample)

ipd.Audio(wav, rate=sr)

Advanced Usage

# There is no specific advanced usage description in the original text, so the code remains the same
import json
import os
from pathlib import Path

import IPython.display as ipd
from fairseq import hub_utils
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.speech_to_text.hub_interface import S2THubInterface
from fairseq.models.text_to_speech import CodeHiFiGANVocoder
from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface

from huggingface_hub import snapshot_download
import torchaudio

cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")

#models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
#     "facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
#     arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
#     cache_dir=cache_dir,
# )
# model = models[0].cpu()
# cfg["task"].cpu = True
# generator = task.build_generator([model], cfg)


# # requires 16000Hz mono channel audio
# audio, _ = torchaudio.load("/Users/lpw/git/api-inference-community/docker_images/fairseq/tests/samples/sample2.flac")

# sample = S2THubInterface.get_model_input(task, audio)
# unit = S2THubInterface.get_prediction(task, model, generator, sample)

# speech synthesis           
library_name = "fairseq"
cache_dir = (
    cache_dir or (Path.home() / ".cache" / library_name).as_posix()
)
cache_dir = snapshot_download(
    f"facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", cache_dir=cache_dir, library_name=library_name
)

x = hub_utils.from_pretrained(
    cache_dir,
    "model.pt",
    ".",
    archive_map=CodeHiFiGANVocoder.hub_models(),
    config_yaml="config.json",
    fp16=False,
    is_vocoder=True,
)

with open(f"{x['args']['data']}/config.json") as f:
    vocoder_cfg = json.load(f)
assert (
    len(x["args"]["model_path"]) == 1
), "Too many vocoder models in the input"

vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
tts_model = VocoderHubInterface(vocoder_cfg, vocoder)

tts_sample = tts_model.get_model_input(unit)
wav, sr = tts_model.get_prediction(tts_sample)

ipd.Audio(wav, rate=sr)

📄 License

This project is licensed under the CC BY-NC 4.0 license.

Property	Details
Library Name	fairseq
Task	text-to-speech
Tags	fairseq, audio, text-to-speech
Datasets	mtedx, covost2, europarl_st, voxpopuli
Example Audio	Common Voice sample 1

Featured Recommended AI Models

Empowering the Future, Your AI Solution Knowledge Base

English 简体中文繁體中文にほんご