csm-1bオープンソース音声生成モデル - テキストおよび音声入力に対応、コンテキスト付きで音声符号化を生成

ホーム

Csm 1b

eustlbによって開発

CSMはSesameが開発した1Bパラメータの音声生成モデルで、テキストと音声入力からRVQ音声エンコードを生成でき、コンテキストを考慮した音声生成をサポートします。

音声合成

Safetensors

英語オープンソースライセンス:Apache-2.0 #対話音声生成 #複数話者対応 #コンテキスト認識

ダウンロード数 5,144

リリース時間 : 3/26/2025

モデル概要

Llamaバックボーンネットワークと軽量音声デコーダーを基盤とした音声生成モデルで、Mimi音声エンコードを出力可能、テキスト読み上げタスクに適しています。

モデル特徴

コンテキスト認識生成

過去の対話音声やテキストをコンテキスト入力として利用可能、現在の音声生成効果を最適化

効率的なアーキテクチャ設計

Llamaバックボーンネットワークと軽量デコーダーを組み合わせ、生成品質と計算効率のバランスを実現

マルチモーダル入力

テキストと音声入力を同時処理可能、より自然な音声インタラクションを実現

モデル能力

テキスト読み上げ生成

コンテキスト認識音声合成

複数話者音声生成

使用事例

インタラクティブ音声アプリケーション

音声アシスタント

対話システムに自然な音声出力を提供

デモケースでは感情的な抑揚を含む音声生成が可能

コンテンツ制作

音声コンテンツ生成

テキストコンテンツを自動的に音声に変換

license: apache-2.0 language:

en pipeline_tag: text-to-speech tags:
text-to-speech

CSM 1B

2025/03/13 - 1BパラメータのCSMバリアントをリリースしました。オリジナルコードはGitHubで公開されています: SesameAILabs/csm。

2025/05/07 - TransformersでCSMがサポートされました ü§ó

CSM（Conversational Speech Model）は、Sesameが開発した音声生成モデルで、テキストと音声入力からRVQオーディオコードを生成します。モデルアーキテクチャはLlamaをバックボーンとし、Mimiオーディオコードを生成する小型のオーディオデコーダーを採用しています。

CSMのファインチューン版は、ブログ記事で紹介したインタラクティブ音声デモで使用されています。

音声生成をテストできるHuggingFaceスペースも公開されています。

使用方法

文章生成

import torch
from transformers import CsmForConditionalGeneration, AutoProcessor

model_id = "eustlb/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"

# モデルとプロセッサの読み込み
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)

# 入力の準備
text = "[0]The past is just a story we tell ourselves." # `[0]`は話者ID0
inputs = processor(text, add_special_tokens=True).to(device)

# 同等の別の入力準備方法
conversation = [
    {"role": "0", "content": [{"type": "text", "text": "The past is just a story we tell ourselves."}]},
]
inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
).to(device)

# モデル推論
audio = model.generate(**inputs, output_audio=True)
processor.save_audio(audio, "example_without_context.wav")

コンテキストを提供するとCSMは最高のパフォーマンスを発揮します

import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
from datasets import load_dataset, Audio

model_id = "eustlb/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"

# モデルとプロセッサの読み込み
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)

# 入力の準備
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
# オーディオを24kHzに変換
ds = ds.cast_column("audio", Audio(sampling_rate=24000))
conversation = []

# 1. コンテキスト
for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
    conversation.append(
        {
            "role": f"{speaker_id}",
            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
        }
    )

# 2. テキストプロンプト
conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})

inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
).to(device)

# モデル推論
audio = model.generate(**inputs, output_audio=True)
processor.save_audio(audio, "example_with_context.wav")

バッチ推論 üì¶

CSMはバッチ推論をサポートしています:

コードスニペット

import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
from datasets import load_dataset, Audio

model_id = "eustlb/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"

# モデルとプロセッサの読み込み
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)

# 入力の準備
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
# オーディオを24kHzに変換
ds = ds.cast_column("audio", Audio(sampling_rate=24000))
# ここでは2つのプロンプトを含むバッチ
conversation = [
    [
        {
            "role": f"{ds[0]['speaker_id']}",
            "content": [
                {"type": "text", "text": ds[0]["text"]},
                {"type": "audio", "path": ds[0]["audio"]["array"]},
            ],
        },
        {
            "role": f"{ds[1]['speaker_id']}",
            "content": [
                {"type": "text", "text": ds[1]["text"]},
            ],
        },
    ],
    [
        {
            "role": f"{ds[0]['speaker_id']}",
            "content": [
                {"type": "text", "text": ds[0]["text"]},
            ],
        }
    ],
]
inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
).to(device)

audio = model.generate(**inputs, output_audio=True)
processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])

モデルを高速化する üèéÔ∏è

CSMはCUDAグラフによるフルグラフコンパイルをサポートしています！

コードスニペット

import torch
import copy
from transformers import CsmForConditionalGeneration, AutoProcessor
from datasets import load_dataset

model_id = "eustlb/csm-1b"
device = "cuda"

# 再コンパイルとグラフブレークを防ぐためのログ設定
torch._logging.set_logs(graph_breaks=True, recompiles=True, cudagraphs=True)

# モデルとプロセッサの読み込み
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)

# スタティックキャッシュを使用し、自動的にtorch.compileをフルグラフとreduce-overheadで有効化
model.generation_config.max_length = 250 # 再コンパイルを避けるために十分な大きさ
model.generation_config.max_new_tokens = None # max_lengthより優先される
model.generation_config.cache_implementation = "static"
model.depth_decoder.generation_config.cache_implementation = "static"

# 生成パラメータ
gen_kwargs = {
    "do_sample": False,
    "depth_decoder_do_sample": False,
    "temperature": 1.0,
    "depth_decoder_temperature": 1.0,
}

# タイミング計測デコレータの定義
class TimerContext:
    def __init__(self, name="Execution"):
        self.name = name
        self.start_event = None
        self.end_event = None
        
    def __enter__(self):
        # GPUタイミング計測のためCUDAイベントを使用
        self.start_event = torch.cuda.Event(enable_timing=True)
        self.end_event = torch.cuda.Event(enable_timing=True)
        self.start_event.record()
        return self

    def __exit__(self, *args):
        self.end_event.record()
        torch.cuda.synchronize()
        elapsed_time = self.start_event.elapsed_time(self.end_event) / 1000.0
        print(f"{self.name} time: {elapsed_time:.4f} seconds")

# 入力の準備
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")

conversation = [
    {
        "role": f"{ds[0]['speaker_id']}",
        "content": [
            {"type": "text", "text": ds[0]["text"]},
            {"type": "audio", "path": ds[0]["audio"]["array"]},
        ],
    },
    {
        "role": f"{ds[1]['speaker_id']}",
        "content": [
            {"type": "text", "text": ds[1]["text"]},
            {"type": "audio", "path": ds[1]["audio"]["array"]},
        ],
    },
    {
        "role": f"{ds[2]['speaker_id']}",
        "content": [
            {"type": "text", "text": ds[2]["text"]},
        ],
    },
]

padded_inputs_1 = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
).to(device)

print("\n" + "="*50)
print("最初の生成 - CUDAグラフのコンパイルと記録中...")
with TimerContext("最初の生成"):
    _ = model.generate(**padded_inputs_1, **gen_kwargs)
print("="*50)

print("\n" + "="*50)
print("2回目の生成 - 高速!!!")
with TimerContext("2回目の生成"):
    _ = model.generate(**padded_inputs_1, **gen_kwargs)
print("="*50)

# 異なる入力での実行
conversation = [
    {
        "role": f"{ds[0]['speaker_id']}",
        "content": [
            {"type": "text", "text": ds[2]["text"]},
            {"type": "audio", "path": ds[2]["audio"]["array"]},
        ],
    },
    {
        "role": f"{ds[1]['speaker_id']}",
        "content": [
            {"type": "text", "text": ds[3]["text"]},
            {"type": "audio", "path": ds[3]["audio"]["array"]},
        ],
    },
    {
        "role": f"{ds[2]['speaker_id']}",
        "content": [
            {"type": "text", "text": ds[4]["text"]},
        ],
    },
]
padded_inputs_2 = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
).to(device)

print("\n" + "="*50)
print("異なる入力での生成!")
with TimerContext("異なる入力での生成"):
    _ = model.generate(**padded_inputs_2, **gen_kwargs)
print("="*50)

ファインチューニング & トレーニング üìâ

CSMはTransformersのTrainerを使用してファインチューニングできます。

コードスニペット

from datasets import load_dataset, Audio
from transformers import (
    CsmForConditionalGeneration,
    TrainingArguments,
    CsmProcessor,
    Trainer
)

processor = CsmProcessor.from_pretrained("eustlb/csm-1b")
model = CsmForConditionalGeneration.from_pretrained("eustlb/csm-1b")
model.train()

ds = load_dataset("eustlb/dailytalk-conversations-grouped", split="train")
ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))

def data_collator(samples):
    conversations = [] 

    for sample in samples:
        concatenated_audio_array = sample["audio"]["array"]
        audio = [concatenated_audio_array[s: e] for s, e in sample["audio_cut_idxs"]]
            
        conversation = []
        for speaker_id, text, audio in zip(sample["speaker_ids"], sample["texts"], audio):
            conversation.append({
                "role": f"{speaker_id}",
                "content": [
                    {"type": "text", "text": text},
                    {"type": "audio", "audio": audio}
                ]
            })
            
        conversations.append(conversation)

    inputs = processor.apply_chat_template(
        conversations,
        tokenize=True,
        return_dict=True,
        output_labels=True,
    )
    return inputs

training_args = TrainingArguments(
    "test-trainer",
    remove_unused_columns=False,
    gradient_checkpointing=True,
)

trainer = Trainer(
    model, 
    training_args,
    train_dataset=ds,
    data_collator=data_collator,
)

trainer.train()