from mlx_lm.sample_utils import make_sampler
from huggingface_hub import hf_hub_download
from csm_mlx import CSM, csm_1b, generate
import audiofile
import numpy as np
csm = CSM(csm_1b())
weight = hf_hub_download(repo_id="senstella/csm-expressiva-1b", filename="mlx-ckpt.safetensors") # Here's the difference!
csm.load_weights(weight)
audio = generate(
csm,
text="Hello from Sesame.",
speaker=4, # And this is another difference - please use 4 regardless of where you're inferencing!
context=[],
max_audio_length_ms=20_000,
sampler=make_sampler(temp=0.8, top_k=50)
)
audiofile.write("./audio.wav", np.asarray(audio), 24000)