🚀 Orpheus-3b FT 4bit AWQ Quantized Version
This is a 4bit awq quantized version of Orpheus-3b FT. It's recommended to use lmdeploy as it's easy to install and offers very fast speed. Here, you'll find the code to load the model, process audio files for voice cloning, and generate speech.
🚀 Quick Start
✨ Features
- A 4bit awq quantized version of Orpheus-3b FT.
- Utilizes lmdeploy for easy installation and high speed.
- Enables voice cloning and speech generation.
📦 Installation
Install snac and lmdeploy with pip install snac lmdeploy
💻 Usage Examples
Basic Usage
Code to load model:
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
from transformers import AutoTokenizer
from snac import SNAC
tp = 1
cache_max_entry_count = 0.2
engine_config = TurbomindEngineConfig(model_format='awq', dtype='float16', cache_max_entry_count=cache_max_entry_count, tp=tp, quant_policy=8)
pipe = pipeline("YaTharThShaRma999/orpheus_awq", backend_config=engine_config)
tokeniser = AutoTokenizer.from_pretrained("unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to('cuda:0')
Advanced Usage
Code to convert voice file into snac tokens for voice cloning
import librosa
import torch
from IPython.display import Audio
import gc
import torch
from pydub import AudioSegment
tokenizer = tokeniser
my_wav_file_is = "test.mp3"
and_the_transcript_is = ""
filename = my_wav_file_is
audio_array, sample_rate = librosa.load(filename)
def tokenise_audio(waveform):
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = waveform.to(dtype=torch.float32)
waveform = waveform.unsqueeze(0).to('cuda:0')
with torch.inference_mode():
codes = snac_model.encode(waveform)
all_codes = []
for i in range(codes[0].shape[1]):
all_codes.append(codes[0][0][i].item()+128266)
all_codes.append(codes[1][0][2*i].item()+128266+4096)
all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
return all_codes
myts = tokenise_audio(audio_array)
gc.collect()
torch.cuda.empty_cache()
Code to generate speech and display it using IPython
from lmdeploy import GenerationConfig
import gc
import torch
gen_config = GenerationConfig(top_p=0.7,
top_k=50,
temperature=0.2,
max_new_tokens=1024,
min_new_tokens=30,
stop_token_ids=[128009, 128001, 49158, 128258],
repetition_penalty=2.0,
skip_special_tokens=False,
do_sample=True,
min_p=0.6)
prompt = and_the_transcript_is + "<laugh> So um hey, like what's up??"
voice_name = "zac"
response2 = pipe([f"<custom_token_3><|begin_of_text|>{voice_name}: {prompt}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>" + tokeniser.decode(myts)], gen_config=gen_config)
gc.collect()
torch.cuda.empty_cache()
generated_ids = tokeniser.encode(response2[0].text, return_tensors='pt', add_special_tokens=False)
token_to_find = 128257
token_to_remove = 128258
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_occurrence_idx = token_indices[1][-1].item()
cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
cropped_tensor = generated_ids
mask = cropped_tensor != token_to_remove
processed_rows = []
for row in cropped_tensor:
masked_row = row[row != token_to_remove]
processed_rows.append(masked_row)
code_lists = []
for row in processed_rows:
row_length = row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = row[:new_length]
trimmed_row = [t - 128266 for t in trimmed_row]
code_lists.append(trimmed_row)
def redistribute_codes(code_list):
layer_1 = []
layer_2 = []
layer_3 = []
for i in range((len(code_list)+1)//7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1]-4096)
layer_3.append(code_list[7*i+2]-(2*4096))
layer_3.append(code_list[7*i+3]-(3*4096))
layer_2.append(code_list[7*i+4]-(4*4096))
layer_3.append(code_list[7*i+5]-(5*4096))
layer_3.append(code_list[7*i+6]-(6*4096))
codes = [torch.tensor(layer_1).unsqueeze(0).to('cuda:0'),
torch.tensor(layer_2).unsqueeze(0).to('cuda:0'),
torch.tensor(layer_3).unsqueeze(0).to('cuda:0')]
audio_hat = snac_model.decode(codes)
return audio_hat
my_samples = []
for code_list in code_lists:
samples = redistribute_codes(code_list)
my_samples.append(samples)
from IPython.display import display, Audio
display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))
del my_samples,samples, code_lists, mask, cropped_tensor, processed_rows
gc.collect()
torch.cuda.empty_cache()
📄 License
This project is licensed under the Apache-2.0 license.