🚀 Whisper Turbo KSC2 Model
This is a fine-tuned model based on Whisper large-v3-turbo, designed for automatic speech recognition of Kazakh audio.
🚀 Quick Start
This model is a Whisper large-v3-turbo fine - tuned on the Kazakh Speech Corpus 2, which contains about 1000 hours of transcribed audio from diverse sources. After training on the Train partition, it achieved a 9.16% WER on the Test partition.
💻 Usage Examples
Basic Usage
import librosa
import numpy as np
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
class Transcriber:
def __init__(
self,
model_path="abilmansplus/whisper-turbo-ksc2",
device="cuda:0",
sampling_rate=16_000,
language="kazakh",
task="transcribe",
num_beams=5,
chunk_length_s=30,
stride_length_s=1
):
self.processor = WhisperProcessor.from_pretrained(
model_path,
language=language,
task=task
)
self.model = WhisperForConditionalGeneration.from_pretrained(model_path)
self.model = self.model.to(device)
self.sr = sampling_rate
self.language=language
self.task = task
self.num_beams=num_beams
self.chunk_length_s = chunk_length_s
self.stride_length_s = stride_length_s
def transcribe(self, audio_path: str) -> str:
"""transcribes the audio chunk by chunk and merges the results
Args:
audio_path (str): path to the audio to be transcribed
Returns:
full_transcription (str): transcription of the entire audio
"""
speech_array, sampling_rate = librosa.load(audio_path, sr=self.sr)
audio_length_s = len(speech_array) / self.sr
if audio_length_s <= self.chunk_length_s:
full_transcription = self._transcribe_chunk(speech_array)
return full_transcription
chunk_length_samples = int(self.chunk_length_s * self.sr)
stride_length_samples = int(self.stride_length_s * self.sr)
num_samples = len(speech_array)
num_chunks = max(1,
int(
1 +
np.ceil(
(num_samples - chunk_length_samples) /
(chunk_length_samples - stride_length_samples)
)
)
)
transcriptions = []
for i in range(num_chunks):
start = max(0, i * (chunk_length_samples - stride_length_samples))
end = min(num_samples, start + chunk_length_samples)
chunk = speech_array[start:end]
chunk_transcription = self._transcribe_chunk(chunk)
transcriptions.append(chunk_transcription)
full_transcription = " ".join(transcriptions)
return full_transcription
def _transcribe_chunk(self, audio_chunk) -> str:
inputs = self.processor(
audio_chunk,
sampling_rate=self.sr,
return_tensors="pt"
).input_features.to(self.model.device)
forced_decoder_ids = self.processor.get_decoder_prompt_ids(
language=self.language,
task=self.task
)
attention_mask = torch.ones_like(inputs[:, :, 0])
with torch.no_grad():
generated_ids = self.model.generate(
inputs,
forced_decoder_ids=forced_decoder_ids,
max_length=448,
num_beams=self.num_beams,
attention_mask=attention_mask,
)
transcription = self.processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
return transcription
Advanced Usage
⚠️ Important Note
For longer audio (35+ seconds), you can divide them into 30 - second chunks, transcribe each chunk separately, and then merge the results.
📄 License
This project is released under the MIT license.
📦 Model Information
Property |
Details |
Model Type |
Fine - tuned Whisper large - v3 - turbo |
Training Data |
issai/Kazakh_Speech_Corpus_2 |
Evaluation Metric |
WER = 9.16% |
Base Model |
openai/whisper-large-v3-turbo |
Pipeline Tag |
automatic-speech-recognition |
Library Name |
transformers |