wav2vec2-large-xlsr-53-gender-recognition-librispeech开源模型

首页

Wav2vec2 Large Xlsr 53 Gender Recognition Librispeech

由 alefiury 开发

基于Librispeech-clean-100数据集微调的性别识别模型，在测试集上F1分数达0.9993

音频分类

Transformers

开源协议:Apache-2.0 #语音性别识别 #高精度F1 #Librispeech微调

下载量 182.33k

发布时间 : 4/24/2023

模型简介

该模型是基于wav2vec2-xls-r-300m架构的语音性别识别模型，专门用于从语音中识别说话者性别

模型特点

高精度性别识别

在Librispeech测试集上达到99.93%的F1分数

基于预训练模型微调

利用facebook/wav2vec2-xls-r-300m预训练模型进行迁移学习

高效训练配置

采用混合精度训练和梯度累积等技术优化训练效率

模型能力

语音性别分类

英语语音处理

使用案例

语音分析

说话者性别识别

从语音片段中识别说话者性别

测试集F1分数0.9993

🚀 wav2vec2-large-xlsr-53-gender-recognition-librispeech

该模型是 facebook/wav2vec2-xls-r-300m 在 Librispeech-clean-100 数据集上针对性别识别任务进行微调后的版本。它在评估集上取得了以下成绩：

损失值：0.0061
F1 分数：0.9993

🚀 快速开始

计算推理结果

import os
import random
from glob import glob
from typing import List, Optional, Union, Dict

import tqdm
import torch
import torchaudio
import numpy as np
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    Wav2Vec2Processor
)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        dataset: List,
        basedir: Optional[str] = None,
        sampling_rate: int = 16000,
        max_audio_len: int = 5,
    ):
        self.dataset = dataset
        self.basedir = basedir

        self.sampling_rate = sampling_rate
        self.max_audio_len = max_audio_len

    def __len__(self):
        """
        Return the length of the dataset
        """
        return len(self.dataset)

    def __getitem__(self, index):
        if self.basedir is None:
            filepath = self.dataset[index]
        else:
            filepath = os.path.join(self.basedir, self.dataset[index])

        speech_array, sr = torchaudio.load(filepath)

        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)

        if sr != self.sampling_rate:
            transform = torchaudio.transforms.Resample(sr, self.sampling_rate)
            speech_array = transform(speech_array)
            sr = self.sampling_rate

        len_audio = speech_array.shape[1]

        # Pad or truncate the audio to match the desired length
        if len_audio < self.max_audio_len * self.sampling_rate:
            # Pad the audio if it's shorter than the desired length
            padding = torch.zeros(1, self.max_audio_len * self.sampling_rate - len_audio)
            speech_array = torch.cat([speech_array, padding], dim=1)
        else:
            # Truncate the audio if it's longer than the desired length
            speech_array = speech_array[:, :self.max_audio_len * self.sampling_rate]

        speech_array = speech_array.squeeze().numpy()

        return {"input_values": speech_array, "attention_mask": None}


class CollateFunc:
    def __init__(
        self,
        processor: Wav2Vec2Processor,
        padding: Union[bool, str] = True,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: bool = True,
        sampling_rate: int = 16000,
        max_length: Optional[int] = None,
    ):
        self.sampling_rate = sampling_rate
        self.processor = processor
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_attention_mask = return_attention_mask
        self.max_length = max_length

    def __call__(self, batch: List[Dict[str, np.ndarray]]):
        # Extract input_values from the batch
        input_values = [item["input_values"] for item in batch]

        batch = self.processor(
            input_values,
            sampling_rate=self.sampling_rate,
            return_tensors="pt",
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_attention_mask=self.return_attention_mask
        )

        return {
            "input_values": batch.input_values,
            "attention_mask": batch.attention_mask if self.return_attention_mask else None
        }


def predict(test_dataloader, model, device: torch.device):
    """
    Predict the class of the audio
    """
    model.to(device)
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in tqdm.tqdm(test_dataloader):
            input_values, attention_mask = batch['input_values'].to(device), batch['attention_mask'].to(device)

            logits = model(input_values, attention_mask=attention_mask).logits
            scores = F.softmax(logits, dim=-1)

            pred = torch.argmax(scores, dim=1).cpu().detach().numpy()

            preds.extend(pred)

    return preds


def get_gender(model_name_or_path: str, audio_paths: List[str], label2id: Dict, id2label: Dict, device: torch.device):
    num_labels = 2

    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
    model = AutoModelForAudioClassification.from_pretrained(
        pretrained_model_name_or_path=model_name_or_path,
        num_labels=num_labels,
        label2id=label2id,
        id2label=id2label,
    )

    test_dataset = CustomDataset(audio_paths, max_audio_len=5)  # for 5-second audio

    data_collator = CollateFunc(
        processor=feature_extractor,
        padding=True,
        sampling_rate=16000,
    )

    test_dataloader = DataLoader(
        dataset=test_dataset,
        batch_size=16,
        collate_fn=data_collator,
        shuffle=False,
        num_workers=2
    )

    preds = predict(test_dataloader=test_dataloader, model=model, device=device)

    return preds

model_name_or_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"

audio_paths = [] # Must be a list with absolute paths of the audios that will be used in inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

label2id = {
    "female": 0,
    "male": 1
}

id2label = {
    0: "female",
    1: "male"
}

num_labels = 2

preds = get_gender(model_name_or_path, audio_paths, label2id, id2label, device)