đ Model for Age and Gender Recognition based on Wav2vec 2.0 (24 layers)
This model takes a raw audio signal as input and outputs predictions for age (ranging approximately from 0...1, equivalent to 0...100 years) and gender, expressing the probability of being a child, female, or male. Additionally, it provides the pooled states of the last transformer layer.
đ Quick Start
The model was created by fine-tuning Wav2Vec2-Large-Robust on aGender, Mozilla Common Voice, Timit and Voxceleb 2. For this version of the model, all 24 transformer layers were trained. An ONNX export of the model is available from doi:10.5281/zenodo.7761387. Further details are given in the associated paper and tutorial.
⨠Features
- Input & Output: Expects a raw audio signal as input and outputs age and gender predictions along with the pooled states of the last transformer layer.
- Fine - Tuning: Fine - tuned on multiple datasets including agender, mozillacommonvoice, timit, and voxceleb2.
- ONNX Export: An ONNX export of the model is available.
đĻ Installation
The installation details are not provided in the original document.
đģ Usage Examples
Basic Usage
import numpy as np
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
Wav2Vec2Model,
Wav2Vec2PreTrainedModel,
)
class ModelHead(nn.Module):
r"""Classification head."""
def __init__(self, config, num_labels):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, num_labels)
def forward(self, features, **kwargs):
x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class AgeGenderModel(Wav2Vec2PreTrainedModel):
r"""Speech emotion classifier."""
def __init__(self, config):
super().__init__(config)
self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.age = ModelHead(config, 1)
self.gender = ModelHead(config, 3)
self.init_weights()
def forward(
self,
input_values,
):
outputs = self.wav2vec2(input_values)
hidden_states = outputs[0]
hidden_states = torch.mean(hidden_states, dim=1)
logits_age = self.age(hidden_states)
logits_gender = torch.softmax(self.gender(hidden_states), dim=1)
return hidden_states, logits_age, logits_gender
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-24-ft-age-gender'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = AgeGenderModel.from_pretrained(model_name)
sampling_rate = 16000
signal = np.zeros((1, sampling_rate), dtype=np.float32)
def process_func(
x: np.ndarray,
sampling_rate: int,
embeddings: bool = False,
) -> np.ndarray:
r"""Predict age and gender or extract embeddings from raw audio signal."""
y = processor(x, sampling_rate=sampling_rate)
y = y['input_values'][0]
y = y.reshape(1, -1)
y = torch.from_numpy(y).to(device)
with torch.no_grad():
y = model(y)
if embeddings:
y = y[0]
else:
y = torch.hstack([y[1], y[2]])
y = y.detach().cpu().numpy()
return y
print(process_func(signal, sampling_rate))
print(process_func(signal, sampling_rate, embeddings=True))
đ Documentation
Datasets
Property |
Details |
Datasets |
agender, mozillacommonvoice, timit, voxceleb2 |
Inference |
true |
Tags |
speech, audio, wav2vec2, audio - classification, age - recognition, gender - recognition |
License |
cc - by - nc - sa - 4.0 |
Base Model |
facebook/wav2vec2-large-robust |
đ License
The model is licensed under cc-by-nc-sa-4.0.