🚀 hubert-base-ch-speech-emotion-recognition
本模型以 [TencentGameMate/chinese-hubert-base](TencentGameMate/chinese-hubert-base · Hugging Face) 作为预训练模型,在 CASIA 数据集上进行训练。CASIA 数据集提供了 1200 个由演员用中文演绎 6 种不同情感的录音样本(该数据集官网共提供 9600 条数据,我使用的数据集可能并不完整),这 6 种情感分别为:
emotions = ['anger', 'fear', 'happy', 'neutral', 'sad', 'surprise']
🚀 快速开始
模型使用
import os
import random
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoConfig, Wav2Vec2FeatureExtractor, HubertPreTrainedModel, HubertModel
model_name_or_path = "xmj2002/hubert-base-ch-speech-emotion-recognition"
duration = 6
sample_rate = 16000
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path=model_name_or_path,
)
def id2class(id):
if id == 0:
return "angry"
elif id == 1:
return "fear"
elif id == 2:
return "happy"
elif id == 3:
return "neutral"
elif id == 4:
return "sad"
else:
return "surprise"
def predict(path, processor, model):
speech, sr = librosa.load(path=path, sr=sample_rate)
speech = processor(speech, padding="max_length", truncation=True, max_length=duration * sr,
return_tensors="pt", sampling_rate=sr).input_values
with torch.no_grad():
logit = model(speech)
score = F.softmax(logit, dim=1).detach().cpu().numpy()[0]
id = torch.argmax(logit).cpu().numpy()
print(f"file path: {path} \t predict: {id2class(id)} \t score:{score[id]} ")
class HubertClassificationHead(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_class)
def forward(self, x):
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class HubertForSpeechClassification(HubertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.hubert = HubertModel(config)
self.classifier = HubertClassificationHead(config)
self.init_weights()
def forward(self, x):
outputs = self.hubert(x)
hidden_states = outputs[0]
x = torch.mean(hidden_states, dim=1)
x = self.classifier(x)
return x
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
model = HubertForSpeechClassification.from_pretrained(
model_name_or_path,
config=config,
)
model.eval()
file_path = [f"test_data/{path}" for path in os.listdir("test_data")]
path = random.sample(file_path, 1)[0]
predict(path, processor, model)
训练设置
- 数据集分割比例:训练集 : 验证集 : 测试集 = 0.6 : 0.2 : 0.2
- 随机种子:34
- 批量大小:36
- 学习率:2e - 4
- 优化器:AdamW(betas=(0.93, 0.98), weight_decay=0.2)
- 调度器:Step_LR(step_size=10, gamma=0.3)
- 分类器丢弃率:0.1
- 优化器参数:
for name, param in model.named_parameters():
if "hubert" in name:
parameter.append({'params': param, 'lr': 0.2 * lr})
else:
parameter.append({'params': param, "lr": lr})
评估指标
- 测试集损失:0.1165
- 测试集准确率:0.972
训练集和验证集准确率曲线
训练集和验证集损失曲线
📄 许可证
本项目采用 Apache - 2.0 许可证。