Llama-3-8B-Instruct-Chinese開源模型 - 專注增強中文任務處理能力

首頁

Llama 3 8B Instruct Chinese

由Rookie開發

Llama-3-8B-Instruct的中文自微調版本，專注於中文任務增強

大型語言模型 #中文指令微調 #多輪對話優化 #文化內容增強

下載量 453

發布時間 : 4/22/2024

模型概述

基於Llama-3-8B-Instruct模型進行中文指令微調的版本，增強了中文理解和生成能力

模型特點

中文指令微調

針對中文任務進行了專門優化，提升了中文理解和生成能力

多任務支持

支持多種中文NLP任務，包括問答、對話、文本生成等

高質量訓練數據

使用了包括firefly-train-1.1M、moss-003-sft-data等多個高質量中文數據集

模型能力

中文文本生成

多輪對話

問答系統

代碼生成

數學問題解答

文言文翻譯

使用案例

教育

數學問題解答

解答中小學數學問題

能夠處理25萬條數學問題數據集

文言文翻譯

將文言文翻譯為現代漢語

基於包含文言文內容的數據集訓練

娛樂

弱智吧問答

回答幽默、腦筋急轉彎類問題

能夠處理弱智吧問答數據

技術

代碼生成

生成Python等編程語言的代碼片段

能夠生成Transformer block、快速排序等代碼

🚀 Llama-3-8B-Instruct-中文聊天

這是Llama-3-8B-Instruct的中文自微調版本，可用於多種中文自然語言處理任務。

🚀 快速開始

默認情況下直接運行以下代碼即可體驗llama3中文對話，請自行修改model_name_or_path為你下載的模型路徑：

from transformers import AutoTokenizer, AutoConfig, AddedToken, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from dataclasses import dataclass
from typing import Dict
import torch
import copy

## 定義聊天模板
@dataclass
class Template:
    template_name:str
    system_format: str
    user_format: str
    assistant_format: str
    system: str
    stop_word: str

template_dict: Dict[str, Template] = dict()

def register_template(template_name, system_format, user_format, assistant_format, system, stop_word=None):
    template_dict[template_name] = Template(
        template_name=template_name,
        system_format=system_format,
        user_format=user_format,
        assistant_format=assistant_format,
        system=system,
        stop_word=stop_word,
    )

# 這裡的系統提示詞是訓練時使用的，推理時可以自行嘗試修改效果
register_template(
    template_name='llama3',
    system_format='<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>',
    user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
    assistant_format='{content}<|eot_id|>',
    system=None,
    stop_word='<|eot_id|>'
)


## 加載模型
def load_model(model_name_or_path, load_in_4bit=False, adapter_name_or_path=None):
    if load_in_4bit:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
        )
    else:
        quantization_config = None

    # 加載base model
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        load_in_4bit=load_in_4bit,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map='auto',
        quantization_config=quantization_config
    )

    # 加載adapter
    if adapter_name_or_path is not None:
        model = PeftModel.from_pretrained(model, adapter_name_or_path)

    return model

## 加載tokenzier
def load_tokenizer(model_name_or_path):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        trust_remote_code=True,
        use_fast=False
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return tokenizer

## 構建prompt
def build_prompt(tokenizer, template, query, history, system=None):
    template_name = template.template_name
    system_format = template.system_format
    user_format = template.user_format
    assistant_format = template.assistant_format
    system = system if system is not None else template.system

    history.append({"role": 'user', 'message': query})
    input_ids = []

    # 添加系統信息
    if system_format is not None:
        if system is not None:
            system_text = system_format.format(content=system)
            input_ids = tokenizer.encode(system_text, add_special_tokens=False)
    # 拼接歷史對話
    for item in history:
        role, message = item['role'], item['message']
        if role == 'user':
            message = user_format.format(content=message, stop_token=tokenizer.eos_token)
        else:
            message = assistant_format.format(content=message, stop_token=tokenizer.eos_token)
        tokens = tokenizer.encode(message, add_special_tokens=False)
        input_ids += tokens
    input_ids = torch.tensor([input_ids], dtype=torch.long)

    return input_ids


def main():
    model_name_or_path = 'NousResearch/Meta-Llama-3-8B'
    template_name = 'llama3'
    adapter_name_or_path = None

    template = template_dict[template_name]

    load_in_4bit = False

    max_new_tokens = 500 
    top_p = 0.9
    temperature = 0.35 
    repetition_penalty = 1.1

    # 加載模型
    print(f'Loading model from: {model_name_or_path}')
    print(f'adapter_name_or_path: {adapter_name_or_path}')
    model = load_model(
        model_name_or_path,
        load_in_4bit=load_in_4bit,
        adapter_name_or_path=adapter_name_or_path
    ).eval()
    tokenizer = load_tokenizer(model_name_or_path if adapter_name_or_path is None else adapter_name_or_path)
    if template.stop_word is None:
        template.stop_word = tokenizer.eos_token
    stop_token_id = tokenizer.encode(template.stop_word, add_special_tokens=True)
    assert len(stop_token_id) == 1
    stop_token_id = stop_token_id[0]

    history = []

    query = input('# User：')
    while True:
        query = query.strip()
        input_ids = build_prompt(tokenizer, template, query, copy.deepcopy(history), system=None).to(model.device)
        attention_mask = torch.ones(input_ids.shape,dtype=torch.long,device=model.device)
        outputs = model.generate(
            input_ids=input_ids, max_new_tokens=max_new_tokens, do_sample=True,
            top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty,
            eos_token_id=stop_token_id,
            attention_mask=attention_mask,
            pad_token_id=tokenizer.eos_token_id
        )
        outputs = outputs.tolist()[0][len(input_ids[0]):]
        response = tokenizer.decode(outputs)
        response = response.strip().replace(template.stop_word, "").strip()

        # 存儲對話歷史
        history.append({"role": 'user', 'message': query})
        history.append({"role": 'assistant', 'message': response})

        if len(history) > 12:
            history = history[:-12]

        print("llama3 Chinese：{}".format(response))
        query = input('# User：')


if __name__ == '__main__':
    main()

✨ 主要特性

中文微調：基於Llama-3-8B-Instruct進行中文微調，更適合中文自然語言處理任務。
多輪問答：支持多輪對話，可處理複雜的交互場景。
多數據集訓練：使用多種中文數據集進行訓練，提升模型的泛化能力。

📦 訓練可用數據整理

屬性	詳情
數據集	firefly-train-1.1M、moss-003-sft-data、school_math_0.25M、ruozhiba
介紹	包含了23種常見的中文NLP任務的數據，並且構造了許多與中華文化相關的數據，如對聯、作詩、文言文翻譯、散文、金庸小說等；由復旦大學MOSS團隊開源的中英文多輪對話數據；由BELLE項目組開源的數學運算指令數據；弱智吧數據問答，據說比較鍛鍊模型的心智能力

歡迎補充，要求中文且一問一答形式，適合用於提升llama3任務能力的數據集。

📚 詳細文檔

github地址

Chat版模型下載

Instruct + 繼續中文sft版
huggingface地址

模型量化加速、部署

文檔中未詳細提及模型量化加速和部署的具體內容，可參考相關技術文檔進行操作。

💻 使用示例

多輪問答示例

User：你是誰
llama3 Chinese：我是一名AI語言模型，我的名字叫MOSS。

User：你會什麼
llama3 Chinese：我可以回答問題、提供信息、幫助您完成任務等等。請問您需要我做什麼？

User：兩千塊錢買什麼新車好？
llama3 Chinese：兩千塊錢買一輛新車可能比較困難，因為大多數新車的價格都比這個金額高得多。但是，有一些二手車或者低價新車可能在這個價格範圍內，您可以考慮購買這些車輛。但請注意，在購買二手車時，您應該仔細檢查車輛的狀況，並瞭解其歷史記錄，以確保您獲得了一個安全可靠的車輛。

User：火燒赤壁 曹操為何不撥打119求救
llama3 Chinese：這是一個歷史上的故事，赤壁之戰發生在三國時期，曹操並不是現代社會的人物，他沒有119這樣的電話系統，也不能像我們現在一樣撥打911求救。因此，他只能通過其他方式來組織部隊和進行戰鬥。

User：幫我寫一個transformer block
llama3 Chinese：當然可以！下面是一個簡單的Transformer Block示例：

```python
import torch.nn as nn
import torch

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.feed_forward = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Self-Attention
        residual = x
        output = self.self_attn(x, x)
        output += residual
        output = self.dropout(output)

        # Feed Forward
        residual = output
        output = self.feed_forward(output)
        output += residual
        output = self.dropout(output)

        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, query, key, value):
        batch_size = query.size(0)
        head_dim = self.d_model // self.n_heads

        query = self.query_linear(query).view(batch_size, -1, self.n_heads, head_dim)
        key = self.key_linear(key).view(batch_size, -1, self.n_heads, head_dim)
        value = self.value_linear(value).view(batch_size, -1, self.n_heads, head_dim)

        attention_scores = torch.matmul(query.permute(0, 2, 1, 3), key.permute(0, 2, 3, 1)) / math.sqrt(head_dim)
        attention_scores = F.softmax(attention_scores, dim=-1)
        attention_scores = self.dropout(attention_scores)

        output = torch.matmul(attention_scores, value)
        output = output.view(batch_size, -1, self.d_model)

        return output

這個Transformer Block包含兩個主要部分：Self-Attention和Feed Forward。Self-Attention使用MultiHeadAttention模塊來計算輸入序列之間的注意力權重

User：能寫歌快速排序嗎 llama3 Chinese：當然可以，以下是Python實現的快速排序算法：

def quick_sort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[0]
    left = [x for x in arr[1:] if x < pivot]
    right = [x for x in arr[1:] if x >= pivot]
    return quick_sort(left) + [pivot] + quick_sort(right)