llama-2-medical-consultation開源醫療諮詢模型 - 免費提供專業醫療問題解答

首頁

Llama 2 Medical Consultation

由Ashishkr開發

基於Llama-2-7b-chat-hf微調的醫療諮詢模型，專門用於回答醫療相關問題

大型語言模型

TensorBoard

#醫療問答微調 #神經科症狀分析 #術後康復諮詢

下載量 364

發布時間 : 8/23/2023

模型概述

該模型是針對醫療諮詢場景微調的大語言模型，能夠根據患者描述提供醫療建議

模型特點

醫療領域微調

專門針對醫療諮詢場景進行優化，能夠理解醫學術語和患者描述

資源高效

可在T4 GPU(16GB顯存)和CPU(32GB內存)上運行，適合多種部署環境

4位量化

支持4位量化技術，降低內存需求同時保持模型性能

模型能力

醫療問題解答

症狀分析

醫學術語理解

患者諮詢響應

使用案例

醫療諮詢

症狀分析

根據患者描述的症狀提供初步醫療建議

幫助患者瞭解可能的病因和應對措施

術後諮詢

回答患者關於術後恢復的問題

提供術後護理建議和注意事項

🚀 peft

本項目是基於meta-llama/Llama-2-7b-hf模型微調得到的醫療諮詢模型，可在T4 GPU（16GB VRAM）以及CPU（32GB RAM）上運行。

🚀 快速開始

運行環境說明

本模型支持在GPU和CPU上運行，下面分別給出相應的運行代碼示例。

在GPU上運行

import transformers
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from torch import cuda, bfloat16

base_model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


hf_auth = "your-huggingface-access-token"
model_config = transformers.AutoConfig.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

config = PeftConfig.from_pretrained("Ashishkr/llama-2-medical-consultation")
model = PeftModel.from_pretrained(model, "Ashishkr/llama-2-medical-consultation").to(device)

model.eval()
print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

生成回覆的函數

def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.92):

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )

    # Check if bfloat16 is supported, otherwise use float16
    dtype_to_use = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    with torch.autocast("cuda", dtype=dtype_to_use):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )

    return decoded_output[len(prompt) :]

prompt = """
 instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n

input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
I am having some major bilateral temple pain along with numbness that comes and
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
but this is different. Also, my moods have been horrible for the past few weeks.\n

response:  """
# You can use the function as before
response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)

在CPU上運行

import torch
import transformers
from torch import cuda, bfloat16
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


base_model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    llm_int8_enable_fp32_cpu_offload = True
)

import torch
hf_auth = "YOUR-HUGGINGFACE-ACCESS-TOKEN"
model_config = transformers.AutoConfig.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    # device_map='auto',
    use_auth_token=hf_auth
)

config = PeftConfig.from_pretrained("Ashishkr/llama-2-medical-consultation")
model = PeftModel.from_pretrained(model, "Ashishkr/llama-2-medical-consultation").to(device)

model.eval()
print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.92):

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )

    # Check if bfloat16 is supported, otherwise use float16
    dtype_to_use = torch.float32
    with torch.autocast("cuda", dtype=dtype_to_use):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )

    return decoded_output[len(prompt) :]

prompt = """
 instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n

input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
I am having some major bilateral temple pain along with numbness that comes and
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
but this is different. Also, my moods have been horrible for the past few weeks.\n

response:  """
# You can use the function as before
response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)