llama-2-medical-consultation开源医疗咨询模型 - 免费提供专业医疗问题解答

首页

Llama 2 Medical Consultation

由 Ashishkr 开发

基于Llama-2-7b-chat-hf微调的医疗咨询模型，专门用于回答医疗相关问题

大型语言模型

TensorBoard

#医疗问答微调 #神经科症状分析 #术后康复咨询

下载量 364

发布时间 : 8/23/2023

模型简介

该模型是针对医疗咨询场景微调的大语言模型，能够根据患者描述提供医疗建议

模型特点

医疗领域微调

专门针对医疗咨询场景进行优化，能够理解医学术语和患者描述

资源高效

可在T4 GPU(16GB显存)和CPU(32GB内存)上运行，适合多种部署环境

4位量化

支持4位量化技术，降低内存需求同时保持模型性能

模型能力

医疗问题解答

症状分析

医学术语理解

患者咨询响应

使用案例

医疗咨询

症状分析

根据患者描述的症状提供初步医疗建议

帮助患者了解可能的病因和应对措施

术后咨询

回答患者关于术后恢复的问题

提供术后护理建议和注意事项

🚀 peft

本项目是基于meta-llama/Llama-2-7b-hf模型微调得到的医疗咨询模型，可在T4 GPU（16GB VRAM）以及CPU（32GB RAM）上运行。

🚀 快速开始

运行环境说明

本模型支持在GPU和CPU上运行，下面分别给出相应的运行代码示例。

在GPU上运行

import transformers
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from torch import cuda, bfloat16

base_model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


hf_auth = "your-huggingface-access-token"
model_config = transformers.AutoConfig.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

config = PeftConfig.from_pretrained("Ashishkr/llama-2-medical-consultation")
model = PeftModel.from_pretrained(model, "Ashishkr/llama-2-medical-consultation").to(device)

model.eval()
print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

生成回复的函数

def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.92):

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )

    # Check if bfloat16 is supported, otherwise use float16
    dtype_to_use = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    with torch.autocast("cuda", dtype=dtype_to_use):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )

    return decoded_output[len(prompt) :]

prompt = """
 instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n

input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
I am having some major bilateral temple pain along with numbness that comes and
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
but this is different. Also, my moods have been horrible for the past few weeks.\n

response:  """
# You can use the function as before
response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)

在CPU上运行

import torch
import transformers
from torch import cuda, bfloat16
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


base_model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    llm_int8_enable_fp32_cpu_offload = True
)

import torch
hf_auth = "YOUR-HUGGINGFACE-ACCESS-TOKEN"
model_config = transformers.AutoConfig.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    # device_map='auto',
    use_auth_token=hf_auth
)

config = PeftConfig.from_pretrained("Ashishkr/llama-2-medical-consultation")
model = PeftModel.from_pretrained(model, "Ashishkr/llama-2-medical-consultation").to(device)

model.eval()
print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.92):

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )

    # Check if bfloat16 is supported, otherwise use float16
    dtype_to_use = torch.float32
    with torch.autocast("cuda", dtype=dtype_to_use):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )

    return decoded_output[len(prompt) :]

prompt = """
 instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n

input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
I am having some major bilateral temple pain along with numbness that comes and
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
but this is different. Also, my moods have been horrible for the past few weeks.\n

response:  """
# You can use the function as before
response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)