🚀 Qwen2-VL デバッグ用モデル
このモデルはデバッグ用です。Qwen/Qwen2-VL-7B-Instruct の設定を使用してランダムに初期化されていますが、サイズはより小さくなっています。
🚀 クイックスタート
モデルの使用方法
以下のコードを使用して、このモデルを使って画像の説明を生成することができます。
💻 使用例
基本的な使用法
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
model_id = "yujiepan/qwen2-vl-tiny-random"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_id, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
conversation = [
{
"role": "user",
"content": [
{
"type": "image",
},
{"type": "text", "text": "Describe this image."},
],
}
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(
text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [
output_ids[len(input_ids) :]
for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)
高度な使用法
以下のコードは、モデルをランダムに初期化し、保存して、推論を試すためのものです。
import os
from typing import Dict
import requests
import torch
import transformers
from PIL import Image
from torchvision import io
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
AutoTokenizer, GenerationConfig, pipeline, set_seed)
from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration
model_id = "Qwen/Qwen2-VL-7B-Instruct"
repo_id = "yujiepan/qwen2-vl-tiny-random"
save_path = f"/tmp/{repo_id}"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.hidden_size = 16
config.intermediate_size = 32
config.num_attention_heads = 2
config.num_hidden_layers = 2
config.num_key_value_heads = 1
config.vision_config.embed_dim = 16
config.vision_config.num_heads = 2
config.vision_config.hidden_size = 16
config.vision_config.depth = 2
config.rope_scaling['mrope_section'] = [1, 1, 2]
model = Qwen2VLForConditionalGeneration(config=config)
model = model.to(torch.bfloat16).cuda().eval()
model.generation_config = GenerationConfig.from_pretrained(
model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
for _, p in sorted(model.named_parameters()):
torch.nn.init.uniform_(p, -0.3, 0.3)
processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
os.system(f"ls -alh {save_path}")
def try_inference():
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
conversation = [
{
"role": "user",
"content": [
{
"type": "image",
},
{"type": "text", "text": "Describe this image."},
],
}
]
processor = AutoProcessor.from_pretrained(save_path)
model = Qwen2VLForConditionalGeneration.from_pretrained(
save_path, torch_dtype=torch.bfloat16, device_map='cuda')
text_prompt = processor.apply_chat_template(
conversation, add_generation_prompt=True)
inputs = processor(
text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")
output_ids = model.generate(**inputs, max_new_tokens=16)
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)
try_inference()