🚀 Qwen2-VL Tiny Random
This is a small randomly initialized model based on the configuration of Qwen/Qwen2-VL-7B-Instruct, mainly used for debugging purposes.
🚀 Quick Start
This model is designed for debugging. It's randomly initialized using the configuration from Qwen/Qwen2-VL-7B-Instruct but with a smaller size.
💻 Usage Examples
Basic Usage
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
model_id = "yujiepan/qwen2-vl-tiny-random"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_id, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
conversation = [
{
"role": "user",
"content": [
{
"type": "image",
},
{"type": "text", "text": "Describe this image."},
],
}
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(
text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [
output_ids[len(input_ids) :]
for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)
Advanced Usage
import os
from typing import Dict
import requests
import torch
import transformers
from PIL import Image
from torchvision import io
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
AutoTokenizer, GenerationConfig, pipeline, set_seed)
from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration
model_id = "Qwen/Qwen2-VL-7B-Instruct"
repo_id = "yujiepan/qwen2-vl-tiny-random"
save_path = f"/tmp/{repo_id}"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.hidden_size = 16
config.intermediate_size = 32
config.num_attention_heads = 2
config.num_hidden_layers = 2
config.num_key_value_heads = 1
config.vision_config.embed_dim = 16
config.vision_config.num_heads = 2
config.vision_config.hidden_size = 16
config.vision_config.depth = 2
config.rope_scaling['mrope_section'] = [1, 1, 2]
model = Qwen2VLForConditionalGeneration(config=config)
model = model.to(torch.bfloat16).cuda().eval()
model.generation_config = GenerationConfig.from_pretrained(
model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
for _, p in sorted(model.named_parameters()):
torch.nn.init.uniform_(p, -0.3, 0.3)
processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
os.system(f"ls -alh {save_path}")
def try_inference():
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
conversation = [
{
"role": "user",
"content": [
{
"type": "image",
},
{"type": "text", "text": "Describe this image."},
],
}
]
processor = AutoProcessor.from_pretrained(save_path)
model = Qwen2VLForConditionalGeneration.from_pretrained(
save_path, torch_dtype=torch.bfloat16, device_map='cuda')
text_prompt = processor.apply_chat_template(
conversation, add_generation_prompt=True)
inputs = processor(
text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")
output_ids = model.generate(**inputs, max_new_tokens=16)
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)
try_inference()
📚 Documentation
Model Information
Property |
Details |
Library Name |
transformers |
Pipeline Tag |
text-generation |
Inference |
true |
Base Model |
Qwen/Qwen2-VL-7B-Instruct |
Widget Example
- Text: Hello!
- Example Title: Hello world
- Group: Python