模型简介
模型特点
模型能力
使用案例
🚀 InternVL3-14B-Instruct
InternVL3-14B-Instruct是一款先进的多模态大语言模型,在多模态感知、推理和语言处理等方面表现出色,拓展了多模态能力的应用范围。
🚀 快速开始
我们提供了使用transformers
库运行InternVL3-14B
的示例代码。
⚠️ 重要提示
请使用
transformers>=4.37.2
以确保模型正常工作。
模型加载
16位(bf16 / fp16)
import torch
from transformers import AutoTokenizer, AutoModel
path = "OpenGVLab/InternVL3-14B"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
BNB 8位量化
import torch
from transformers import AutoTokenizer, AutoModel
path = "OpenGVLab/InternVL3-14B"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
load_in_8bit=True,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval()
多GPU使用
以下代码的编写方式是为了避免在多GPU推理期间由于张量不在同一设备上而出现错误。通过确保大语言模型(LLM)的第一层和最后一层在同一设备上,我们可以防止此类错误。
import math
import torch
from transformers import AutoTokenizer, AutoModel
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
path = "OpenGVLab/InternVL3-14B"
device_map = split_model('InternVL3-14B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
使用Transformers进行推理
import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
# If you set `load_in_8bit=True`, you will need two 80GB GPUs.
# If you set `load_in_8bit=False`, you will need at least three 80GB GPUs.
path = 'OpenGVLab/InternVL3-14B'
device_map = split_model('InternVL3-14B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
load_in_8bit=False,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
# set the max number of tiles in `max_num`
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)
# pure-text conversation (纯文本对话)
question = 'Hello, who are you?'
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Can you tell me a story?'
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# single-image single-round conversation (单图像单轮对话)
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
# single-image multi-round conversation (单图像多轮对话)
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Please write a poem according to the image.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# multi-image multi-round conversation, combined images (多图像多轮对话,组合图像)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
question = '<image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# multi-image multi-round conversation, separate images (多图像多轮对话,分离图像)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# batch inference, single image per sample (单图像批量推理)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
responses = model.batch_chat(tokenizer, pixel_values,
num_patches_list=num_patches_list,
questions=questions,
generation_config=generation_config)
for question, response in zip(questions, responses):
print(f'User: {question}\nAssistant: {response}')
# video multi-round conversation (视频多轮对话)
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(num_segments)
])
return frame_indices
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
pixel_values_list, num_patches_list = [], []
transform = build_transform(input_size=input_size)
frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(tile) for tile in img]
pixel_values = torch.stack(pixel_values)
num_patches_list.append(pixel_values.shape[0])
pixel_values_list.append(pixel_values)
pixel_values = torch.cat(pixel_values_list)
return pixel_values, num_patches_list
video_path = './examples/red-panda.mp4'
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
question = video_prefix + 'What is the red panda doing?'
# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Describe this video in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
流式输出
除了上述方法,你还可以使用以下代码实现流式输出。
from transformers import TextIteratorStreamer
from threading import Thread
# Initialize the streamer
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
# Define the generation configuration
generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
# Start the model chat in a separate thread
thread = Thread(target=model.chat, kwargs=dict(
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
history=None, return_history=False, generation_config=generation_config,
))
thread.start()
# Initialize an empty string to store the generated text
generated_text = ''
# Loop through the streamer to get the new text as it is generated
for new_text in streamer:
if new_text == model.conv_template.sep:
break
generated_text += new_text
print(new_text, end='', flush=True) # Print each new chunk of generated text on the same line
✨ 主要特性
- 先进的多模态能力:与InternVL 2.5相比,InternVL3展现出更出色的多模态感知和推理能力,并将多模态能力扩展到工具使用、GUI代理、工业图像分析、3D视觉感知等领域。
- 原生多模态预训练:提出原生多模态预训练方法,将语言和视觉学习整合到一个预训练阶段,使模型能同时学习语言和多模态表示,增强处理视觉 - 语言任务的能力。
- 更好的长上下文理解:集成可变视觉位置编码(V2PE),利用更小、更灵活的位置增量处理视觉标记,使InternVL3在长上下文理解方面表现更优。
- 超越Qwen2.5的文本性能:得益于原生多模态预训练,InternVL3系列在整体文本性能上优于Qwen2.5系列。
📚 详细文档
模型介绍
这是InternVL3-14B的SFT版本,经过了原生多模态预训练和SFT,但未经过MPO。如果你不确定使用哪个版本,请使用InternVL3-14B版本。
InternVL3是一系列先进的多模态大语言模型(MLLM),整体性能优越。与InternVL 2.5相比,InternVL3在多模态感知和推理能力上表现更出色,并且进一步拓展了多模态能力,涵盖工具使用、GUI代理、工业图像分析、3D视觉感知等领域。
InternVL3家族
以下表格概述了InternVL3系列:
模型名称 | 视觉部分 | 语言部分 | Hugging Face链接 |
---|---|---|---|
InternVL3-1B | InternViT-300M-448px-V2_5 | Qwen2.5-0.5B | 链接 |
InternVL3-2B | InternViT-300M-448px-V2_5 | Qwen2.5-1.5B | 链接 |
InternVL3-8B | InternViT-300M-448px-V2_5 | Qwen2.5-7B | 链接 |
InternVL3-9B | InternViT-300M-448px-V2_5 | internlm3-8b-instruct | 链接 |
InternVL3-14B | InternViT-300M-448px-V2_5 | Qwen2.5-14B | 链接 |
InternVL3-38B | InternViT-6B-448px-V2_5 | Qwen2.5-32B | 链接 |
InternVL3-78B | InternViT-6B-448px-V2_5 | Qwen2.5-72B | 链接 |
模型架构
如下图所示,InternVL3保留了与InternVL 2.5及其前身InternVL 1.5和2.0相同的模型架构,遵循“ViT - MLP - LLM”范式。在这个新版本中,我们使用随机初始化的MLP投影器,将新的增量预训练的InternViT与各种预训练的LLM(包括InternLM 3和Qwen 2.5)集成在一起。
与之前的版本一样,我们应用了像素重排操作,将视觉标记的数量减少到原来的四分之一。此外,我们采用了与InternVL 1.5类似的动态分辨率策略,将图像划分为448×448像素的图块。从InternVL 2.0开始,关键的区别在于我们还增加了对多图像和视频数据的支持。
值得注意的是,在InternVL3中,我们集成了可变视觉位置编码(V2PE),它为视觉标记使用更小、更灵活的位置增量。得益于V2PE,InternVL3与前代模型相比,表现出更好的长上下文理解能力。
训练策略
原生多模态预训练
我们提出了一种原生多模态预训练方法,将语言和视觉学习整合到一个预训练阶段。与先训练纯语言模型,然后将其适应处理其他模态的标准范式不同,我们的方法将多模态数据(如图像 - 文本、视频 - 文本或图像 - 文本交错序列)与大规模文本语料库交织在一起。这种统一的训练方案使模型能够同时学习语言和多模态表示,最终增强其处理视觉 - 语言任务的能力,而无需单独的对齐或桥接模块。更多细节请参阅我们的论文。
监督微调
在这个阶段,InternVL2.5中提出的随机JPEG压缩、平方损失重新加权和多模态数据打包技术也被应用于InternVL3系列。与InternVL2.5相比,InternVL3的SFT阶段的主要进步在于使用了更高质量和更多样化的训练数据。具体来说,我们进一步扩展了工具使用、3D场景理解、GUI操作、长上下文任务、视频理解、科学图表、创意写作和多模态推理的训练样本。
混合偏好优化
在预训练和SFT期间,模型根据先前的真实标记来预测下一个标记。然而,在推理期间,模型根据自己的先前输出预测每个标记。真实标记和模型预测标记之间的这种差异引入了分布偏移,这可能会损害模型的思维链(CoT)推理能力。为了缓解这个问题,我们采用了MPO,它引入了来自正样本和负样本的额外监督,以使模型响应分布与真实分布对齐,从而提高推理性能。具体来说,MPO的训练目标是偏好损失 \(\mathcal{L}{\text{p}}\)、质量损失 \(\mathcal{L}{\text{q}}\) 和生成损失 \(\mathcal{L}_{\text{g}}\) 的组合,可以表述如下:
$$ \mathcal{L}=w_{p}\cdot\mathcal{L}{\text{p}} + w{q}\cdot\mathcal{L}{\text{q}} + w{g}\cdot\mathcal{L}_{\text{g}}, $$
其中 \(w_{*}\) 表示每个损失分量的权重。有关MPO的更多细节,请参阅我们的论文。
测试时缩放
测试时缩放已被证明是增强LLM和MLLM推理能力的有效方法。在这项工作中,我们使用Best - of - N评估策略,并采用[VisualPRM - 8B](https://huggingface.co/OpenGVLab/VisualPRM - 8B)作为评估模型,以选择最佳响应进行推理和数学评估。
多模态能力评估
- 多模态推理和数学
- OCR、图表和文档理解
- 多图像和现实世界理解
- 综合多模态和幻觉评估
- 视觉定位
- 多模态多语言理解
- 视频理解
- GUI定位
- 空间推理
语言能力评估
我们将InternVL3与Qwen2.5聊天模型进行了比较,Qwen2.5的相应预训练基础模型被用作InternVL3中语言组件的初始化。得益于原生多模态预训练,InternVL3系列在整体文本性能上甚至优于Qwen2.5系列。请注意,Qwen2.5系列的评估分数可能与官方报告的不同,因为我们在所有数据集上采用了表中提供的提示版本进行OpenCompass评估。
消融研究
原生多模态预训练
我们在InternVL2 - 8B模型上进行了实验,同时保持其架构、初始化参数和训练数据完全不变。传统上,InternVL2 - 8B采用的训练流程是先进行MLP预热阶段以进行特征对齐,然后进行指令调优阶段。在我们的实验中,我们用原生多模态预训练过程取代了传统的MLP预热阶段。这种修改隔离了原生多模态预训练对模型整体多模态能力的贡献。
下图的评估结果表明,经过原生多模态预训练的模型在大多数基准测试中的性能与经过完整多阶段训练的InternVL2 - 8B基线相当。此外,当在更高质量的数据上进行指令调优时,该模型在评估的多模态任务中表现出进一步的性能提升。这些发现强调了原生多模态预训练在赋予MLLM强大多模态能力方面的效率。
混合偏好优化
如下表所示,与未使用MPO的模型相比,使用MPO进行微调的模型在七个多模态推理基准测试中表现出更优越的推理性能。具体来说,InternVL3 - 78B和InternVL3 - 38B分别比其对应模型高出4.1和4.5分。值得注意的是,用于MPO的训练数据是用于SFT的训练数据的子集,这表明性能提升主要源于训练算法而非训练数据。
可变视觉位置编码
如下表所示,引入V2PE在大多数评估指标上带来了显著的性能提升。此外,我们的消融研究通过改变位置增量 \( \delta \) 揭示,即使对于主要涉及传统上下文的任务,相对较小的 \( \delta \) 值也可以实现最佳性能。这些发现为未来改进MLLM中视觉标记的位置编码策略提供了重要见解。
🔧 技术细节
模型加载
在不同的硬件环境下,我们提供了多种模型加载方式,包括16位(bf16 / fp16)加载、BNB 8位量化加载以及多GPU加载。通过合理选择加载方式,可以充分利用硬件资源,提高模型的运行效率。
推理过程
在推理过程中,我们使用了一系列的图像处理和数据预处理方法,如构建图像变换、寻找最接近的宽高比、动态预处理图像等。这些方法确保了模型能够准确地处理不同类型的输入数据,包括单图像、多图像和视频数据。
流式输出
通过使用TextIteratorStreamer
和多线程技术,我们实现了模型的流式输出。这种方式可以在生成文本的过程中实时显示结果,提高用户体验。
💻 使用示例
基础用法
# 模型加载和推理示例
import torch
from transformers import AutoTokenizer, AutoModel
path = "OpenGVLab/InternVL3-14B"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
高级用法
# 多图像多轮对话示例
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
📦 安装指南
LMDeploy安装
# 如果lmdeploy<0.7.3,你需要显式设置chat_template_config=ChatTemplateConfig(model_name='internvl2_5')
pip install lmdeploy>=0.7.3
OpenAI库安装
pip install openai
📄 许可证
本项目采用MIT许可证发布。本项目使用预训练的Qwen2.5作为组件,该组件遵循Apache 2.0许可证。
引用
如果你在研究中发现这个项目有用,请考虑引用:
@article{chen2024expanding,
title={Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling},
author={Chen, Zhe and Wang, Weiyun and Cao, Yue and Liu, Yangzhou and Gao, Zhangwei and Cui, Erfei and Zhu, Jinguo and Ye, Shenglong and Tian, Hao and Liu, Zhaoyang and others},
journal={arXiv preprint arXiv:2412.05271},
year={2024}
}
@article{wang2024mpo,
title={Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization},
author={Wang, Weiyun and Chen, Zhe and Wang, Wenhai and Cao, Yue and Liu, Yangzhou and Gao, Zhangwei and Zhu, Jinguo and Zhu, Xizhou and Lu, Lewei and Qiao, Yu and Dai, Jifeng},
journal={arXiv preprint arXiv:2411.10442},
year={2024}
}
@article{chen2024far,
title={How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites},
author={Chen, Zhe and Wang, Weiyun and Tian, Hao and Ye, Shenglong and Gao, Zhangwei and Cui, Erfei and Tong, Wenwen and Hu, Kongzhi and Luo, Jiapeng and Ma, Zheng and others},
journal={arXiv preprint arXiv:2404.16821},
year={2024}
}
@inproceedings{chen2024internvl,
title={Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks},
author={Chen, Zhe and Wu, Jiannan and Wang, Wenhai and Su, Weijie and Chen, Guo and Xing, Sen and Zhong, Muyan and Zhang, Qinglong and Zhu, Xizhou and Lu, Lewei and others},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={24185--24198},
year={2024}
}








