模型简介
模型特点
模型能力
使用案例
🚀 InternVL3-1B
InternVL3-1B 是先进的多模态大语言模型(MLLM)系列,相比前代模型,在多模态感知、推理等能力上有显著提升,还拓展了工具使用、GUI 代理等多模态能力。
项目链接
- 📂 GitHub
- 📜 InternVL 1.0
- 📜 InternVL 1.5
- 📜 InternVL 2.5
- 📜 InternVL2.5-MPO
- 📜 InternVL3
- 🆕 Blog
- 🗨️ Chat Demo
- 🤗 HF Demo
- 🚀 Quick Start
- 📖 Documents

✨ 主要特性
- 卓越的多模态性能:相比 InternVL 2.5,InternVL3 展现出更出色的多模态感知和推理能力,还将多模态能力拓展到工具使用、GUI 代理、工业图像分析、3D 视觉感知等领域。
- 统一的预训练方法:提出原生多模态预训练方法,将语言和视觉学习整合到一个预训练阶段,增强模型处理视觉 - 语言任务的能力。
- 灵活的视觉位置编码:集成可变视觉位置编码(V2PE),使模型在长上下文理解能力上优于前代。
📦 安装指南
LMDeploy 安装
# 如果 lmdeploy<0.7.3,需要显式设置 chat_template_config=ChatTemplateConfig(model_name='internvl2_5')
pip install lmdeploy>=0.7.3
OpenAI 安装
pip install openai
💻 使用示例
基础用法
# 模型加载 - 16 位 (bf16 / fp16)
import torch
from transformers import AutoTokenizer, AutoModel
path = "OpenGVLab/InternVL3-1B"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
高级用法
多 GPU 推理
import math
import torch
from transformers import AutoTokenizer, AutoModel
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
# 由于第一个 GPU 将用于 ViT,将其视为半个 GPU。
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
path = "OpenGVLab/InternVL3-1B"
device_map = split_model('InternVL3-1B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
推理示例
import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# 计算现有图像宽高比
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# 找到最接近目标的宽高比
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# 计算目标宽度和高度
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# 调整图像大小
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# 分割图像
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
# 由于第一个 GPU 将用于 ViT,将其视为半个 GPU。
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
# 如果设置 `load_in_8bit=True`,需要两个 80GB GPU。
# 如果设置 `load_in_8bit=False`,至少需要三个 80GB GPU。
path = 'OpenGVLab/InternVL3-1B'
device_map = split_model('InternVL3-1B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
load_in_8bit=False,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
# 设置 `max_num` 中的最大图块数
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)
# 纯文本对话
question = 'Hello, who are you?'
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Can you tell me a story?'
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# 单图单轮对话
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
# 单图多轮对话
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Please write a poem according to the image.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# 多图多轮对话,拼接图像
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
question = '<image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# 多图多轮对话,独立图像
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# 单图批处理
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
responses = model.batch_chat(tokenizer, pixel_values,
num_patches_list=num_patches_list,
questions=questions,
generation_config=generation_config)
for question, response in zip(questions, responses):
print(f'User: {question}\nAssistant: {response}')
# 视频多轮对话
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(num_segments)
])
return frame_indices
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
pixel_values_list, num_patches_list = [], []
transform = build_transform(input_size=input_size)
frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(tile) for tile in img]
pixel_values = torch.stack(pixel_values)
num_patches_list.append(pixel_values.shape[0])
pixel_values_list.append(pixel_values)
pixel_values = torch.cat(pixel_values_list)
return pixel_values, num_patches_list
video_path = './examples/red-panda.mp4'
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
question = video_prefix + 'What is the red panda doing?'
# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Describe this video in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
流式输出
from transformers import TextIteratorStreamer
from threading import Thread
# 初始化流处理器
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
# 定义生成配置
generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
# 在单独的线程中启动模型对话
thread = Thread(target=model.chat, kwargs=dict(
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
history=None, return_history=False, generation_config=generation_config,
))
thread.start()
# 初始化一个空字符串来存储生成的文本
generated_text = ''
# 循环遍历流处理器以获取生成的新文本
for new_text in streamer:
if new_text == model.conv_template.sep:
break
generated_text += new_text
print(new_text, end='', flush=True) # 在同一行打印每个新生成的文本块
LMDeploy 使用示例
'Hello, world' 示例
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL3-1B'
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=16384, tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
response = pipe(('describe this image', image))
print(response.text)
多图像推理
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
from lmdeploy.vl import load_image
from lmdeploy.vl.constants import IMAGE_TOKEN
model = 'OpenGVLab/InternVL3-1B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=16384, tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
image_urls=[
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/det.jpg'
]
images = [load_image(img_url) for img_url in image_urls]
# 为图像编号有助于多图像对话
response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
print(response.text)
批量提示推理
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL3-1B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=16384, tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
image_urls=[
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/det.jpg"
]
prompts = [('describe this image', load_image(img_url)) for img_url in image_urls]
response = pipe(prompts)
print(response)
多轮对话
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig, ChatTemplateConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL3-1B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=16384, tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
sess = pipe.chat(('describe this image', image), gen_config=gen_config)
print(sess.response.text)
sess = pipe.chat('What is the woman doing?', session=sess, gen_config=gen_config)
print(sess.response.text)
服务启动
lmdeploy serve api_server OpenGVLab/InternVL3-1B --chat-template internvl2_5 --server-port 23333 --tp 1
OpenAI 风格接口调用
from openai import OpenAI
client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
model_name = client.models.list().data[0].id
response = client.chat.completions.create(
model=model_name,
messages=[{
'role':
'user',
'content': [{
'type': 'text',
'text': 'describe this image',
}, {
'type': 'image_url',
'image_url': {
'url':
'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/tiger.jpeg',
},
}],
}],
temperature=0.8,
top_p=0.8)
print(response)
📚 详细文档
模型架构
InternVL3 沿用了 InternVL 2.5 及其前代的 "ViT - MLP - LLM" 范式,通过随机初始化的 MLP 投影器,将新的增量预训练的 InternViT 与各种预训练的大语言模型(如 InternLM 3 和 Qwen 2.5)集成。同时,应用像素重排操作,减少视觉标记数量,并采用动态分辨率策略,支持多图像和视频数据。
训练策略
原生多模态预训练
提出原生多模态预训练方法,将语言和视觉学习整合到一个预训练阶段,避免了先训练纯语言模型再适应其他模态的传统范式,使模型能同时学习语言和多模态表示。
监督微调
在 InternVL3 系列中采用了 InternVL2.5 提出的随机 JPEG 压缩、平方损失重加权和多模态数据打包技术,并使用更高质量和更多样化的训练数据。
混合偏好优化
使用 MPO 方法,引入正负样本的额外监督,使模型响应分布与真实分布对齐,提高推理性能。
测试时缩放
采用 Best - of - N 评估策略,并使用 [VisualPRM - 8B](https://huggingface.co/OpenGVLab/VisualPRM - 8B) 作为评估模型,选择最佳响应进行推理和数学评估。
评估指标
多模态能力评估
包括多模态推理和数学、OCR、图表和文档理解、多图像和真实世界理解、综合多模态和幻觉评估、视觉定位、多模态多语言理解、视频理解、GUI 定位和空间推理等方面。
语言能力评估
将 InternVL3 与 Qwen2.5 Chat 模型进行比较,由于原生多模态预训练,InternVL3 系列在整体文本性能上优于 Qwen2.5 系列。
消融实验
原生多模态预训练
在 InternVL2 - 8B 模型上进行实验,将传统的 MLP 预热阶段替换为原生多模态预训练过程,结果表明该方法能有效赋予模型强大的多模态能力。
混合偏好优化
使用 MPO 微调的模型在七个多模态推理基准测试中表现出优于未使用 MPO 的模型的推理性能。
可变视觉位置编码
引入 V2PE 使模型在大多数评估指标上有显著性能提升,且较小的位置增量能在常规上下文任务中实现最佳性能。
🔧 技术细节
模型家族
模型名称 | 视觉部分 | 语言部分 | HF 链接 |
---|---|---|---|
InternVL3 - 1B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [Qwen2.5 - 0.5B](https://huggingface.co/Qwen/Qwen2.5 - 0.5B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 1B) |
InternVL3 - 2B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [Qwen2.5 - 1.5B](https://huggingface.co/Qwen/Qwen2.5 - 1.5B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 2B) |
InternVL3 - 8B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [Qwen2.5 - 7B](https://huggingface.co/Qwen/Qwen2.5 - 7B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 8B) |
InternVL3 - 9B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [internlm3 - 8b - instruct](https://huggingface.co/internlm/internlm3 - 8b - instruct) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 9B) |
InternVL3 - 14B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [Qwen2.5 - 14B](https://huggingface.co/Qwen/Qwen2.5 - 14B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 14B) |
InternVL3 - 38B | [InternViT - 6B - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 6B - 448px - V2_5) | [Qwen2.5 - 32B](https://huggingface.co/Qwen/Qwen2.5 - 32B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 38B) |
InternVL3 - 78B | [InternViT - 6B - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 6B - 448px - V2_5) | [Qwen2.5 - 72B](https://huggingface.co/Qwen/Qwen2.5 - 72B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 78B) |
模型评估
多模态能力评估
        
语言能力评估

消融实验结果
原生多模态预训练

混合偏好优化

可变视觉位置编码

📄 许可证
本项目采用 MIT 许可证发布。项目使用预训练的 Qwen2.5 作为组件,该组件遵循 Qwen 许可证。
引用
如果您在研究中发现本项目有用,请考虑引用以下文献:
@article{chen2024expanding,
title={Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling},
author={Chen, Zhe and Wang, Weiyun and Cao, Yue and Liu, Yangzhou and Gao, Zhangwei and Cui, Erfei and Zhu, Jinguo and Ye, Shenglong and Tian, Hao and Liu, Zhaoyang and others},
journal={arXiv preprint arXiv:2412.05271},
year={2024}
}
@article{wang2024mpo,
title={Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization},
author={Wang, Weiyun and Chen, Zhe and Wang, Wenhai and Cao, Yue and Liu, Yangzhou and Gao, Zhangwei and Zhu, Jinguo and Zhu, Xizhou and Lu, Lewei and Qiao, Yu and Dai, Jifeng},
journal={arXiv preprint arXiv:2411.10442},
year={2024}
}
@article{chen2024far,
title={How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites},
author={Chen, Zhe and Wang, Weiyun and Tian, Hao and Ye, Shenglong and Gao, Zhangwei and Cui, Erfei and Tong, Wenwen and Hu, Kongzhi and Luo, Jiapeng and Ma, Zheng and others},
journal={arXiv preprint arXiv:2404.16821},
year={2024}
}
@inproceedings{chen2024internvl,
title={Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks},
author={Chen, Zhe and Wu, Jiannan and Wang, Wenhai and Su, Weijie and Chen, Guo and Xing, Sen and Zhong, Muyan and Zhang, Qinglong and Zhu, Xizhou and Lu, Lewei and others},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={24185--24198},
year={2024}
}









