模型概述
模型特點
模型能力
使用案例
🚀 InternVL3-1B
InternVL3-1B 是先進的多模態大語言模型(MLLM)系列,相比前代模型,在多模態感知、推理等能力上有顯著提升,還拓展了工具使用、GUI 代理等多模態能力。
項目鏈接
- 📂 GitHub
- 📜 InternVL 1.0
- 📜 InternVL 1.5
- 📜 InternVL 2.5
- 📜 InternVL2.5-MPO
- 📜 InternVL3
- 🆕 Blog
- 🗨️ Chat Demo
- 🤗 HF Demo
- 🚀 Quick Start
- 📖 Documents

✨ 主要特性
- 卓越的多模態性能:相比 InternVL 2.5,InternVL3 展現出更出色的多模態感知和推理能力,還將多模態能力拓展到工具使用、GUI 代理、工業圖像分析、3D 視覺感知等領域。
- 統一的預訓練方法:提出原生多模態預訓練方法,將語言和視覺學習整合到一個預訓練階段,增強模型處理視覺 - 語言任務的能力。
- 靈活的視覺位置編碼:集成可變視覺位置編碼(V2PE),使模型在長上下文理解能力上優於前代。
📦 安裝指南
LMDeploy 安裝
# 如果 lmdeploy<0.7.3,需要顯式設置 chat_template_config=ChatTemplateConfig(model_name='internvl2_5')
pip install lmdeploy>=0.7.3
OpenAI 安裝
pip install openai
💻 使用示例
基礎用法
# 模型加載 - 16 位 (bf16 / fp16)
import torch
from transformers import AutoTokenizer, AutoModel
path = "OpenGVLab/InternVL3-1B"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
高級用法
多 GPU 推理
import math
import torch
from transformers import AutoTokenizer, AutoModel
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
# 由於第一個 GPU 將用於 ViT,將其視為半個 GPU。
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
path = "OpenGVLab/InternVL3-1B"
device_map = split_model('InternVL3-1B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
推理示例
import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# 計算現有圖像寬高比
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# 找到最接近目標的寬高比
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# 計算目標寬度和高度
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# 調整圖像大小
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# 分割圖像
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
# 由於第一個 GPU 將用於 ViT,將其視為半個 GPU。
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
# 如果設置 `load_in_8bit=True`,需要兩個 80GB GPU。
# 如果設置 `load_in_8bit=False`,至少需要三個 80GB GPU。
path = 'OpenGVLab/InternVL3-1B'
device_map = split_model('InternVL3-1B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
load_in_8bit=False,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
# 設置 `max_num` 中的最大圖塊數
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)
# 純文本對話
question = 'Hello, who are you?'
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Can you tell me a story?'
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# 單圖單輪對話
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
# 單圖多輪對話
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Please write a poem according to the image.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# 多圖多輪對話,拼接圖像
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
question = '<image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# 多圖多輪對話,獨立圖像
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# 單圖批處理
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
responses = model.batch_chat(tokenizer, pixel_values,
num_patches_list=num_patches_list,
questions=questions,
generation_config=generation_config)
for question, response in zip(questions, responses):
print(f'User: {question}\nAssistant: {response}')
# 視頻多輪對話
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(num_segments)
])
return frame_indices
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
pixel_values_list, num_patches_list = [], []
transform = build_transform(input_size=input_size)
frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(tile) for tile in img]
pixel_values = torch.stack(pixel_values)
num_patches_list.append(pixel_values.shape[0])
pixel_values_list.append(pixel_values)
pixel_values = torch.cat(pixel_values_list)
return pixel_values, num_patches_list
video_path = './examples/red-panda.mp4'
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
question = video_prefix + 'What is the red panda doing?'
# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Describe this video in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
流式輸出
from transformers import TextIteratorStreamer
from threading import Thread
# 初始化流處理器
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
# 定義生成配置
generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
# 在單獨的線程中啟動模型對話
thread = Thread(target=model.chat, kwargs=dict(
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
history=None, return_history=False, generation_config=generation_config,
))
thread.start()
# 初始化一個空字符串來存儲生成的文本
generated_text = ''
# 循環遍歷流處理器以獲取生成的新文本
for new_text in streamer:
if new_text == model.conv_template.sep:
break
generated_text += new_text
print(new_text, end='', flush=True) # 在同一行打印每個新生成的文本塊
LMDeploy 使用示例
'Hello, world' 示例
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL3-1B'
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=16384, tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
response = pipe(('describe this image', image))
print(response.text)
多圖像推理
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
from lmdeploy.vl import load_image
from lmdeploy.vl.constants import IMAGE_TOKEN
model = 'OpenGVLab/InternVL3-1B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=16384, tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
image_urls=[
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/det.jpg'
]
images = [load_image(img_url) for img_url in image_urls]
# 為圖像編號有助於多圖像對話
response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
print(response.text)
批量提示推理
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL3-1B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=16384, tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
image_urls=[
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/det.jpg"
]
prompts = [('describe this image', load_image(img_url)) for img_url in image_urls]
response = pipe(prompts)
print(response)
多輪對話
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig, ChatTemplateConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL3-1B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=16384, tp=1), chat_template_config=ChatTemplateConfig(model_name='internvl2_5'))
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
sess = pipe.chat(('describe this image', image), gen_config=gen_config)
print(sess.response.text)
sess = pipe.chat('What is the woman doing?', session=sess, gen_config=gen_config)
print(sess.response.text)
服務啟動
lmdeploy serve api_server OpenGVLab/InternVL3-1B --chat-template internvl2_5 --server-port 23333 --tp 1
OpenAI 風格接口調用
from openai import OpenAI
client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
model_name = client.models.list().data[0].id
response = client.chat.completions.create(
model=model_name,
messages=[{
'role':
'user',
'content': [{
'type': 'text',
'text': 'describe this image',
}, {
'type': 'image_url',
'image_url': {
'url':
'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/tiger.jpeg',
},
}],
}],
temperature=0.8,
top_p=0.8)
print(response)
📚 詳細文檔
模型架構
InternVL3 沿用了 InternVL 2.5 及其前代的 "ViT - MLP - LLM" 範式,通過隨機初始化的 MLP 投影器,將新的增量預訓練的 InternViT 與各種預訓練的大語言模型(如 InternLM 3 和 Qwen 2.5)集成。同時,應用像素重排操作,減少視覺標記數量,並採用動態分辨率策略,支持多圖像和視頻數據。
訓練策略
原生多模態預訓練
提出原生多模態預訓練方法,將語言和視覺學習整合到一個預訓練階段,避免了先訓練純語言模型再適應其他模態的傳統範式,使模型能同時學習語言和多模態表示。
監督微調
在 InternVL3 系列中採用了 InternVL2.5 提出的隨機 JPEG 壓縮、平方損失重加權和多模態數據打包技術,並使用更高質量和更多樣化的訓練數據。
混合偏好優化
使用 MPO 方法,引入正負樣本的額外監督,使模型響應分佈與真實分佈對齊,提高推理性能。
測試時縮放
採用 Best - of - N 評估策略,並使用 [VisualPRM - 8B](https://huggingface.co/OpenGVLab/VisualPRM - 8B) 作為評估模型,選擇最佳響應進行推理和數學評估。
評估指標
多模態能力評估
包括多模態推理和數學、OCR、圖表和文檔理解、多圖像和真實世界理解、綜合多模態和幻覺評估、視覺定位、多模態多語言理解、視頻理解、GUI 定位和空間推理等方面。
語言能力評估
將 InternVL3 與 Qwen2.5 Chat 模型進行比較,由於原生多模態預訓練,InternVL3 系列在整體文本性能上優於 Qwen2.5 系列。
消融實驗
原生多模態預訓練
在 InternVL2 - 8B 模型上進行實驗,將傳統的 MLP 預熱階段替換為原生多模態預訓練過程,結果表明該方法能有效賦予模型強大的多模態能力。
混合偏好優化
使用 MPO 微調的模型在七個多模態推理基準測試中表現出優於未使用 MPO 的模型的推理性能。
可變視覺位置編碼
引入 V2PE 使模型在大多數評估指標上有顯著性能提升,且較小的位置增量能在常規上下文任務中實現最佳性能。
🔧 技術細節
模型家族
模型名稱 | 視覺部分 | 語言部分 | HF 鏈接 |
---|---|---|---|
InternVL3 - 1B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [Qwen2.5 - 0.5B](https://huggingface.co/Qwen/Qwen2.5 - 0.5B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 1B) |
InternVL3 - 2B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [Qwen2.5 - 1.5B](https://huggingface.co/Qwen/Qwen2.5 - 1.5B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 2B) |
InternVL3 - 8B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [Qwen2.5 - 7B](https://huggingface.co/Qwen/Qwen2.5 - 7B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 8B) |
InternVL3 - 9B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [internlm3 - 8b - instruct](https://huggingface.co/internlm/internlm3 - 8b - instruct) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 9B) |
InternVL3 - 14B | [InternViT - 300M - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 300M - 448px - V2_5) | [Qwen2.5 - 14B](https://huggingface.co/Qwen/Qwen2.5 - 14B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 14B) |
InternVL3 - 38B | [InternViT - 6B - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 6B - 448px - V2_5) | [Qwen2.5 - 32B](https://huggingface.co/Qwen/Qwen2.5 - 32B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 38B) |
InternVL3 - 78B | [InternViT - 6B - 448px - V2_5](https://huggingface.co/OpenGVLab/InternViT - 6B - 448px - V2_5) | [Qwen2.5 - 72B](https://huggingface.co/Qwen/Qwen2.5 - 72B) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL3 - 78B) |
模型評估
多模態能力評估
        
語言能力評估

消融實驗結果
原生多模態預訓練

混合偏好優化

可變視覺位置編碼

📄 許可證
本項目採用 MIT 許可證發佈。項目使用預訓練的 Qwen2.5 作為組件,該組件遵循 Qwen 許可證。
引用
如果您在研究中發現本項目有用,請考慮引用以下文獻:
@article{chen2024expanding,
title={Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling},
author={Chen, Zhe and Wang, Weiyun and Cao, Yue and Liu, Yangzhou and Gao, Zhangwei and Cui, Erfei and Zhu, Jinguo and Ye, Shenglong and Tian, Hao and Liu, Zhaoyang and others},
journal={arXiv preprint arXiv:2412.05271},
year={2024}
}
@article{wang2024mpo,
title={Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization},
author={Wang, Weiyun and Chen, Zhe and Wang, Wenhai and Cao, Yue and Liu, Yangzhou and Gao, Zhangwei and Zhu, Jinguo and Zhu, Xizhou and Lu, Lewei and Qiao, Yu and Dai, Jifeng},
journal={arXiv preprint arXiv:2411.10442},
year={2024}
}
@article{chen2024far,
title={How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites},
author={Chen, Zhe and Wang, Weiyun and Tian, Hao and Ye, Shenglong and Gao, Zhangwei and Cui, Erfei and Tong, Wenwen and Hu, Kongzhi and Luo, Jiapeng and Ma, Zheng and others},
journal={arXiv preprint arXiv:2404.16821},
year={2024}
}
@inproceedings{chen2024internvl,
title={Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks},
author={Chen, Zhe and Wu, Jiannan and Wang, Wenhai and Su, Weijie and Chen, Guo and Xing, Sen and Zhong, Muyan and Zhang, Qinglong and Zhu, Xizhou and Lu, Lewei and others},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={24185--24198},
year={2024}
}









