@misc{blip3video-xgenmmvid,
author = {Michael S. Ryoo and Honglu Zhou and Shrikant Kendre and Can Qin and Le Xue and Manli Shu and Silvio Savarese and Ran Xu and Caiming Xiong and Juan Carlos Niebles},
title = {xGen-MM-Vid (BLIP-3-Video): You Only Need 32 Tokens to Represent a Video Even in VLMs},
year = {2024},
eprint = {2410.16267},
archivePrefix = {arXiv},
primaryClass = {cs.CV},
url = {https://arxiv.org/abs/2410.16267},
}