Model Overview
Model Features
Model Capabilities
Use Cases
đ Harmon: Harmonizing Visual Representations for Unified Multimodal Understanding and Generation
Harmon is a novel unified framework for multimodal understanding and generation. It harmonizes visual presentations via a shared MAR encoder, achieving advanced generation performance and competitive results in relevant tasks. This repo offers inference code for image understanding and text - to - image generation with two model variants.
đ Quick Start
Harmon is a unified framework that addresses multimodal understanding and generation. It overcomes the limitations of existing architectures by harmonizing visual presentations through a shared MAR encoder. In this repository, we offer inference code for both image - to - text and text - to - image generation using two model variants: Harmon - 0.5B and Harmon - 1.5B.
⨠Features
- Unified Framework: Harmon provides a single framework for both multimodal understanding and generation, breaking away from the traditional approach of using different encoder models for these two tasks.
- Shared MAR Encoder: The shared MAR encoder harmonizes the visual presentations of understanding and generation, leading to better performance in generation benchmarks and multimodal understanding tasks.
- Multiple Model Variants: Two model variants, Harmon - 0.5B and Harmon - 1.5B, are available, offering flexibility for different application scenarios.
đĻ Installation
The provided README does not contain installation steps, so this section is skipped.
đģ Usage Examples
Basic Usage
đī¸ Image - to - text Generation
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from einops import rearrange
from PIL import Image
import requests
PROMPT_TEMPLATE = dict(
SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
SUFFIX='<|im_end|>',
SUFFIX_AS_EOS=True,
SEP='\n',
STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
@torch.no_grad()
def question_answer(question,
image,
model,
tokenizer,
max_new_tokens=512,
image_size=512
):
assert image_size == 512
image = expand2square(
image, (127, 127, 127))
image = image.resize(size=(image_size, image_size))
image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=model.device)
image = rearrange(image, 'h w c -> c h w')[None]
image = 2 * (image / 255) - 1
prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
assert '<image>' in prompt
image_length = (image_size // 16) ** 2 + model.mar.buffer_size
prompt = prompt.replace('<image>', '<image>'*image_length)
input_ids = tokenizer.encode(
prompt, add_special_tokens=True, return_tensors='pt').cuda()
_, z_enc = model.extract_visual_feature(model.encode(image))
inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
input_ids[input_ids != image_token_idx]
)
output = model.llm.generate(inputs_embeds=inputs_embeds,
use_cache=True,
do_sample=False,
max_new_tokens=max_new_tokens,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id
if tokenizer.pad_token_id is not None else
tokenizer.eos_token_id
)
return tokenizer.decode(output[0])
harmon_tokenizer = AutoTokenizer.from_pretrained("wusize/Harmon-1_5B",
trust_remote_code=True)
harmon_model = AutoModel.from_pretrained("wusize/Harmon-1_5B",
trust_remote_code=True).eval().cuda().bfloat16()
special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
assert num_added_toks == 1
image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}")
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw).convert('RGB')
output_text = question_answer(question='Describe the image in detail.',
image=raw_image,
model=harmon_model,
tokenizer=harmon_tokenizer,
)
print(output_text)
đŧī¸ Text - to - image Generation
import os
import torch
from transformers import AutoTokenizer, AutoModel
from einops import rearrange
from PIL import Image
PROMPT_TEMPLATE = dict(
SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
SUFFIX='<|im_end|>',
SUFFIX_AS_EOS=True,
SEP='\n',
STOP_WORDS=['<|im_end|>', '<|endoftext|>'])
GENERATION_TEMPLATE = "Generate an image: {text}"
@torch.no_grad()
def generate_images(prompts,
negative_prompt,
tokenizer,
model,
output,
grid_size=2, # will produce 2 x 2 images per prompt
num_steps=64, cfg_scale=3.0, temperature=1.0, image_size=512):
assert image_size == 512
m = n = image_size // 16
prompts = [
PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)
for prompt in prompts
] * (grid_size ** 2)
if cfg_scale != 1.0:
prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
inputs = tokenizer(
prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(model.device)
images = model.sample(**inputs, num_iter=num_steps, cfg=cfg_scale, cfg_schedule="constant",
temperature=temperature, progress=True, image_shape=(m, n))
images = rearrange(images, '(m n b) c h w -> b (m h) (n w) c', m=grid_size, n=grid_size)
images = torch.clamp(
127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
os.makedirs(output, exist_ok=True)
for idx, image in enumerate(images):
Image.fromarray(image).save(f"{output}/{idx:08d}.jpg")
harmon_tokenizer = AutoTokenizer.from_pretrained("wusize/Harmon-1_5B",
trust_remote_code=True)
harmon_model = AutoModel.from_pretrained("wusize/Harmon-1_5B",
trust_remote_code=True).cuda().bfloat16().eval()
texts = ['a dog on the left and a cat on the right.',
'a photo of a pink stop sign.']
pos_prompts = [GENERATION_TEMPLATE.format(text=text) for text in texts]
neg_prompt = 'Generate an image.' # for classifier-free guidance
generate_images(prompts=pos_prompts,
negative_prompt=neg_prompt,
tokenizer=harmon_tokenizer,
model=harmon_model,
output='output',)
Advanced Usage
The provided README does not contain advanced usage scenarios, so this part is skipped.
đ Documentation
The table below shows the details of different model variants:
Property | Details |
---|---|
Model Type | Harmon - 0.5B: Uses Qwen2.5 - 0.5B - Instruct as LLM and MAR - Base as MAR. đ¤ Hugging Face Model Harmon - 1.5B: Uses Qwen2.5 - 1.5B - Instruct as LLM and MAR - Huge as MAR. đ¤ Hugging Face Model |
Training Data | Not provided in the original README. |
đ§ Technical Details
The provided README does not contain technical details, so this section is skipped.
đ License
This project is licensed under NTU S - Lab License 1.0.
đ Citation
If you find Harmon useful for your research or applications, please cite our paper using the following BibTeX:
@misc{wu2025harmon,
title={Harmonizing Visual Representations for Unified Multimodal Understanding and Generation},
author={Size Wu and Wenwei Zhang and Lumin Xu and Sheng Jin and Zhonghua Wu and Qingyi Tao and Wentao Liu and Wei Li and Chen Change Loy},
year={2025},
eprint={2503.21979},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2503.21979},
}







