đ Model Card for Segment Anything Model in High Quality (SAM-HQ)
SAM-HQ is an enhanced version of the Segment Anything Model (SAM). It can generate higher - quality object masks from input prompts like points or boxes, addressing the limitations of the original SAM model with minimal extra parameters and computation cost.
đ Quick Start
SAM-HQ can generate high - quality segmentation masks, even for objects with complex boundaries and thin structures. It maintains SAM's original promptable design, efficiency, and zero - shot generalizability while significantly improving mask quality.
⨠Features
- High - Quality Output Token: A learnable token in SAM's mask decoder, trained to predict high - quality masks.
- Global - local Feature Fusion: Fuses mask - decoder features with early and final ViT features for better mask details, combining high - level semantic context and low - level boundary information.
đĻ Installation
The provided README does not contain specific installation steps, so this section is skipped.
đģ Usage Examples
Basic Usage
Prompted - Mask - Generation
from PIL import Image
import requests
from transformers import SamHQModel, SamHQProcessor
model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-large")
processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-large")
img_url = "https://raw.githubusercontent.com/SysCV/sam-hq/refs/heads/main/demo/input_imgs/example1.png"
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
input_boxes = [[[306, 132, 925, 893]]]
inputs = processor(raw_image, input_boxes=input_boxes, return_tensors="pt").to("cuda")
outputs = model(**inputs)
masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
scores = outputs.iou_scores
Automatic - Mask - Generation
from transformers import pipeline
generator = pipeline("mask-generation", model="syscv-community/sam-hq-vit-large", device=0, points_per_batch=256)
image_url = "https://raw.githubusercontent.com/SysCV/sam-hq/refs/heads/main/demo/input_imgs/example1.png"
outputs = generator(image_url, points_per_batch=256)
Advanced Usage
Complete Example with Visualization
import numpy as np
import matplotlib.pyplot as plt
def show_mask(mask, ax, random_color=False):
if random_color:
color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
else:
color = np.array([30/255, 144/255, 255/255, 0.6])
h, w = mask.shape[-2:]
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
ax.imshow(mask_image)
def show_box(box, ax):
x0, y0 = box[0], box[1]
w, h = box[2] - box[0], box[3] - box[1]
ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
def show_boxes_on_image(raw_image, boxes):
plt.figure(figsize=(10,10))
plt.imshow(raw_image)
for box in boxes:
show_box(box, plt.gca())
plt.axis('on')
plt.show()
def show_points_on_image(raw_image, input_points, input_labels=None):
plt.figure(figsize=(10,10))
plt.imshow(raw_image)
input_points = np.array(input_points)
if input_labels is None:
labels = np.ones_like(input_points[:, 0])
else:
labels = np.array(input_labels)
show_points(input_points, labels, plt.gca())
plt.axis('on')
plt.show()
def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
plt.figure(figsize=(10,10))
plt.imshow(raw_image)
input_points = np.array(input_points)
if input_labels is None:
labels = np.ones_like(input_points[:, 0])
else:
labels = np.array(input_labels)
show_points(input_points, labels, plt.gca())
for box in boxes:
show_box(box, plt.gca())
plt.axis('on')
plt.show()
def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
plt.figure(figsize=(10,10))
plt.imshow(raw_image)
input_points = np.array(input_points)
if input_labels is None:
labels = np.ones_like(input_points[:, 0])
else:
labels = np.array(input_labels)
show_points(input_points, labels, plt.gca())
for box in boxes:
show_box(box, plt.gca())
plt.axis('on')
plt.show()
def show_points(coords, labels, ax, marker_size=375):
pos_points = coords[labels==1]
neg_points = coords[labels==0]
ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
def show_masks_on_image(raw_image, masks, scores):
if len(masks.shape) == 4:
masks = masks.squeeze()
if scores.shape[0] == 1:
scores = scores.squeeze()
nb_predictions = scores.shape[-1]
fig, axes = plt.subplots(1, nb_predictions, figsize=(15, 15))
for i, (mask, score) in enumerate(zip(masks, scores)):
mask = mask.cpu().detach()
axes[i].imshow(np.array(raw_image))
show_mask(mask, axes[i])
axes[i].title.set_text(f"Mask {i+1}, Score: {score.item():.3f}")
axes[i].axis("off")
plt.show()
def show_masks_on_single_image(raw_image, masks, scores):
if len(masks.shape) == 4:
masks = masks.squeeze()
if scores.shape[0] == 1:
scores = scores.squeeze()
image_np = np.array(raw_image)
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(image_np)
for i, (mask, score) in enumerate(zip(masks, scores)):
mask = mask.cpu().detach().numpy()
show_mask(mask, ax)
ax.set_title(f"Overlayed Masks with Scores")
ax.axis("off")
plt.show()
import torch
from transformers import SamHQModel, SamHQProcessor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-large").to(device)
processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-large")
from PIL import Image
import requests
img_url = "https://raw.githubusercontent.com/SysCV/sam-hq/refs/heads/main/demo/input_imgs/example1.png"
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
plt.imshow(raw_image)
inputs = processor(raw_image, return_tensors="pt").to(device)
image_embeddings, intermediate_embeddings = model.get_image_embeddings(inputs["pixel_values"])
input_boxes = [[[306, 132, 925, 893]]]
show_boxes_on_image(raw_image, input_boxes[0])
inputs.pop("pixel_values", None)
inputs.update({"image_embeddings": image_embeddings})
inputs.update({"intermediate_embeddings": intermediate_embeddings})
with torch.no_grad():
outputs = model(**inputs)
masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
scores = outputs.iou_scores
show_masks_on_single_image(raw_image, masks[0], scores)
show_masks_on_image(raw_image, masks[0], scores)
đ Documentation
Model Details
SAM-HQ builds on the original SAM architecture with two key innovations while keeping SAM's pretrained weights:
- High - Quality Output Token: A learnable token in the mask decoder, trained to predict high - quality masks.
- Global - local Feature Fusion: Fuses mask - decoder features with early and final ViT features for better mask details.
SAM-HQ was trained on a curated dataset of 44K fine - grained masks (HQSeg - 44K). The training takes only 4 hours on 8 GPUs, introducing less than 0.5% additional parameters compared to the original SAM model.
The model has been evaluated on 10 diverse segmentation datasets. It addresses the limitations of the original SAM model, such as coarse mask boundaries and incorrect predictions, making it valuable for applications requiring accurate image masks.
đ§ Technical Details
SAM-HQ addresses two main problems of the original SAM model:
- Coarse mask boundaries, often neglecting thin object structures.
- Incorrect predictions, broken masks, or large errors in challenging cases.
đ License
The model is licensed under the apache - 2.0
license.
đ Citation
@misc{ke2023segmenthighquality,
title={Segment Anything in High Quality},
author={Lei Ke and Mingqiao Ye and Martin Danelljan and Yifan Liu and Yu - Wing Tai and Chi - Keung Tang and Fisher Yu},
year={2023},
eprint={2306.01567},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2306.01567},
}