Qwen VL Guidance開源GUIChat多模態模型 - 精準理解圖像，優化GUI交互問答

首頁

Qwen Vl Guidance

由RhapsodyAI開發

GUIChat是一個基於視覺問答(VQA)的多模態模型，能夠理解圖像內容並回答相關問題，特別針對GUI界面元素識別和交互進行了優化。

文本生成圖像

Transformers

開源協議:Apache-2.0 #視覺問答 #GUI交互 #座標標註

下載量 46

發布時間 : 7/15/2024

模型概述

該模型結合了視覺理解和自然語言處理能力，主要用於GUI界面的元素識別、定位和交互問答任務。

模型特點

GUI元素精確定位

能夠識別並標註GUI界面中的特定元素，支持框選和點選兩種定位方式

多模態理解

同時處理圖像和文本輸入，理解圖像內容並回答相關問題

交互式問答

支持通過自然語言對話方式與GUI界面進行交互

模型能力

GUI元素識別

視覺問答

界面元素定位

多模態理解

使用案例

軟件測試自動化

GUI元素自動定位

自動識別和定位軟件界面中的按鈕、輸入框等元素

提高測試腳本編寫的效率和準確性

無障礙輔助

界面元素語音描述

為視障用戶描述界面元素及其位置

增強軟件的可訪問性

🚀 視覺問答項目

本項目專注於視覺問答領域，藉助相關模型和代碼實現對圖像的問答交互，並能對回答結果進行可視化標註。

🚀 快速開始

以下是項目的快速使用步驟和代碼示例。

代碼示例

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

from PIL import Image, ImageDraw, ImageFont
import re


def draw_circle(draw, center, radius=10, width=2, outline_color=(0, 255, 0), is_fill=False, bg_color=(0, 255, 0), transparency=80):
    # Calculate the bounding box coordinates for the circle
    x1 = center[0] - radius
    y1 = center[1] - radius
    x2 = center[0] + radius
    y2 = center[1] + radius
    bbox = (x1, y1, x2, y2)

    # Draw the circle
    if is_fill:
        # Calculate the alpha value based on the transparency percentage
        alpha = int((1 - transparency / 100) * 255)

        # Set the fill color with the specified background color and transparency
        fill_color = tuple(bg_color) + (alpha,)

        draw.ellipse(bbox, width=width, outline=outline_color, fill=fill_color)
    else:
        draw.ellipse(bbox, width=width, outline=outline_color)

def draw_point(draw, center, radius1=3, radius2=6, color=(0, 255, 0)):
    draw_circle(draw, center, radius=radius1, outline_color=color)
    draw_circle(draw, center, radius=radius2, outline_color=color)

def draw_rectangle(draw, box_coords, width=2, outline_color=(0, 255, 0), is_fill=False, bg_color=(0, 255, 0), transparency=80):  
    if is_fill:
        # Calculate the alpha value based on the transparency percentage
        alpha = int((1 - transparency / 100) * 255)

        # Set the fill color with the specified background color and transparency
        fill_color = tuple(bg_color) + (alpha,)

        draw.rectangle(box_coords, width=width, outline=outline_color, fill=fill_color)
    else:
        draw.rectangle(box_coords, width=width, outline=outline_color)

def draw(path, out_path, response):
    img = Image.open(path).convert("RGB")
    draw = ImageDraw.Draw(img)

    box_coords = re.findall(r"<box>(.*?)</box>", response)
    for box in box_coords:
        try:
            x1, y1, x2, y2 = box.replace("(", "").replace(")", "").split(",")
            x1, y1, x2, y2 = float(x1) * img.width/1000, float(y1) * img.height/1000, float(x2) * img.width/1000, float(y2) * img.height/1000
            draw_rectangle(draw, (x1, y1, x2, y2))
        except:
            print("There were some errors while parsing the bounding box.")

    point_coords = re.findall(r"<point>(.*?)</point>", response)
    for point in point_coords:
        try:
            x1, y1 = point.replace("(", "").replace(")", "").split(",")
            x1, y1 = float(x1) * img.width/1000, float(y1) * img.height/1000
            draw_point(draw, (x1, y1))
        except:
            print("There were some errors while parsing the bounding point.")

    img.save(out_path)

def load_model_and_tokenizer(path, device):
    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(path, device_map=device, trust_remote_code=True).eval()
    return model, tokenizer


def infer(model, tokenizer, image_path, text):
    query = tokenizer.from_list_format([
        {'image': image_path},
        {'text': text},
    ])
    response, history = model.chat(tokenizer, query=query, history=None)
    return response

if __name__ == "__main__":
    device = "cuda:0"
    model_path = "<your_model_path>"
    model, tokenizer = load_model_and_tokenizer(model_path, device)

    while True:
        image_path = input("image path >>>>> ")
        if image_path == "stop":
            break
        query = input("Human:")
        if query == "stop":
            break

        response = infer(model, tokenizer, image_path, query)
        draw(image_path, "1.jpg", response)