sci-gui-agent-benchmark/mm_agents/gta1/format_message.py


import base64
import os
from typing import Dict, Any, List, Union
import numpy as np
import cv2

class FormatMessage:
    def __init__(self):
        self.text_key = "input_text"
        self.image_key = "input_image"

    def encode_image(self, image_content: bytes) -> str:
        return base64.b64encode(image_content).decode('utf-8')

    def format_image(self, image: bytes, detail: str="high") -> Dict[str, Any]:
        return {
            "type": self.image_key,
            "image_url": f"data:image/png;base64,{self.encode_image(image)}",
            "detail": detail
        }

    def format_text_message(self, text: str) -> Dict[str, Any]:
        return {"type": self.text_key, "text": text}

    def create_system_message(self, content: str) -> Dict[str, Any]:
        return {
                "role": "system",
                "content": [self.format_text_message(content)]
            }

    def create_user_message(self, text: str=None, image: bytes=None, detail: str="high", image_first: bool=False) -> Dict[str, Any]:
        if text is None and image is None:
            raise ValueError("At least one of text or image must be provided")

        content = []

        # Add text if provided
        if text is not None:
            content.append(self.format_text_message(text))

        # Add image if provided
        if image is not None:
            content.append(self.format_image(image, detail))

        if image_first:
            content.reverse()
        return {
            "role": "user",
            "content": content
        }

    def create_assistant_message(self, text: str) -> Dict[str, Any]:
        return {
                "role": "assistant",
                "content": [{"type": "output_text", "text": text}]
            }


def encode_numpy_image_to_base64(image: np.ndarray) -> str:
    # Convert numpy array to bytes
    success, buffer = cv2.imencode('.png', image)
    if not success:
        raise ValueError("Failed to encode image to png format")

    # Convert bytes to base64 string
    image_bytes = buffer.tobytes()
    base64_string = base64.b64encode(image_bytes).decode('utf-8')

    return base64_string

def encode_image_bytes(image_content):
    return base64.b64encode(image_content).decode('utf-8')