Files
sci-gui-agent-benchmark/mm_agents/uipath/grounder_client.py
alexandruilie7 5463d3bb89 uipath v2 (#413)
* submission v2

* small updates
2026-01-09 08:47:20 +08:00

46 lines
1.7 KiB
Python

import httpx
import mm_agents.uipath.utils as utils
import os
class GrounderClient(object):
def __init__(self):
# Proxy for hosting finetuned Qwen3VL + UiElementPredictor
# Could be replaced with a VLLM server and grounder specific processing
self.url = ""
async def predict(
self, image_base64: str, action_description: str, action: str, element_description: str | None = None,
) -> utils.GroundingOutput:
request = utils.GroundingRequest(
description=action_description,
image_base64=image_base64,
action_type=action,
element_description=element_description
)
api_key = os.getenv("SERVICE_KEY")
async with httpx.AsyncClient() as client:
response = await client.post(
self.url,
json={
"image_base64": request.image_base64,
"action_description": request.description,
"action": request.action_type,
"element_description": request.element_description,
},
headers={
"X-API-KEY": api_key
},
timeout=100.0,
)
if response.status_code != 200:
raise ValueError(f"Prediction failed: {response.text}")
data = response.json()
if tuple(data["position"]) == (-1, -1):
raise utils.GroundingOutputValidationException(f"Element {request.description} not found in image", request.description)
return utils.GroundingOutput(
description=data["description"],
position=tuple(data["position"]),
)