Djlu/qwen3vl dash (#356)

* support dashscopoe sdk to call qwen3-vl-plus

* support dashscopoe sdk to call qwen3-vl-plus

---------

Co-authored-by: Timothyxxx <Timothyxxx@users.noreply.github.com>
This commit is contained in:
Dunjie Lu
2025-10-13 16:31:06 +08:00
committed by GitHub
parent f9e9273b3b
commit d25464c203
2 changed files with 153 additions and 11 deletions

View File

@@ -6,6 +6,9 @@ import os
from io import BytesIO from io import BytesIO
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from http import HTTPStatus
import dashscope
from dashscope import MultiModalConversation
import backoff import backoff
import openai import openai
from PIL import Image from PIL import Image
@@ -40,7 +43,7 @@ def process_image(image_bytes):
height=height, height=height,
width=width, width=width,
factor=32, factor=32,
max_pixels=16 * 16 * 4 * 1280, max_pixels=16 * 16 * 4 * 12800,
) )
image = image.resize((resized_width, resized_height)) image = image.resize((resized_width, resized_height))
@@ -58,7 +61,7 @@ class Qwen3VLAgent:
self, self,
platform: str = "ubuntu", platform: str = "ubuntu",
model: str = "qwen3-vl", model: str = "qwen3-vl",
max_tokens: int = 1500, max_tokens: int = 40960,
top_p: float = 0.9, top_p: float = 0.9,
temperature: float = 0.0, temperature: float = 0.0,
action_space: str = "pyautogui", action_space: str = "pyautogui",
@@ -66,6 +69,9 @@ class Qwen3VLAgent:
history_n: int = 4, history_n: int = 4,
add_thought_prefix: bool = False, add_thought_prefix: bool = False,
coordinate_type: str = "relative", coordinate_type: str = "relative",
api_backend: str = "dashscope", # "openai" or "dashscope"
enable_thinking: bool = True, # Enable thinking mode for DashScope
thinking_budget: int = 32768, # Token budget for reasoning
): ):
self.platform = platform self.platform = platform
self.model = model self.model = model
@@ -77,9 +83,13 @@ class Qwen3VLAgent:
self.history_n = history_n self.history_n = history_n
self.add_thought_prefix = add_thought_prefix self.add_thought_prefix = add_thought_prefix
self.coordinate_type = coordinate_type self.coordinate_type = coordinate_type
self.api_backend = api_backend
self.enable_thinking = enable_thinking
self.thinking_budget = thinking_budget
assert action_space in ["pyautogui"], "Invalid action space" assert action_space in ["pyautogui"], "Invalid action space"
assert observation_type in ["screenshot"], "Invalid observation type" assert observation_type in ["screenshot"], "Invalid observation type"
assert api_backend in ["openai", "dashscope"], "Invalid API backend, must be 'openai' or 'dashscope'"
self.thoughts = [] self.thoughts = []
self.actions = [] self.actions = []
@@ -527,6 +537,70 @@ Previous actions:
return low_level_instruction, pyautogui_code return low_level_instruction, pyautogui_code
@staticmethod
def _to_dashscope_messages(messages):
"""
Convert messages built for OpenAI compat into DashScope MultiModalConversation format.
- "text" part -> {"text": "..."}
- "image_url" -> {"image": "<url-or-data-uri>"}
- "video_url" -> {"video": "<url-or-data-uri>"}
"""
ds_msgs = []
for m in messages:
role = m.get("role", "")
parts = m.get("content", [])
ds_content = []
for p in parts:
ptype = p.get("type")
if ptype == "text":
ds_content.append({"text": p.get("text", "")})
elif ptype == "image_url":
url = (p.get("image_url") or {}).get("url", "")
# DashScope accepts http(s), file://, or data:image/*; keep as-is
ds_content.append({"image": url})
elif ptype == "video_url":
url = (p.get("video_url") or {}).get("url", "")
ds_content.append({"video": url})
else:
# If you ever pass raw assistant strings (no parts), tolerate it
if isinstance(p, str):
ds_content.append({"text": p})
# Also tolerate plain-string content (rare)
if not ds_content and isinstance(m.get("content"), str):
ds_content = [{"text": m["content"]}]
ds_msgs.append({"role": role, "content": ds_content})
return ds_msgs
@staticmethod
def _extract_text_from_dashscope_response(resp):
"""Join all 'text' parts from the first choice, including reasoning if present."""
if hasattr(resp, "output"):
out = resp.output
else:
out = resp.get("output") if isinstance(resp, dict) else None
if not out:
return None
choices = getattr(out, "choices", None) if not isinstance(out, dict) else out.get("choices")
if not choices:
return None
msg = getattr(choices[0], "message", None) if not isinstance(choices[0], dict) else choices[0].get("message")
if not msg:
return None
content = getattr(msg, "content", None) if not isinstance(msg, dict) else msg.get("content", [])
if not content:
return None
# Extract reasoning content if present (for thinking models)
reasoning_content = getattr(msg, "reasoning_content", None) if not isinstance(msg, dict) else msg.get("reasoning_content", None)
content_text = "".join(part.get("text", "") for part in content if isinstance(part, dict) and "text" in part)
# Format with thinking tags if reasoning exists
if reasoning_content is not None:
return f"<think>\n{reasoning_content}\n</think>\n\n{content_text}"
else:
return content_text
@backoff.on_exception( @backoff.on_exception(
backoff.constant, backoff.constant,
( (
@@ -545,25 +619,93 @@ Previous actions:
def call_llm(self, payload, model): def call_llm(self, payload, model):
messages = payload["messages"] messages = payload["messages"]
if self.api_backend == "openai":
return self._call_llm_openai(messages, model)
elif self.api_backend == "dashscope":
return self._call_llm_dashscope(messages, model)
else:
raise ValueError(f"Unknown API backend: {self.api_backend}")
def _call_llm_openai(self, messages, model):
"""Call LLM using OpenAI SDK (compatible with OpenAI-compatible endpoints)."""
base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1" base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1"
api_key = "sk-123" api_key = "sk-123"
client = openai.OpenAI(base_url=base_url, api_key=api_key) client = openai.OpenAI(base_url=base_url, api_key=api_key)
for _ in range(MAX_RETRY_TIMES): for attempt in range(1, MAX_RETRY_TIMES + 1):
logger.info("Generating content with Qwen model: %s", model) logger.info(f"[OpenAI] Generating content with model: {model} (attempt {attempt}/{MAX_RETRY_TIMES})")
try: try:
response = client.chat.completions.create( response = client.chat.completions.create(
model=model, model=model,
messages=messages, messages=messages,
max_tokens=self.max_tokens, max_tokens=self.max_tokens,
temperature=self.temperature, # temperature=self.temperature,
top_p=self.top_p, # top_p=self.top_p,
) )
return response.choices[0].message.content return response.choices[0].message.content
except Exception as e: except Exception as e:
logger.error(f"Error calling Qwen model: {e}") logger.error(f"[OpenAI] Error calling model: {e}")
time.sleep(5) if attempt < MAX_RETRY_TIMES:
continue time.sleep(5)
continue
break
return ""
def _call_llm_dashscope(self, messages, model):
"""Call LLM using DashScope SDK."""
dashscope.base_http_api_url = "https://poc-dashscope.aliyuncs.com/api/v1"
dashscope.api_key = "sk-123"
# Convert message schema
ds_messages = self._to_dashscope_messages(messages)
# Retry loop
last_err = None
for attempt in range(1, MAX_RETRY_TIMES + 1):
thinking_status = f" (thinking={self.enable_thinking})" if self.enable_thinking else ""
logger.info(f"[DashScope] Generating content with model: {model}, thinking_status: {thinking_status} (attempt {attempt}/{MAX_RETRY_TIMES})")
try:
# Build API call parameters
call_params = {
"model": model,
"messages": ds_messages,
"max_tokens": min(self.max_tokens, 2048),
# "temperature": self.temperature,
# "top_p": self.top_p,
"vl_high_resolution_images": True,
}
# Add thinking parameters if enabled
if self.enable_thinking:
call_params["enable_thinking"] = True
call_params["thinking_budget"] = self.thinking_budget
resp = MultiModalConversation.call(**call_params)
if getattr(resp, "status_code", None) not in (None, HTTPStatus.OK):
code = getattr(resp, "code", "")
msg = getattr(resp, "message", "")
reqid = getattr(resp, "request_id", "")
logger.warning(f"[DashScope] non-OK response (id={reqid}): {code} {msg}")
last_err = RuntimeError(f"DashScope status {resp.status_code}: {code} {msg}")
time.sleep(1.5 * attempt)
continue
text = self._extract_text_from_dashscope_response(resp)
if not text:
raise ValueError("DashScope response has no text content")
return text
except Exception as e:
last_err = e
logger.error(f"[DashScope] call failed: {e}")
if attempt < MAX_RETRY_TIMES:
time.sleep(1.5 * attempt)
continue
break
if last_err:
raise last_err
return "" return ""
def reset(self, _logger=None): def reset(self, _logger=None):

View File

@@ -57,13 +57,13 @@ def config() -> argparse.Namespace:
parser.add_argument("--model", type=str, default="qwen3-vl") parser.add_argument("--model", type=str, default="qwen3-vl")
parser.add_argument("--temperature", type=float, default=0) parser.add_argument("--temperature", type=float, default=0)
parser.add_argument("--top_p", type=float, default=0.9) parser.add_argument("--top_p", type=float, default=0.9)
parser.add_argument("--max_tokens", type=int, default=1500) parser.add_argument("--max_tokens", type=int, default=40960)
parser.add_argument("--stop_token", type=str, default=None) parser.add_argument("--stop_token", type=str, default=None)
parser.add_argument( parser.add_argument(
"--coord", "--coord",
type=str, type=str,
choices=["absolute", "relative"], choices=["absolute", "relative"],
default="absolute", default="relative",
help="Coordinate system for agent outputs (absolute or relative)", help="Coordinate system for agent outputs (absolute or relative)",
) )
parser.add_argument( parser.add_argument(