Djlu/qwen3vl dash (#356)
* support dashscopoe sdk to call qwen3-vl-plus * support dashscopoe sdk to call qwen3-vl-plus --------- Co-authored-by: Timothyxxx <Timothyxxx@users.noreply.github.com>
This commit is contained in:
@@ -6,6 +6,9 @@ import os
|
||||
from io import BytesIO
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from http import HTTPStatus
|
||||
import dashscope
|
||||
from dashscope import MultiModalConversation
|
||||
import backoff
|
||||
import openai
|
||||
from PIL import Image
|
||||
@@ -40,7 +43,7 @@ def process_image(image_bytes):
|
||||
height=height,
|
||||
width=width,
|
||||
factor=32,
|
||||
max_pixels=16 * 16 * 4 * 1280,
|
||||
max_pixels=16 * 16 * 4 * 12800,
|
||||
)
|
||||
|
||||
image = image.resize((resized_width, resized_height))
|
||||
@@ -58,7 +61,7 @@ class Qwen3VLAgent:
|
||||
self,
|
||||
platform: str = "ubuntu",
|
||||
model: str = "qwen3-vl",
|
||||
max_tokens: int = 1500,
|
||||
max_tokens: int = 40960,
|
||||
top_p: float = 0.9,
|
||||
temperature: float = 0.0,
|
||||
action_space: str = "pyautogui",
|
||||
@@ -66,6 +69,9 @@ class Qwen3VLAgent:
|
||||
history_n: int = 4,
|
||||
add_thought_prefix: bool = False,
|
||||
coordinate_type: str = "relative",
|
||||
api_backend: str = "dashscope", # "openai" or "dashscope"
|
||||
enable_thinking: bool = True, # Enable thinking mode for DashScope
|
||||
thinking_budget: int = 32768, # Token budget for reasoning
|
||||
):
|
||||
self.platform = platform
|
||||
self.model = model
|
||||
@@ -77,9 +83,13 @@ class Qwen3VLAgent:
|
||||
self.history_n = history_n
|
||||
self.add_thought_prefix = add_thought_prefix
|
||||
self.coordinate_type = coordinate_type
|
||||
self.api_backend = api_backend
|
||||
self.enable_thinking = enable_thinking
|
||||
self.thinking_budget = thinking_budget
|
||||
|
||||
assert action_space in ["pyautogui"], "Invalid action space"
|
||||
assert observation_type in ["screenshot"], "Invalid observation type"
|
||||
assert api_backend in ["openai", "dashscope"], "Invalid API backend, must be 'openai' or 'dashscope'"
|
||||
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
@@ -527,6 +537,70 @@ Previous actions:
|
||||
|
||||
return low_level_instruction, pyautogui_code
|
||||
|
||||
@staticmethod
|
||||
def _to_dashscope_messages(messages):
|
||||
"""
|
||||
Convert messages built for OpenAI compat into DashScope MultiModalConversation format.
|
||||
- "text" part -> {"text": "..."}
|
||||
- "image_url" -> {"image": "<url-or-data-uri>"}
|
||||
- "video_url" -> {"video": "<url-or-data-uri>"}
|
||||
"""
|
||||
ds_msgs = []
|
||||
for m in messages:
|
||||
role = m.get("role", "")
|
||||
parts = m.get("content", [])
|
||||
ds_content = []
|
||||
for p in parts:
|
||||
ptype = p.get("type")
|
||||
if ptype == "text":
|
||||
ds_content.append({"text": p.get("text", "")})
|
||||
elif ptype == "image_url":
|
||||
url = (p.get("image_url") or {}).get("url", "")
|
||||
# DashScope accepts http(s), file://, or data:image/*; keep as-is
|
||||
ds_content.append({"image": url})
|
||||
elif ptype == "video_url":
|
||||
url = (p.get("video_url") or {}).get("url", "")
|
||||
ds_content.append({"video": url})
|
||||
else:
|
||||
# If you ever pass raw assistant strings (no parts), tolerate it
|
||||
if isinstance(p, str):
|
||||
ds_content.append({"text": p})
|
||||
# Also tolerate plain-string content (rare)
|
||||
if not ds_content and isinstance(m.get("content"), str):
|
||||
ds_content = [{"text": m["content"]}]
|
||||
ds_msgs.append({"role": role, "content": ds_content})
|
||||
return ds_msgs
|
||||
|
||||
@staticmethod
|
||||
def _extract_text_from_dashscope_response(resp):
|
||||
"""Join all 'text' parts from the first choice, including reasoning if present."""
|
||||
if hasattr(resp, "output"):
|
||||
out = resp.output
|
||||
else:
|
||||
out = resp.get("output") if isinstance(resp, dict) else None
|
||||
if not out:
|
||||
return None
|
||||
choices = getattr(out, "choices", None) if not isinstance(out, dict) else out.get("choices")
|
||||
if not choices:
|
||||
return None
|
||||
msg = getattr(choices[0], "message", None) if not isinstance(choices[0], dict) else choices[0].get("message")
|
||||
if not msg:
|
||||
return None
|
||||
content = getattr(msg, "content", None) if not isinstance(msg, dict) else msg.get("content", [])
|
||||
if not content:
|
||||
return None
|
||||
|
||||
# Extract reasoning content if present (for thinking models)
|
||||
reasoning_content = getattr(msg, "reasoning_content", None) if not isinstance(msg, dict) else msg.get("reasoning_content", None)
|
||||
|
||||
content_text = "".join(part.get("text", "") for part in content if isinstance(part, dict) and "text" in part)
|
||||
|
||||
# Format with thinking tags if reasoning exists
|
||||
if reasoning_content is not None:
|
||||
return f"<think>\n{reasoning_content}\n</think>\n\n{content_text}"
|
||||
else:
|
||||
return content_text
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.constant,
|
||||
(
|
||||
@@ -545,25 +619,93 @@ Previous actions:
|
||||
def call_llm(self, payload, model):
|
||||
messages = payload["messages"]
|
||||
|
||||
if self.api_backend == "openai":
|
||||
return self._call_llm_openai(messages, model)
|
||||
elif self.api_backend == "dashscope":
|
||||
return self._call_llm_dashscope(messages, model)
|
||||
else:
|
||||
raise ValueError(f"Unknown API backend: {self.api_backend}")
|
||||
|
||||
def _call_llm_openai(self, messages, model):
|
||||
"""Call LLM using OpenAI SDK (compatible with OpenAI-compatible endpoints)."""
|
||||
base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
api_key = "sk-123"
|
||||
client = openai.OpenAI(base_url=base_url, api_key=api_key)
|
||||
|
||||
for _ in range(MAX_RETRY_TIMES):
|
||||
logger.info("Generating content with Qwen model: %s", model)
|
||||
for attempt in range(1, MAX_RETRY_TIMES + 1):
|
||||
logger.info(f"[OpenAI] Generating content with model: {model} (attempt {attempt}/{MAX_RETRY_TIMES})")
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
max_tokens=self.max_tokens,
|
||||
temperature=self.temperature,
|
||||
top_p=self.top_p,
|
||||
# temperature=self.temperature,
|
||||
# top_p=self.top_p,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Qwen model: {e}")
|
||||
time.sleep(5)
|
||||
continue
|
||||
logger.error(f"[OpenAI] Error calling model: {e}")
|
||||
if attempt < MAX_RETRY_TIMES:
|
||||
time.sleep(5)
|
||||
continue
|
||||
break
|
||||
return ""
|
||||
|
||||
def _call_llm_dashscope(self, messages, model):
|
||||
"""Call LLM using DashScope SDK."""
|
||||
dashscope.base_http_api_url = "https://poc-dashscope.aliyuncs.com/api/v1"
|
||||
dashscope.api_key = "sk-123"
|
||||
|
||||
# Convert message schema
|
||||
ds_messages = self._to_dashscope_messages(messages)
|
||||
|
||||
# Retry loop
|
||||
last_err = None
|
||||
for attempt in range(1, MAX_RETRY_TIMES + 1):
|
||||
thinking_status = f" (thinking={self.enable_thinking})" if self.enable_thinking else ""
|
||||
logger.info(f"[DashScope] Generating content with model: {model}, thinking_status: {thinking_status} (attempt {attempt}/{MAX_RETRY_TIMES})")
|
||||
try:
|
||||
# Build API call parameters
|
||||
call_params = {
|
||||
"model": model,
|
||||
"messages": ds_messages,
|
||||
"max_tokens": min(self.max_tokens, 2048),
|
||||
# "temperature": self.temperature,
|
||||
# "top_p": self.top_p,
|
||||
"vl_high_resolution_images": True,
|
||||
}
|
||||
|
||||
# Add thinking parameters if enabled
|
||||
if self.enable_thinking:
|
||||
call_params["enable_thinking"] = True
|
||||
call_params["thinking_budget"] = self.thinking_budget
|
||||
|
||||
resp = MultiModalConversation.call(**call_params)
|
||||
|
||||
if getattr(resp, "status_code", None) not in (None, HTTPStatus.OK):
|
||||
code = getattr(resp, "code", "")
|
||||
msg = getattr(resp, "message", "")
|
||||
reqid = getattr(resp, "request_id", "")
|
||||
logger.warning(f"[DashScope] non-OK response (id={reqid}): {code} {msg}")
|
||||
last_err = RuntimeError(f"DashScope status {resp.status_code}: {code} {msg}")
|
||||
time.sleep(1.5 * attempt)
|
||||
continue
|
||||
|
||||
text = self._extract_text_from_dashscope_response(resp)
|
||||
if not text:
|
||||
raise ValueError("DashScope response has no text content")
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
logger.error(f"[DashScope] call failed: {e}")
|
||||
if attempt < MAX_RETRY_TIMES:
|
||||
time.sleep(1.5 * attempt)
|
||||
continue
|
||||
break
|
||||
|
||||
if last_err:
|
||||
raise last_err
|
||||
return ""
|
||||
|
||||
def reset(self, _logger=None):
|
||||
|
||||
@@ -57,13 +57,13 @@ def config() -> argparse.Namespace:
|
||||
parser.add_argument("--model", type=str, default="qwen3-vl")
|
||||
parser.add_argument("--temperature", type=float, default=0)
|
||||
parser.add_argument("--top_p", type=float, default=0.9)
|
||||
parser.add_argument("--max_tokens", type=int, default=1500)
|
||||
parser.add_argument("--max_tokens", type=int, default=40960)
|
||||
parser.add_argument("--stop_token", type=str, default=None)
|
||||
parser.add_argument(
|
||||
"--coord",
|
||||
type=str,
|
||||
choices=["absolute", "relative"],
|
||||
default="absolute",
|
||||
default="relative",
|
||||
help="Coordinate system for agent outputs (absolute or relative)",
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user