Djlu/qwen3vl dash (#356)

* support dashscopoe sdk to call qwen3-vl-plus * support dashscopoe sdk to call qwen3-vl-plus --------- Co-authored-by: Timothyxxx <Timothyxxx@users.noreply.github.com>
2025-10-13 16:31:06 +08:00
parent f9e9273b3b
commit d25464c203
2 changed files with 153 additions and 11 deletions
--- a/mm_agents/qwen3vl_agent.py
+++ b/mm_agents/qwen3vl_agent.py
@@ -6,6 +6,9 @@ import os
 from io import BytesIO
 from typing import Dict, List, Tuple
 from http import HTTPStatus
 import dashscope
 from dashscope import MultiModalConversation
 import backoff
 import openai
 from PIL import Image
@@ -40,7 +43,7 @@ def process_image(image_bytes):
        height=height,
        width=width,
        factor=32,
-        max_pixels=16 * 16 * 4 * 1280,
+        max_pixels=16 * 16 * 4 * 12800,
    )
    image = image.resize((resized_width, resized_height))
@@ -58,7 +61,7 @@ class Qwen3VLAgent:
        self,
        platform: str = "ubuntu",
        model: str = "qwen3-vl",
-        max_tokens: int = 1500,
+        max_tokens: int = 40960,
        top_p: float = 0.9,
        temperature: float = 0.0,
        action_space: str = "pyautogui",
@@ -66,6 +69,9 @@ class Qwen3VLAgent:
        history_n: int = 4,
        add_thought_prefix: bool = False,
        coordinate_type: str = "relative",
        api_backend: str = "dashscope",  # "openai" or "dashscope"
        enable_thinking: bool = True,  # Enable thinking mode for DashScope
        thinking_budget: int = 32768,  # Token budget for reasoning
    ):
        self.platform = platform
        self.model = model
@@ -77,9 +83,13 @@ class Qwen3VLAgent:
        self.history_n = history_n
        self.add_thought_prefix = add_thought_prefix
        self.coordinate_type = coordinate_type
        self.api_backend = api_backend
        self.enable_thinking = enable_thinking
        self.thinking_budget = thinking_budget
        assert action_space in ["pyautogui"], "Invalid action space"
        assert observation_type in ["screenshot"], "Invalid observation type"
        assert api_backend in ["openai", "dashscope"], "Invalid API backend, must be 'openai' or 'dashscope'"
        self.thoughts = []
        self.actions = []
@@ -527,6 +537,70 @@ Previous actions:
        return low_level_instruction, pyautogui_code
    @staticmethod
    def _to_dashscope_messages(messages):
        """
        Convert messages built for OpenAI compat into DashScope MultiModalConversation format.
        - "text" part  -> {"text": "..."}
        - "image_url"  -> {"image": "<url-or-data-uri>"}
        - "video_url"  -> {"video": "<url-or-data-uri>"}
        """
        ds_msgs = []
        for m in messages:
            role = m.get("role", "")
            parts = m.get("content", [])
            ds_content = []
            for p in parts:
                ptype = p.get("type")
                if ptype == "text":
                    ds_content.append({"text": p.get("text", "")})
                elif ptype == "image_url":
                    url = (p.get("image_url") or {}).get("url", "")
                    # DashScope accepts http(s), file://, or data:image/*; keep as-is
                    ds_content.append({"image": url})
                elif ptype == "video_url":
                    url = (p.get("video_url") or {}).get("url", "")
                    ds_content.append({"video": url})
                else:
                    # If you ever pass raw assistant strings (no parts), tolerate it
                    if isinstance(p, str):
                        ds_content.append({"text": p})
            # Also tolerate plain-string content (rare)
            if not ds_content and isinstance(m.get("content"), str):
                ds_content = [{"text": m["content"]}]
            ds_msgs.append({"role": role, "content": ds_content})
        return ds_msgs
    @staticmethod
    def _extract_text_from_dashscope_response(resp):
        """Join all 'text' parts from the first choice, including reasoning if present."""
        if hasattr(resp, "output"):
            out = resp.output
        else:
            out = resp.get("output") if isinstance(resp, dict) else None
        if not out:
            return None
        choices = getattr(out, "choices", None) if not isinstance(out, dict) else out.get("choices")
        if not choices:
            return None
        msg = getattr(choices[0], "message", None) if not isinstance(choices[0], dict) else choices[0].get("message")
        if not msg:
            return None
        content = getattr(msg, "content", None) if not isinstance(msg, dict) else msg.get("content", [])
        if not content:
            return None
        # Extract reasoning content if present (for thinking models)
        reasoning_content = getattr(msg, "reasoning_content", None) if not isinstance(msg, dict) else msg.get("reasoning_content", None)
        content_text = "".join(part.get("text", "") for part in content if isinstance(part, dict) and "text" in part)
        # Format with thinking tags if reasoning exists
        if reasoning_content is not None:
            return f"<think>\n{reasoning_content}\n</think>\n\n{content_text}"
        else:
            return content_text
    @backoff.on_exception(
        backoff.constant,
        (
@@ -545,25 +619,93 @@ Previous actions:
    def call_llm(self, payload, model):
        messages = payload["messages"]
        if self.api_backend == "openai":
            return self._call_llm_openai(messages, model)
        elif self.api_backend == "dashscope":
            return self._call_llm_dashscope(messages, model)
        else:
            raise ValueError(f"Unknown API backend: {self.api_backend}")
    def _call_llm_openai(self, messages, model):
        """Call LLM using OpenAI SDK (compatible with OpenAI-compatible endpoints)."""
        base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1"
        api_key = "sk-123"
        client = openai.OpenAI(base_url=base_url, api_key=api_key)
-        for _ in range(MAX_RETRY_TIMES):
+        for attempt in range(1, MAX_RETRY_TIMES + 1):
-            logger.info("Generating content with Qwen model: %s", model)
+            logger.info(f"[OpenAI] Generating content with model: {model} (attempt {attempt}/{MAX_RETRY_TIMES})")
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    max_tokens=self.max_tokens,
-                    temperature=self.temperature,
+                    # temperature=self.temperature,
-                    top_p=self.top_p,
+                    # top_p=self.top_p,
                )
                return response.choices[0].message.content
            except Exception as e:
-                logger.error(f"Error calling Qwen model: {e}")
+                logger.error(f"[OpenAI] Error calling model: {e}")
-                time.sleep(5)
+                if attempt < MAX_RETRY_TIMES:
-                continue
+                    time.sleep(5)
                    continue
                break
        return ""
    def _call_llm_dashscope(self, messages, model):
        """Call LLM using DashScope SDK."""
        dashscope.base_http_api_url = "https://poc-dashscope.aliyuncs.com/api/v1"
        dashscope.api_key = "sk-123"
        # Convert message schema
        ds_messages = self._to_dashscope_messages(messages)
        # Retry loop
        last_err = None
        for attempt in range(1, MAX_RETRY_TIMES + 1):
            thinking_status = f" (thinking={self.enable_thinking})" if self.enable_thinking else ""
            logger.info(f"[DashScope] Generating content with model: {model}, thinking_status: {thinking_status} (attempt {attempt}/{MAX_RETRY_TIMES})")
            try:
                # Build API call parameters
                call_params = {
                    "model": model,
                    "messages": ds_messages,
                    "max_tokens": min(self.max_tokens, 2048),
                    # "temperature": self.temperature,
                    # "top_p": self.top_p,
                    "vl_high_resolution_images": True,
                }
                # Add thinking parameters if enabled
                if self.enable_thinking:
                    call_params["enable_thinking"] = True
                    call_params["thinking_budget"] = self.thinking_budget
                resp = MultiModalConversation.call(**call_params)
                if getattr(resp, "status_code", None) not in (None, HTTPStatus.OK):
                    code = getattr(resp, "code", "")
                    msg = getattr(resp, "message", "")
                    reqid = getattr(resp, "request_id", "")
                    logger.warning(f"[DashScope] non-OK response (id={reqid}): {code} {msg}")
                    last_err = RuntimeError(f"DashScope status {resp.status_code}: {code} {msg}")
                    time.sleep(1.5 * attempt)
                    continue
                text = self._extract_text_from_dashscope_response(resp)
                if not text:
                    raise ValueError("DashScope response has no text content")
                return text
            except Exception as e:
                last_err = e
                logger.error(f"[DashScope] call failed: {e}")
                if attempt < MAX_RETRY_TIMES:
                    time.sleep(1.5 * attempt)
                    continue
                break
        if last_err:
            raise last_err
        return ""
    def reset(self, _logger=None):
--- a/run_multienv_qwen3vl.py
+++ b/run_multienv_qwen3vl.py
@@ -57,13 +57,13 @@ def config() -> argparse.Namespace:
    parser.add_argument("--model", type=str, default="qwen3-vl")
    parser.add_argument("--temperature", type=float, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)
-    parser.add_argument("--max_tokens", type=int, default=1500)
+    parser.add_argument("--max_tokens", type=int, default=40960)
    parser.add_argument("--stop_token", type=str, default=None)
    parser.add_argument(
        "--coord",
        type=str,
        choices=["absolute", "relative"],
-        default="absolute",
+        default="relative",
        help="Coordinate system for agent outputs (absolute or relative)",
    )
    parser.add_argument(