Djlu/qwen3vl dash (#356)

* support dashscopoe sdk to call qwen3-vl-plus * support dashscopoe sdk to call qwen3-vl-plus --------- Co-authored-by: Timothyxxx <Timothyxxx@users.noreply.github.com>
2025-10-13 16:31:06 +08:00
parent f9e9273b3b
commit d25464c203
2 changed files with 153 additions and 11 deletions
--- a/mm_agents/qwen3vl_agent.py
+++ b/mm_agents/qwen3vl_agent.py
@@ -6,6 +6,9 @@ import os
 from io import BytesIO
 from typing import Dict, List, Tuple

+from http import HTTPStatus
+import dashscope
+from dashscope import MultiModalConversation
 import backoff
 import openai
 from PIL import Image
@@ -40,7 +43,7 @@ def process_image(image_bytes):
        height=height,
        width=width,
        factor=32,
-        max_pixels=16 * 16 * 4 * 1280,
+        max_pixels=16 * 16 * 4 * 12800,
    )

    image = image.resize((resized_width, resized_height))
@@ -58,7 +61,7 @@ class Qwen3VLAgent:
        self,
        platform: str = "ubuntu",
        model: str = "qwen3-vl",
-        max_tokens: int = 1500,
+        max_tokens: int = 40960,
        top_p: float = 0.9,
        temperature: float = 0.0,
        action_space: str = "pyautogui",
@@ -66,6 +69,9 @@ class Qwen3VLAgent:
        history_n: int = 4,
        add_thought_prefix: bool = False,
        coordinate_type: str = "relative",
+        api_backend: str = "dashscope",  # "openai" or "dashscope"
+        enable_thinking: bool = True,  # Enable thinking mode for DashScope
+        thinking_budget: int = 32768,  # Token budget for reasoning
    ):
        self.platform = platform
        self.model = model
@@ -77,9 +83,13 @@ class Qwen3VLAgent:
        self.history_n = history_n
        self.add_thought_prefix = add_thought_prefix
        self.coordinate_type = coordinate_type
+        self.api_backend = api_backend
+        self.enable_thinking = enable_thinking
+        self.thinking_budget = thinking_budget

        assert action_space in ["pyautogui"], "Invalid action space"
        assert observation_type in ["screenshot"], "Invalid observation type"
+        assert api_backend in ["openai", "dashscope"], "Invalid API backend, must be 'openai' or 'dashscope'"

        self.thoughts = []
        self.actions = []
@@ -527,6 +537,70 @@ Previous actions:

        return low_level_instruction, pyautogui_code

+    @staticmethod
+    def _to_dashscope_messages(messages):
+        """
+        Convert messages built for OpenAI compat into DashScope MultiModalConversation format.
+        - "text" part  -> {"text": "..."}
+        - "image_url"  -> {"image": "<url-or-data-uri>"}
+        - "video_url"  -> {"video": "<url-or-data-uri>"}
+        """
+        ds_msgs = []
+        for m in messages:
+            role = m.get("role", "")
+            parts = m.get("content", [])
+            ds_content = []
+            for p in parts:
+                ptype = p.get("type")
+                if ptype == "text":
+                    ds_content.append({"text": p.get("text", "")})
+                elif ptype == "image_url":
+                    url = (p.get("image_url") or {}).get("url", "")
+                    # DashScope accepts http(s), file://, or data:image/*; keep as-is
+                    ds_content.append({"image": url})
+                elif ptype == "video_url":
+                    url = (p.get("video_url") or {}).get("url", "")
+                    ds_content.append({"video": url})
+                else:
+                    # If you ever pass raw assistant strings (no parts), tolerate it
+                    if isinstance(p, str):
+                        ds_content.append({"text": p})
+            # Also tolerate plain-string content (rare)
+            if not ds_content and isinstance(m.get("content"), str):
+                ds_content = [{"text": m["content"]}]
+            ds_msgs.append({"role": role, "content": ds_content})
+        return ds_msgs
+
+    @staticmethod
+    def _extract_text_from_dashscope_response(resp):
+        """Join all 'text' parts from the first choice, including reasoning if present."""
+        if hasattr(resp, "output"):
+            out = resp.output
+        else:
+            out = resp.get("output") if isinstance(resp, dict) else None
+        if not out:
+            return None
+        choices = getattr(out, "choices", None) if not isinstance(out, dict) else out.get("choices")
+        if not choices:
+            return None
+        msg = getattr(choices[0], "message", None) if not isinstance(choices[0], dict) else choices[0].get("message")
+        if not msg:
+            return None
+        content = getattr(msg, "content", None) if not isinstance(msg, dict) else msg.get("content", [])
+        if not content:
+            return None
+        
+        # Extract reasoning content if present (for thinking models)
+        reasoning_content = getattr(msg, "reasoning_content", None) if not isinstance(msg, dict) else msg.get("reasoning_content", None)
+        
+        content_text = "".join(part.get("text", "") for part in content if isinstance(part, dict) and "text" in part)
+        
+        # Format with thinking tags if reasoning exists
+        if reasoning_content is not None:
+            return f"<think>\n{reasoning_content}\n</think>\n\n{content_text}"
+        else:
+            return content_text
+
    @backoff.on_exception(
        backoff.constant,
        (
@@ -545,25 +619,93 @@ Previous actions:
    def call_llm(self, payload, model):
        messages = payload["messages"]

+        if self.api_backend == "openai":
+            return self._call_llm_openai(messages, model)
+        elif self.api_backend == "dashscope":
+            return self._call_llm_dashscope(messages, model)
+        else:
+            raise ValueError(f"Unknown API backend: {self.api_backend}")
+
+    def _call_llm_openai(self, messages, model):
+        """Call LLM using OpenAI SDK (compatible with OpenAI-compatible endpoints)."""
        base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1"
        api_key = "sk-123"
        client = openai.OpenAI(base_url=base_url, api_key=api_key)

-        for _ in range(MAX_RETRY_TIMES):
-            logger.info("Generating content with Qwen model: %s", model)
+        for attempt in range(1, MAX_RETRY_TIMES + 1):
+            logger.info(f"[OpenAI] Generating content with model: {model} (attempt {attempt}/{MAX_RETRY_TIMES})")
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    max_tokens=self.max_tokens,
-                    temperature=self.temperature,
-                    top_p=self.top_p,
+                    # temperature=self.temperature,
+                    # top_p=self.top_p,
                )
                return response.choices[0].message.content
            except Exception as e:
-                logger.error(f"Error calling Qwen model: {e}")
-                time.sleep(5)
-                continue
+                logger.error(f"[OpenAI] Error calling model: {e}")
+                if attempt < MAX_RETRY_TIMES:
+                    time.sleep(5)
+                    continue
+                break
+        return ""
+
+    def _call_llm_dashscope(self, messages, model):
+        """Call LLM using DashScope SDK."""
+        dashscope.base_http_api_url = "https://poc-dashscope.aliyuncs.com/api/v1"
+        dashscope.api_key = "sk-123"
+
+        # Convert message schema
+        ds_messages = self._to_dashscope_messages(messages)
+
+        # Retry loop
+        last_err = None
+        for attempt in range(1, MAX_RETRY_TIMES + 1):
+            thinking_status = f" (thinking={self.enable_thinking})" if self.enable_thinking else ""
+            logger.info(f"[DashScope] Generating content with model: {model}, thinking_status: {thinking_status} (attempt {attempt}/{MAX_RETRY_TIMES})")
+            try:
+                # Build API call parameters
+                call_params = {
+                    "model": model,
+                    "messages": ds_messages,
+                    "max_tokens": min(self.max_tokens, 2048),
+                    # "temperature": self.temperature,
+                    # "top_p": self.top_p,
+                    "vl_high_resolution_images": True,
+                }
+                
+                # Add thinking parameters if enabled
+                if self.enable_thinking:
+                    call_params["enable_thinking"] = True
+                    call_params["thinking_budget"] = self.thinking_budget
+                
+                resp = MultiModalConversation.call(**call_params)
+
+                if getattr(resp, "status_code", None) not in (None, HTTPStatus.OK):
+                    code = getattr(resp, "code", "")
+                    msg = getattr(resp, "message", "")
+                    reqid = getattr(resp, "request_id", "")
+                    logger.warning(f"[DashScope] non-OK response (id={reqid}): {code} {msg}")
+                    last_err = RuntimeError(f"DashScope status {resp.status_code}: {code} {msg}")
+                    time.sleep(1.5 * attempt)
+                    continue
+
+                text = self._extract_text_from_dashscope_response(resp)
+                if not text:
+                    raise ValueError("DashScope response has no text content")
+                return text
+
+            except Exception as e:
+                last_err = e
+                logger.error(f"[DashScope] call failed: {e}")
+                if attempt < MAX_RETRY_TIMES:
+                    time.sleep(1.5 * attempt)
+                    continue
+                break
+
+        if last_err:
+            raise last_err
        return ""

    def reset(self, _logger=None):
--- a/run_multienv_qwen3vl.py
+++ b/run_multienv_qwen3vl.py
@@ -57,13 +57,13 @@ def config() -> argparse.Namespace:
    parser.add_argument("--model", type=str, default="qwen3-vl")
    parser.add_argument("--temperature", type=float, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)
-    parser.add_argument("--max_tokens", type=int, default=1500)
+    parser.add_argument("--max_tokens", type=int, default=40960)
    parser.add_argument("--stop_token", type=str, default=None)
    parser.add_argument(
        "--coord",
        type=str,
        choices=["absolute", "relative"],
-        default="absolute",
+        default="relative",
        help="Coordinate system for agent outputs (absolute or relative)",
    )
    parser.add_argument(