diff --git a/mm_agents/qwen3vl_agent.py b/mm_agents/qwen3vl_agent.py index 4877995..e0ee85f 100644 --- a/mm_agents/qwen3vl_agent.py +++ b/mm_agents/qwen3vl_agent.py @@ -6,6 +6,9 @@ import os from io import BytesIO from typing import Dict, List, Tuple +from http import HTTPStatus +import dashscope +from dashscope import MultiModalConversation import backoff import openai from PIL import Image @@ -40,7 +43,7 @@ def process_image(image_bytes): height=height, width=width, factor=32, - max_pixels=16 * 16 * 4 * 1280, + max_pixels=16 * 16 * 4 * 12800, ) image = image.resize((resized_width, resized_height)) @@ -58,7 +61,7 @@ class Qwen3VLAgent: self, platform: str = "ubuntu", model: str = "qwen3-vl", - max_tokens: int = 1500, + max_tokens: int = 40960, top_p: float = 0.9, temperature: float = 0.0, action_space: str = "pyautogui", @@ -66,6 +69,9 @@ class Qwen3VLAgent: history_n: int = 4, add_thought_prefix: bool = False, coordinate_type: str = "relative", + api_backend: str = "dashscope", # "openai" or "dashscope" + enable_thinking: bool = True, # Enable thinking mode for DashScope + thinking_budget: int = 32768, # Token budget for reasoning ): self.platform = platform self.model = model @@ -77,9 +83,13 @@ class Qwen3VLAgent: self.history_n = history_n self.add_thought_prefix = add_thought_prefix self.coordinate_type = coordinate_type + self.api_backend = api_backend + self.enable_thinking = enable_thinking + self.thinking_budget = thinking_budget assert action_space in ["pyautogui"], "Invalid action space" assert observation_type in ["screenshot"], "Invalid observation type" + assert api_backend in ["openai", "dashscope"], "Invalid API backend, must be 'openai' or 'dashscope'" self.thoughts = [] self.actions = [] @@ -527,6 +537,70 @@ Previous actions: return low_level_instruction, pyautogui_code + @staticmethod + def _to_dashscope_messages(messages): + """ + Convert messages built for OpenAI compat into DashScope MultiModalConversation format. + - "text" part -> {"text": "..."} + - "image_url" -> {"image": ""} + - "video_url" -> {"video": ""} + """ + ds_msgs = [] + for m in messages: + role = m.get("role", "") + parts = m.get("content", []) + ds_content = [] + for p in parts: + ptype = p.get("type") + if ptype == "text": + ds_content.append({"text": p.get("text", "")}) + elif ptype == "image_url": + url = (p.get("image_url") or {}).get("url", "") + # DashScope accepts http(s), file://, or data:image/*; keep as-is + ds_content.append({"image": url}) + elif ptype == "video_url": + url = (p.get("video_url") or {}).get("url", "") + ds_content.append({"video": url}) + else: + # If you ever pass raw assistant strings (no parts), tolerate it + if isinstance(p, str): + ds_content.append({"text": p}) + # Also tolerate plain-string content (rare) + if not ds_content and isinstance(m.get("content"), str): + ds_content = [{"text": m["content"]}] + ds_msgs.append({"role": role, "content": ds_content}) + return ds_msgs + + @staticmethod + def _extract_text_from_dashscope_response(resp): + """Join all 'text' parts from the first choice, including reasoning if present.""" + if hasattr(resp, "output"): + out = resp.output + else: + out = resp.get("output") if isinstance(resp, dict) else None + if not out: + return None + choices = getattr(out, "choices", None) if not isinstance(out, dict) else out.get("choices") + if not choices: + return None + msg = getattr(choices[0], "message", None) if not isinstance(choices[0], dict) else choices[0].get("message") + if not msg: + return None + content = getattr(msg, "content", None) if not isinstance(msg, dict) else msg.get("content", []) + if not content: + return None + + # Extract reasoning content if present (for thinking models) + reasoning_content = getattr(msg, "reasoning_content", None) if not isinstance(msg, dict) else msg.get("reasoning_content", None) + + content_text = "".join(part.get("text", "") for part in content if isinstance(part, dict) and "text" in part) + + # Format with thinking tags if reasoning exists + if reasoning_content is not None: + return f"\n{reasoning_content}\n\n\n{content_text}" + else: + return content_text + @backoff.on_exception( backoff.constant, ( @@ -545,25 +619,93 @@ Previous actions: def call_llm(self, payload, model): messages = payload["messages"] + if self.api_backend == "openai": + return self._call_llm_openai(messages, model) + elif self.api_backend == "dashscope": + return self._call_llm_dashscope(messages, model) + else: + raise ValueError(f"Unknown API backend: {self.api_backend}") + + def _call_llm_openai(self, messages, model): + """Call LLM using OpenAI SDK (compatible with OpenAI-compatible endpoints).""" base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1" api_key = "sk-123" client = openai.OpenAI(base_url=base_url, api_key=api_key) - for _ in range(MAX_RETRY_TIMES): - logger.info("Generating content with Qwen model: %s", model) + for attempt in range(1, MAX_RETRY_TIMES + 1): + logger.info(f"[OpenAI] Generating content with model: {model} (attempt {attempt}/{MAX_RETRY_TIMES})") try: response = client.chat.completions.create( model=model, messages=messages, max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, + # temperature=self.temperature, + # top_p=self.top_p, ) return response.choices[0].message.content except Exception as e: - logger.error(f"Error calling Qwen model: {e}") - time.sleep(5) - continue + logger.error(f"[OpenAI] Error calling model: {e}") + if attempt < MAX_RETRY_TIMES: + time.sleep(5) + continue + break + return "" + + def _call_llm_dashscope(self, messages, model): + """Call LLM using DashScope SDK.""" + dashscope.base_http_api_url = "https://poc-dashscope.aliyuncs.com/api/v1" + dashscope.api_key = "sk-123" + + # Convert message schema + ds_messages = self._to_dashscope_messages(messages) + + # Retry loop + last_err = None + for attempt in range(1, MAX_RETRY_TIMES + 1): + thinking_status = f" (thinking={self.enable_thinking})" if self.enable_thinking else "" + logger.info(f"[DashScope] Generating content with model: {model}, thinking_status: {thinking_status} (attempt {attempt}/{MAX_RETRY_TIMES})") + try: + # Build API call parameters + call_params = { + "model": model, + "messages": ds_messages, + "max_tokens": min(self.max_tokens, 2048), + # "temperature": self.temperature, + # "top_p": self.top_p, + "vl_high_resolution_images": True, + } + + # Add thinking parameters if enabled + if self.enable_thinking: + call_params["enable_thinking"] = True + call_params["thinking_budget"] = self.thinking_budget + + resp = MultiModalConversation.call(**call_params) + + if getattr(resp, "status_code", None) not in (None, HTTPStatus.OK): + code = getattr(resp, "code", "") + msg = getattr(resp, "message", "") + reqid = getattr(resp, "request_id", "") + logger.warning(f"[DashScope] non-OK response (id={reqid}): {code} {msg}") + last_err = RuntimeError(f"DashScope status {resp.status_code}: {code} {msg}") + time.sleep(1.5 * attempt) + continue + + text = self._extract_text_from_dashscope_response(resp) + if not text: + raise ValueError("DashScope response has no text content") + return text + + except Exception as e: + last_err = e + logger.error(f"[DashScope] call failed: {e}") + if attempt < MAX_RETRY_TIMES: + time.sleep(1.5 * attempt) + continue + break + + if last_err: + raise last_err return "" def reset(self, _logger=None): diff --git a/run_multienv_qwen3vl.py b/run_multienv_qwen3vl.py index 1b9e167..2d9c0f6 100644 --- a/run_multienv_qwen3vl.py +++ b/run_multienv_qwen3vl.py @@ -57,13 +57,13 @@ def config() -> argparse.Namespace: parser.add_argument("--model", type=str, default="qwen3-vl") parser.add_argument("--temperature", type=float, default=0) parser.add_argument("--top_p", type=float, default=0.9) - parser.add_argument("--max_tokens", type=int, default=1500) + parser.add_argument("--max_tokens", type=int, default=40960) parser.add_argument("--stop_token", type=str, default=None) parser.add_argument( "--coord", type=str, choices=["absolute", "relative"], - default="absolute", + default="relative", help="Coordinate system for agent outputs (absolute or relative)", ) parser.add_argument(