feat: refactor run_multienv_qwen25vl.py and qwen25vl_agent.py for improved logging and task management

- Introduced signal handling for graceful shutdown of environments and processes. - Enhanced logging configuration to support dynamic log levels and structured output. - Updated argument parsing to include new parameters for model selection and task execution. - Refactored task distribution logic to streamline environment task management. - Improved error handling during task execution and environment cleanup. - Adjusted Qwen25VLAgent initialization to support new model and thought prefix options. - Reduced max tries for LLM calls to optimize performance.
2025-07-22 19:46:42 +00:00
parent 4a5d48000f
commit 82c3cdd590
2 changed files with 383 additions and 207 deletions
--- a/mm_agents/qwen25vl_agent.py
+++ b/mm_agents/qwen25vl_agent.py
@@ -66,25 +66,24 @@ class Qwen25VLAgent:
    def __init__(
        self,
        platform="ubuntu",
-        planner_model="gpt-4o",
-        executor_model="qwen2.5vl",
+        model="qwen2.5-vl-72b-instruct",
        max_tokens=1500,
        top_p=0.9,
        temperature=0.5,
        action_space="pyautogui",
        observation_type="screenshot",
        history_n=4,  # Number of previous interactions to include in full detail
+        add_thought_prefix=False,
    ):
        self.platform = platform
-        self.planner_model = planner_model
-        self.executor_model = executor_model
-        assert self.executor_model is not None, "Executor model cannot be None"
+        self.model = model
        self.max_tokens = max_tokens
        self.top_p = top_p
        self.temperature = temperature
        self.action_space = action_space
        self.observation_type = observation_type
        self.history_n = history_n  # Control how many previous interactions to include
+        self.add_thought_prefix = add_thought_prefix
        assert action_space in ["pyautogui"], "Invalid action space"
        assert observation_type in ["screenshot"], "Invalid observation type"
        self.thoughts = []
@@ -277,19 +276,20 @@ Previous actions:
            })

        # append_text = f"""Step {current_step+1}: Thought:"""
-        append_text = f"""Thought:"""
-        messages.append({"role": "assistant", "content": [{"type": "text", "text": append_text}]})
+        if self.add_thought_prefix:
+            append_text = f"""Thought:"""
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": append_text}]})

        # Call the LLM
        response = self.call_llm(
            {
-                "model": self.executor_model,
+                "model": self.model,
                "messages": messages,
                "max_tokens": self.max_tokens,
                "top_p": self.top_p,
                "temperature": self.temperature,
            },
-            self.executor_model,
+            self.model,
        )

        logger.info(f"Qwen25VL Output: {response}")
@@ -483,10 +483,10 @@ Previous actions:
                continue
            
            # Handle lines inside tool call markers
-            if line.startswith("<tool_call>"):
+            if line.startswith("<tool_call>") or line.startswith("⚗") or line.startswith("📐"): # Yeah, it's a bug during data processing
                inside_tool_call = True
                continue
-            elif line.startswith("</tool_call>"):
+            elif line.startswith("</tool_call>") or line.startswith("⚗") or line.startswith("📐"): # Yeah, it's a bug during data processing
                if current_tool_call:
                    # Process the collected tool call
                    process_tool_call("\n".join(current_tool_call))
@@ -540,12 +540,13 @@ Previous actions:
            # todo: check
        ),
        interval=30,
-        max_tries=10,
+        max_tries=5,
    )
    def call_llm(self, payload, model):
        messages = payload["messages"]
-        base_url = "your_base_url"
-        api_key = "your_api_key"
+
+        base_url = os.getenv('DASHSCOPE_BASE_URL', "https://dashscope.aliyuncs.com/compatible-mode/v1")
+        api_key = os.getenv('DASHSCOPE_API_KEY', "sk-123")
        
        client = openai.OpenAI(
            base_url=base_url,