diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py
index 096dc54..daba009 100644
--- a/desktop_env/desktop_env.py
+++ b/desktop_env/desktop_env.py
@@ -111,6 +111,7 @@ class DesktopEnv(gym.Env):
             os_type: str = "Ubuntu",
             enable_proxy: bool = False,
             client_password: str = "",
+            eval_model: str = "gpt-5.2-chat-latest"
     ):
         """
         Args:
@@ -127,6 +128,7 @@ class DesktopEnv(gym.Env):
             require_terminal (bool): whether to require terminal output
             os_type (str): operating system type, default to "Ubuntu"
             enable_proxy (bool): whether to enable proxy support, default to False
+            eval_model (str): evaluation model to use, default to "gpt-5.2-chat-latest"
         """
         # Initialize VM manager and vitualization provider
         self.region = region
@@ -179,6 +181,9 @@ class DesktopEnv(gym.Env):
         self.require_a11y_tree = require_a11y_tree
         self.require_terminal = require_terminal
 
+        # Evaluation model
+        self.eval_model = eval_model
+
         # Initialize emulator and controller
         logger.info("Initializing...")
         self._start_emulator()
@@ -425,7 +430,7 @@ class DesktopEnv(gym.Env):
 
         return observation, reward, done, info
 
-    def evaluate(self):
+    def evaluate(self, result_dir: Optional[str] = None):
         """
         Evaluate whether the task is successfully completed.
         """
@@ -448,6 +453,20 @@ class DesktopEnv(gym.Env):
                 if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
                     return 0
 
+        if self.evaluator['func'] == "vllm_eval":
+            logger.info("Preparing vllm_eval metric options...")
+            screenshot_bytes = self.controller.get_screenshot()
+
+            import base64
+            self.metric_options["instruction"] = self.instruction
+            self.metric_options["eval_model"] = self.eval_model
+
+            if result_dir:
+                self.metric_options["result_dir"] = result_dir
+                logger.info(f"Using result_dir for vllm_eval: {result_dir}")
+
+            logger.info(f"Evaluation options prepared: {self.metric_options.keys()}")
+            
         if type(self.metric) == list:
             # Multiple metrics to evaluate whether the task is successfully completed
             results = []