From bd6efcfc4d40bc6aef25e78f0d6c5e74f64910d7 Mon Sep 17 00:00:00 2001 From: Timothyxxx Date: Sun, 10 Aug 2025 14:40:18 +0000 Subject: [PATCH] fix: enhance screenshot retrieval in PythonController - Added a static method to validate image responses for PNG and JPEG formats using magic bytes. - Improved error handling in the get_screenshot method to log invalid payloads and retry attempts. - Updated the requests call to include a timeout for better reliability. --- README.md | 2 +- desktop_env/controllers/python.py | 32 +++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e9beee8..4487f46 100644 --- a/README.md +++ b/README.md @@ -256,6 +256,6 @@ If you find this environment useful, please consider citing our work: ## Acknowledgement for OSWorld-Verified Special thanks to the following institutions that provided feedback and participated in the fixes (as well as institutions that provided feedback during the process): [MoonShot AI, a.k.a. Kimi](https://www.moonshot.ai/),[Human Data](https://www.hud.so/), [OpenAI](https://openai.com/), [ByteDance Seed TARS](https://seed-tars.com/), [Anthropic](https://www.anthropic.com/), [Simular](https://www.simular.ai/), [HKU Data Intelligence Lab](https://sites.google.com/view/chaoh) -Special thanks to the following students who participated in the specific fixes: [Mengqi Yuan](https://yuanmengqi.github.io/), [Danyang Zhang](https://zdy023.github.io/), [Xinzhuang Xiong](https://thisisxxz.com/), [Zhennan Shen](https://scholar.google.com/citations?user=JPwg5MwAAAAJ&hl=en), [Zilong Zhou](https://github.com/adlsdztony), Yanxu Chen, [JIaqi Deng](https://www.linkedin.com/in/jiaqideng), [Tianbao Xie](https://tianbaoxie.com/), Junda Chen, [Jixuan Chen](https://chenjix.github.io/), [Haoyuan Wu](https://www.linkedin.com/in/haoyuan-wu-240878291/). +Special thanks to the following students who participated in the specific fixes: [Mengqi Yuan](https://yuanmengqi.github.io/), [Danyang Zhang](https://zdy023.github.io/), [Xinzhuang Xiong](https://thisisxxz.com/), [Zhennan Shen](https://scholar.google.com/citations?user=JPwg5MwAAAAJ&hl=en), [Zilong Zhou](https://github.com/adlsdztony), Yanxu Chen, [Jiaqi Deng](https://millank0817.github.io/), [Tianbao Xie](https://tianbaoxie.com/), Junda Chen, [Jixuan Chen](https://chenjix.github.io/), [Haoyuan Wu](https://www.linkedin.com/in/haoyuan-wu-240878291/). Special thanks to the following students who participated in running the re-evaluation: [Mengqi Yuan](https://yuanmengqi.github.io/), [Zilong Zhou](https://github.com/adlsdztony), [Xinyuan Wang](https://xinyuanwangcs.github.io/), [Bowen Wang](https://bowenbryanwang.github.io/). diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py index 743dd09..667658d 100644 --- a/desktop_env/controllers/python.py +++ b/desktop_env/controllers/python.py @@ -20,17 +20,41 @@ class PythonController: self.retry_times = 3 self.retry_interval = 5 + @staticmethod + def _is_valid_image_response(content_type: str, data: Optional[bytes]) -> bool: + """Quick validation for PNG/JPEG payload using magic bytes; Content-Type is advisory. + Returns True only when bytes look like a real PNG or JPEG. + """ + if not isinstance(data, (bytes, bytearray)) or not data: + return False + # PNG magic + if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n": + return True + # JPEG magic + if len(data) >= 3 and data[:3] == b"\xff\xd8\xff": + return True + # If server explicitly marks as image, accept as a weak fallback (some environments strip magic) + if content_type and ("image/png" in content_type or "image/jpeg" in content_type or "image/jpg" in content_type): + return True + return False + def get_screenshot(self) -> Optional[bytes]: """ Gets a screenshot from the server. With the cursor. None -> no screenshot or unexpected error. """ - for _ in range(self.retry_times): + for attempt_idx in range(self.retry_times): try: - response = requests.get(self.http_server + "/screenshot") + response = requests.get(self.http_server + "/screenshot", timeout=10) if response.status_code == 200: - logger.info("Got screenshot successfully") - return response.content + content_type = response.headers.get("Content-Type", "") + content = response.content + if self._is_valid_image_response(content_type, content): + logger.info("Got screenshot successfully") + return content + else: + logger.error("Invalid screenshot payload (attempt %d/%d).", attempt_idx + 1, self.retry_times) + logger.info("Retrying to get screenshot.") else: logger.error("Failed to get screenshot. Status code: %d", response.status_code) logger.info("Retrying to get screenshot.")