feat: Add Volcengine provider support for desktop environment. (#307)

Co-authored-by: lisailong <lisailong.ze@bytedance.com>
This commit is contained in:
SaiLong Li
2025-08-15 18:53:13 +08:00
committed by GitHub
parent 6ecbcf006b
commit cc6eddb466
7 changed files with 541 additions and 1 deletions

View File

@@ -158,7 +158,7 @@ class DesktopEnv(gym.Env):
# Track whether environment has been used (step/setup) to optimize snapshot revert
# docker, aws, gcp, azure are always unused as the emulator starts from a clean state
# vmware, virtualbox are always used as the emulator starts from a dirty state
if self.provider_name in {"docker", "aws", "gcp", "azure", "aliyun"}:
if self.provider_name in {"docker", "aws", "gcp", "azure", "aliyun", "volcengine"}:
self.is_environment_used = False
elif self.provider_name in {"vmware", "virtualbox"}:
self.is_environment_used = True

View File

@@ -35,5 +35,9 @@ def create_vm_manager_and_provider(provider_name: str, region: str, use_proxy: b
from desktop_env.providers.aliyun.manager import AliyunVMManager
from desktop_env.providers.aliyun.provider import AliyunProvider
return AliyunVMManager(), AliyunProvider()
elif provider_name == "volcengine":
from desktop_env.providers.volcengine.manager import VolcengineVMManager
from desktop_env.providers.volcengine.provider import VolcengineProvider
return VolcengineVMManager(), VolcengineProvider()
else:
raise NotImplementedError(f"{provider_name} not implemented!")

View File

@@ -0,0 +1,127 @@
# 火山引擎ECS提供商配置指南
本指南介绍如何为OSWorld桌面环境配置和使用火山引擎ECS。
## 配置流程
1. **火山引擎账户**您需要一个有效的火山引擎账户本脚本默认ECS通过按量付费方式拉起需保证账户余额在100以上。
2. **访问密钥**在火山引擎IAM控制台中创建AccessKey ID和SecretAccessKey并授权ECS控制权限
3. **VPC设置**在目标地域创建VPC、子网和安全组
4. **自定义镜像**创建OSWorld自定义镜像
5. 建议手动完成一次ECS创建流程后记录所有需要的环境变量信息。
## 环境变量
在您的`.env`文件中设置以下环境变量:
```bash
# 火山引擎访问凭证
VOLCENGINE_ACCESS_KEY_ID=your_access_key_id
VOLCENGINE_SECRET_ACCESS_KEY=your_secret_access_key
# ECS配置信息
VOLCENGINE_REGION=eu-central-1
VOLCENGINE_IMAGE_ID=your_image_id
VOLCENGINE_INSTANCE_TYPE=ecs.e-c1m2.large
VOLCENGINE_SUBNET_ID=subnet-xxxxxxxxx
VOLCENGINE_SECURITY_GROUP_ID=sg-xxxxxxxxx
VOLCENGINE_ZONE_ID=zone-xxxxxxxxx
VOLCENGINE_DEFAULT_PASSWORD=your_default_password
```
## 所需火山引擎资源
### 1. VPC和子网
- 在目标地域创建VPC
- 在VPC内创建子网
- 确保子网具有互联网访问能力以支持VNC连接
### 2. 安全组
**⚠️ 重要提示**请严格按照以下端口设置以防止OSWorld任务因连接问题而失败
#### 入方向规则需要8条规则
| 类型 | 协议 | 端口范围 | 源地址 | 描述 |
|------|------|----------|--------|------|
| SSH | TCP | 22 | 0.0.0.0/0 | SSH访问 |
| HTTP | TCP | 80 | 172.31.0.0/16 | HTTP流量 |
| 自定义TCP | TCP | 5000 | 172.31.0.0/16 | OSWorld后端服务 |
| 自定义TCP | TCP | 5910 | 0.0.0.0/0 | NoVNC可视化端口 |
| 自定义TCP | TCP | 8006 | 172.31.0.0/16 | VNC服务端口 |
| 自定义TCP | TCP | 8080 | 172.31.0.0/16 | VLC服务端口 |
| 自定义TCP | TCP | 8081 | 172.31.0.0/16 | 附加服务端口 |
| 自定义TCP | TCP | 9222 | 172.31.0.0/16 | Chrome控制端口 |
#### 出方向规则需要1条规则
| 类型 | 协议 | 端口范围 | 目标地址 | 描述 |
|------|------|----------|----------|------|
| 全部流量 | 全部 | 全部 | 0.0.0.0/0 | 允许所有出站流量 |
### 3. 自定义镜像
您需要为火山引擎ECS创建自定义OSWorld镜像。请按照"为OSWorld创建自定义ECS镜像"部分的说明进行操作。
## 为OSWorld创建自定义ECS镜像
本部分提供如何创建OSWorld桌面环境所需的自定义ECS镜像的指导。该过程包括设置带有桌面环境和VNC服务器的基础实例然后从中创建自定义镜像。
### 分步镜像创建过程
#### 步骤1上传现有qcow2镜像到火山引擎
-`desktop_env/providers/docker/manager.py`中的链接下载提供的qcow2镜像https://huggingface.co/datasets/xlangai/ubuntu_osworld/resolve/main/Ubuntu.qcow2.zip
- 解压下载的文件并上传到火山引擎对象存储服务TOS。确保TOS与您要启动ECS实例的目标地域在同一地域。
- 在您的ECS控制台中转到"镜像"页面,您将看到"导入镜像"按钮。点击它并按照说明从TOS导入qcow2镜像。
- 导入完成后,您将在"镜像"列表中看到导入的镜像。
#### 步骤2创建新镜像
请注意您在步骤1中创建的镜像分辨率与您想要用于OSWorld的分辨率1920x1080不同。我们需要自定义镜像以具有正确的分辨率并设置noVNC。
- 转到"实例"选项卡,使用导入的镜像创建新实例。
- 通过VNC连接到正在运行的实例。
- 连接到实例后,请打开终端并下载此配置脚本:`https://gist.githubusercontent.com/qykong/bea58ff98f20057d3a69921276dd4553/raw/cd1a91a0840c4192d793f43cfb90553370343b08/config.sh`
- 运行脚本并重启您的实例。
- 重启后实例将具有正确的分辨率和noVNC设置。您可以通过"http://<your_instance_public_ip>:5910/vnc.html"连接到实例确保您的安全组允许端口5910
- 将正在运行的实例保存为新镜像。新镜像将用作OSWorld镜像。
## 使用说明
### 启动OSWorld任务
```bash
python run_multienv_qwen25vl.py \
--provider_name volcengine \
--region eu-central-1 \
--num_envs 2 \
--model qwen2.5-vl-72b-instruct \
--action_space pyautogui \
--observation_type screenshot \
--max_steps 15 \
--domain chrome
```
### 监控实例状态
- 在火山引擎控制台中查看实例运行状态
- 通过VNC URL访问远程桌面`http://<public_ip>:5910/vnc.html`
- 查看日志文件了解任务执行情况
### 清理资源
- 任务完成后,实例会自动终止
- 如需手动清理,可在控制台中删除相关实例
- 建议定期清理不再使用的镜像以节省存储成本
## 故障排除
### 常见问题
1. **实例启动失败**检查账户余额、镜像ID和实例类型配置
2. **VNC连接失败**确认安全组端口5910已开放
3. **网络连接问题**:验证子网配置和路由表设置
4. **镜像导入失败**检查TOS权限和镜像格式
### 日志查看
- 查看控制台日志:`logs/normal-YYYYMMDD@HHMMSS.log`
- 查看调试日志:`logs/debug-YYYYMMDD@HHMMSS.log`
- 查看实例系统日志通过VNC连接查看系统日志
## 成本优化建议
1. **实例类型选择**:根据任务复杂度选择合适的实例类型
2. **镜像优化**:定期清理未使用的镜像
3. **批量任务**:合理设置并发数量以平衡性能和成本
4. **监控使用**:定期查看资源使用情况,及时调整配置

View File

@@ -0,0 +1,221 @@
import os
import logging
import signal
import dotenv
import time
import volcenginesdkcore
import volcenginesdkecs.models as ecs_models
from volcenginesdkecs.api import ECSApi
from desktop_env.providers.base import VMManager
# Load environment variables from .env file
dotenv.load_dotenv()
for env_name in [
"VOLCENGINE_ACCESS_KEY_ID",
"VOLCENGINE_SECRET_ACCESS_KEY",
"VOLCENGINE_REGION",
"VOLCENGINE_SUBNET_ID",
"VOLCENGINE_SECURITY_GROUP_ID",
"VOLCENGINE_INSTANCE_TYPE",
"VOLCENGINE_IMAGE_ID",
"VOLCENGINE_ZONE_ID",
"VOLCENGINE_DEFAULT_PASSWORD",
]:
if not os.getenv(env_name):
raise EnvironmentError(f"{env_name} must be set in the environment variables.")
logger = logging.getLogger("desktopenv.providers.volcengine.VolcengineVMManager")
logger.setLevel(logging.INFO)
VOLCENGINE_ACCESS_KEY_ID = os.getenv("VOLCENGINE_ACCESS_KEY_ID")
VOLCENGINE_SECRET_ACCESS_KEY = os.getenv("VOLCENGINE_SECRET_ACCESS_KEY")
VOLCENGINE_REGION = os.getenv("VOLCENGINE_REGION")
VOLCENGINE_SUBNET_ID = os.getenv("VOLCENGINE_SUBNET_ID")
VOLCENGINE_SECURITY_GROUP_ID = os.getenv("VOLCENGINE_SECURITY_GROUP_ID")
VOLCENGINE_INSTANCE_TYPE = os.getenv("VOLCENGINE_INSTANCE_TYPE")
VOLCENGINE_IMAGE_ID = os.getenv("VOLCENGINE_IMAGE_ID")
VOLCENGINE_ZONE_ID = os.getenv("VOLCENGINE_ZONE_ID")
VOLCENGINE_DEFAULT_PASSWORD = os.getenv("VOLCENGINE_DEFAULT_PASSWORD")
def _allocate_vm(screen_size=(1920, 1080)):
"""分配火山引擎虚拟机"""
# 初始化火山引擎客户端
configuration = volcenginesdkcore.Configuration()
configuration.region = VOLCENGINE_REGION
configuration.ak = VOLCENGINE_ACCESS_KEY_ID
configuration.sk = VOLCENGINE_SECRET_ACCESS_KEY
configuration.client_side_validation = True
# set default configuration
volcenginesdkcore.Configuration.set_default(configuration)
# use global default configuration
api_instance = ECSApi()
instance_id = None
original_sigint_handler = signal.getsignal(signal.SIGINT)
original_sigterm_handler = signal.getsignal(signal.SIGTERM)
def signal_handler(sig, frame):
if instance_id:
signal_name = "SIGINT" if sig == signal.SIGINT else "SIGTERM"
logger.warning(f"Received {signal_name} signal, terminating instance {instance_id}...")
try:
api_instance.delete_instance(ecs_models.DeleteInstanceRequest(
instance_id=instance_id,
))
logger.info(f"Successfully terminated instance {instance_id} after {signal_name}.")
except Exception as cleanup_error:
logger.error(f"Failed to terminate instance {instance_id} after {signal_name}: {str(cleanup_error)}")
# Restore original signal handlers
signal.signal(signal.SIGINT, original_sigint_handler)
signal.signal(signal.SIGTERM, original_sigterm_handler)
if sig == signal.SIGINT:
raise KeyboardInterrupt
else:
import sys
sys.exit(0)
try:
# Set up signal handlers
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# 创建实例参数
create_instance_params = ecs_models.RunInstancesRequest(
image_id = VOLCENGINE_IMAGE_ID,
instance_type = VOLCENGINE_INSTANCE_TYPE,
network_interfaces=[ecs_models.NetworkInterfaceForRunInstancesInput(
subnet_id=VOLCENGINE_SUBNET_ID,
security_group_ids=[VOLCENGINE_SECURITY_GROUP_ID],
)],
eip_address=ecs_models.EipAddressForRunInstancesInput(
bandwidth_mbps = 5,
charge_type = "PayByBandwidth",
),
instance_name = f"osworld-{os.getpid()}-{int(time.time())}",
volumes=[ecs_models.VolumeForRunInstancesInput(
volume_type="ESSD_PL0",
size=30,
)],
zone_id=VOLCENGINE_ZONE_ID,
password = VOLCENGINE_DEFAULT_PASSWORD, # 默认密码
description = "OSWorld evaluation instance"
)
# 创建实例
response = api_instance.run_instances(create_instance_params)
instance_id = response.instance_ids[0]
logger.info(f"Waiting for instance {instance_id} to be running...")
# 等待实例运行
while True:
instance_info = api_instance.describe_instances(ecs_models.DescribeInstancesRequest(
instance_ids=[instance_id]
))
status = instance_info.instances[0].status
if status == 'RUNNING':
break
elif status in ['STOPPED', 'ERROR']:
raise Exception(f"Instance {instance_id} failed to start, status: {status}")
time.sleep(5)
logger.info(f"Instance {instance_id} is ready.")
# 获取实例IP地址
try:
instance_info = api_instance.describe_instances(ecs_models.DescribeInstancesRequest(
instance_ids=[instance_id]
))
print(instance_info)
public_ip = instance_info.instances[0].eip_address.ip_address
private_ip = instance_info.instances[0].network_interfaces[0].primary_ip_address
if public_ip:
vnc_url = f"http://{public_ip}:5910/vnc.html"
logger.info("="*80)
logger.info(f"🖥️ VNC Web Access URL: {vnc_url}")
logger.info(f"📡 Public IP: {public_ip}")
logger.info(f"🏠 Private IP: {private_ip}")
logger.info(f"🆔 Instance ID: {instance_id}")
logger.info("="*80)
print(f"\n🌐 VNC Web Access URL: {vnc_url}")
print(f"📍 Please open the above address in the browser for remote desktop access\n")
except Exception as e:
logger.warning(f"Failed to get VNC address for instance {instance_id}: {e}")
except KeyboardInterrupt:
logger.warning("VM allocation interrupted by user (SIGINT).")
if instance_id:
logger.info(f"Terminating instance {instance_id} due to interruption.")
api_instance.delete_instance(ecs_models.DeleteInstanceRequest(
instance_id=instance_id,
))
raise
except Exception as e:
logger.error(f"Failed to allocate VM: {e}", exc_info=True)
if instance_id:
logger.info(f"Terminating instance {instance_id} due to an error.")
api_instance.delete_instance(ecs_models.DeleteInstanceRequest(
instance_id=instance_id,
))
raise
finally:
# Restore original signal handlers
signal.signal(signal.SIGINT, original_sigint_handler)
signal.signal(signal.SIGTERM, original_sigterm_handler)
return instance_id
class VolcengineVMManager(VMManager):
"""
Volcengine VM Manager for managing virtual machines on Volcengine.
Volcengine does not need to maintain a registry of VMs, as it can dynamically allocate and deallocate VMs.
"""
def __init__(self, **kwargs):
self.initialize_registry()
def initialize_registry(self, **kwargs):
pass
def add_vm(self, vm_path, lock_needed=True, **kwargs):
pass
def _add_vm(self, vm_path):
pass
def delete_vm(self, vm_path, lock_needed=True, **kwargs):
pass
def _delete_vm(self, vm_path):
pass
def occupy_vm(self, vm_path, pid, lock_needed=True, **kwargs):
pass
def _occupy_vm(self, vm_path, pid):
pass
def check_and_clean(self, lock_needed=True, **kwargs):
pass
def _check_and_clean(self):
pass
def list_free_vms(self, lock_needed=True, **kwargs):
pass
def _list_free_vms(self):
pass
def get_vm_path(self, screen_size=(1920, 1080), **kwargs):
logger.info("Allocating a new VM in region: {region}".format(region=VOLCENGINE_REGION))
new_vm_path = _allocate_vm(screen_size=screen_size)
return new_vm_path

View File

@@ -0,0 +1,188 @@
import os
import time
import logging
import volcenginesdkcore
import volcenginesdkautoscaling
import volcenginesdkecs.models as ecs_models
from volcenginesdkcore.rest import ApiException
from volcenginesdkecs.api import ECSApi
from desktop_env.providers.base import Provider
from desktop_env.providers.volcengine.manager import _allocate_vm
logger = logging.getLogger("desktopenv.providers.volcengine.VolcengineProvider")
logger.setLevel(logging.INFO)
WAIT_DELAY = 15
MAX_ATTEMPTS = 10
class VolcengineProvider(Provider):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.region = os.getenv("VOLCENGINE_REGION", "eu-central-1")
self.client = self._create_client()
def _create_client(self) -> ECSApi:
configuration = volcenginesdkcore.Configuration()
configuration.ak = os.getenv('VOLCENGINE_ACCESS_KEY_ID')
configuration.sk = os.getenv('VOLCENGINE_SECRET_ACCESS_KEY')
configuration.region = os.getenv('VOLCENGINE_REGION')
configuration.client_side_validation = True
# set default configuration
volcenginesdkcore.Configuration.set_default(configuration)
return ECSApi()
def start_emulator(self, path_to_vm: str, headless: bool, *args, **kwargs):
logger.info("Starting Volcengine VM...")
try:
# 检查实例状态
instance_info = self.client.describe_instances(ecs_models.DescribeInstancesRequest(
instance_ids=[path_to_vm]
))
status = instance_info.instances[0].status
logger.info(f"Instance {path_to_vm} current status: {status}")
if status == 'RUNNING':
logger.info(f"Instance {path_to_vm} is already running. Skipping start.")
return
if status == 'STOPPED':
# 启动实例
self.client.start_instance(ecs_models.StartInstancesRequest(instance_ids=[path_to_vm]))
logger.info(f"Instance {path_to_vm} is starting...")
# 等待实例运行
for attempt in range(MAX_ATTEMPTS):
time.sleep(WAIT_DELAY)
instance_info = self.client.describe_instances(ecs_models.DescribeInstancesRequest(
instance_ids=[path_to_vm]
))
status = instance_info.instances[0].status
if status == 'RUNNING':
logger.info(f"Instance {path_to_vm} is now running.")
break
elif status == 'ERROR':
raise Exception(f"Instance {path_to_vm} failed to start")
elif attempt == MAX_ATTEMPTS - 1:
raise Exception(f"Instance {path_to_vm} failed to start within timeout")
else:
logger.warning(f"Instance {path_to_vm} is in status '{status}' and cannot be started.")
except ApiException as e:
logger.error(f"Failed to start the Volcengine VM {path_to_vm}: {str(e)}")
raise
def get_ip_address(self, path_to_vm: str) -> str:
logger.info("Getting Volcengine VM IP address...")
try:
instance_info = self.client.describe_instances(ecs_models.DescribeInstancesRequest(
instance_ids=[path_to_vm]
))
public_ip = instance_info.instances[0].eip_address.ip_address
private_ip = instance_info.instances[0].network_interfaces[0].primary_ip_address
if public_ip:
vnc_url = f"http://{public_ip}:5910/vnc.html"
logger.info("=" * 80)
logger.info(f"🖥️ VNC Web Access URL: {vnc_url}")
logger.info(f"📡 Public IP: {public_ip}")
logger.info(f"🏠 Private IP: {private_ip}")
logger.info("=" * 80)
print(f"\n🌐 VNC Web Access URL: {vnc_url}")
print(f"📍 Please open the above address in the browser for remote desktop access\n")
else:
logger.warning("No public IP address available for VNC access")
return private_ip
except ApiException as e:
logger.error(f"Failed to retrieve IP address for the instance {path_to_vm}: {str(e)}")
raise
def save_state(self, path_to_vm: str, snapshot_name: str):
logger.info("Saving Volcengine VM state...")
try:
# 创建镜像
response = self.client.create_image(ecs_models.CreateImageRequest(
snapshot_id=snapshot_name,
instance_id=path_to_vm,
description=f"OSWorld snapshot: {snapshot_name}"
))
image_id = response['image_id']
logger.info(f"Image {image_id} created successfully from instance {path_to_vm}.")
return image_id
except ApiException as e:
logger.error(f"Failed to create image from the instance {path_to_vm}: {str(e)}")
raise
def revert_to_snapshot(self, path_to_vm: str, snapshot_name: str):
logger.info(f"Reverting Volcengine VM to snapshot: {snapshot_name}...")
try:
# 删除原实例
self.client.delete_instance(ecs_models.DeleteInstanceRequest(
instance_id=path_to_vm,
))
logger.info(f"Old instance {path_to_vm} has been deleted.")
# 创建实例
new_instance_id = _allocate_vm()
logger.info(f"New instance {new_instance_id} launched from image {snapshot_name}.")
logger.info(f"Waiting for instance {new_instance_id} to be running...")
# 等待新实例运行
while True:
instance_info = self.client.describe_instances(ecs_models.DescribeInstancesRequest(
instance_ids=[new_instance_id]
))
status = instance_info.instances[0].status
if status == 'RUNNING':
break
elif status in ['STOPPED', 'ERROR']:
raise Exception(f"New instance {new_instance_id} failed to start, status: {status}")
time.sleep(5)
logger.info(f"Instance {new_instance_id} is ready.")
# 获取新实例的IP地址
try:
instance_info = self.client.describe_instances(ecs_models.DescribeInstancesRequest(
instance_ids=[new_instance_id]
))
public_ip = instance_info.instances[0].eip_address.ip_address
if public_ip:
vnc_url = f"http://{public_ip}:5910/vnc.html"
logger.info("=" * 80)
logger.info(f"🖥️ New Instance VNC Web Access URL: {vnc_url}")
logger.info(f"📡 Public IP: {public_ip}")
logger.info(f"🆔 New Instance ID: {new_instance_id}")
logger.info("=" * 80)
print(f"\n🌐 New Instance VNC Web Access URL: {vnc_url}")
print(f"📍 Please open the above address in the browser for remote desktop access\n")
except Exception as e:
logger.warning(f"Failed to get VNC address for new instance {new_instance_id}: {e}")
return new_instance_id
except ApiException as e:
logger.error(f"Failed to revert to snapshot {snapshot_name} for the instance {path_to_vm}: {str(e)}")
raise
def stop_emulator(self, path_to_vm, region=None):
logger.info(f"Stopping Volcengine VM {path_to_vm}...")
try:
self.client.delete_instance(ecs_models.DeleteInstanceRequest(
instance_id=path_to_vm,
))
logger.info(f"Instance {path_to_vm} has been terminated.")
except ApiException as e:
logger.error(f"Failed to stop the Volcengine VM {path_to_vm}: {str(e)}")
raise