init commit
This commit is contained in:
277
nimbus/scheduler/inner_pipe.py
Normal file
277
nimbus/scheduler/inner_pipe.py
Normal file
@@ -0,0 +1,277 @@
|
||||
import math
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
|
||||
import ray
|
||||
|
||||
from nimbus.daemon.status_monitor import StatusMonitor
|
||||
from nimbus.scheduler.stages import DedumpStage, DumpStage
|
||||
from nimbus.utils.logging import configure_logging
|
||||
from nimbus.utils.random import set_all_seeds
|
||||
from nimbus.utils.types import MONITOR_CHECK_INTERVAL, STATUS_TIMEOUTS, StageInput
|
||||
from nimbus.utils.utils import init_env, pipe_consume_stage
|
||||
|
||||
|
||||
def iter_to_obj(iter_obj):
|
||||
return pipe_consume_stage(iter_obj), True
|
||||
|
||||
|
||||
def _consume_N(iter_obj, N=1):
|
||||
print("consume: ", iter_obj)
|
||||
results = []
|
||||
finish = False
|
||||
for _ in range(N):
|
||||
try:
|
||||
obj = next(iter_obj)
|
||||
results.append(obj)
|
||||
except StopIteration:
|
||||
finish = True
|
||||
return results, finish
|
||||
|
||||
|
||||
def consume_N(stage_input):
|
||||
finish = False
|
||||
if hasattr(stage_input, "Args"):
|
||||
stage_input.Args, finish = _consume_N(stage_input.Args[0])
|
||||
if hasattr(stage_input, "Kwargs"):
|
||||
if stage_input.Kwargs is not None:
|
||||
stage_input.Kwargs = {key: _consume_N(value) for key, value in stage_input.Kwargs.items()}
|
||||
return stage_input, finish
|
||||
|
||||
|
||||
class PipeWorkerGroup:
|
||||
"""
|
||||
Manages a group of pipe workers and their supervisors.
|
||||
Supports dynamic worker spawning for worker_schedule feature.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pipe_name,
|
||||
exp_name,
|
||||
pipe_num,
|
||||
stage_list,
|
||||
master_seed,
|
||||
supervisor_class,
|
||||
inner_pipe_class,
|
||||
initial_instances=0,
|
||||
):
|
||||
self.workers = {}
|
||||
self._next_worker_idx = 0
|
||||
self.pipe_name = pipe_name
|
||||
self.exp_name = exp_name
|
||||
self.pipe_num = pipe_num
|
||||
self.stage_list = stage_list
|
||||
self.master_seed = master_seed
|
||||
self.supervisor_class = supervisor_class
|
||||
self.inner_pipe_class = inner_pipe_class
|
||||
|
||||
if initial_instances > 0:
|
||||
self.spawn(initial_instances)
|
||||
|
||||
def spawn(self, count):
|
||||
"""
|
||||
Spawn new workers dynamically.
|
||||
Returns list of (name, bundle) tuples for created workers.
|
||||
"""
|
||||
created = []
|
||||
for _ in range(count):
|
||||
name = f"p{self.pipe_num}_w{self._next_worker_idx}"
|
||||
worker_seed = self.master_seed + self._next_worker_idx if self.master_seed is not None else None
|
||||
supervisor = self.supervisor_class.remote(name)
|
||||
pipe_actor = self.inner_pipe_class.remote(self.stage_list, name, supervisor, seed=worker_seed)
|
||||
ray.get(supervisor.set_pipe.remote(pipe_actor))
|
||||
supervisor.run.remote()
|
||||
bundle = {"worker": pipe_actor, "supervisor": supervisor}
|
||||
self.workers[name] = bundle
|
||||
created.append((name, bundle))
|
||||
self._next_worker_idx += 1
|
||||
time.sleep(3)
|
||||
|
||||
if created:
|
||||
print(f"{self.pipe_name}: spawned {len(created)} workers - {[name for name, _ in created]}")
|
||||
return created
|
||||
|
||||
def items(self):
|
||||
"""Return items view of workers dictionary."""
|
||||
return self.workers.items()
|
||||
|
||||
def values(self):
|
||||
"""Return values view of workers dictionary."""
|
||||
return self.workers.values()
|
||||
|
||||
def keys(self):
|
||||
"""Return keys view of workers dictionary."""
|
||||
return self.workers.keys()
|
||||
|
||||
def __len__(self):
|
||||
"""Return number of workers in the group."""
|
||||
return len(self.workers)
|
||||
|
||||
def __repr__(self):
|
||||
worker_names = list(self.workers.keys())
|
||||
return f"PipeWorkerGroup({worker_names})"
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Support dictionary-style access."""
|
||||
return self.workers[key]
|
||||
|
||||
def remove(self, name, logger):
|
||||
"""Remove a worker from the group."""
|
||||
ray.kill(self.workers[name]["worker"])
|
||||
logger.info(f"killed worker actor {name} to release GPU resouces")
|
||||
ray.kill(self.workers[name]["supervisor"])
|
||||
logger.info(f"Supervisor {name} killed successfully")
|
||||
if name in self.workers:
|
||||
del self.workers[name]
|
||||
|
||||
|
||||
def make_pipe(pipe_name, exp_name, pipe_num, stage_list, dev, instance_num, total_processes, config, master_seed=None):
|
||||
gpu_num = 0
|
||||
if dev == "gpu":
|
||||
resources = ray.cluster_resources()
|
||||
total_gpus = resources.get("GPU", 0)
|
||||
assert total_gpus > 0, "not enough gpu resources"
|
||||
processes_per_gpu = math.ceil(total_processes / total_gpus)
|
||||
gpu_num = 1.0 / processes_per_gpu
|
||||
|
||||
@ray.remote
|
||||
class Supervisor:
|
||||
def __init__(self, name):
|
||||
self.name = "supervisor_" + name
|
||||
self.pipe_worker = None
|
||||
self.logger = configure_logging(exp_name, self.name)
|
||||
self.logger.info("Supervisor started")
|
||||
self.monitor = StatusMonitor.get_instance()
|
||||
self.monitor.set_logger(self.logger)
|
||||
|
||||
self._last_status_check = 0.0
|
||||
self.check_interval = config.get(MONITOR_CHECK_INTERVAL, 120)
|
||||
self.logger.info(f"Monitor check interval: {self.check_interval} seconds")
|
||||
if config.get(STATUS_TIMEOUTS, None) is not None:
|
||||
self.monitor.set_component_timeouts(config[STATUS_TIMEOUTS])
|
||||
|
||||
def set_pipe(self, pipe_worker):
|
||||
self.logger.info("set pipe worker")
|
||||
self.pipe_worker = pipe_worker
|
||||
|
||||
def set_queue(self, input_queue, output_queue):
|
||||
self.input_queue = input_queue
|
||||
self.output_queue = output_queue
|
||||
|
||||
def _restart_worker(self):
|
||||
try:
|
||||
ray.kill(self.pipe_worker, no_restart=False)
|
||||
self.logger.info("trigger restart of the actor")
|
||||
except Exception as ke:
|
||||
self.logger.error(f"restart actor error: {ke}")
|
||||
|
||||
def update_component_state(self, components_state):
|
||||
for _, state in components_state.items():
|
||||
self.monitor.register_update(state)
|
||||
|
||||
def _start_daemon(self):
|
||||
miss_cnt = 0
|
||||
while True:
|
||||
now = time.time()
|
||||
if now - self._last_status_check >= self.check_interval:
|
||||
try:
|
||||
timeout_components = self.monitor.check_and_update_timeouts()
|
||||
if len(timeout_components) > 0:
|
||||
self.logger.warning(f"Components timeout: {timeout_components}, restart the pipe worker")
|
||||
self._restart_worker()
|
||||
self.monitor.clear()
|
||||
else:
|
||||
if self.monitor.get_components_length() == 0:
|
||||
miss_cnt += 1
|
||||
self.logger.info(f"No components timeout detected, miss count: {miss_cnt}")
|
||||
if miss_cnt >= 5:
|
||||
self.logger.info("No components detected for 5 consecutive checks, restart pipe worker")
|
||||
self._restart_worker()
|
||||
self.monitor.clear()
|
||||
miss_cnt = 0
|
||||
except Exception as e:
|
||||
self.logger.error(f"Get components status failed: {e}")
|
||||
self._restart_worker()
|
||||
self.monitor.clear()
|
||||
self._last_status_check = now
|
||||
time.sleep(1)
|
||||
|
||||
def run(self):
|
||||
assert self.pipe_worker is not None, "pipe worker is not set"
|
||||
thread = threading.Thread(target=self._start_daemon, daemon=True)
|
||||
thread.start()
|
||||
|
||||
@ray.remote(num_gpus=gpu_num, max_restarts=3, max_task_retries=3)
|
||||
class InnerPipe:
|
||||
def __init__(self, stage_list, name, supervisor, seed=None):
|
||||
if seed is not None:
|
||||
set_all_seeds(seed)
|
||||
self.stages = stage_list
|
||||
self.name = name
|
||||
self.supervisor = supervisor
|
||||
init_env()
|
||||
self.logger = configure_logging(exp_name, self.name)
|
||||
self.logger.info(f"Working on gpu {os.environ.get('CUDA_VISIBLE_DEVICES')}")
|
||||
if ray.get_runtime_context().was_current_actor_reconstructed is True:
|
||||
msg = (
|
||||
f"{'='*80}\n"
|
||||
"!!! ATTENTION !!!\n"
|
||||
f"!!! InnerPipe {name} WAS RECONSTRUCTED due to SYSTEM ERROR !!!\n"
|
||||
"!!! Please CHECK LOGS in /tmp/ray/session_latest/logs/ for details !!!\n"
|
||||
f"{'='*80}\n"
|
||||
)
|
||||
self.logger.info(msg)
|
||||
|
||||
self.monitor = StatusMonitor.get_instance()
|
||||
self.monitor.set_logger(self.logger)
|
||||
|
||||
self.monitor_check_interval = config.get(MONITOR_CHECK_INTERVAL, 120)
|
||||
|
||||
def _update_supervisor(self):
|
||||
while True:
|
||||
for _ in range(self.monitor_check_interval):
|
||||
time.sleep(1)
|
||||
components_status = self.monitor.get_all_status()
|
||||
ray.get(self.supervisor.update_component_state.remote(components_status))
|
||||
|
||||
def run(self, input_queue, output_queue):
|
||||
self.logger.info(f"[InnerPipe stages]: {self.stages}")
|
||||
|
||||
thread = threading.Thread(target=self._update_supervisor, daemon=True)
|
||||
thread.start()
|
||||
self.logger.info("Reporter started, start running pipe")
|
||||
|
||||
mid_results = StageInput()
|
||||
# if input_queue is None:
|
||||
# mid_results = StageInput()
|
||||
# else:
|
||||
# mid_results = StageInput((input_queue,), {})
|
||||
for _, stage in enumerate(self.stages):
|
||||
if isinstance(stage, DumpStage):
|
||||
mid_results = stage.run(mid_results, output_queue)
|
||||
elif isinstance(stage, DedumpStage):
|
||||
mid_results = stage.run(mid_results, input_queue)
|
||||
else:
|
||||
mid_results = stage.run(mid_results)
|
||||
result, finish = iter_to_obj(mid_results)
|
||||
self.logger.info("====================================")
|
||||
self.logger.info(f"result: {result}, finish: {finish}")
|
||||
self.logger.info("====================================")
|
||||
ray.kill(self.supervisor)
|
||||
self.logger.info("actor finished")
|
||||
return finish
|
||||
|
||||
group = PipeWorkerGroup(
|
||||
pipe_name=pipe_name,
|
||||
exp_name=exp_name,
|
||||
pipe_num=pipe_num,
|
||||
stage_list=stage_list,
|
||||
master_seed=master_seed,
|
||||
supervisor_class=Supervisor,
|
||||
inner_pipe_class=InnerPipe,
|
||||
initial_instances=instance_num,
|
||||
)
|
||||
print(pipe_name, group)
|
||||
return group
|
||||
Reference in New Issue
Block a user