import asyncio import logging import csv import os.path from enum import Enum from time import time from sipyco.sync_struct import Notifier from sipyco.asyncio_tools import TaskObject, Condition from artiq.master.worker import Worker, log_worker_exception from artiq.tools import asyncio_wait_or_cancel logger = logging.getLogger(__name__) class RunStatus(Enum): pending = 0 flushing = 1 preparing = 2 prepare_done = 3 running = 4 run_done = 5 analyzing = 6 deleting = 7 paused = 8 def _mk_worker_method(name): async def worker_method(self, *args, **kwargs): if self.worker.closed.is_set(): return True m = getattr(self.worker, name) try: return await m(*args, **kwargs) except Exception as e: if isinstance(e, asyncio.CancelledError): raise if self.worker.closed.is_set(): logger.debug("suppressing worker exception of terminated run", exc_info=True) # Return completion on termination return True else: raise return worker_method class Run: def __init__(self, rid, pipeline_name, wd, expid, priority, due_date, flush, pool, **kwargs): # called through pool self.rid = rid self.pipeline_name = pipeline_name self.wd = wd self.expid = expid self.priority = priority self.due_date = due_date self.flush = flush self.worker = Worker(pool.worker_handlers) self.termination_requested = False self._status = RunStatus.pending notification = { "pipeline": self.pipeline_name, "expid": self.expid, "priority": self.priority, "due_date": self.due_date, "flush": self.flush, "status": self._status.name } notification.update(kwargs) self._notifier = pool.notifier self._notifier[self.rid] = notification self._state_changed = pool.state_changed @property def status(self): return self._status @status.setter def status(self, value): self._status = value if not self.worker.closed.is_set(): self._notifier[self.rid]["status"] = self._status.name self._state_changed.notify() def priority_key(self): """Return a comparable value that defines a run priority order. Applies only to runs the due date of which has already elapsed. """ return (self.priority, -(self.due_date or 0), -self.rid) async def close(self): # called through pool await self.worker.close() del self._notifier[self.rid] _build = _mk_worker_method("build") async def build(self): await self._build(self.rid, self.pipeline_name, self.wd, self.expid, self.priority) prepare = _mk_worker_method("prepare") run = _mk_worker_method("run") resume = _mk_worker_method("resume") analyze = _mk_worker_method("analyze") class RunPool: def __init__(self, ridc, worker_handlers, notifier, experiment_db, log_submissions): self.runs = dict() self.state_changed = Condition() self.ridc = ridc self.worker_handlers = worker_handlers self.notifier = notifier self.experiment_db = experiment_db self.log_submissions = log_submissions def log_submission(self, rid, expid): start_time = time() with open(self.log_submissions, 'a', newline='') as f: writer = csv.writer(f) writer.writerow([rid, start_time, expid["file"]]) def submit(self, expid, priority, due_date, flush, pipeline_name): """ Submits an experiment to be run by this pool If expid has the attribute `repo_rev`, treat it as a git revision or reference and resolve into a unique git hash before submission """ # mutates expid to insert head repository revision if None and # replaces relative path with the absolute one. # called through scheduler. rid = self.ridc.get() if "repo_rev" in expid: repo_rev_or_ref = expid["repo_rev"] or self.experiment_db.cur_rev wd, repo_msg, repo_rev = self.experiment_db.repo_backend.request_rev(repo_rev_or_ref) # Mutate expid's repo_rev to that returned from request_rev, in case # a branch was passed instead of a hash expid["repo_rev"] = repo_rev else: if "file" in expid: expid["file"] = os.path.abspath(expid["file"]) wd, repo_msg = None, None run = Run(rid, pipeline_name, wd, expid, priority, due_date, flush, self, repo_msg=repo_msg) if self.log_submissions is not None: self.log_submission(rid, expid) self.runs[rid] = run self.state_changed.notify() return rid async def delete(self, rid): # called through deleter if rid not in self.runs: return run = self.runs[rid] await run.close() if "repo_rev" in run.expid: self.experiment_db.repo_backend.release_rev(run.expid["repo_rev"]) del self.runs[rid] class PrepareStage(TaskObject): def __init__(self, pool, delete_cb): self.pool = pool self.delete_cb = delete_cb def _get_run(self): """If a run should get prepared now, return it. Otherwise, return a float giving the time until the next check, or None if no time-based check is required. The latter can be the case if there are no due-date runs, or none of them are going to become next-in-line before further pool state changes (which will also cause a re-evaluation). """ pending_runs = list( filter(lambda r: r.status == RunStatus.pending, self.pool.runs.values())) now = time() def is_runnable(r): return (r.due_date or 0) < now prepared_max = max((r.priority_key() for r in self.pool.runs.values() if r.status == RunStatus.prepare_done), default=None) def takes_precedence(r): return prepared_max is None or r.priority_key() > prepared_max candidate = max(filter(is_runnable, pending_runs), key=lambda r: r.priority_key(), default=None) if candidate is not None and takes_precedence(candidate): return candidate return min((r.due_date - now for r in pending_runs if (not is_runnable(r) and takes_precedence(r))), default=None) async def _do(self): while True: run = self._get_run() if run is None: await self.pool.state_changed.wait() elif isinstance(run, float): await asyncio_wait_or_cancel([self.pool.state_changed.wait()], timeout=run) else: if run.flush: run.status = RunStatus.flushing while not all(r.status in (RunStatus.pending, RunStatus.deleting) or r.priority < run.priority or r is run for r in self.pool.runs.values()): ev = [self.pool.state_changed.wait(), run.worker.closed.wait()] await asyncio_wait_or_cancel( ev, return_when=asyncio.FIRST_COMPLETED) if run.worker.closed.is_set(): break if run.worker.closed.is_set(): continue run.status = RunStatus.preparing try: await run.build() await run.prepare() except Exception: logger.error("got worker exception in prepare stage, " "deleting RID %d", run.rid) log_worker_exception() self.delete_cb(run.rid) else: run.status = RunStatus.prepare_done class RunStage(TaskObject): def __init__(self, pool, delete_cb): self.pool = pool self.delete_cb = delete_cb def _get_run(self): prepared_runs = filter(lambda r: r.status == RunStatus.prepare_done, self.pool.runs.values()) try: r = max(prepared_runs, key=lambda r: r.priority_key()) except ValueError: # prepared_runs is an empty sequence r = None return r async def _do(self): stack = [] while True: next_irun = self._get_run() if not stack or ( next_irun is not None and next_irun.priority_key() > stack[-1].priority_key()): while next_irun is None: await self.pool.state_changed.wait() next_irun = self._get_run() stack.append(next_irun) run = stack.pop() try: if run.status == RunStatus.paused: run.status = RunStatus.running # clear "termination requested" flag now # so that if it is set again during the resume, this # results in another exception. request_termination = run.termination_requested run.termination_requested = False completed = await run.resume(request_termination) else: run.status = RunStatus.running completed = await run.run() except Exception: logger.error("got worker exception in run stage, " "deleting RID %d", run.rid) log_worker_exception() self.delete_cb(run.rid) else: if completed: run.status = RunStatus.run_done else: run.status = RunStatus.paused stack.append(run) class AnalyzeStage(TaskObject): def __init__(self, pool, delete_cb): self.pool = pool self.delete_cb = delete_cb def _get_run(self): run_runs = filter(lambda r: r.status == RunStatus.run_done, self.pool.runs.values()) try: r = max(run_runs, key=lambda r: r.priority_key()) except ValueError: # run_runs is an empty sequence r = None return r async def _do(self): while True: run = self._get_run() while run is None: await self.pool.state_changed.wait() run = self._get_run() run.status = RunStatus.analyzing try: await run.analyze() except Exception: logger.error("got worker exception in analyze stage of RID %d.", run.rid) log_worker_exception() self.delete_cb(run.rid) class Pipeline: def __init__(self, ridc, deleter, worker_handlers, notifier, experiment_db, log_submissions): self.pool = RunPool(ridc, worker_handlers, notifier, experiment_db, log_submissions) self._prepare = PrepareStage(self.pool, deleter.delete) self._run = RunStage(self.pool, deleter.delete) self._analyze = AnalyzeStage(self.pool, deleter.delete) def start(self, *, loop=None): self._prepare.start(loop=loop) self._run.start(loop=loop) self._analyze.start(loop=loop) async def stop(self): # NB: restart of a stopped pipeline is not supported await self._analyze.stop() await self._run.stop() await self._prepare.stop() class Deleter(TaskObject): """Provides a synchronous interface for instigating deletion of runs. :meth:`RunPool.delete` is an async function (it needs to close the worker connection, etc.), so we maintain a queue of RIDs to delete on a background task. """ def __init__(self, pipelines): self._pipelines = pipelines self._queue = asyncio.Queue() def delete(self, rid): """Delete the run with the given RID. Multiple calls for the same RID are silently ignored. """ logger.debug("delete request for RID %d", rid) for pipeline in self._pipelines.values(): if rid in pipeline.pool.runs: pipeline.pool.runs[rid].status = RunStatus.deleting break self._queue.put_nowait(rid) async def join(self): await self._queue.join() async def _delete(self, rid): # By looking up the run by RID, we implicitly make sure to delete each run only # once. for pipeline in self._pipelines.values(): if rid in pipeline.pool.runs: logger.debug("deleting RID %d...", rid) await pipeline.pool.delete(rid) logger.debug("deletion of RID %d completed", rid) break async def _gc_pipelines(self): pipeline_names = list(self._pipelines.keys()) for name in pipeline_names: if not self._pipelines[name].pool.runs: logger.debug("garbage-collecting pipeline '%s'...", name) await self._pipelines[name].stop() del self._pipelines[name] logger.debug("garbage-collection of pipeline '%s' completed", name) async def _do(self): while True: rid = await self._queue.get() await self._delete(rid) await self._gc_pipelines() self._queue.task_done() class Scheduler: def __init__(self, ridc, worker_handlers, experiment_db, log_submissions): self.notifier = Notifier(dict()) self._pipelines = dict() self._worker_handlers = worker_handlers self._experiment_db = experiment_db self._terminated = False self._ridc = ridc self._deleter = Deleter(self._pipelines) self._log_submissions = log_submissions def start(self, *, loop=None): self._loop = loop self._deleter.start(loop=self._loop) async def stop(self): # NB: restart of a stopped scheduler is not supported self._terminated = True # prevent further runs from being created for pipeline in self._pipelines.values(): for rid in pipeline.pool.runs.keys(): self._deleter.delete(rid) await self._deleter.join() await self._deleter.stop() if self._pipelines: logger.warning("some pipelines were not garbage-collected") def submit(self, pipeline_name, expid, priority=0, due_date=None, flush=False): """Submits a new run. When called through an experiment, the default values of ``pipeline_name``, ``expid`` and ``priority`` correspond to those of the current run.""" # mutates expid to insert head repository revision if None, and # replaces relative file path with absolute one if self._terminated: return try: pipeline = self._pipelines[pipeline_name] except KeyError: logger.debug("creating pipeline '%s'", pipeline_name) pipeline = Pipeline(self._ridc, self._deleter, self._worker_handlers, self.notifier, self._experiment_db, self._log_submissions) self._pipelines[pipeline_name] = pipeline pipeline.start(loop=self._loop) return pipeline.pool.submit(expid, priority, due_date, flush, pipeline_name) def delete(self, rid): """Kills the run with the specified RID.""" self._deleter.delete(rid) def request_termination(self, rid): """Requests graceful termination of the run with the specified RID.""" for pipeline in self._pipelines.values(): if rid in pipeline.pool.runs: run = pipeline.pool.runs[rid] if run.status == RunStatus.running or run.status == RunStatus.paused: run.termination_requested = True else: self.delete(rid) break def get_status(self): """Returns a dictionary containing information about the runs currently tracked by the scheduler. Must not be modified.""" return self.notifier.raw_view def check_pause(self, rid): """Returns ``True`` if there is a condition that could make ``pause`` not return immediately (termination requested or higher priority run). The typical purpose of this function is to check from a kernel whether returning control to the host and pausing would have an effect, in order to avoid the cost of switching kernels in the common case where ``pause`` does nothing. This function does not have side effects, and does not have to be followed by a call to ``pause``. """ for pipeline in self._pipelines.values(): if rid in pipeline.pool.runs: run = pipeline.pool.runs[rid] if run.status != RunStatus.running: return False if run.termination_requested: return True prepared_runs = filter(lambda r: r.status == RunStatus.prepare_done, pipeline.pool.runs.values()) try: r = max(prepared_runs, key=lambda r: r.priority_key()) except ValueError: # prepared_runs is an empty sequence return False return r.priority_key() > run.priority_key() raise KeyError("RID not found") def check_termination(self, rid): """Returns ``True`` if termination is requested.""" for pipeline in self._pipelines.values(): if rid in pipeline.pool.runs: run = pipeline.pool.runs[rid] if run.termination_requested: return True return False