artiq/artiq/master/worker.py

323 lines
11 KiB
Python
Raw Normal View History

import sys
import os
import asyncio
2015-05-17 16:11:00 +08:00
import logging
import subprocess
import time
2014-10-25 16:31:34 +08:00
2019-11-10 15:55:17 +08:00
from sipyco import pipe_ipc, pyon
from sipyco.logging_tools import LogParser
from sipyco.packed_exceptions import current_exc_packed
from artiq.tools import asyncio_wait_or_cancel
2015-05-17 16:11:00 +08:00
logger = logging.getLogger(__name__)
class WorkerTimeout(Exception):
pass
class WorkerWatchdogTimeout(Exception):
pass
class WorkerError(Exception):
2014-12-31 17:41:22 +08:00
pass
class WorkerInternalException(Exception):
"""Exception raised inside the worker, information has been printed
through logging."""
pass
def log_worker_exception():
exc, _, _ = sys.exc_info()
if exc is WorkerInternalException:
logger.debug("worker exception details", exc_info=True)
else:
logger.error("worker exception details", exc_info=True)
class Worker:
def __init__(self, handlers=dict(), send_timeout=10.0):
self.handlers = handlers
self.send_timeout = send_timeout
2015-05-17 16:11:00 +08:00
self.rid = None
self.filename = None
2016-01-26 21:59:36 +08:00
self.ipc = None
self.watchdogs = dict() # wid -> expiration (using time.monotonic)
2015-05-17 16:11:00 +08:00
self.io_lock = asyncio.Lock()
self.closed = asyncio.Event()
def create_watchdog(self, t):
2015-03-12 02:06:46 +08:00
n_user_watchdogs = len(self.watchdogs)
if -1 in self.watchdogs:
n_user_watchdogs -= 1
avail = set(range(n_user_watchdogs + 1)) \
- set(self.watchdogs.keys())
wid = next(iter(avail))
self.watchdogs[wid] = time.monotonic() + t
return wid
def delete_watchdog(self, wid):
del self.watchdogs[wid]
def watchdog_time(self):
if self.watchdogs:
return min(self.watchdogs.values()) - time.monotonic()
else:
return None
def _get_log_source(self):
return "worker({},{})".format(self.rid, self.filename if self.filename is not None else "<none>")
2015-10-20 18:11:50 +08:00
async def _create_process(self, log_level):
if self.ipc is not None:
return # process already exists, recycle
2015-10-03 19:28:57 +08:00
await self.io_lock.acquire()
2015-05-17 16:11:00 +08:00
try:
if self.closed.is_set():
raise WorkerError("Attempting to create process after close")
2016-01-26 21:59:36 +08:00
self.ipc = pipe_ipc.AsyncioParentComm()
env = os.environ.copy()
env["PYTHONUNBUFFERED"] = "1"
2016-01-26 21:59:36 +08:00
await self.ipc.create_subprocess(
2015-05-17 16:11:00 +08:00
sys.executable, "-m", "artiq.master.worker_impl",
self.ipc.get_address(), str(log_level),
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
env=env, start_new_session=True)
asyncio.ensure_future(
LogParser(self._get_log_source).stream_task(
self.ipc.process.stdout))
asyncio.ensure_future(
LogParser(self._get_log_source).stream_task(
self.ipc.process.stderr))
2015-05-17 16:11:00 +08:00
finally:
self.io_lock.release()
async def close(self, term_timeout=2.0):
"""Interrupts any I/O with the worker process and terminates the
worker process.
This method should always be called by the user to clean up, even if
build() or examine() raises an exception."""
2015-05-17 16:11:00 +08:00
self.closed.set()
2015-10-03 19:28:57 +08:00
await self.io_lock.acquire()
try:
2016-01-26 21:59:36 +08:00
if self.ipc is None:
# Note the %s - self.rid can be None or a user string
2015-05-17 16:11:00 +08:00
logger.debug("worker was not created (RID %s)", self.rid)
return
2016-01-26 21:59:36 +08:00
if self.ipc.process.returncode is not None:
logger.debug("worker already terminated (RID %s)", self.rid)
2016-01-26 21:59:36 +08:00
if self.ipc.process.returncode != 0:
2015-05-17 16:11:00 +08:00
logger.warning("worker finished with status code %d"
2016-01-26 21:59:36 +08:00
" (RID %s)", self.ipc.process.returncode,
2015-05-17 16:11:00 +08:00
self.rid)
return
try:
2016-02-01 16:52:31 +08:00
await self._send({"action": "terminate"}, cancellable=False)
await asyncio.wait_for(self.ipc.process.wait(), term_timeout)
logger.debug("worker exited on request (RID %s)", self.rid)
return
2015-05-17 16:11:00 +08:00
except:
2016-02-01 16:52:31 +08:00
logger.debug("worker failed to exit on request"
" (RID %s), ending the process", self.rid,
exc_info=True)
if os.name != "nt":
try:
2016-02-01 16:52:31 +08:00
self.ipc.process.terminate()
except ProcessLookupError:
pass
2016-02-01 16:52:31 +08:00
try:
2016-02-03 06:31:40 +08:00
await asyncio.wait_for(self.ipc.process.wait(),
term_timeout)
2016-02-01 16:52:31 +08:00
logger.debug("worker terminated (RID %s)", self.rid)
return
except asyncio.TimeoutError:
2016-02-03 06:31:40 +08:00
logger.warning(
"worker did not terminate (RID %s), killing", self.rid)
2016-02-01 16:52:31 +08:00
try:
self.ipc.process.kill()
except ProcessLookupError:
pass
2015-05-17 16:11:00 +08:00
try:
2016-01-26 21:59:36 +08:00
await asyncio.wait_for(self.ipc.process.wait(), term_timeout)
2016-02-01 16:52:31 +08:00
logger.debug("worker killed (RID %s)", self.rid)
return
2015-05-17 16:11:00 +08:00
except asyncio.TimeoutError:
2016-02-01 16:52:31 +08:00
logger.warning("worker refuses to die (RID %s)", self.rid)
2015-05-17 16:11:00 +08:00
finally:
self.io_lock.release()
2015-10-03 19:28:57 +08:00
async def _send(self, obj, cancellable=True):
2015-05-17 16:11:00 +08:00
assert self.io_lock.locked()
2014-10-25 16:31:34 +08:00
line = pyon.encode(obj)
2016-01-26 21:59:36 +08:00
self.ipc.write((line + "\n").encode())
ifs = [self.ipc.drain()]
2015-05-17 16:11:00 +08:00
if cancellable:
ifs.append(self.closed.wait())
2015-10-03 19:28:57 +08:00
fs = await asyncio_wait_or_cancel(
2015-07-15 05:43:08 +08:00
ifs, timeout=self.send_timeout,
2015-05-17 16:11:00 +08:00
return_when=asyncio.FIRST_COMPLETED)
if all(f.cancelled() for f in fs):
raise WorkerTimeout(
"Timeout sending data to worker (RID {})".format(self.rid))
2015-05-17 16:11:00 +08:00
for f in fs:
if not f.cancelled() and f.done():
f.result() # raise any exceptions
if cancellable and self.closed.is_set():
raise WorkerError(
"Data transmission to worker cancelled (RID {})".format(
self.rid))
2015-10-03 19:28:57 +08:00
async def _recv(self, timeout):
2015-05-17 16:11:00 +08:00
assert self.io_lock.locked()
2015-10-03 19:28:57 +08:00
fs = await asyncio_wait_or_cancel(
2016-01-26 21:59:36 +08:00
[self.ipc.readline(), self.closed.wait()],
2015-05-17 16:11:00 +08:00
timeout=timeout, return_when=asyncio.FIRST_COMPLETED)
if all(f.cancelled() for f in fs):
raise WorkerTimeout(
"Timeout receiving data from worker (RID {})".format(self.rid))
2015-05-17 16:11:00 +08:00
if self.closed.is_set():
raise WorkerError(
"Receiving data from worker cancelled (RID {})".format(
self.rid))
2015-05-17 16:11:00 +08:00
line = fs[0].result()
if not line:
raise WorkerError(
"Worker ended while attempting to receive data (RID {})".
format(self.rid))
try:
2014-10-25 16:31:34 +08:00
obj = pyon.decode(line.decode())
except:
raise WorkerError("Worker sent invalid PYON data (RID {})".format(
self.rid))
return obj
2015-10-03 19:28:57 +08:00
async def _handle_worker_requests(self):
while True:
2015-03-12 02:06:46 +08:00
try:
2015-10-03 19:28:57 +08:00
await self.io_lock.acquire()
2015-05-17 16:11:00 +08:00
try:
2015-10-03 19:28:57 +08:00
obj = await self._recv(self.watchdog_time())
2015-05-17 16:11:00 +08:00
finally:
self.io_lock.release()
2015-03-12 02:06:46 +08:00
except WorkerTimeout:
raise WorkerWatchdogTimeout
action = obj["action"]
if action == "completed":
2015-05-17 16:11:00 +08:00
return True
elif action == "pause":
return False
elif action == "exception":
raise WorkerInternalException
2015-10-28 17:35:57 +08:00
elif action == "create_watchdog":
func = self.create_watchdog
elif action == "delete_watchdog":
func = self.delete_watchdog
elif action == "register_experiment":
func = self.register_experiment
else:
func = self.handlers[action]
try:
if getattr(func, "_worker_pass_rid", False):
args = [self.rid] + list(obj["args"])
else:
args = obj["args"]
data = func(*args, **obj["kwargs"])
if asyncio.iscoroutine(data):
data = await data
reply = {"status": "ok", "data": data}
except Exception:
reply = {
"status": "failed",
"exception": current_exc_packed()
}
2015-10-03 19:28:57 +08:00
await self.io_lock.acquire()
2015-05-17 16:11:00 +08:00
try:
2015-10-03 19:28:57 +08:00
await self._send(reply)
2015-05-17 16:11:00 +08:00
finally:
self.io_lock.release()
2015-10-03 19:28:57 +08:00
async def _worker_action(self, obj, timeout=None):
2015-03-12 02:06:46 +08:00
if timeout is not None:
self.watchdogs[-1] = time.monotonic() + timeout
try:
2015-10-03 19:28:57 +08:00
await self.io_lock.acquire()
2015-05-17 16:11:00 +08:00
try:
2015-10-03 19:28:57 +08:00
await self._send(obj)
2015-05-17 16:11:00 +08:00
finally:
self.io_lock.release()
2015-03-12 02:06:46 +08:00
try:
2015-10-03 19:28:57 +08:00
completed = await self._handle_worker_requests()
2015-03-12 02:06:46 +08:00
except WorkerTimeout:
raise WorkerWatchdogTimeout
finally:
if timeout is not None:
del self.watchdogs[-1]
2015-05-17 16:11:00 +08:00
return completed
2015-03-12 02:06:46 +08:00
2016-02-03 06:31:40 +08:00
async def build(self, rid, pipeline_name, wd, expid, priority,
timeout=15.0):
2015-05-17 16:11:00 +08:00
self.rid = rid
if "file" in expid:
self.filename = os.path.basename(expid["file"])
2015-10-20 18:11:50 +08:00
await self._create_process(expid["log_level"])
2015-10-03 19:28:57 +08:00
await self._worker_action(
2015-07-09 19:18:12 +08:00
{"action": "build",
2015-05-17 16:11:00 +08:00
"rid": rid,
"pipeline_name": pipeline_name,
2015-08-07 15:51:56 +08:00
"wd": wd,
"expid": expid,
"priority": priority},
2015-07-15 05:43:08 +08:00
timeout)
2015-07-09 19:18:12 +08:00
2015-10-03 19:28:57 +08:00
async def prepare(self):
await self._worker_action({"action": "prepare"})
2015-10-03 19:28:57 +08:00
async def run(self):
completed = await self._worker_action({"action": "run"})
2015-05-17 16:11:00 +08:00
if not completed:
self.yield_time = time.monotonic()
return completed
async def resume(self, request_termination):
2015-05-17 16:11:00 +08:00
stop_duration = time.monotonic() - self.yield_time
for wid, expiry in self.watchdogs:
self.watchdogs[wid] += stop_duration
2015-10-03 19:28:57 +08:00
completed = await self._worker_action({"status": "ok",
"data": request_termination})
2015-05-17 16:11:00 +08:00
if not completed:
self.yield_time = time.monotonic()
return completed
2015-10-03 19:28:57 +08:00
async def analyze(self):
await self._worker_action({"action": "analyze"})
2015-03-12 02:06:46 +08:00
async def examine(self, rid, file, timeout=20.0):
self.rid = rid
self.filename = os.path.basename(file)
2015-10-20 18:11:50 +08:00
await self._create_process(logging.WARNING)
r = dict()
2016-02-03 06:31:40 +08:00
def register(class_name, name, arginfo, argument_ui,
scheduler_defaults):
r[class_name] = {
"name": name,
"arginfo": arginfo,
"argument_ui": argument_ui,
"scheduler_defaults": scheduler_defaults
}
self.register_experiment = register
await self._worker_action({"action": "examine", "file": file},
timeout)
del self.register_experiment
return r