forked from M-Labs/artiq
master: use a new worker process for each experiment
This commit is contained in:
parent
ec1d082730
commit
4c280d5fcc
|
@ -62,7 +62,7 @@ def main():
|
||||||
"scheduler_run_timed": scheduler.run_timed,
|
"scheduler_run_timed": scheduler.run_timed,
|
||||||
"scheduler_cancel_timed": scheduler.cancel_timed,
|
"scheduler_cancel_timed": scheduler.cancel_timed,
|
||||||
}
|
}
|
||||||
loop.run_until_complete(scheduler.start())
|
scheduler.start()
|
||||||
atexit.register(lambda: loop.run_until_complete(scheduler.stop()))
|
atexit.register(lambda: loop.run_until_complete(scheduler.stop()))
|
||||||
|
|
||||||
server_control = Server({
|
server_control = Server({
|
||||||
|
|
|
@ -25,17 +25,14 @@ class Scheduler:
|
||||||
trids -= set(self.timed.read.keys())
|
trids -= set(self.timed.read.keys())
|
||||||
return next(iter(trids))
|
return next(iter(trids))
|
||||||
|
|
||||||
@asyncio.coroutine
|
|
||||||
def start(self):
|
def start(self):
|
||||||
self.task = asyncio.Task(self._schedule())
|
self.task = asyncio.Task(self._schedule())
|
||||||
yield from self.worker.create_process()
|
|
||||||
|
|
||||||
@asyncio.coroutine
|
@asyncio.coroutine
|
||||||
def stop(self):
|
def stop(self):
|
||||||
self.task.cancel()
|
self.task.cancel()
|
||||||
yield from asyncio.wait([self.task])
|
yield from asyncio.wait([self.task])
|
||||||
del self.task
|
del self.task
|
||||||
yield from self.worker.end_process()
|
|
||||||
|
|
||||||
def run_queued(self, run_params):
|
def run_queued(self, run_params):
|
||||||
rid = self.new_rid()
|
rid = self.new_rid()
|
||||||
|
|
|
@ -7,11 +7,7 @@ import traceback
|
||||||
from artiq.protocols import pyon
|
from artiq.protocols import pyon
|
||||||
|
|
||||||
|
|
||||||
class WorkerFailed(Exception):
|
class WorkerError(Exception):
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class RunFailed(Exception):
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,11 +20,22 @@ class Worker:
|
||||||
self.term_timeout = term_timeout
|
self.term_timeout = term_timeout
|
||||||
|
|
||||||
@asyncio.coroutine
|
@asyncio.coroutine
|
||||||
def create_process(self):
|
def _create_process(self):
|
||||||
self.process = yield from asyncio.create_subprocess_exec(
|
self.process = yield from asyncio.create_subprocess_exec(
|
||||||
sys.executable, "-m", "artiq.master.worker_impl",
|
sys.executable, "-m", "artiq.master.worker_impl",
|
||||||
stdout=subprocess.PIPE, stdin=subprocess.PIPE)
|
stdout=subprocess.PIPE, stdin=subprocess.PIPE)
|
||||||
|
|
||||||
|
@asyncio.coroutine
|
||||||
|
def _end_process(self):
|
||||||
|
if self.process.returncode is not None:
|
||||||
|
return
|
||||||
|
self.process.send_signal(signal.SIGTERM)
|
||||||
|
try:
|
||||||
|
yield from asyncio.wait_for(
|
||||||
|
self.process.wait(), timeout=self.term_timeout)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
self.process.send_signal(signal.SIGKILL)
|
||||||
|
|
||||||
@asyncio.coroutine
|
@asyncio.coroutine
|
||||||
def _send(self, obj, timeout):
|
def _send(self, obj, timeout):
|
||||||
line = pyon.encode(obj)
|
line = pyon.encode(obj)
|
||||||
|
@ -39,9 +46,9 @@ class Worker:
|
||||||
if fut is not (): # FIXME: why does Python return this?
|
if fut is not (): # FIXME: why does Python return this?
|
||||||
yield from asyncio.wait_for(fut, timeout=timeout)
|
yield from asyncio.wait_for(fut, timeout=timeout)
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
raise WorkerFailed("Timeout sending data from worker")
|
raise WorkerError("Timeout sending data from worker")
|
||||||
except:
|
except:
|
||||||
raise WorkerFailed("Failed to send data to worker")
|
raise WorkerError("Failed to send data to worker")
|
||||||
|
|
||||||
@asyncio.coroutine
|
@asyncio.coroutine
|
||||||
def _recv(self, timeout):
|
def _recv(self, timeout):
|
||||||
|
@ -49,32 +56,33 @@ class Worker:
|
||||||
line = yield from asyncio.wait_for(
|
line = yield from asyncio.wait_for(
|
||||||
self.process.stdout.readline(), timeout=timeout)
|
self.process.stdout.readline(), timeout=timeout)
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
raise WorkerFailed("Timeout receiving data from worker")
|
raise WorkerError("Timeout receiving data from worker")
|
||||||
if not line:
|
if not line:
|
||||||
raise WorkerFailed(
|
return None
|
||||||
"Worker ended unexpectedly while trying to receive data")
|
|
||||||
try:
|
try:
|
||||||
obj = pyon.decode(line.decode())
|
obj = pyon.decode(line.decode())
|
||||||
except:
|
except:
|
||||||
raise WorkerFailed("Worker sent invalid PYON data")
|
raise WorkerError("Worker sent invalid PYON data")
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
@asyncio.coroutine
|
@asyncio.coroutine
|
||||||
def run(self, rid, run_params):
|
def run(self, rid, run_params):
|
||||||
obj = {"rid": rid, "run_params": run_params}
|
yield from self._create_process()
|
||||||
yield from self._send(obj, self.send_timeout)
|
|
||||||
obj = yield from self._recv(self.start_reply_timeout)
|
try:
|
||||||
if obj != "ack":
|
obj = {"rid": rid, "run_params": run_params}
|
||||||
raise WorkerFailed("Incorrect acknowledgement")
|
yield from self._send(obj, self.send_timeout)
|
||||||
while True:
|
obj = yield from self._recv(self.start_reply_timeout)
|
||||||
obj = yield from self._recv(None)
|
if obj != "ack":
|
||||||
action = obj["action"]
|
raise WorkerError("Incorrect acknowledgement")
|
||||||
if action == "report_completed":
|
while True:
|
||||||
if obj["status"] != "ok":
|
obj = yield from self._recv(None)
|
||||||
raise RunFailed(obj["message"])
|
if obj is None:
|
||||||
else:
|
if self.process.returncode != 0:
|
||||||
return
|
raise WorkerError("Worker finished with status code {}"
|
||||||
else:
|
.format(self.process.returncode))
|
||||||
|
break
|
||||||
|
action = obj["action"]
|
||||||
del obj["action"]
|
del obj["action"]
|
||||||
try:
|
try:
|
||||||
data = self.handlers[action](**obj)
|
data = self.handlers[action](**obj)
|
||||||
|
@ -83,14 +91,5 @@ class Worker:
|
||||||
reply = {"status": "failed",
|
reply = {"status": "failed",
|
||||||
"message": traceback.format_exc()}
|
"message": traceback.format_exc()}
|
||||||
yield from self._send(reply, self.send_timeout)
|
yield from self._send(reply, self.send_timeout)
|
||||||
|
finally:
|
||||||
@asyncio.coroutine
|
yield from self._end_process()
|
||||||
def end_process(self):
|
|
||||||
if self.process.returncode is not None:
|
|
||||||
return
|
|
||||||
self.process.send_signal(signal.SIGTERM)
|
|
||||||
try:
|
|
||||||
yield from asyncio.wait_for(
|
|
||||||
self.process.wait(), timeout=self.term_timeout)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
self.process.send_signal(signal.SIGKILL)
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
|
||||||
|
|
||||||
from artiq.protocols import pyon
|
from artiq.protocols import pyon
|
||||||
from artiq.tools import file_import
|
from artiq.tools import file_import
|
||||||
|
@ -80,20 +79,12 @@ def run(rid, run_params):
|
||||||
rdb = ResultDB(init_rt_results, update_rt_results)
|
rdb = ResultDB(init_rt_results, update_rt_results)
|
||||||
dbh = DBHub(ParentDDB, ParentPDB, rdb)
|
dbh = DBHub(ParentDDB, ParentPDB, rdb)
|
||||||
try:
|
try:
|
||||||
try:
|
exp_inst = exp(dbh,
|
||||||
exp_inst = exp(dbh,
|
scheduler=Scheduler,
|
||||||
scheduler=Scheduler,
|
run_params=run_params,
|
||||||
run_params=run_params,
|
**run_params["arguments"])
|
||||||
**run_params["arguments"])
|
exp_inst.run()
|
||||||
exp_inst.run()
|
exp_inst.analyze()
|
||||||
exp_inst.analyze()
|
|
||||||
except Exception:
|
|
||||||
put_object({"action": "report_completed",
|
|
||||||
"status": "failed",
|
|
||||||
"message": traceback.format_exc()})
|
|
||||||
else:
|
|
||||||
put_object({"action": "report_completed",
|
|
||||||
"status": "ok"})
|
|
||||||
finally:
|
finally:
|
||||||
dbh.close()
|
dbh.close()
|
||||||
|
|
||||||
|
@ -107,10 +98,10 @@ def run(rid, run_params):
|
||||||
def main():
|
def main():
|
||||||
sys.stdout = sys.stderr
|
sys.stdout = sys.stderr
|
||||||
|
|
||||||
while True:
|
obj = get_object()
|
||||||
obj = get_object()
|
put_object("ack")
|
||||||
put_object("ack")
|
run(obj["rid"], obj["run_params"])
|
||||||
run(obj["rid"], obj["run_params"])
|
put_object({"action": "report_completed"})
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Reference in New Issue