ctlmgr: support immediate controller retry

This commit is contained in:
Sebastien Bourdeauducq 2015-08-11 23:22:36 +08:00
parent 80805407bf
commit a6ab066c87
2 changed files with 72 additions and 43 deletions

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import asyncio import asyncio
import atexit
import argparse import argparse
import os import os
import logging import logging
@ -8,9 +9,9 @@ import shlex
import socket import socket
from artiq.protocols.sync_struct import Subscriber from artiq.protocols.sync_struct import Subscriber
from artiq.protocols.pc_rpc import AsyncioClient from artiq.protocols.pc_rpc import AsyncioClient, Server
from artiq.tools import verbosity_args, init_logger from artiq.tools import verbosity_args, init_logger
from artiq.tools import asyncio_process_wait_timeout from artiq.tools import TaskObject, asyncio_process_wait_timeout, Condition
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -29,8 +30,11 @@ def get_argparser():
"--retry-master", default=5.0, type=float, "--retry-master", default=5.0, type=float,
help="retry timer for reconnecting to master") help="retry timer for reconnecting to master")
parser.add_argument( parser.add_argument(
"--retry-command", default=5.0, type=float, "--bind", default="::1",
help="retry timer for restarting a controller command") help="hostname or IP address to bind to")
parser.add_argument(
"--bind-port", default=3249, type=int,
help="TCP port to listen to for control (default: %(default)d)")
return parser return parser
@ -48,6 +52,7 @@ class Controller:
self.term_timeout = ddb_entry.get("term_timeout", 30) self.term_timeout = ddb_entry.get("term_timeout", 30)
self.retry_timer_cur = self.retry_timer self.retry_timer_cur = self.retry_timer
self.retry_now = Condition()
self.process = None self.process = None
self.launch_task = asyncio.Task(self.launcher()) self.launch_task = asyncio.Task(self.launcher())
@ -109,8 +114,13 @@ class Controller:
logger.warning("Controller %s failed to start", self.name) logger.warning("Controller %s failed to start", self.name)
else: else:
logger.warning("Controller %s exited", self.name) logger.warning("Controller %s exited", self.name)
logger.warning("Restarting in %.1f seconds", self.retry_timer_cur) logger.warning("Restarting in %.1f seconds",
yield from asyncio.sleep(self.retry_timer_cur) self.retry_timer_cur)
try:
yield from asyncio.wait_for(self.retry_now.wait(),
self.retry_timer_cur)
except asyncio.TimeoutError:
pass
self.retry_timer_cur *= self.retry_timer_backoff self.retry_timer_cur *= self.retry_timer_backoff
except asyncio.CancelledError: except asyncio.CancelledError:
yield from self._terminate() yield from self._terminate()
@ -208,34 +218,47 @@ class ControllerDB:
return self.current_controllers return self.current_controllers
@asyncio.coroutine class ControllerManager(TaskObject):
def ctlmgr(server, port, retry_master): def __init__(self, server, port, retry_master):
controller_db = ControllerDB() self.server = server
try: self.port = port
subscriber = Subscriber("devices", controller_db.sync_struct_init) self.retry_master = retry_master
while True: self.controller_db = ControllerDB()
try:
def set_host_filter(): @asyncio.coroutine
s = subscriber.writer.get_extra_info("socket") def _do(self):
localhost = s.getsockname()[0] try:
controller_db.set_host_filter(localhost) subscriber = Subscriber("devices",
yield from subscriber.connect(server, port, set_host_filter) self.controller_db.sync_struct_init)
while True:
try: try:
yield from asyncio.wait_for(subscriber.receive_task, None) def set_host_filter():
finally: s = subscriber.writer.get_extra_info("socket")
yield from subscriber.close() localhost = s.getsockname()[0]
except (ConnectionAbortedError, ConnectionError, self.controller_db.set_host_filter(localhost)
ConnectionRefusedError, ConnectionResetError) as e: yield from subscriber.connect(self.server, self.port,
logger.warning("Connection to master failed (%s: %s)", set_host_filter)
e.__class__.__name__, str(e)) try:
else: yield from asyncio.wait_for(subscriber.receive_task, None)
logger.warning("Connection to master lost") finally:
logger.warning("Retrying in %.1f seconds", retry_master) yield from subscriber.close()
yield from asyncio.sleep(retry_master) except (ConnectionAbortedError, ConnectionError,
except asyncio.CancelledError: ConnectionRefusedError, ConnectionResetError) as e:
pass logger.warning("Connection to master failed (%s: %s)",
finally: e.__class__.__name__, str(e))
yield from controller_db.current_controllers.shutdown() else:
logger.warning("Connection to master lost")
logger.warning("Retrying in %.1f seconds", self.retry_master)
yield from asyncio.sleep(self.retry_master)
except asyncio.CancelledError:
pass
finally:
yield from self.controller_db.current_controllers.shutdown()
def retry_now(self, k):
"""If a controller is disabled and pending retry, perform that retry
now."""
self.controller_db.current_controllers.active[k].retry_now.notify()
def main(): def main():
@ -247,18 +270,22 @@ def main():
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
else: else:
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
atexit.register(lambda: loop.close())
try: ctlmgr = ControllerManager(args.server, args.port, args.retry_master)
task = asyncio.Task(ctlmgr( ctlmgr.start()
args.server, args.port, args.retry_master)) atexit.register(lambda: loop.run_until_complete(ctlmgr.stop()))
try:
loop.run_forever() class CtlMgrRPC:
finally: retry_now = ctlmgr.retry_now
task.cancel()
loop.run_until_complete(asyncio.wait_for(task, None)) rpc_target = CtlMgrRPC()
rpc_server = Server({"ctlmgr": rpc_target}, builtin_terminate=True)
loop.run_until_complete(rpc_server.start(args.bind, args.bind_port))
atexit.register(lambda: loop.run_until_complete(rpc_server.stop()))
loop.run_until_complete(rpc_server.wait_terminate())
finally:
loop.close()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -8,6 +8,8 @@ Default network ports
+--------------------------+--------------+ +--------------------------+--------------+
| Core device (mon/inj) | 3250 (UDP) | | Core device (mon/inj) | 3250 (UDP) |
+--------------------------+--------------+ +--------------------------+--------------+
| Controller manager | 3249 |
+--------------------------+--------------+
| Master (notifications) | 3250 | | Master (notifications) | 3250 |
+--------------------------+--------------+ +--------------------------+--------------+
| Master (control) | 3251 | | Master (control) | 3251 |