From f1fd42ea98c370c375fa58ff933080d2e42d6beb Mon Sep 17 00:00:00 2001 From: David Nadlinger Date: Fri, 22 Jan 2021 12:19:41 +0100 Subject: [PATCH] coredevice: Re-enable TCP keepalive This partially reverts commit b5e1bd3fa2169fe88d4d0cdd4eedeb95fa41fa7f, which had removed keepalive. This, however, led to experiments hanging forever if the core device had dropped the connection (e.g. to a kernel CPU panic, or the device being rebooted). The chosen keepalive settings are fairly conservative (with the 10 s timeout) to avoid any possible interaction with smoltcp's 3 s ARP try interval (see GitHub issue #1150), even though this should be a non-issue now due to the larger ARP cache. --- artiq/coredevice/comm.py | 28 ++++++++++++++++++++++++++++ artiq/coredevice/comm_kernel.py | 4 ++-- artiq/coredevice/comm_mgmt.py | 6 +++--- 3 files changed, 33 insertions(+), 5 deletions(-) create mode 100644 artiq/coredevice/comm.py diff --git a/artiq/coredevice/comm.py b/artiq/coredevice/comm.py new file mode 100644 index 000000000..fb70a59b5 --- /dev/null +++ b/artiq/coredevice/comm.py @@ -0,0 +1,28 @@ +import sys +import socket +import logging + +logger = logging.getLogger(__name__) + + +def set_keepalive(sock, after_idle, interval, max_fails): + if sys.platform.startswith("linux"): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, after_idle) + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, interval) + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, max_fails) + elif sys.platform.startswith("win") or sys.platform.startswith("cygwin"): + # setting max_fails is not supported, typically ends up being 5 or 10 + # depending on Windows version + sock.ioctl(socket.SIO_KEEPALIVE_VALS, + (1, after_idle * 1000, interval * 1000)) + else: + logger.warning("TCP keepalive not supported on platform '%s', ignored", + sys.platform) + + +def initialize_connection(host, port): + sock = socket.create_connection((host, port)) + set_keepalive(sock, 10, 10, 3) + logger.debug("connected to %s:%d", host, port) + return sock diff --git a/artiq/coredevice/comm_kernel.py b/artiq/coredevice/comm_kernel.py index a188bc19d..eb34a3fdf 100644 --- a/artiq/coredevice/comm_kernel.py +++ b/artiq/coredevice/comm_kernel.py @@ -8,6 +8,7 @@ from fractions import Fraction from collections import namedtuple from artiq.coredevice import exceptions +from artiq.coredevice.comm import initialize_connection from artiq import __version__ as software_version @@ -184,8 +185,7 @@ class CommKernel: def open(self): if hasattr(self, "socket"): return - self.socket = socket.create_connection((self.host, self.port)) - logger.debug("connected to %s:%d", self.host, self.port) + self.socket = initialize_connection(self.host, self.port) self.socket.sendall(b"ARTIQ coredev\n") endian = self._read(1) if endian == b"e": diff --git a/artiq/coredevice/comm_mgmt.py b/artiq/coredevice/comm_mgmt.py index 2914977e0..79fde1a81 100644 --- a/artiq/coredevice/comm_mgmt.py +++ b/artiq/coredevice/comm_mgmt.py @@ -1,8 +1,9 @@ from enum import Enum import logging -import socket import struct +from artiq.coredevice.comm import initialize_connection + logger = logging.getLogger(__name__) @@ -60,8 +61,7 @@ class CommMgmt: def open(self): if hasattr(self, "socket"): return - self.socket = socket.create_connection((self.host, self.port)) - logger.debug("connected to %s:%d", self.host, self.port) + self.socket = initialize_connection(self.host, self.port) self.socket.sendall(b"ARTIQ management\n") endian = self._read(1) if endian == b"e":