From 4a6bea479af03bca5a3583977bbdebf4c7f6b14e Mon Sep 17 00:00:00 2001 From: Steve Fan <19037626d@connect.polyu.hk> Date: Sat, 4 Dec 2021 13:33:24 +0800 Subject: [PATCH] Host report for async error upon kernel termination (#1791) Closes #1644 --- RELEASE_NOTES.rst | 4 +++- artiq/coredevice/comm_kernel.py | 12 ++++++++++++ artiq/firmware/libproto_artiq/session_proto.rs | 14 ++++++++++---- artiq/firmware/runtime/rtio_mgt.rs | 9 +++++++++ artiq/firmware/runtime/session.rs | 8 ++++++-- 5 files changed, 40 insertions(+), 7 deletions(-) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index a1c807008..46cd999ab 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -31,6 +31,9 @@ Highlights: * The configuration entry ``rtio_clock`` supports multiple clocking settings, deprecating the usage of compile-time options. * DRTIO: added support for 100MHz clock. +* Previously detected RTIO async errors are reported to the host after each kernel terminates and a + warning is logged. The warning is additional to the one already printed in the core device log upon + detection of the error. Breaking changes: @@ -44,7 +47,6 @@ Breaking changes: * DRTIO: Changed message alignment from 32-bits to 64-bits. * The deprecated ``set_dataset(..., save=...)`` is no longer supported. - ARTIQ-6 ------- diff --git a/artiq/coredevice/comm_kernel.py b/artiq/coredevice/comm_kernel.py index cdb54a118..1b0111c49 100644 --- a/artiq/coredevice/comm_kernel.py +++ b/artiq/coredevice/comm_kernel.py @@ -621,6 +621,7 @@ class CommKernel: function = self._read_string() backtrace = [self._read_int32() for _ in range(self._read_int32())] + self._process_async_error() traceback = list(reversed(symbolizer(backtrace))) + \ [(filename, line, column, *demangler([function]), None)] @@ -635,6 +636,16 @@ class CommKernel: python_exn.artiq_core_exception = core_exn raise python_exn + def _process_async_error(self): + errors = self._read_int8() + if errors > 0: + map_name = lambda y, z: [f"{y}(s)"] if z else [] + errors = map_name("collision", errors & 2 ** 0) + \ + map_name("busy error", errors & 2 ** 1) + \ + map_name("sequence error", errors & 2 ** 2) + logger.warning(f"{(', '.join(errors[:-1]) + ' and ') if len(errors) > 1 else ''}{errors[-1]} " + f"reported during kernel execution") + def serve(self, embedding_map, symbolizer, demangler): while True: self._read_header() @@ -646,4 +657,5 @@ class CommKernel: raise exceptions.ClockFailure else: self._read_expect(Reply.KernelFinished) + self._process_async_error() return diff --git a/artiq/firmware/libproto_artiq/session_proto.rs b/artiq/firmware/libproto_artiq/session_proto.rs index 99412de10..0475a4489 100644 --- a/artiq/firmware/libproto_artiq/session_proto.rs +++ b/artiq/firmware/libproto_artiq/session_proto.rs @@ -90,7 +90,9 @@ pub enum Reply<'a> { LoadCompleted, LoadFailed(&'a str), - KernelFinished, + KernelFinished { + async_errors: u8 + }, KernelStartupFailed, KernelException { name: &'a str, @@ -100,7 +102,8 @@ pub enum Reply<'a> { line: u32, column: u32, function: &'a str, - backtrace: &'a [usize] + backtrace: &'a [usize], + async_errors: u8 }, RpcRequest { async: bool }, @@ -160,14 +163,16 @@ impl<'a> Reply<'a> { writer.write_string(reason)?; }, - Reply::KernelFinished => { + Reply::KernelFinished { async_errors } => { writer.write_u8(7)?; + writer.write_u8(async_errors)?; }, Reply::KernelStartupFailed => { writer.write_u8(8)?; }, Reply::KernelException { - name, message, param, file, line, column, function, backtrace + name, message, param, file, line, column, function, backtrace, + async_errors } => { writer.write_u8(9)?; writer.write_string(name)?; @@ -183,6 +188,7 @@ impl<'a> Reply<'a> { for &addr in backtrace { writer.write_u32(addr as u32)? } + writer.write_u8(async_errors)?; }, Reply::RpcRequest { async } => { diff --git a/artiq/firmware/runtime/rtio_mgt.rs b/artiq/firmware/runtime/rtio_mgt.rs index 825900b78..1a1d1660b 100644 --- a/artiq/firmware/runtime/rtio_mgt.rs +++ b/artiq/firmware/runtime/rtio_mgt.rs @@ -326,6 +326,14 @@ pub mod drtio { pub fn reset(_io: &Io, _aux_mutex: &Mutex) {} } +static mut SEEN_ASYNC_ERRORS: u8 = 0; + +pub unsafe fn get_async_errors() -> u8 { + let mut errors = SEEN_ASYNC_ERRORS; + SEEN_ASYNC_ERRORS = 0; + errors +} + fn async_error_thread(io: Io) { loop { unsafe { @@ -343,6 +351,7 @@ fn async_error_thread(io: Io) { error!("RTIO sequence error involving channel {}", csr::rtio_core::sequence_error_channel_read()); } + SEEN_ASYNC_ERRORS = errors; csr::rtio_core::async_error_write(errors); } } diff --git a/artiq/firmware/runtime/session.rs b/artiq/firmware/runtime/session.rs index 7d0935667..260a1b385 100644 --- a/artiq/firmware/runtime/session.rs +++ b/artiq/firmware/runtime/session.rs @@ -9,6 +9,7 @@ use urc::Urc; use sched::{ThreadHandle, Io, Mutex, TcpListener, TcpStream, Error as SchedError}; use rtio_clocking; use rtio_dma::Manager as DmaManager; +use rtio_mgt::get_async_errors; use cache::Cache; use kern_hwreq; use board_artiq::drtio_routing; @@ -431,7 +432,9 @@ fn process_kern_message(io: &Io, aux_mutex: &Mutex, match stream { None => return Ok(true), Some(ref mut stream) => - host_write(stream, host::Reply::KernelFinished).map_err(|e| e.into()) + host_write(stream, host::Reply::KernelFinished { + async_errors: unsafe { get_async_errors() } + }).map_err(|e| e.into()) } } &kern::RunException { @@ -458,7 +461,8 @@ fn process_kern_message(io: &Io, aux_mutex: &Mutex, line: line, column: column, function: function, - backtrace: backtrace + backtrace: backtrace, + async_errors: unsafe { get_async_errors() } }).map_err(|e| e.into()) } }