forked from M-Labs/artiq
1
0
Fork 0

Host report for async error upon kernel termination (#1791)

Closes #1644
This commit is contained in:
Steve Fan 2021-12-04 13:33:24 +08:00 committed by GitHub
parent 9bbf7eb485
commit 4a6bea479a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 40 additions and 7 deletions

View File

@ -31,6 +31,9 @@ Highlights:
* The configuration entry ``rtio_clock`` supports multiple clocking settings, deprecating the usage * The configuration entry ``rtio_clock`` supports multiple clocking settings, deprecating the usage
of compile-time options. of compile-time options.
* DRTIO: added support for 100MHz clock. * DRTIO: added support for 100MHz clock.
* Previously detected RTIO async errors are reported to the host after each kernel terminates and a
warning is logged. The warning is additional to the one already printed in the core device log upon
detection of the error.
Breaking changes: Breaking changes:
@ -44,7 +47,6 @@ Breaking changes:
* DRTIO: Changed message alignment from 32-bits to 64-bits. * DRTIO: Changed message alignment from 32-bits to 64-bits.
* The deprecated ``set_dataset(..., save=...)`` is no longer supported. * The deprecated ``set_dataset(..., save=...)`` is no longer supported.
ARTIQ-6 ARTIQ-6
------- -------

View File

@ -621,6 +621,7 @@ class CommKernel:
function = self._read_string() function = self._read_string()
backtrace = [self._read_int32() for _ in range(self._read_int32())] backtrace = [self._read_int32() for _ in range(self._read_int32())]
self._process_async_error()
traceback = list(reversed(symbolizer(backtrace))) + \ traceback = list(reversed(symbolizer(backtrace))) + \
[(filename, line, column, *demangler([function]), None)] [(filename, line, column, *demangler([function]), None)]
@ -635,6 +636,16 @@ class CommKernel:
python_exn.artiq_core_exception = core_exn python_exn.artiq_core_exception = core_exn
raise python_exn raise python_exn
def _process_async_error(self):
errors = self._read_int8()
if errors > 0:
map_name = lambda y, z: [f"{y}(s)"] if z else []
errors = map_name("collision", errors & 2 ** 0) + \
map_name("busy error", errors & 2 ** 1) + \
map_name("sequence error", errors & 2 ** 2)
logger.warning(f"{(', '.join(errors[:-1]) + ' and ') if len(errors) > 1 else ''}{errors[-1]} "
f"reported during kernel execution")
def serve(self, embedding_map, symbolizer, demangler): def serve(self, embedding_map, symbolizer, demangler):
while True: while True:
self._read_header() self._read_header()
@ -646,4 +657,5 @@ class CommKernel:
raise exceptions.ClockFailure raise exceptions.ClockFailure
else: else:
self._read_expect(Reply.KernelFinished) self._read_expect(Reply.KernelFinished)
self._process_async_error()
return return

View File

@ -90,7 +90,9 @@ pub enum Reply<'a> {
LoadCompleted, LoadCompleted,
LoadFailed(&'a str), LoadFailed(&'a str),
KernelFinished, KernelFinished {
async_errors: u8
},
KernelStartupFailed, KernelStartupFailed,
KernelException { KernelException {
name: &'a str, name: &'a str,
@ -100,7 +102,8 @@ pub enum Reply<'a> {
line: u32, line: u32,
column: u32, column: u32,
function: &'a str, function: &'a str,
backtrace: &'a [usize] backtrace: &'a [usize],
async_errors: u8
}, },
RpcRequest { async: bool }, RpcRequest { async: bool },
@ -160,14 +163,16 @@ impl<'a> Reply<'a> {
writer.write_string(reason)?; writer.write_string(reason)?;
}, },
Reply::KernelFinished => { Reply::KernelFinished { async_errors } => {
writer.write_u8(7)?; writer.write_u8(7)?;
writer.write_u8(async_errors)?;
}, },
Reply::KernelStartupFailed => { Reply::KernelStartupFailed => {
writer.write_u8(8)?; writer.write_u8(8)?;
}, },
Reply::KernelException { Reply::KernelException {
name, message, param, file, line, column, function, backtrace name, message, param, file, line, column, function, backtrace,
async_errors
} => { } => {
writer.write_u8(9)?; writer.write_u8(9)?;
writer.write_string(name)?; writer.write_string(name)?;
@ -183,6 +188,7 @@ impl<'a> Reply<'a> {
for &addr in backtrace { for &addr in backtrace {
writer.write_u32(addr as u32)? writer.write_u32(addr as u32)?
} }
writer.write_u8(async_errors)?;
}, },
Reply::RpcRequest { async } => { Reply::RpcRequest { async } => {

View File

@ -326,6 +326,14 @@ pub mod drtio {
pub fn reset(_io: &Io, _aux_mutex: &Mutex) {} pub fn reset(_io: &Io, _aux_mutex: &Mutex) {}
} }
static mut SEEN_ASYNC_ERRORS: u8 = 0;
pub unsafe fn get_async_errors() -> u8 {
let mut errors = SEEN_ASYNC_ERRORS;
SEEN_ASYNC_ERRORS = 0;
errors
}
fn async_error_thread(io: Io) { fn async_error_thread(io: Io) {
loop { loop {
unsafe { unsafe {
@ -343,6 +351,7 @@ fn async_error_thread(io: Io) {
error!("RTIO sequence error involving channel {}", error!("RTIO sequence error involving channel {}",
csr::rtio_core::sequence_error_channel_read()); csr::rtio_core::sequence_error_channel_read());
} }
SEEN_ASYNC_ERRORS = errors;
csr::rtio_core::async_error_write(errors); csr::rtio_core::async_error_write(errors);
} }
} }

View File

@ -9,6 +9,7 @@ use urc::Urc;
use sched::{ThreadHandle, Io, Mutex, TcpListener, TcpStream, Error as SchedError}; use sched::{ThreadHandle, Io, Mutex, TcpListener, TcpStream, Error as SchedError};
use rtio_clocking; use rtio_clocking;
use rtio_dma::Manager as DmaManager; use rtio_dma::Manager as DmaManager;
use rtio_mgt::get_async_errors;
use cache::Cache; use cache::Cache;
use kern_hwreq; use kern_hwreq;
use board_artiq::drtio_routing; use board_artiq::drtio_routing;
@ -431,7 +432,9 @@ fn process_kern_message(io: &Io, aux_mutex: &Mutex,
match stream { match stream {
None => return Ok(true), None => return Ok(true),
Some(ref mut stream) => Some(ref mut stream) =>
host_write(stream, host::Reply::KernelFinished).map_err(|e| e.into()) host_write(stream, host::Reply::KernelFinished {
async_errors: unsafe { get_async_errors() }
}).map_err(|e| e.into())
} }
} }
&kern::RunException { &kern::RunException {
@ -458,7 +461,8 @@ fn process_kern_message(io: &Io, aux_mutex: &Mutex,
line: line, line: line,
column: column, column: column,
function: function, function: function,
backtrace: backtrace backtrace: backtrace,
async_errors: unsafe { get_async_errors() }
}).map_err(|e| e.into()) }).map_err(|e| e.into())
} }
} }