subkernels: support exceptions from subkernels

This commit is contained in:
mwojcik 2024-07-04 16:57:49 +08:00
parent 6057af56d9
commit b9722e95d7
8 changed files with 217 additions and 133 deletions

View File

@ -267,12 +267,14 @@ pub enum Packet {
exception_src: u8,
},
SubkernelExceptionRequest {
source: u8,
destination: u8,
},
SubkernelException {
destination: u8,
last: bool,
length: u16,
data: [u8; SAT_PAYLOAD_MAX_SIZE],
data: [u8; MASTER_PAYLOAD_MAX_SIZE],
},
SubkernelMessage {
source: u8,
@ -524,14 +526,17 @@ impl Packet {
exception_src: reader.read_u8()?,
},
0xc9 => Packet::SubkernelExceptionRequest {
source: reader.read_u8()?,
destination: reader.read_u8()?,
},
0xca => {
let destination = reader.read_u8()?;
let last = reader.read_bool()?;
let length = reader.read_u16()?;
let mut data: [u8; SAT_PAYLOAD_MAX_SIZE] = [0; SAT_PAYLOAD_MAX_SIZE];
let mut data: [u8; MASTER_PAYLOAD_MAX_SIZE] = [0; MASTER_PAYLOAD_MAX_SIZE];
reader.read_exact(&mut data[0..length as usize])?;
Packet::SubkernelException {
destination: destination,
last: last,
length: length,
data: data,
@ -896,12 +901,19 @@ impl Packet {
writer.write_bool(with_exception)?;
writer.write_u8(exception_src)?;
}
Packet::SubkernelExceptionRequest { destination } => {
Packet::SubkernelExceptionRequest { source, destination } => {
writer.write_u8(0xc9)?;
writer.write_u8(source)?;
writer.write_u8(destination)?;
}
Packet::SubkernelException { last, length, data } => {
Packet::SubkernelException {
destination,
last,
length,
data,
} => {
writer.write_u8(0xca)?;
writer.write_u8(destination)?;
writer.write_bool(last)?;
writer.write_u16(length)?;
writer.write_all(&data[0..length as usize])?;
@ -943,6 +955,8 @@ impl Packet {
Packet::SubkernelLoadRunReply { destination, .. } => Some(*destination),
Packet::SubkernelMessage { destination, .. } => Some(*destination),
Packet::SubkernelMessageAck { destination } => Some(*destination),
Packet::SubkernelExceptionRequest { destination, .. } => Some(*destination),
Packet::SubkernelException { destination, .. } => Some(*destination),
Packet::DmaPlaybackStatus { destination, .. } => Some(*destination),
Packet::SubkernelFinished { destination, .. } => Some(*destination),
_ => None,

View File

@ -23,8 +23,8 @@ pub enum SubkernelStatus {
Timeout,
IncorrectState,
CommLost,
OtherError,
Exception(Vec<u8>),
OtherError,
}
#[derive(Debug, Clone)]
@ -91,9 +91,7 @@ pub enum Message {
timeout: i64,
},
#[cfg(has_drtio)]
SubkernelAwaitFinishReply {
status: SubkernelStatus,
},
SubkernelAwaitFinishReply,
#[cfg(has_drtio)]
SubkernelMsgSend {
id: u32,
@ -110,9 +108,10 @@ pub enum Message {
},
#[cfg(has_drtio)]
SubkernelMsgRecvReply {
status: SubkernelStatus,
count: u8,
},
#[cfg(has_drtio)]
SubkernelError(SubkernelStatus),
}
static CHANNEL_0TO1: Mutex<Option<sync_channel::Sender<'static, Message>>> = Mutex::new(None);

View File

@ -36,27 +36,18 @@ pub extern "C" fn await_finish(id: u32, timeout: i64) {
});
}
match unsafe { KERNEL_CHANNEL_0TO1.as_mut().unwrap() }.recv() {
Message::SubkernelAwaitFinishReply {
status: SubkernelStatus::NoError,
} => (),
Message::SubkernelAwaitFinishReply {
status: SubkernelStatus::IncorrectState,
} => artiq_raise!("SubkernelError", "Subkernel not running"),
Message::SubkernelAwaitFinishReply {
status: SubkernelStatus::Timeout,
} => artiq_raise!("SubkernelError", "Subkernel timed out"),
Message::SubkernelAwaitFinishReply {
status: SubkernelStatus::CommLost,
} => artiq_raise!("SubkernelError", "Lost communication with satellite"),
Message::SubkernelAwaitFinishReply {
status: SubkernelStatus::OtherError,
} => artiq_raise!("SubkernelError", "An error occurred during subkernel operation"),
Message::SubkernelAwaitFinishReply {
status: SubkernelStatus::Exception(raw_exception),
} => {
// reconstruct the exception here and raise it
eh_artiq::raise_raw(&raw_exception)
Message::SubkernelAwaitFinishReply => (),
Message::SubkernelError(SubkernelStatus::IncorrectState) => {
artiq_raise!("SubkernelError", "Subkernel not running")
}
Message::SubkernelError(SubkernelStatus::Timeout) => artiq_raise!("SubkernelError", "Subkernel timed out"),
Message::SubkernelError(SubkernelStatus::CommLost) => {
artiq_raise!("SubkernelError", "Lost communication with satellite")
}
Message::SubkernelError(SubkernelStatus::OtherError) => {
artiq_raise!("SubkernelError", "An error occurred during subkernel operation")
}
Message::SubkernelError(SubkernelStatus::Exception(raw_exception)) => eh_artiq::raise_raw(&raw_exception),
_ => panic!("expected SubkernelAwaitFinishReply after SubkernelAwaitFinishRequest"),
}
}
@ -98,37 +89,22 @@ pub extern "C" fn await_message(id: i32, timeout: i64, tags: &CSlice<u8>, min: u
});
}
match unsafe { KERNEL_CHANNEL_0TO1.as_mut().unwrap() }.recv() {
Message::SubkernelMsgRecvReply {
status: SubkernelStatus::NoError,
count,
} => {
Message::SubkernelMsgRecvReply { count } => {
if min > count || count > max {
artiq_raise!("SubkernelError", "Received more or less arguments than required")
}
}
Message::SubkernelMsgRecvReply {
status: SubkernelStatus::IncorrectState,
..
} => artiq_raise!("SubkernelError", "Subkernel not running"),
Message::SubkernelMsgRecvReply {
status: SubkernelStatus::Timeout,
..
} => artiq_raise!("SubkernelError", "Subkernel timed out"),
Message::SubkernelMsgRecvReply {
status: SubkernelStatus::CommLost,
..
} => artiq_raise!("SubkernelError", "Lost communication with satellite"),
Message::SubkernelMsgRecvReply {
status: SubkernelStatus::OtherError,
..
} => artiq_raise!("SubkernelError", "An error occurred during subkernel operation"),
Message::SubkernelMsgRecvReply {
status: SubkernelStatus::Exception(raw_exception),
..
} => {
// reconstruct the raw exception here
eh_artiq::raise_raw(&raw_exception);
Message::SubkernelError(SubkernelStatus::IncorrectState) => {
artiq_raise!("SubkernelError", "Subkernel not running")
}
Message::SubkernelError(SubkernelStatus::Timeout) => artiq_raise!("SubkernelError", "Subkernel timed out"),
Message::SubkernelError(SubkernelStatus::CommLost) => {
artiq_raise!("SubkernelError", "Lost communication with satellite")
}
Message::SubkernelError(SubkernelStatus::OtherError) => {
artiq_raise!("SubkernelError", "An error occurred during subkernel operation")
}
Message::SubkernelError(SubkernelStatus::Exception(raw_exception)) => eh_artiq::raise_raw(&raw_exception),
_ => panic!("expected SubkernelMsgRecvReply after SubkernelMsgRecvRequest"),
}
// RpcRecvRequest should be called after this to receive message data

View File

@ -422,25 +422,23 @@ async fn handle_run_kernel(
#[cfg(has_drtio)]
kernel::Message::SubkernelAwaitFinishRequest { id, timeout } => {
let res = subkernel::await_finish(aux_mutex, routing_table, timer, id, timeout).await;
let status = match res {
let response = match res {
Ok(res) => {
if res.status == subkernel::FinishStatus::CommLost {
kernel::SubkernelStatus::CommLost
kernel::Message::SubkernelError(kernel::SubkernelStatus::CommLost)
} else if let Some(exception) = res.exception {
kernel::SubkernelStatus::Exception(exception)
kernel::Message::SubkernelError(kernel::SubkernelStatus::Exception(exception))
} else {
kernel::SubkernelStatus::NoError
kernel::Message::SubkernelAwaitFinishReply
}
}
Err(SubkernelError::Timeout) => kernel::SubkernelStatus::Timeout,
Err(SubkernelError::IncorrectState) => kernel::SubkernelStatus::IncorrectState,
Err(_) => kernel::SubkernelStatus::OtherError,
Err(SubkernelError::Timeout) => kernel::Message::SubkernelError(kernel::SubkernelStatus::Timeout),
Err(SubkernelError::IncorrectState) => {
kernel::Message::SubkernelError(kernel::SubkernelStatus::IncorrectState)
}
Err(_) => kernel::Message::SubkernelError(kernel::SubkernelStatus::OtherError),
};
control
.borrow_mut()
.tx
.async_send(kernel::Message::SubkernelAwaitFinishReply { status: status })
.await;
control.borrow_mut().tx.async_send(response).await;
}
#[cfg(has_drtio)]
kernel::Message::SubkernelMsgSend { id, destination, data } => {
@ -461,28 +459,23 @@ async fn handle_run_kernel(
#[cfg(has_drtio)]
kernel::Message::SubkernelMsgRecvRequest { id, timeout, tags } => {
let message_received = subkernel::message_await(id as u32, timeout, timer).await;
let (status, count) = match message_received {
Ok(ref message) => (kernel::SubkernelStatus::NoError, message.count),
Err(SubkernelError::Timeout) => (kernel::SubkernelStatus::Timeout, 0),
Err(SubkernelError::IncorrectState) => (kernel::SubkernelStatus::IncorrectState, 0),
Err(SubkernelError::CommLost) => (kernel::SubkernelStatus::CommLost, 0),
let response = match message_received {
Ok(ref message) => kernel::Message::SubkernelMsgRecvReply { count: message.count },
Err(SubkernelError::Timeout) => kernel::Message::SubkernelError(kernel::SubkernelStatus::Timeout),
Err(SubkernelError::IncorrectState) => {
kernel::Message::SubkernelError(kernel::SubkernelStatus::IncorrectState)
}
Err(SubkernelError::CommLost) => kernel::Message::SubkernelError(kernel::SubkernelStatus::CommLost),
Err(SubkernelError::SubkernelException) => {
// just retrieve the exception
let status = subkernel::await_finish(aux_mutex, routing_table, timer, id as u32, timeout)
.await
.unwrap();
(kernel::SubkernelStatus::Exception(status.exception.unwrap()), 0)
kernel::Message::SubkernelError(kernel::SubkernelStatus::Exception(status.exception.unwrap()))
}
Err(_) => (kernel::SubkernelStatus::OtherError, 0),
Err(_) => kernel::Message::SubkernelError(kernel::SubkernelStatus::OtherError),
};
control
.borrow_mut()
.tx
.async_send(kernel::Message::SubkernelMsgRecvReply {
status: status,
count: count,
})
.await;
control.borrow_mut().tx.async_send(response).await;
if let Ok(message) = message_received {
// receive code almost identical to RPC recv, except we are not reading from a stream
let mut reader = Cursor::new(message.data);
@ -514,7 +507,7 @@ async fn handle_run_kernel(
.async_send(kernel::Message::RpcRecvReply(Ok(0)))
.await;
i += 1;
if i < count {
if i < message.count {
current_tags = remaining_tags;
} else {
break;

View File

@ -129,6 +129,8 @@ pub mod drtio {
| Packet::SubkernelLoadRunReply { destination, .. }
| Packet::SubkernelMessage { destination, .. }
| Packet::SubkernelMessageAck { destination, .. }
| Packet::SubkernelException { destination, .. }
| Packet::SubkernelExceptionRequest { destination, .. }
| Packet::DmaPlaybackStatus { destination, .. }
| Packet::SubkernelFinished { destination, .. } => {
if destination == 0 {
@ -183,10 +185,7 @@ pub mod drtio {
async fn drain_buffer(linkno: u8, draining_time: Milliseconds, timer: GlobalTimer) {
let max_time = timer.get_time() + draining_time;
loop {
if timer.get_time() > max_time {
return;
}
while timer.get_time() < max_time {
let _ = drtioaux_async::recv(linkno).await;
}
}
@ -835,13 +834,19 @@ pub mod drtio {
linkno,
routing_table,
&Packet::SubkernelExceptionRequest {
source: 0,
destination: destination,
},
timer,
)
.await?;
match reply {
Packet::SubkernelException { last, length, data } => {
Packet::SubkernelException {
destination: 0,
last,
length,
data,
} => {
remote_data.extend(&data[0..length as usize]);
if last {
return Ok(remote_data);

View File

@ -895,6 +895,7 @@ fn process_aux_packet(
Ok(())
}
drtioaux::Packet::SubkernelExceptionRequest {
source,
destination: _destination,
} => {
forward!(
@ -907,17 +908,46 @@ fn process_aux_packet(
&packet,
timer
);
let mut data_slice: [u8; SAT_PAYLOAD_MAX_SIZE] = [0; SAT_PAYLOAD_MAX_SIZE];
let mut data_slice: [u8; MASTER_PAYLOAD_MAX_SIZE] = [0; MASTER_PAYLOAD_MAX_SIZE];
let meta = kernel_manager.exception_get_slice(&mut data_slice);
drtioaux::send(
0,
&drtioaux::Packet::SubkernelException {
router.send(
drtioaux::Packet::SubkernelException {
destination: source,
last: meta.status.is_last(),
length: meta.len,
data: data_slice,
},
_routing_table,
*rank,
*self_destination,
)
}
drtioaux::Packet::SubkernelException {
destination: _destination,
last,
length,
data,
} => {
forward!(
router,
_routing_table,
_destination,
*rank,
*self_destination,
_repeaters,
&packet,
timer
);
kernel_manager.received_exception(
&data[..length as usize],
last,
router,
_routing_table,
*rank,
*self_destination,
);
Ok(())
}
drtioaux::Packet::SubkernelMessage {
source,
destination: _destination,

View File

@ -4,7 +4,7 @@ use core::cmp::min;
#[cfg(has_drtio_routing)]
use libboard_artiq::pl::csr;
use libboard_artiq::{drtio_routing, drtioaux,
drtioaux_proto::{PayloadStatus, MASTER_PAYLOAD_MAX_SIZE, SAT_PAYLOAD_MAX_SIZE}};
drtioaux_proto::{PayloadStatus, MASTER_PAYLOAD_MAX_SIZE}};
pub struct SliceMeta {
pub destination: u8,
@ -57,7 +57,6 @@ impl Sliceable {
self.data.extend(data);
}
get_slice_fn!(get_slice_sat, SAT_PAYLOAD_MAX_SIZE);
get_slice_fn!(get_slice_master, MASTER_PAYLOAD_MAX_SIZE);
}

View File

@ -11,7 +11,7 @@ use io::{Cursor, ProtoWrite};
use ksupport::{eh_artiq, kernel, rpc};
use libboard_artiq::{drtio_routing::RoutingTable,
drtioaux,
drtioaux_proto::{PayloadStatus, MASTER_PAYLOAD_MAX_SIZE, SAT_PAYLOAD_MAX_SIZE},
drtioaux_proto::{PayloadStatus, MASTER_PAYLOAD_MAX_SIZE},
pl::csr};
use libboard_zynq::{time::Milliseconds, timer::GlobalTimer};
use libcortex_a9::sync_channel::Receiver;
@ -47,6 +47,9 @@ enum KernelState {
DmaAwait {
max_time: Milliseconds,
},
SubkernelRetrievingException {
destination: u8,
},
}
#[derive(Debug)]
@ -123,10 +126,11 @@ struct MessageManager {
struct Session {
id: u32,
kernel_state: KernelState,
last_exception: Option<Sliceable>,
last_exception: Option<Sliceable>, // exceptions raised locally
external_exception: Option<Vec<u8>>, // exceptions from sub-subkernels
messages: MessageManager,
source: u8, // which destination requested running the kernel
subkernels_finished: Vec<u32>,
subkernels_finished: Vec<(u32, Option<u8>)>,
}
impl Session {
@ -135,6 +139,7 @@ impl Session {
id: id,
kernel_state: KernelState::Absent,
last_exception: None,
external_exception: None,
messages: MessageManager::new(),
source: 0,
subkernels_finished: Vec::new(),
@ -410,9 +415,9 @@ impl<'a> Manager<'_> {
}
}
pub fn exception_get_slice(&mut self, data_slice: &mut [u8; SAT_PAYLOAD_MAX_SIZE]) -> SliceMeta {
pub fn exception_get_slice(&mut self, data_slice: &mut [u8; MASTER_PAYLOAD_MAX_SIZE]) -> SliceMeta {
match self.session.last_exception.as_mut() {
Some(exception) => exception.get_slice_sat(data_slice),
Some(exception) => exception.get_slice_master(data_slice),
None => SliceMeta {
destination: 0,
len: 0,
@ -540,7 +545,7 @@ impl<'a> Manager<'_> {
return;
}
match self.process_external_messages(timer) {
match self.process_external_messages(router, routing_table, rank, destination, timer) {
Ok(()) => (),
Err(Error::AwaitingMessage) => return, // kernel still waiting, do not process kernel messages
Err(Error::KernelException(exception)) => {
@ -596,6 +601,41 @@ impl<'a> Manager<'_> {
}
}
fn check_finished_kernels(
&mut self,
id: u32,
router: &mut Router,
routing_table: &RoutingTable,
rank: u8,
self_destination: u8,
) {
for (i, (status, exception_source)) in self.session.subkernels_finished.iter().enumerate() {
if *status == id {
if exception_source.is_none() {
self.control.tx.send(kernel::Message::SubkernelAwaitFinishReply);
self.session.kernel_state = KernelState::Running;
self.session.subkernels_finished.swap_remove(i);
} else {
let destination = exception_source.unwrap();
self.session.external_exception = Some(Vec::new());
self.session.kernel_state = KernelState::SubkernelRetrievingException {
destination: destination,
};
router.route(
drtioaux::Packet::SubkernelExceptionRequest {
source: self_destination,
destination: destination,
},
&routing_table,
rank,
self_destination,
);
}
break;
}
}
}
pub fn subkernel_load_run_reply(&mut self, succeeded: bool) {
if self.session.kernel_state == KernelState::SubkernelAwaitLoad {
self.control
@ -608,16 +648,46 @@ impl<'a> Manager<'_> {
}
pub fn remote_subkernel_finished(&mut self, id: u32, with_exception: bool, exception_source: u8) {
if with_exception {
self.kernel_stop();
self.last_finished = Some(SubkernelFinished {
source: self.session.source,
id: self.session.id,
with_exception: true,
exception_source: exception_source,
})
let exception_src = if with_exception { Some(exception_source) } else { None };
self.session.subkernels_finished.push((id, exception_src));
}
pub fn received_exception(
&mut self,
exception_data: &[u8],
last: bool,
router: &mut Router,
routing_table: &RoutingTable,
rank: u8,
self_destination: u8,
) {
if let KernelState::SubkernelRetrievingException { destination } = self.session.kernel_state {
self.session
.external_exception
.as_mut()
.unwrap()
.extend_from_slice(exception_data);
if last {
self.control
.tx
.send(kernel::Message::SubkernelError(kernel::SubkernelStatus::Exception(
self.session.external_exception.take().unwrap(),
)));
self.session.kernel_state = KernelState::Running;
} else {
/* fetch another slice */
router.route(
drtioaux::Packet::SubkernelExceptionRequest {
source: self_destination,
destination: destination,
},
routing_table,
rank,
self_destination,
);
}
} else {
self.session.subkernels_finished.push(id);
warn!("Received unsolicited exception data");
}
}
@ -780,28 +850,35 @@ impl<'a> Manager<'_> {
Ok(false)
}
fn process_external_messages(&mut self, timer: &GlobalTimer) -> Result<(), Error> {
fn process_external_messages(
&mut self,
router: &mut Router,
routing_table: &RoutingTable,
rank: u8,
self_destination: u8,
timer: &GlobalTimer,
) -> Result<(), Error> {
match &self.session.kernel_state {
KernelState::MsgAwait { max_time, id, tags } => {
if let Some(max_time) = *max_time {
if timer.get_time() > max_time {
self.control.tx.send(kernel::Message::SubkernelMsgRecvReply {
status: kernel::SubkernelStatus::Timeout,
count: 0,
});
self.control
.tx
.send(kernel::Message::SubkernelError(kernel::SubkernelStatus::Timeout));
self.session.kernel_state = KernelState::Running;
return Ok(());
}
}
if let Some(message) = self.session.messages.get_incoming(*id) {
self.control.tx.send(kernel::Message::SubkernelMsgRecvReply {
status: kernel::SubkernelStatus::NoError,
count: message.count,
});
self.control
.tx
.send(kernel::Message::SubkernelMsgRecvReply { count: message.count });
let tags = tags.clone();
self.session.kernel_state = KernelState::Running;
self.pass_message_to_kernel(&message, tags, timer)
} else {
let id = *id;
self.check_finished_kernels(id, router, routing_table, rank, self_destination);
Err(Error::AwaitingMessage)
}
}
@ -817,27 +894,18 @@ impl<'a> Manager<'_> {
KernelState::SubkernelAwaitFinish { max_time, id } => {
if let Some(max_time) = *max_time {
if timer.get_time() > max_time {
self.control.tx.send(kernel::Message::SubkernelAwaitFinishReply {
status: kernel::SubkernelStatus::Timeout,
});
self.control
.tx
.send(kernel::Message::SubkernelError(kernel::SubkernelStatus::Timeout));
self.session.kernel_state = KernelState::Running;
return Ok(());
}
}
let mut i = 0;
for status in &self.session.subkernels_finished {
if *status == *id {
self.control.tx.send(kernel::Message::SubkernelAwaitFinishReply {
status: kernel::SubkernelStatus::NoError,
});
self.session.kernel_state = KernelState::Running;
self.session.subkernels_finished.swap_remove(i);
break;
}
i += 1;
}
let id = *id;
self.check_finished_kernels(id, router, routing_table, rank, self_destination);
Ok(())
}
KernelState::SubkernelRetrievingException { .. } => Err(Error::AwaitingMessage),
KernelState::DmaAwait { max_time } | KernelState::DmaPendingAwait { max_time, .. } => {
if timer.get_time() > *max_time {
self.control.tx.send(kernel::Message::DmaAwaitRemoteReply {