From 63f4783687b85902e6c6181321df7dc067778f47 Mon Sep 17 00:00:00 2001 From: mwojcik Date: Thu, 4 Jul 2024 16:57:49 +0800 Subject: [PATCH] subkernels: support exceptions from subkernels --- src/libboard_artiq/src/drtioaux_proto.rs | 22 +++- src/libksupport/src/kernel/mod.rs | 9 +- src/libksupport/src/kernel/subkernel.rs | 68 ++++------- src/runtime/src/comms.rs | 49 ++++---- src/runtime/src/rtio_mgt.rs | 15 ++- src/satman/src/main.rs | 38 +++++- src/satman/src/routing.rs | 3 +- src/satman/src/subkernel.rs | 146 +++++++++++++++++------ 8 files changed, 217 insertions(+), 133 deletions(-) diff --git a/src/libboard_artiq/src/drtioaux_proto.rs b/src/libboard_artiq/src/drtioaux_proto.rs index cc649dda..62322603 100644 --- a/src/libboard_artiq/src/drtioaux_proto.rs +++ b/src/libboard_artiq/src/drtioaux_proto.rs @@ -267,12 +267,14 @@ pub enum Packet { exception_src: u8, }, SubkernelExceptionRequest { + source: u8, destination: u8, }, SubkernelException { + destination: u8, last: bool, length: u16, - data: [u8; SAT_PAYLOAD_MAX_SIZE], + data: [u8; MASTER_PAYLOAD_MAX_SIZE], }, SubkernelMessage { source: u8, @@ -524,14 +526,17 @@ impl Packet { exception_src: reader.read_u8()?, }, 0xc9 => Packet::SubkernelExceptionRequest { + source: reader.read_u8()?, destination: reader.read_u8()?, }, 0xca => { + let destination = reader.read_u8()?; let last = reader.read_bool()?; let length = reader.read_u16()?; - let mut data: [u8; SAT_PAYLOAD_MAX_SIZE] = [0; SAT_PAYLOAD_MAX_SIZE]; + let mut data: [u8; MASTER_PAYLOAD_MAX_SIZE] = [0; MASTER_PAYLOAD_MAX_SIZE]; reader.read_exact(&mut data[0..length as usize])?; Packet::SubkernelException { + destination: destination, last: last, length: length, data: data, @@ -896,12 +901,19 @@ impl Packet { writer.write_bool(with_exception)?; writer.write_u8(exception_src)?; } - Packet::SubkernelExceptionRequest { destination } => { + Packet::SubkernelExceptionRequest { source, destination } => { writer.write_u8(0xc9)?; + writer.write_u8(source)?; writer.write_u8(destination)?; } - Packet::SubkernelException { last, length, data } => { + Packet::SubkernelException { + destination, + last, + length, + data, + } => { writer.write_u8(0xca)?; + writer.write_u8(destination)?; writer.write_bool(last)?; writer.write_u16(length)?; writer.write_all(&data[0..length as usize])?; @@ -943,6 +955,8 @@ impl Packet { Packet::SubkernelLoadRunReply { destination, .. } => Some(*destination), Packet::SubkernelMessage { destination, .. } => Some(*destination), Packet::SubkernelMessageAck { destination } => Some(*destination), + Packet::SubkernelExceptionRequest { destination, .. } => Some(*destination), + Packet::SubkernelException { destination, .. } => Some(*destination), Packet::DmaPlaybackStatus { destination, .. } => Some(*destination), Packet::SubkernelFinished { destination, .. } => Some(*destination), _ => None, diff --git a/src/libksupport/src/kernel/mod.rs b/src/libksupport/src/kernel/mod.rs index 3f90e95b..8a8d48d9 100644 --- a/src/libksupport/src/kernel/mod.rs +++ b/src/libksupport/src/kernel/mod.rs @@ -23,8 +23,8 @@ pub enum SubkernelStatus { Timeout, IncorrectState, CommLost, - OtherError, Exception(Vec), + OtherError, } #[derive(Debug, Clone)] @@ -91,9 +91,7 @@ pub enum Message { timeout: i64, }, #[cfg(has_drtio)] - SubkernelAwaitFinishReply { - status: SubkernelStatus, - }, + SubkernelAwaitFinishReply, #[cfg(has_drtio)] SubkernelMsgSend { id: u32, @@ -110,9 +108,10 @@ pub enum Message { }, #[cfg(has_drtio)] SubkernelMsgRecvReply { - status: SubkernelStatus, count: u8, }, + #[cfg(has_drtio)] + SubkernelError(SubkernelStatus), } static CHANNEL_0TO1: Mutex>> = Mutex::new(None); diff --git a/src/libksupport/src/kernel/subkernel.rs b/src/libksupport/src/kernel/subkernel.rs index 8ab21660..66adb46c 100644 --- a/src/libksupport/src/kernel/subkernel.rs +++ b/src/libksupport/src/kernel/subkernel.rs @@ -36,27 +36,18 @@ pub extern "C" fn await_finish(id: u32, timeout: i64) { }); } match unsafe { KERNEL_CHANNEL_0TO1.as_mut().unwrap() }.recv() { - Message::SubkernelAwaitFinishReply { - status: SubkernelStatus::NoError, - } => (), - Message::SubkernelAwaitFinishReply { - status: SubkernelStatus::IncorrectState, - } => artiq_raise!("SubkernelError", "Subkernel not running"), - Message::SubkernelAwaitFinishReply { - status: SubkernelStatus::Timeout, - } => artiq_raise!("SubkernelError", "Subkernel timed out"), - Message::SubkernelAwaitFinishReply { - status: SubkernelStatus::CommLost, - } => artiq_raise!("SubkernelError", "Lost communication with satellite"), - Message::SubkernelAwaitFinishReply { - status: SubkernelStatus::OtherError, - } => artiq_raise!("SubkernelError", "An error occurred during subkernel operation"), - Message::SubkernelAwaitFinishReply { - status: SubkernelStatus::Exception(raw_exception), - } => { - // reconstruct the exception here and raise it - eh_artiq::raise_raw(&raw_exception) + Message::SubkernelAwaitFinishReply => (), + Message::SubkernelError(SubkernelStatus::IncorrectState) => { + artiq_raise!("SubkernelError", "Subkernel not running") } + Message::SubkernelError(SubkernelStatus::Timeout) => artiq_raise!("SubkernelError", "Subkernel timed out"), + Message::SubkernelError(SubkernelStatus::CommLost) => { + artiq_raise!("SubkernelError", "Lost communication with satellite") + } + Message::SubkernelError(SubkernelStatus::OtherError) => { + artiq_raise!("SubkernelError", "An error occurred during subkernel operation") + } + Message::SubkernelError(SubkernelStatus::Exception(raw_exception)) => eh_artiq::raise_raw(&raw_exception), _ => panic!("expected SubkernelAwaitFinishReply after SubkernelAwaitFinishRequest"), } } @@ -98,37 +89,22 @@ pub extern "C" fn await_message(id: i32, timeout: i64, tags: &CSlice, min: u }); } match unsafe { KERNEL_CHANNEL_0TO1.as_mut().unwrap() }.recv() { - Message::SubkernelMsgRecvReply { - status: SubkernelStatus::NoError, - count, - } => { + Message::SubkernelMsgRecvReply { count } => { if min > count || count > max { artiq_raise!("SubkernelError", "Received more or less arguments than required") } } - Message::SubkernelMsgRecvReply { - status: SubkernelStatus::IncorrectState, - .. - } => artiq_raise!("SubkernelError", "Subkernel not running"), - Message::SubkernelMsgRecvReply { - status: SubkernelStatus::Timeout, - .. - } => artiq_raise!("SubkernelError", "Subkernel timed out"), - Message::SubkernelMsgRecvReply { - status: SubkernelStatus::CommLost, - .. - } => artiq_raise!("SubkernelError", "Lost communication with satellite"), - Message::SubkernelMsgRecvReply { - status: SubkernelStatus::OtherError, - .. - } => artiq_raise!("SubkernelError", "An error occurred during subkernel operation"), - Message::SubkernelMsgRecvReply { - status: SubkernelStatus::Exception(raw_exception), - .. - } => { - // reconstruct the raw exception here - eh_artiq::raise_raw(&raw_exception); + Message::SubkernelError(SubkernelStatus::IncorrectState) => { + artiq_raise!("SubkernelError", "Subkernel not running") } + Message::SubkernelError(SubkernelStatus::Timeout) => artiq_raise!("SubkernelError", "Subkernel timed out"), + Message::SubkernelError(SubkernelStatus::CommLost) => { + artiq_raise!("SubkernelError", "Lost communication with satellite") + } + Message::SubkernelError(SubkernelStatus::OtherError) => { + artiq_raise!("SubkernelError", "An error occurred during subkernel operation") + } + Message::SubkernelError(SubkernelStatus::Exception(raw_exception)) => eh_artiq::raise_raw(&raw_exception), _ => panic!("expected SubkernelMsgRecvReply after SubkernelMsgRecvRequest"), } // RpcRecvRequest should be called after this to receive message data diff --git a/src/runtime/src/comms.rs b/src/runtime/src/comms.rs index 1453e3a2..e6abfb42 100644 --- a/src/runtime/src/comms.rs +++ b/src/runtime/src/comms.rs @@ -422,25 +422,23 @@ async fn handle_run_kernel( #[cfg(has_drtio)] kernel::Message::SubkernelAwaitFinishRequest { id, timeout } => { let res = subkernel::await_finish(aux_mutex, routing_table, timer, id, timeout).await; - let status = match res { + let response = match res { Ok(res) => { if res.status == subkernel::FinishStatus::CommLost { - kernel::SubkernelStatus::CommLost + kernel::Message::SubkernelError(kernel::SubkernelStatus::CommLost) } else if let Some(exception) = res.exception { - kernel::SubkernelStatus::Exception(exception) + kernel::Message::SubkernelError(kernel::SubkernelStatus::Exception(exception)) } else { - kernel::SubkernelStatus::NoError + kernel::Message::SubkernelAwaitFinishReply } } - Err(SubkernelError::Timeout) => kernel::SubkernelStatus::Timeout, - Err(SubkernelError::IncorrectState) => kernel::SubkernelStatus::IncorrectState, - Err(_) => kernel::SubkernelStatus::OtherError, + Err(SubkernelError::Timeout) => kernel::Message::SubkernelError(kernel::SubkernelStatus::Timeout), + Err(SubkernelError::IncorrectState) => { + kernel::Message::SubkernelError(kernel::SubkernelStatus::IncorrectState) + } + Err(_) => kernel::Message::SubkernelError(kernel::SubkernelStatus::OtherError), }; - control - .borrow_mut() - .tx - .async_send(kernel::Message::SubkernelAwaitFinishReply { status: status }) - .await; + control.borrow_mut().tx.async_send(response).await; } #[cfg(has_drtio)] kernel::Message::SubkernelMsgSend { id, destination, data } => { @@ -461,28 +459,23 @@ async fn handle_run_kernel( #[cfg(has_drtio)] kernel::Message::SubkernelMsgRecvRequest { id, timeout, tags } => { let message_received = subkernel::message_await(id as u32, timeout, timer).await; - let (status, count) = match message_received { - Ok(ref message) => (kernel::SubkernelStatus::NoError, message.count), - Err(SubkernelError::Timeout) => (kernel::SubkernelStatus::Timeout, 0), - Err(SubkernelError::IncorrectState) => (kernel::SubkernelStatus::IncorrectState, 0), - Err(SubkernelError::CommLost) => (kernel::SubkernelStatus::CommLost, 0), + let response = match message_received { + Ok(ref message) => kernel::Message::SubkernelMsgRecvReply { count: message.count }, + Err(SubkernelError::Timeout) => kernel::Message::SubkernelError(kernel::SubkernelStatus::Timeout), + Err(SubkernelError::IncorrectState) => { + kernel::Message::SubkernelError(kernel::SubkernelStatus::IncorrectState) + } + Err(SubkernelError::CommLost) => kernel::Message::SubkernelError(kernel::SubkernelStatus::CommLost), Err(SubkernelError::SubkernelException) => { // just retrieve the exception let status = subkernel::await_finish(aux_mutex, routing_table, timer, id as u32, timeout) .await .unwrap(); - (kernel::SubkernelStatus::Exception(status.exception.unwrap()), 0) + kernel::Message::SubkernelError(kernel::SubkernelStatus::Exception(status.exception.unwrap())) } - Err(_) => (kernel::SubkernelStatus::OtherError, 0), + Err(_) => kernel::Message::SubkernelError(kernel::SubkernelStatus::OtherError), }; - control - .borrow_mut() - .tx - .async_send(kernel::Message::SubkernelMsgRecvReply { - status: status, - count: count, - }) - .await; + control.borrow_mut().tx.async_send(response).await; if let Ok(message) = message_received { // receive code almost identical to RPC recv, except we are not reading from a stream let mut reader = Cursor::new(message.data); @@ -514,7 +507,7 @@ async fn handle_run_kernel( .async_send(kernel::Message::RpcRecvReply(Ok(0))) .await; i += 1; - if i < count { + if i < message.count { current_tags = remaining_tags; } else { break; diff --git a/src/runtime/src/rtio_mgt.rs b/src/runtime/src/rtio_mgt.rs index 8644e906..53a3b8bf 100644 --- a/src/runtime/src/rtio_mgt.rs +++ b/src/runtime/src/rtio_mgt.rs @@ -129,6 +129,8 @@ pub mod drtio { | Packet::SubkernelLoadRunReply { destination, .. } | Packet::SubkernelMessage { destination, .. } | Packet::SubkernelMessageAck { destination, .. } + | Packet::SubkernelException { destination, .. } + | Packet::SubkernelExceptionRequest { destination, .. } | Packet::DmaPlaybackStatus { destination, .. } | Packet::SubkernelFinished { destination, .. } => { if destination == 0 { @@ -183,10 +185,7 @@ pub mod drtio { async fn drain_buffer(linkno: u8, draining_time: Milliseconds, timer: GlobalTimer) { let max_time = timer.get_time() + draining_time; - loop { - if timer.get_time() > max_time { - return; - } + while timer.get_time() < max_time { let _ = drtioaux_async::recv(linkno).await; } } @@ -835,13 +834,19 @@ pub mod drtio { linkno, routing_table, &Packet::SubkernelExceptionRequest { + source: 0, destination: destination, }, timer, ) .await?; match reply { - Packet::SubkernelException { last, length, data } => { + Packet::SubkernelException { + destination: 0, + last, + length, + data, + } => { remote_data.extend(&data[0..length as usize]); if last { return Ok(remote_data); diff --git a/src/satman/src/main.rs b/src/satman/src/main.rs index 422b260b..aebd9d65 100644 --- a/src/satman/src/main.rs +++ b/src/satman/src/main.rs @@ -895,6 +895,7 @@ fn process_aux_packet( Ok(()) } drtioaux::Packet::SubkernelExceptionRequest { + source, destination: _destination, } => { forward!( @@ -907,17 +908,46 @@ fn process_aux_packet( &packet, timer ); - let mut data_slice: [u8; SAT_PAYLOAD_MAX_SIZE] = [0; SAT_PAYLOAD_MAX_SIZE]; + let mut data_slice: [u8; MASTER_PAYLOAD_MAX_SIZE] = [0; MASTER_PAYLOAD_MAX_SIZE]; let meta = kernel_manager.exception_get_slice(&mut data_slice); - drtioaux::send( - 0, - &drtioaux::Packet::SubkernelException { + router.send( + drtioaux::Packet::SubkernelException { + destination: source, last: meta.status.is_last(), length: meta.len, data: data_slice, }, + _routing_table, + *rank, + *self_destination, ) } + drtioaux::Packet::SubkernelException { + destination: _destination, + last, + length, + data, + } => { + forward!( + router, + _routing_table, + _destination, + *rank, + *self_destination, + _repeaters, + &packet, + timer + ); + kernel_manager.received_exception( + &data[..length as usize], + last, + router, + _routing_table, + *rank, + *self_destination, + ); + Ok(()) + } drtioaux::Packet::SubkernelMessage { source, destination: _destination, diff --git a/src/satman/src/routing.rs b/src/satman/src/routing.rs index 32764446..87d5f092 100644 --- a/src/satman/src/routing.rs +++ b/src/satman/src/routing.rs @@ -4,7 +4,7 @@ use core::cmp::min; #[cfg(has_drtio_routing)] use libboard_artiq::pl::csr; use libboard_artiq::{drtio_routing, drtioaux, - drtioaux_proto::{PayloadStatus, MASTER_PAYLOAD_MAX_SIZE, SAT_PAYLOAD_MAX_SIZE}}; + drtioaux_proto::{PayloadStatus, MASTER_PAYLOAD_MAX_SIZE}}; pub struct SliceMeta { pub destination: u8, @@ -57,7 +57,6 @@ impl Sliceable { self.data.extend(data); } - get_slice_fn!(get_slice_sat, SAT_PAYLOAD_MAX_SIZE); get_slice_fn!(get_slice_master, MASTER_PAYLOAD_MAX_SIZE); } diff --git a/src/satman/src/subkernel.rs b/src/satman/src/subkernel.rs index a7b56349..83d0c38f 100644 --- a/src/satman/src/subkernel.rs +++ b/src/satman/src/subkernel.rs @@ -11,7 +11,7 @@ use io::{Cursor, ProtoWrite}; use ksupport::{eh_artiq, kernel, rpc}; use libboard_artiq::{drtio_routing::RoutingTable, drtioaux, - drtioaux_proto::{PayloadStatus, MASTER_PAYLOAD_MAX_SIZE, SAT_PAYLOAD_MAX_SIZE}, + drtioaux_proto::{PayloadStatus, MASTER_PAYLOAD_MAX_SIZE}, pl::csr}; use libboard_zynq::{time::Milliseconds, timer::GlobalTimer}; use libcortex_a9::sync_channel::Receiver; @@ -47,6 +47,9 @@ enum KernelState { DmaAwait { max_time: Milliseconds, }, + SubkernelRetrievingException { + destination: u8, + }, } #[derive(Debug)] @@ -123,10 +126,11 @@ struct MessageManager { struct Session { id: u32, kernel_state: KernelState, - last_exception: Option, + last_exception: Option, // exceptions raised locally + external_exception: Option>, // exceptions from sub-subkernels messages: MessageManager, source: u8, // which destination requested running the kernel - subkernels_finished: Vec, + subkernels_finished: Vec<(u32, Option)>, } impl Session { @@ -135,6 +139,7 @@ impl Session { id: id, kernel_state: KernelState::Absent, last_exception: None, + external_exception: None, messages: MessageManager::new(), source: 0, subkernels_finished: Vec::new(), @@ -410,9 +415,9 @@ impl<'a> Manager<'_> { } } - pub fn exception_get_slice(&mut self, data_slice: &mut [u8; SAT_PAYLOAD_MAX_SIZE]) -> SliceMeta { + pub fn exception_get_slice(&mut self, data_slice: &mut [u8; MASTER_PAYLOAD_MAX_SIZE]) -> SliceMeta { match self.session.last_exception.as_mut() { - Some(exception) => exception.get_slice_sat(data_slice), + Some(exception) => exception.get_slice_master(data_slice), None => SliceMeta { destination: 0, len: 0, @@ -540,7 +545,7 @@ impl<'a> Manager<'_> { return; } - match self.process_external_messages(timer) { + match self.process_external_messages(router, routing_table, rank, destination, timer) { Ok(()) => (), Err(Error::AwaitingMessage) => return, // kernel still waiting, do not process kernel messages Err(Error::KernelException(exception)) => { @@ -596,6 +601,41 @@ impl<'a> Manager<'_> { } } + fn check_finished_kernels( + &mut self, + id: u32, + router: &mut Router, + routing_table: &RoutingTable, + rank: u8, + self_destination: u8, + ) { + for (i, (status, exception_source)) in self.session.subkernels_finished.iter().enumerate() { + if *status == id { + if exception_source.is_none() { + self.control.tx.send(kernel::Message::SubkernelAwaitFinishReply); + self.session.kernel_state = KernelState::Running; + self.session.subkernels_finished.swap_remove(i); + } else { + let destination = exception_source.unwrap(); + self.session.external_exception = Some(Vec::new()); + self.session.kernel_state = KernelState::SubkernelRetrievingException { + destination: destination, + }; + router.route( + drtioaux::Packet::SubkernelExceptionRequest { + source: self_destination, + destination: destination, + }, + &routing_table, + rank, + self_destination, + ); + } + break; + } + } + } + pub fn subkernel_load_run_reply(&mut self, succeeded: bool) { if self.session.kernel_state == KernelState::SubkernelAwaitLoad { self.control @@ -608,16 +648,46 @@ impl<'a> Manager<'_> { } pub fn remote_subkernel_finished(&mut self, id: u32, with_exception: bool, exception_source: u8) { - if with_exception { - self.kernel_stop(); - self.last_finished = Some(SubkernelFinished { - source: self.session.source, - id: self.session.id, - with_exception: true, - exception_source: exception_source, - }) + let exception_src = if with_exception { Some(exception_source) } else { None }; + self.session.subkernels_finished.push((id, exception_src)); + } + + pub fn received_exception( + &mut self, + exception_data: &[u8], + last: bool, + router: &mut Router, + routing_table: &RoutingTable, + rank: u8, + self_destination: u8, + ) { + if let KernelState::SubkernelRetrievingException { destination } = self.session.kernel_state { + self.session + .external_exception + .as_mut() + .unwrap() + .extend_from_slice(exception_data); + if last { + self.control + .tx + .send(kernel::Message::SubkernelError(kernel::SubkernelStatus::Exception( + self.session.external_exception.take().unwrap(), + ))); + self.session.kernel_state = KernelState::Running; + } else { + /* fetch another slice */ + router.route( + drtioaux::Packet::SubkernelExceptionRequest { + source: self_destination, + destination: destination, + }, + routing_table, + rank, + self_destination, + ); + } } else { - self.session.subkernels_finished.push(id); + warn!("Received unsolicited exception data"); } } @@ -780,28 +850,35 @@ impl<'a> Manager<'_> { Ok(false) } - fn process_external_messages(&mut self, timer: &GlobalTimer) -> Result<(), Error> { + fn process_external_messages( + &mut self, + router: &mut Router, + routing_table: &RoutingTable, + rank: u8, + self_destination: u8, + timer: &GlobalTimer, + ) -> Result<(), Error> { match &self.session.kernel_state { KernelState::MsgAwait { max_time, id, tags } => { if let Some(max_time) = *max_time { if timer.get_time() > max_time { - self.control.tx.send(kernel::Message::SubkernelMsgRecvReply { - status: kernel::SubkernelStatus::Timeout, - count: 0, - }); + self.control + .tx + .send(kernel::Message::SubkernelError(kernel::SubkernelStatus::Timeout)); self.session.kernel_state = KernelState::Running; return Ok(()); } } if let Some(message) = self.session.messages.get_incoming(*id) { - self.control.tx.send(kernel::Message::SubkernelMsgRecvReply { - status: kernel::SubkernelStatus::NoError, - count: message.count, - }); + self.control + .tx + .send(kernel::Message::SubkernelMsgRecvReply { count: message.count }); let tags = tags.clone(); self.session.kernel_state = KernelState::Running; self.pass_message_to_kernel(&message, tags, timer) } else { + let id = *id; + self.check_finished_kernels(id, router, routing_table, rank, self_destination); Err(Error::AwaitingMessage) } } @@ -817,27 +894,18 @@ impl<'a> Manager<'_> { KernelState::SubkernelAwaitFinish { max_time, id } => { if let Some(max_time) = *max_time { if timer.get_time() > max_time { - self.control.tx.send(kernel::Message::SubkernelAwaitFinishReply { - status: kernel::SubkernelStatus::Timeout, - }); + self.control + .tx + .send(kernel::Message::SubkernelError(kernel::SubkernelStatus::Timeout)); self.session.kernel_state = KernelState::Running; return Ok(()); } } - let mut i = 0; - for status in &self.session.subkernels_finished { - if *status == *id { - self.control.tx.send(kernel::Message::SubkernelAwaitFinishReply { - status: kernel::SubkernelStatus::NoError, - }); - self.session.kernel_state = KernelState::Running; - self.session.subkernels_finished.swap_remove(i); - break; - } - i += 1; - } + let id = *id; + self.check_finished_kernels(id, router, routing_table, rank, self_destination); Ok(()) } + KernelState::SubkernelRetrievingException { .. } => Err(Error::AwaitingMessage), KernelState::DmaAwait { max_time } | KernelState::DmaPendingAwait { max_time, .. } => { if timer.get_time() > *max_time { self.control.tx.send(kernel::Message::DmaAwaitRemoteReply {