figure out CSlice/&CSlice inconsistency #76

Closed
opened 2020-07-25 17:08:55 +08:00 by sb10q · 11 comments
Owner

DMA and cache functions use CSlice:

pub extern fn dma_record_start(name: CSlice<u8>)
pub extern fn get(key: CSlice<u8>) -> CSlice<'static, i32>

But the RPC functions need &CSlice (only on Zynq, and not in the original runtime):

pub extern fn rpc_send(service: u32, tag: &CSlice<u8>, data: *const *const ())

Why?

DMA and cache functions use ``CSlice``: ```rust pub extern fn dma_record_start(name: CSlice<u8>) pub extern fn get(key: CSlice<u8>) -> CSlice<'static, i32> ``` But the RPC functions need ``&CSlice`` (only on Zynq, and not in the original runtime): ```rust pub extern fn rpc_send(service: u32, tag: &CSlice<u8>, data: *const *const ()) ``` Why?
Contributor

I don't see why it would be passed by value on OR1k:

llsliceptr = ll.LiteralStructType([llptr, lli32]).as_pointer()
        elif name == "rpc_send":
            llty = ll.FunctionType(llvoid, [lli32, llsliceptr, llptrptr])
        elif name == "rpc_send_async":
            llty = ll.FunctionType(llvoid, [lli32, llsliceptr, llptrptr])

I don't see why it would be passed by value on OR1k: ```python llsliceptr = ll.LiteralStructType([llptr, lli32]).as_pointer() ``` ```python elif name == "rpc_send": llty = ll.FunctionType(llvoid, [lli32, llsliceptr, llptrptr]) elif name == "rpc_send_async": llty = ll.FunctionType(llvoid, [lli32, llsliceptr, llptrptr]) ```
Collaborator

Not sure whether that is what is going on here, but depending on the ABI, structs are implicitly passed on the stack, which will be compatible with references on some architectures.

Not sure whether that is what is going on here, but depending on the ABI, structs are implicitly passed on the stack, which will be compatible with references on some architectures.
Author
Owner

Should the compiler be changed so that RPC consistently uses CSlice everywhere then?

Should the compiler be changed so that RPC consistently uses ``CSlice`` everywhere then?
Author
Owner

Those patches work for Zynq but break OR1K:

diff --git a/artiq/compiler/transforms/llvm_ir_generator.py b/artiq/compiler/transforms/llvm_ir_generator.py
index 84dcac2d..2863ee3a 100644
--- a/artiq/compiler/transforms/llvm_ir_generator.py
+++ b/artiq/compiler/transforms/llvm_ir_generator.py
@@ -22,7 +22,6 @@ lldouble   = ll.DoubleType()
 llptr      = ll.IntType(8).as_pointer()
 llptrptr   = ll.IntType(8).as_pointer().as_pointer()
 llslice    = ll.LiteralStructType([llptr, lli32])
-llsliceptr = ll.LiteralStructType([llptr, lli32]).as_pointer()
 llmetadata = ll.MetaDataType()
 
 
@@ -375,9 +374,9 @@ class LLVMIRGenerator:
         elif name == "memcmp":
             llty = ll.FunctionType(lli32, [llptr, llptr, lli32])
         elif name == "rpc_send":
-            llty = ll.FunctionType(llvoid, [lli32, llsliceptr, llptrptr])
+            llty = ll.FunctionType(llvoid, [lli32, llslice, llptrptr])
         elif name == "rpc_send_async":
-            llty = ll.FunctionType(llvoid, [lli32, llsliceptr, llptrptr])
+            llty = ll.FunctionType(llvoid, [lli32, llslice, llptrptr])
         elif name == "rpc_recv":
             llty = ll.FunctionType(lli32, [llptr])
 
@@ -1366,9 +1365,6 @@ class LLVMIRGenerator:
         tag += self._rpc_tag(fun_type.ret, ret_error_handler)
 
         lltag = self.llconst_of_const(ir.Constant(tag, builtins.TStr()))
-        lltagptr = self.llbuilder.alloca(lltag.type)
-        self.llbuilder.store(lltag, lltagptr)
-
         llstackptr = self.llbuilder.call(self.llbuiltin("llvm.stacksave"), [],
                                          name="rpc.stack")
 
@@ -1390,10 +1386,10 @@ class LLVMIRGenerator:
 
         if fun_type.is_async:
             self.llbuilder.call(self.llbuiltin("rpc_send_async"),
-                                [llservice, lltagptr, llargs])
+                                [llservice, lltag, llargs])
         else:
             self.llbuilder.call(self.llbuiltin("rpc_send"),
-                                [llservice, lltagptr, llargs])
+                                [llservice, lltag, llargs])
 
         # Don't waste stack space on saved arguments.
         self.llbuilder.call(self.llbuiltin("llvm.stackrestore"), [llstackptr])
diff --git a/src/runtime/src/kernel/core1.rs b/src/runtime/src/kernel/core1.rs
index 50e8194..5cee05e 100644
--- a/src/runtime/src/kernel/core1.rs
+++ b/src/runtime/src/kernel/core1.rs
@@ -62,7 +62,7 @@ unsafe fn attribute_writeback(typeinfo: *const ()) {
                 attributes = attributes.offset(1);
 
                 if (*attribute).tag.len() > 0 {
-                    rpc_send_async(0, &(*attribute).tag, [
+                    rpc_send_async(0, (*attribute).tag, [
                         &object as *const _ as *const (),
                         &(*attribute).name as *const _ as *const (),
                         (object as usize + (*attribute).offset) as *const ()
diff --git a/src/runtime/src/kernel/rpc.rs b/src/runtime/src/kernel/rpc.rs
index e41c51e..141179d 100644
--- a/src/runtime/src/kernel/rpc.rs
+++ b/src/runtime/src/kernel/rpc.rs
@@ -10,18 +10,18 @@ use super::{
     Message,
 };
 
-fn rpc_send_common(is_async: bool, service: u32, tag: &CSlice<u8>, data: *const *const ()) {
+fn rpc_send_common(is_async: bool, service: u32, tag: CSlice<u8>, data: *const *const ()) {
     let mut core1_tx = KERNEL_CHANNEL_1TO0.lock();
     let mut buffer = Vec::<u8>::new();
     send_args(&mut buffer, service, tag.as_ref(), data).expect("RPC encoding failed");
     core1_tx.as_mut().unwrap().send(Message::RpcSend { is_async, data: Arc::new(buffer) });
 }
 
-pub extern fn rpc_send(service: u32, tag: &CSlice<u8>, data: *const *const ()) {
+pub extern fn rpc_send(service: u32, tag: CSlice<u8>, data: *const *const ()) {
     rpc_send_common(false, service, tag, data);
 }
 
-pub extern fn rpc_send_async(service: u32, tag: &CSlice<u8>, data: *const *const ()) {
+pub extern fn rpc_send_async(service: u32, tag: CSlice<u8>, data: *const *const ()) {
     rpc_send_common(true, service, tag, data);
 }
 
Those patches work for Zynq but break OR1K: ``` diff --git a/artiq/compiler/transforms/llvm_ir_generator.py b/artiq/compiler/transforms/llvm_ir_generator.py index 84dcac2d..2863ee3a 100644 --- a/artiq/compiler/transforms/llvm_ir_generator.py +++ b/artiq/compiler/transforms/llvm_ir_generator.py @@ -22,7 +22,6 @@ lldouble = ll.DoubleType() llptr = ll.IntType(8).as_pointer() llptrptr = ll.IntType(8).as_pointer().as_pointer() llslice = ll.LiteralStructType([llptr, lli32]) -llsliceptr = ll.LiteralStructType([llptr, lli32]).as_pointer() llmetadata = ll.MetaDataType() @@ -375,9 +374,9 @@ class LLVMIRGenerator: elif name == "memcmp": llty = ll.FunctionType(lli32, [llptr, llptr, lli32]) elif name == "rpc_send": - llty = ll.FunctionType(llvoid, [lli32, llsliceptr, llptrptr]) + llty = ll.FunctionType(llvoid, [lli32, llslice, llptrptr]) elif name == "rpc_send_async": - llty = ll.FunctionType(llvoid, [lli32, llsliceptr, llptrptr]) + llty = ll.FunctionType(llvoid, [lli32, llslice, llptrptr]) elif name == "rpc_recv": llty = ll.FunctionType(lli32, [llptr]) @@ -1366,9 +1365,6 @@ class LLVMIRGenerator: tag += self._rpc_tag(fun_type.ret, ret_error_handler) lltag = self.llconst_of_const(ir.Constant(tag, builtins.TStr())) - lltagptr = self.llbuilder.alloca(lltag.type) - self.llbuilder.store(lltag, lltagptr) - llstackptr = self.llbuilder.call(self.llbuiltin("llvm.stacksave"), [], name="rpc.stack") @@ -1390,10 +1386,10 @@ class LLVMIRGenerator: if fun_type.is_async: self.llbuilder.call(self.llbuiltin("rpc_send_async"), - [llservice, lltagptr, llargs]) + [llservice, lltag, llargs]) else: self.llbuilder.call(self.llbuiltin("rpc_send"), - [llservice, lltagptr, llargs]) + [llservice, lltag, llargs]) # Don't waste stack space on saved arguments. self.llbuilder.call(self.llbuiltin("llvm.stackrestore"), [llstackptr]) ``` ``` diff --git a/src/runtime/src/kernel/core1.rs b/src/runtime/src/kernel/core1.rs index 50e8194..5cee05e 100644 --- a/src/runtime/src/kernel/core1.rs +++ b/src/runtime/src/kernel/core1.rs @@ -62,7 +62,7 @@ unsafe fn attribute_writeback(typeinfo: *const ()) { attributes = attributes.offset(1); if (*attribute).tag.len() > 0 { - rpc_send_async(0, &(*attribute).tag, [ + rpc_send_async(0, (*attribute).tag, [ &object as *const _ as *const (), &(*attribute).name as *const _ as *const (), (object as usize + (*attribute).offset) as *const () diff --git a/src/runtime/src/kernel/rpc.rs b/src/runtime/src/kernel/rpc.rs index e41c51e..141179d 100644 --- a/src/runtime/src/kernel/rpc.rs +++ b/src/runtime/src/kernel/rpc.rs @@ -10,18 +10,18 @@ use super::{ Message, }; -fn rpc_send_common(is_async: bool, service: u32, tag: &CSlice<u8>, data: *const *const ()) { +fn rpc_send_common(is_async: bool, service: u32, tag: CSlice<u8>, data: *const *const ()) { let mut core1_tx = KERNEL_CHANNEL_1TO0.lock(); let mut buffer = Vec::<u8>::new(); send_args(&mut buffer, service, tag.as_ref(), data).expect("RPC encoding failed"); core1_tx.as_mut().unwrap().send(Message::RpcSend { is_async, data: Arc::new(buffer) }); } -pub extern fn rpc_send(service: u32, tag: &CSlice<u8>, data: *const *const ()) { +pub extern fn rpc_send(service: u32, tag: CSlice<u8>, data: *const *const ()) { rpc_send_common(false, service, tag, data); } -pub extern fn rpc_send_async(service: u32, tag: &CSlice<u8>, data: *const *const ()) { +pub extern fn rpc_send_async(service: u32, tag: CSlice<u8>, data: *const *const ()) { rpc_send_common(true, service, tag, data); } ```
Author
Owner

This crashes on OR1K with the original compiler:

diff --git a/artiq/firmware/ksupport/lib.rs b/artiq/firmware/ksupport/lib.rs
index c6e747fd..2fa82297 100644
--- a/artiq/firmware/ksupport/lib.rs
+++ b/artiq/firmware/ksupport/lib.rs
@@ -120,7 +120,7 @@ pub extern fn send_to_rtio_log(text: CSlice<u8>) {
 }
 
 #[unwind(aborts)]
-extern fn rpc_send(service: u32, tag: CSlice<u8>, data: *const *const ()) {
+extern fn rpc_send(service: u32, tag: &CSlice<u8>, data: *const *const ()) {
     while !rpc_queue::empty() {}
     send(&RpcSend {
         async:   false,
@@ -131,7 +131,7 @@ extern fn rpc_send(service: u32, tag: CSlice<u8>, data: *const *const ()) {
 }
 
 #[unwind(aborts)]
-extern fn rpc_send_async(service: u32, tag: CSlice<u8>, data: *const *const ()) {
+extern fn rpc_send_async(service: u32, tag: &CSlice<u8>, data: *const *const ()) {
     while rpc_queue::full() {}
     rpc_queue::enqueue(|mut slice| {
         let length = {
@@ -469,7 +469,7 @@ unsafe fn attribute_writeback(typeinfo: *const ()) {
                 attributes = attributes.offset(1);
 
                 if (*attribute).tag.len() > 0 {
-                    rpc_send_async(0, (*attribute).tag, [
+                    rpc_send_async(0, &(*attribute).tag, [
                         &object as *const _ as *const (),
                         &(*attribute).name as *const _ as *const (),
                         (object as usize + (*attribute).offset) as *const ()

So it seems that the compiler's (not Rust) behavior is different between OR1K and ARM, and is arguably incorrect on OR1K.

This crashes on OR1K with the original compiler: ``` diff --git a/artiq/firmware/ksupport/lib.rs b/artiq/firmware/ksupport/lib.rs index c6e747fd..2fa82297 100644 --- a/artiq/firmware/ksupport/lib.rs +++ b/artiq/firmware/ksupport/lib.rs @@ -120,7 +120,7 @@ pub extern fn send_to_rtio_log(text: CSlice<u8>) { } #[unwind(aborts)] -extern fn rpc_send(service: u32, tag: CSlice<u8>, data: *const *const ()) { +extern fn rpc_send(service: u32, tag: &CSlice<u8>, data: *const *const ()) { while !rpc_queue::empty() {} send(&RpcSend { async: false, @@ -131,7 +131,7 @@ extern fn rpc_send(service: u32, tag: CSlice<u8>, data: *const *const ()) { } #[unwind(aborts)] -extern fn rpc_send_async(service: u32, tag: CSlice<u8>, data: *const *const ()) { +extern fn rpc_send_async(service: u32, tag: &CSlice<u8>, data: *const *const ()) { while rpc_queue::full() {} rpc_queue::enqueue(|mut slice| { let length = { @@ -469,7 +469,7 @@ unsafe fn attribute_writeback(typeinfo: *const ()) { attributes = attributes.offset(1); if (*attribute).tag.len() > 0 { - rpc_send_async(0, (*attribute).tag, [ + rpc_send_async(0, &(*attribute).tag, [ &object as *const _ as *const (), &(*attribute).name as *const _ as *const (), (object as usize + (*attribute).offset) as *const () ``` So it seems that the compiler's (not Rust) behavior is different between OR1K and ARM, and is arguably incorrect on OR1K.
Collaborator

I think the compiler's behaviour behind the scenes is the same – it always passes structs as a pointer to stack memory (_prepare_ffi_call). It is, however, incorrect on ARM/EABI, as (IIRC) composite types are opportunistically passed in registers there.

LLVM unfortunately doesn't provide support for ABI details out of the box, instead requiring each frontend to implement the necessary conversions.

I think the compiler's behaviour behind the scenes is the same – it always passes structs as a pointer to stack memory (`_prepare_ffi_call`). It is, however, incorrect on ARM/EABI, as (IIRC) composite types are opportunistically passed in registers there. LLVM unfortunately doesn't provide support for ABI details out of the box, instead requiring each frontend to implement the necessary conversions.
Author
Owner

@dpn But why would there be a difference between rpc_send and dma_record_start/get?

@dpn But why would there be a difference between ``rpc_send`` and ``dma_record_start``/``get``?
Collaborator

why would there be a difference between rpc_send and dma_record_start/get?

The former is actually declared as a pointer to a slice in the LLVM IR generator. As such, they are always drumroll passed as a pointer to a slice. On OR1k, this apparently matched the Rust (C) ABI for CSlice. On ARM, it doesn't, because the slice members are opportunistically passed in registers.

The argument to the latter is passed by pointer, but marked with the LLVM byval attribute, the meaning of which is annoyingly backend-specific, but it is used to capture cases where the ABI dictates that memory for storing aggregate arguments that are passed by value should be allocated by the caller in a special fashion (e.g. adjacent to other parameters) and discarded after the call.

The best (imho only) way to debug this would be to settle on a clear C ABI for each function, and make sure llvm_ir_generator emits exactly the same signatures as Clang (or I suppose rustc) does for each target we support. From experience (porting LDC to various platforms and OSes), it is nearly impossible to get this right by just reading the LLVM docs.

> why would there be a difference between rpc_send and dma_record_start/get? The former is actually declared as a pointer to a slice in the LLVM IR generator. As such, they are always *drumroll* passed as a pointer to a slice. On OR1k, this apparently matched the Rust (C) ABI for `CSlice`. On ARM, it doesn't, because the slice members are opportunistically passed in registers. The argument to the latter is passed by pointer, but marked with the LLVM `byval` attribute, the meaning of which is annoyingly backend-specific, but it is used to capture cases where the ABI dictates that memory for storing aggregate arguments that are passed by value should be allocated by the caller in a special fashion (e.g. adjacent to other parameters) and discarded after the call. The best (imho only) way to debug this would be to settle on a clear C ABI for each function, and make sure `llvm_ir_generator` emits exactly the same signatures as Clang (or I suppose `rustc`) does for each target we support. From experience (porting LDC to various platforms and OSes), it is nearly impossible to get this right by just reading the LLVM docs.
Author
Owner

pca006132: "Regarding this, we now use &CSlice consistently for lists. For names, we use CSlice."

pca006132: "Regarding this, we now use &CSlice consistently for lists. For names, we use CSlice."
Author
Owner

@pca006132 your comment seems true on Zynq, but on RISC-V we have &CSlice everywhere. Why?

@pca006132 your comment seems true on Zynq, but on RISC-V we have ``&CSlice`` everywhere. Why?
Contributor

Some platform ABI will require passing structs by register under certain condition (size of the composite type, available registers, etc.). This is done by the frontend and requires platform specific implementation. The way we are doing this is by trial and error to see if the output matches. A simpler way would be to use pointers for all FFI calls.

Some platform ABI will require passing structs by register under certain condition (size of the composite type, available registers, etc.). This is done by the frontend and requires platform specific implementation. The way we are doing this is by trial and error to see if the output matches. A simpler way would be to use pointers for all FFI calls.
sb10q closed this issue 2022-04-13 09:19:00 +08:00
Sign in to join this conversation.
No Milestone
No Assignees
4 Participants
Notifications
Due Date
The due date is invalid or out of range. Please use the format 'yyyy-mm-dd'.

No due date set.

Dependencies

No dependencies set.

Reference: M-Labs/artiq-zynq#76
No description provided.