Fix RPC of ndarrays from host to device #527

Merged
sb10q merged 2 commits from fix/ndarray-rpc into master 2024-08-29 16:26:58 +08:00
3 changed files with 333 additions and 92 deletions

62
Cargo.lock generated
View File

@ -117,9 +117,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.1.13" version = "1.1.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
dependencies = [ dependencies = [
"shlex", "shlex",
] ]
@ -161,7 +161,7 @@ dependencies = [
"heck 0.5.0", "heck 0.5.0",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.75", "syn 2.0.76",
] ]
[[package]] [[package]]
@ -310,9 +310,9 @@ dependencies = [
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.1.0" version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
[[package]] [[package]]
name = "fixedbitset" name = "fixedbitset"
@ -424,7 +424,7 @@ checksum = "4fa4d8d74483041a882adaa9a29f633253a66dde85055f0495c121620ac484b2"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.75", "syn 2.0.76",
] ]
[[package]] [[package]]
@ -510,9 +510,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.157" version = "0.2.158"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "374af5f94e54fa97cf75e945cce8a6b201e88a1a07e688b47dfd2a59c66dbd86" checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
[[package]] [[package]]
name = "libloading" name = "libloading"
@ -752,7 +752,7 @@ dependencies = [
"phf_shared 0.11.2", "phf_shared 0.11.2",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.75", "syn 2.0.76",
] ]
[[package]] [[package]]
@ -856,7 +856,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"pyo3-macros-backend", "pyo3-macros-backend",
"quote", "quote",
"syn 2.0.75", "syn 2.0.76",
] ]
[[package]] [[package]]
@ -869,14 +869,14 @@ dependencies = [
"proc-macro2", "proc-macro2",
"pyo3-build-config", "pyo3-build-config",
"quote", "quote",
"syn 2.0.75", "syn 2.0.76",
] ]
[[package]] [[package]]
name = "quote" name = "quote"
version = "1.0.36" version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
] ]
@ -942,9 +942,9 @@ dependencies = [
[[package]] [[package]]
name = "redox_users" name = "redox_users"
version = "0.4.5" version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
dependencies = [ dependencies = [
"getrandom", "getrandom",
"libredox", "libredox",
@ -989,9 +989,9 @@ dependencies = [
[[package]] [[package]]
name = "rustix" name = "rustix"
version = "0.38.34" version = "0.38.35"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f"
dependencies = [ dependencies = [
"bitflags", "bitflags",
"errno", "errno",
@ -1035,29 +1035,29 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.208" version = "1.0.209"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
dependencies = [ dependencies = [
"serde_derive", "serde_derive",
] ]
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.208" version = "1.0.209"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.75", "syn 2.0.76",
] ]
[[package]] [[package]]
name = "serde_json" name = "serde_json"
version = "1.0.125" version = "1.0.127"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
dependencies = [ dependencies = [
"itoa", "itoa",
"memchr", "memchr",
@ -1147,7 +1147,7 @@ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"rustversion", "rustversion",
"syn 2.0.75", "syn 2.0.76",
] ]
[[package]] [[package]]
@ -1163,9 +1163,9 @@ dependencies = [
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.75" version = "2.0.76"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -1232,7 +1232,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.75", "syn 2.0.76",
] ]
[[package]] [[package]]
@ -1310,9 +1310,9 @@ checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
[[package]] [[package]]
name = "unicode-xid" name = "unicode-xid"
version = "0.2.4" version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" checksum = "229730647fbc343e3a80e463c1db7f78f3855d3f3739bee0dda773c9a037c90a"
[[package]] [[package]]
name = "unicode_names2" name = "unicode_names2"
@ -1510,5 +1510,5 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.75", "syn 2.0.76",
] ]

View File

@ -2,7 +2,7 @@ use nac3core::{
codegen::{ codegen::{
classes::{ classes::{
ArrayLikeIndexer, ArrayLikeValue, ArraySliceValue, ListValue, NDArrayType, ArrayLikeIndexer, ArrayLikeValue, ArraySliceValue, ListValue, NDArrayType,
NDArrayValue, RangeValue, UntypedArrayLikeAccessor, NDArrayValue, ProxyType, ProxyValue, RangeValue, UntypedArrayLikeAccessor,
}, },
expr::{destructure_range, gen_call}, expr::{destructure_range, gen_call},
irrt::call_ndarray_calc_size, irrt::call_ndarray_calc_size,
@ -22,7 +22,7 @@ use inkwell::{
module::Linkage, module::Linkage,
types::{BasicType, IntType}, types::{BasicType, IntType},
values::{BasicValueEnum, PointerValue, StructValue}, values::{BasicValueEnum, PointerValue, StructValue},
AddressSpace, IntPredicate, AddressSpace, IntPredicate, OptimizationLevel,
}; };
use pyo3::{ use pyo3::{
@ -32,6 +32,7 @@ use pyo3::{
use crate::{symbol_resolver::InnerResolver, timeline::TimeFns}; use crate::{symbol_resolver::InnerResolver, timeline::TimeFns};
use inkwell::values::IntValue;
use itertools::Itertools; use itertools::Itertools;
use std::{ use std::{
collections::{hash_map::DefaultHasher, HashMap}, collections::{hash_map::DefaultHasher, HashMap},
@ -486,13 +487,10 @@ fn format_rpc_arg<'ctx>(
let buffer = ctx.builder.build_array_alloca(llvm_i8, buffer_size, "rpc.arg").unwrap(); let buffer = ctx.builder.build_array_alloca(llvm_i8, buffer_size, "rpc.arg").unwrap();
let buffer = ArraySliceValue::from_ptr_val(buffer, buffer_size, Some("rpc.arg")); let buffer = ArraySliceValue::from_ptr_val(buffer, buffer_size, Some("rpc.arg"));
let ppdata = generator.gen_var_alloc(ctx, llvm_arg_ty.element_type(), None).unwrap();
ctx.builder.build_store(ppdata, llvm_arg.data().base_ptr(ctx, generator)).unwrap();
call_memcpy_generic( call_memcpy_generic(
ctx, ctx,
buffer.base_ptr(ctx, generator), buffer.base_ptr(ctx, generator),
ppdata, llvm_arg.ptr_to_data(ctx),
llvm_pdata_sizeof, llvm_pdata_sizeof,
llvm_i1.const_zero(), llvm_i1.const_zero(),
); );
@ -528,6 +526,298 @@ fn format_rpc_arg<'ctx>(
arg_slot arg_slot
} }
/// Formats an RPC return value to conform to the expected format required by NAC3.
fn format_rpc_ret<'ctx>(
generator: &mut dyn CodeGenerator,
ctx: &mut CodeGenContext<'ctx, '_>,
ret_ty: Type,
) -> Option<BasicValueEnum<'ctx>> {
// -- receive value:
// T result = {
// void *ret_ptr = alloca(sizeof(T));
// void *ptr = ret_ptr;
// loop: int size = rpc_recv(ptr);
// // Non-zero: Provide `size` bytes of extra storage for variable-length data.
// if(size) { ptr = alloca(size); goto loop; }
// else *(T*)ret_ptr
// }
let llvm_i8 = ctx.ctx.i8_type();
let llvm_i32 = ctx.ctx.i32_type();
let llvm_i8_8 = ctx.ctx.struct_type(&[llvm_i8.array_type(8).into()], false);
let llvm_pi8 = llvm_i8.ptr_type(AddressSpace::default());
let rpc_recv = ctx.module.get_function("rpc_recv").unwrap_or_else(|| {
ctx.module.add_function("rpc_recv", llvm_i32.fn_type(&[llvm_pi8.into()], false), None)
});
if ctx.unifier.unioned(ret_ty, ctx.primitives.none) {
ctx.build_call_or_invoke(rpc_recv, &[llvm_pi8.const_null().into()], "rpc_recv");
return None;
}
let prehead_bb = ctx.builder.get_insert_block().unwrap();
let current_function = prehead_bb.get_parent().unwrap();
let head_bb = ctx.ctx.append_basic_block(current_function, "rpc.head");
let alloc_bb = ctx.ctx.append_basic_block(current_function, "rpc.continue");
let tail_bb = ctx.ctx.append_basic_block(current_function, "rpc.tail");
let llvm_ret_ty = ctx.get_llvm_abi_type(generator, ret_ty);
let result = match &*ctx.unifier.get_ty_immutable(ret_ty) {
TypeEnum::TObj { obj_id, .. } if *obj_id == PrimDef::NDArray.id() => {
let llvm_i1 = ctx.ctx.bool_type();
let llvm_usize = generator.get_size_type(ctx.ctx);
// Round `val` up to its modulo `power_of_two`
let round_up = |ctx: &mut CodeGenContext<'ctx, '_>,
val: IntValue<'ctx>,
power_of_two: IntValue<'ctx>| {
debug_assert_eq!(
val.get_type().get_bit_width(),
power_of_two.get_type().get_bit_width()
);
let llvm_val_t = val.get_type();
let max_rem = ctx
.builder
.build_int_sub(power_of_two, llvm_val_t.const_int(1, false), "")
.unwrap();
ctx.builder
.build_and(
ctx.builder.build_int_add(val, max_rem, "").unwrap(),
ctx.builder.build_not(max_rem, "").unwrap(),
"",
)
.unwrap()
};
// Setup types
let (elem_ty, ndims) = unpack_ndarray_var_tys(&mut ctx.unifier, ret_ty);
let llvm_elem_ty = ctx.get_llvm_type(generator, elem_ty);
let llvm_ret_ty = NDArrayType::new(generator, ctx.ctx, llvm_elem_ty);
// Allocate the resulting ndarray
// A condition after format_rpc_ret ensures this will not be popped this off.
let ndarray = llvm_ret_ty.new_value(generator, ctx, Some("rpc.result"));
// Setup ndims
let ndims =
if let TypeEnum::TLiteral { values, .. } = &*ctx.unifier.get_ty_immutable(ndims) {
assert_eq!(values.len(), 1);
u64::try_from(values[0].clone()).unwrap()
} else {
unreachable!();
};
// Set `ndarray.ndims`
ndarray.store_ndims(ctx, generator, llvm_usize.const_int(ndims, false));
// Allocate `ndarray.shape` [size_t; ndims]
ndarray.create_dim_sizes(ctx, llvm_usize, ndarray.load_ndims(ctx));
/*
ndarray now:
- .ndims: initialized
- .shape: allocated but uninitialized .shape
- .data: uninitialized
*/
let llvm_usize_sizeof = ctx
.builder
.build_int_truncate_or_bit_cast(llvm_usize.size_of(), llvm_usize, "")
.unwrap();
let llvm_pdata_sizeof = ctx
.builder
.build_int_truncate_or_bit_cast(
llvm_ret_ty.element_type().size_of().unwrap(),
llvm_usize,
"",
)
.unwrap();
let llvm_elem_sizeof = ctx
.builder
.build_int_truncate_or_bit_cast(llvm_elem_ty.size_of().unwrap(), llvm_usize, "")
.unwrap();
// Allocates a buffer for the initial RPC'ed object, which is guaranteed to be
// (4 + 4 * ndims) bytes with 8-byte alignment
let sizeof_dims =
ctx.builder.build_int_mul(ndarray.load_ndims(ctx), llvm_usize_sizeof, "").unwrap();
let unaligned_buffer_size =
ctx.builder.build_int_add(sizeof_dims, llvm_pdata_sizeof, "").unwrap();
let buffer_size = round_up(ctx, unaligned_buffer_size, llvm_usize.const_int(8, false));
let stackptr = call_stacksave(ctx, None);
// Just to be absolutely sure, alloca in [i8 x 8] slices to force 8-byte alignment
let buffer = ctx
.builder
.build_array_alloca(
llvm_i8_8,
ctx.builder
.build_int_unsigned_div(buffer_size, llvm_usize.const_int(8, false), "")
.unwrap(),
"rpc.buffer",
)
.unwrap();
let buffer = ctx
.builder
.build_bitcast(buffer, llvm_pi8, "")
.map(BasicValueEnum::into_pointer_value)
.unwrap();
let buffer = ArraySliceValue::from_ptr_val(buffer, buffer_size, None);
// The first call to `rpc_recv` reads the top-level ndarray object: [pdata, shape]
//
// The returned value is the number of bytes for `ndarray.data`.
let ndarray_nbytes = ctx
.build_call_or_invoke(
rpc_recv,
&[buffer.base_ptr(ctx, generator).into()], // Reads [usize; ndims]. NOTE: We are allocated [size_t; ndims].
Outdated
Review

We should delete/change my comment here. I think it is not [i32; ndims]. Also I messed up the sentence after NOTE:.

Tag::Array(it, num_dims) => {
    consume_value!(*mut (), |buffer| {
        // Deserialize length along each dimension and compute total number of
        // elements.
        let mut total_len: usize = 1;
        for _ in 0..num_dims {
            let len = reader.read_u32()? as usize;
            total_len *= len;
            consume_value!(usize, |ptr| *ptr = len ) // <----- not i32
        }

        // Allocate backing storage for elements; deserialize them.
        let elt_tag = it.clone().next().expect("truncated tag");
        *buffer = alloc(elt_tag.size() * total_len)?;
        recv_elements(reader, elt_tag, total_len, *buffer, alloc)
    })
}
We should delete/change my comment here. I think it is not `[i32; ndims]`. Also I messed up the sentence after `NOTE:`. ```rust Tag::Array(it, num_dims) => { consume_value!(*mut (), |buffer| { // Deserialize length along each dimension and compute total number of // elements. let mut total_len: usize = 1; for _ in 0..num_dims { let len = reader.read_u32()? as usize; total_len *= len; consume_value!(usize, |ptr| *ptr = len ) // <----- not i32 } // Allocate backing storage for elements; deserialize them. let elt_tag = it.clone().next().expect("truncated tag"); *buffer = alloc(elt_tag.size() * total_len)?; recv_elements(reader, elt_tag, total_len, *buffer, alloc) }) } ```

Updated.

Updated.
"rpc.size.next",
)
.map(BasicValueEnum::into_int_value)
.unwrap();
// debug_assert(ndarray_nbytes > 0)
if ctx.registry.llvm_options.opt_level == OptimizationLevel::None {
Outdated
Review

Unrelated: I think there are inconsistencies with how we insert assertion tests in NAC3.

Sometimes we use cfg!(debug_assertions) and sometimes we are testing against OptimizationLevel::None. It might be necessary to standardize this approach in the future, say with a purpose-defined function like CodeGenContext::should_add_assertions() -> bool.

Unrelated: I think there are inconsistencies with how we insert assertion tests in NAC3. Sometimes we use `cfg!(debug_assertions)` and sometimes we are testing against `OptimizationLevel::None`. It might be necessary to standardize this approach in the future, say with a purpose-defined function like `CodeGenContext::should_add_assertions() -> bool`.

IMO debug_assertions are for assertions that can be checked at compile-time (think function contract violations in Rust), whereas OptimizationNone are for assertions that can only be checked at runtime...

I should write some documentation on this.

IMO `debug_assertions` are for assertions that can be checked at compile-time (think function contract violations in Rust), whereas `OptimizationNone` are for assertions that can only be checked at runtime... I should write some documentation on this.
ctx.make_assert(
generator,
ctx.builder
.build_int_compare(
IntPredicate::UGT,
ndarray_nbytes,
ndarray_nbytes.get_type().const_zero(),
"",
)
.unwrap(),
"0:AssertionError",
"Unexpected RPC termination for ndarray - Expected data buffer next",
[None, None, None],
ctx.current_loc,
);
}
// Copy shape from the buffer to `ndarray.shape`.
let pbuffer_dims =
unsafe { buffer.ptr_offset_unchecked(ctx, generator, &llvm_pdata_sizeof, None) };
call_memcpy_generic(
ctx,
ndarray.dim_sizes().base_ptr(ctx, generator),
pbuffer_dims,
sizeof_dims,
llvm_i1.const_zero(),
);
// Restore stack from before allocation of buffer
call_stackrestore(ctx, stackptr);
// Allocate `ndarray.data`.
// `ndarray.shape` must be initialized beforehand in this implementation
// (for ndarray.create_data() to know how many elements to allocate)
let num_elements =
call_ndarray_calc_size(generator, ctx, &ndarray.dim_sizes(), (None, None));
Outdated
Review

Probably some super insignificant details: I used ndarray_nbytes / itemsize to derive the num_elements for ndarray.create_data because it is computationally less expensive. Doing it like this is a little bit less efficient but I am not sure if this is concerning.

Also, it is possible to simply do let ndarray_data = ctx.builder.build_alloca(ndarray_nbytes, i8_type) and just set ndarray->data = ndarray_data.

Unless LLVM just figures everything out and optimizes this.

Either way, probably not a concern for now.

Probably some super insignificant details: I used `ndarray_nbytes / itemsize` to derive the `num_elements` for `ndarray.create_data` because it is computationally less expensive. Doing it like this is a little bit less efficient but I am not sure if this is concerning. Also, it is possible to simply do `let ndarray_data = ctx.builder.build_alloca(ndarray_nbytes, i8_type)` and just set `ndarray->data = ndarray_data`. Unless LLVM just figures everything out and optimizes this. Either way, probably not a concern for now.

Yeah, but using alloca with type T guarantees that objects are properly aligned. I think let's get the correct implementation out there first, then we can think about how to optimize it further.

Yeah, but using `alloca` with type `T` guarantees that objects are properly aligned. I think let's get the correct implementation out there first, then we can think about how to optimize it further.
// debug_assert(nelems * sizeof(T) >= ndarray_nbytes)
if ctx.registry.llvm_options.opt_level == OptimizationLevel::None {
let sizeof_data =
ctx.builder.build_int_mul(num_elements, llvm_elem_sizeof, "").unwrap();
ctx.make_assert(
generator,
ctx.builder.build_int_compare(IntPredicate::UGE,
sizeof_data,
ndarray_nbytes,
"",
).unwrap(),
"0:AssertionError",
"Unexpected allocation size request for ndarray data - Expected up to {0} bytes, got {1} bytes",
[Some(sizeof_data), Some(ndarray_nbytes), None],
ctx.current_loc,
);
}
ndarray.create_data(ctx, llvm_elem_ty, num_elements);
let ndarray_data = ndarray.data().base_ptr(ctx, generator);
let ndarray_data_i8 =
ctx.builder.build_pointer_cast(ndarray_data, llvm_pi8, "").unwrap();
// NOTE: Currently on `prehead_bb`
ctx.builder.build_unconditional_branch(head_bb).unwrap();
// Inserting into `head_bb`. Do `rpc_recv` for `data` recursively.
ctx.builder.position_at_end(head_bb);
let phi = ctx.builder.build_phi(llvm_pi8, "rpc.ptr").unwrap();
phi.add_incoming(&[(&ndarray_data_i8, prehead_bb)]);
let alloc_size = ctx
.build_call_or_invoke(rpc_recv, &[phi.as_basic_value()], "rpc.size.next")
.map(BasicValueEnum::into_int_value)
.unwrap();
let is_done = ctx
.builder
.build_int_compare(IntPredicate::EQ, llvm_i32.const_zero(), alloc_size, "rpc.done")
.unwrap();
ctx.builder.build_conditional_branch(is_done, tail_bb, alloc_bb).unwrap();
ctx.builder.position_at_end(alloc_bb);
// Align the allocation to sizeof(T)
let alloc_size = round_up(ctx, alloc_size, llvm_elem_sizeof);
let alloc_ptr = ctx
.builder
Outdated
Review

Redundant pointer cast I think.

Redundant pointer cast I think.

Not redundant; First alloc_ptr is i8** (note that we are alloca-ing i8*), second one casts it back to i8*.

Not redundant; First `alloc_ptr` is `i8**` (note that we are `alloca`-ing `i8*`), second one casts it back to `i8*`.
Outdated
Review

Oh

Oh
Outdated
Review

Wait actually, why do we allocate [i8*; alloc_size] here? I thought alloc_size is the number of bytes to allocate on the kernel's end.

Wait actually, why do we allocate `[i8*; alloc_size]` here? I thought `alloc_size` is the number of *bytes* to allocate on the kernel's end.

Ah, it's for 4-byte alignment. And it's overallocating 4 times the buffer size as well...

Let me fix that.

Ah, it's for 4-byte alignment. And it's overallocating 4 times the buffer size as well... Let me fix that.

We now allocate alloc_size by allocating round_up(alloc_size, sizeof(T)) / sizeof(T) objects of T, which should also address any alignment issues.

We now allocate `alloc_size` by allocating `round_up(alloc_size, sizeof(T)) / sizeof(T)` objects of `T`, which should also address any alignment issues.
.build_array_alloca(
llvm_elem_ty,
ctx.builder.build_int_unsigned_div(alloc_size, llvm_elem_sizeof, "").unwrap(),
"rpc.alloc",
)
.unwrap();
let alloc_ptr =
ctx.builder.build_pointer_cast(alloc_ptr, llvm_pi8, "rpc.alloc.ptr").unwrap();
phi.add_incoming(&[(&alloc_ptr, alloc_bb)]);
ctx.builder.build_unconditional_branch(head_bb).unwrap();
ctx.builder.position_at_end(tail_bb);
ndarray.as_base_value().into()
}
_ => {
let slot = ctx.builder.build_alloca(llvm_ret_ty, "rpc.ret.slot").unwrap();
let slotgen = ctx.builder.build_bitcast(slot, llvm_pi8, "rpc.ret.ptr").unwrap();
ctx.builder.build_unconditional_branch(head_bb).unwrap();
ctx.builder.position_at_end(head_bb);
let phi = ctx.builder.build_phi(llvm_pi8, "rpc.ptr").unwrap();
phi.add_incoming(&[(&slotgen, prehead_bb)]);
let alloc_size = ctx
.build_call_or_invoke(rpc_recv, &[phi.as_basic_value()], "rpc.size.next")
.unwrap()
.into_int_value();
let is_done = ctx
.builder
.build_int_compare(IntPredicate::EQ, llvm_i32.const_zero(), alloc_size, "rpc.done")
.unwrap();
ctx.builder.build_conditional_branch(is_done, tail_bb, alloc_bb).unwrap();
ctx.builder.position_at_end(alloc_bb);
let alloc_ptr =
ctx.builder.build_array_alloca(llvm_pi8, alloc_size, "rpc.alloc").unwrap();
let alloc_ptr =
ctx.builder.build_bitcast(alloc_ptr, llvm_pi8, "rpc.alloc.ptr").unwrap();
phi.add_incoming(&[(&alloc_ptr, alloc_bb)]);
ctx.builder.build_unconditional_branch(head_bb).unwrap();
ctx.builder.position_at_end(tail_bb);
ctx.builder.build_load(slot, "rpc.result").unwrap()
}
};
Some(result)
}
fn rpc_codegen_callback_fn<'ctx>( fn rpc_codegen_callback_fn<'ctx>(
ctx: &mut CodeGenContext<'ctx, '_>, ctx: &mut CodeGenContext<'ctx, '_>,
obj: Option<(Type, ValueEnum<'ctx>)>, obj: Option<(Type, ValueEnum<'ctx>)>,
@ -663,63 +953,14 @@ fn rpc_codegen_callback_fn<'ctx>(
// reclaim stack space used by arguments // reclaim stack space used by arguments
call_stackrestore(ctx, stackptr); call_stackrestore(ctx, stackptr);
// -- receive value: let result = format_rpc_ret(generator, ctx, fun.0.ret);
// T result = {
// void *ret_ptr = alloca(sizeof(T));
// void *ptr = ret_ptr;
// loop: int size = rpc_recv(ptr);
// // Non-zero: Provide `size` bytes of extra storage for variable-length data.
// if(size) { ptr = alloca(size); goto loop; }
// else *(T*)ret_ptr
// }
let rpc_recv = ctx.module.get_function("rpc_recv").unwrap_or_else(|| {
ctx.module.add_function("rpc_recv", int32.fn_type(&[ptr_type.into()], false), None)
});
if ctx.unifier.unioned(fun.0.ret, ctx.primitives.none) { if !result.is_some_and(|res| res.get_type().is_pointer_type()) {
ctx.build_call_or_invoke(rpc_recv, &[ptr_type.const_null().into()], "rpc_recv"); // An RPC returning an NDArray would not touch here.
return Ok(None);
}
let prehead_bb = ctx.builder.get_insert_block().unwrap();
let current_function = prehead_bb.get_parent().unwrap();
let head_bb = ctx.ctx.append_basic_block(current_function, "rpc.head");
let alloc_bb = ctx.ctx.append_basic_block(current_function, "rpc.continue");
let tail_bb = ctx.ctx.append_basic_block(current_function, "rpc.tail");
let ret_ty = ctx.get_llvm_abi_type(generator, fun.0.ret);
let need_load = !ret_ty.is_pointer_type();
let slot = ctx.builder.build_alloca(ret_ty, "rpc.ret.slot").unwrap();
let slotgen = ctx.builder.build_bitcast(slot, ptr_type, "rpc.ret.ptr").unwrap();
ctx.builder.build_unconditional_branch(head_bb).unwrap();
ctx.builder.position_at_end(head_bb);
let phi = ctx.builder.build_phi(ptr_type, "rpc.ptr").unwrap();
phi.add_incoming(&[(&slotgen, prehead_bb)]);
let alloc_size = ctx
.build_call_or_invoke(rpc_recv, &[phi.as_basic_value()], "rpc.size.next")
.unwrap()
.into_int_value();
let is_done = ctx
.builder
.build_int_compare(inkwell::IntPredicate::EQ, int32.const_zero(), alloc_size, "rpc.done")
.unwrap();
ctx.builder.build_conditional_branch(is_done, tail_bb, alloc_bb).unwrap();
ctx.builder.position_at_end(alloc_bb);
let alloc_ptr = ctx.builder.build_array_alloca(ptr_type, alloc_size, "rpc.alloc").unwrap();
let alloc_ptr = ctx.builder.build_bitcast(alloc_ptr, ptr_type, "rpc.alloc.ptr").unwrap();
phi.add_incoming(&[(&alloc_ptr, alloc_bb)]);
ctx.builder.build_unconditional_branch(head_bb).unwrap();
ctx.builder.position_at_end(tail_bb);
let result = ctx.builder.build_load(slot, "rpc.result").unwrap();
if need_load {
call_stackrestore(ctx, stackptr); call_stackrestore(ctx, stackptr);
} }
Ok(Some(result))
Ok(result)
} }
pub fn attributes_writeback( pub fn attributes_writeback(

View File

@ -1404,7 +1404,7 @@ impl<'ctx> NDArrayValue<'ctx> {
/// Returns the double-indirection pointer to the `data` array, as if by calling `getelementptr` /// Returns the double-indirection pointer to the `data` array, as if by calling `getelementptr`
/// on the field. /// on the field.
fn ptr_to_data(&self, ctx: &CodeGenContext<'ctx, '_>) -> PointerValue<'ctx> { pub fn ptr_to_data(&self, ctx: &CodeGenContext<'ctx, '_>) -> PointerValue<'ctx> {
let llvm_i32 = ctx.ctx.i32_type(); let llvm_i32 = ctx.ctx.i32_type();
let var_name = self.name.map(|v| format!("{v}.data.addr")).unwrap_or_default(); let var_name = self.name.map(|v| format!("{v}.data.addr")).unwrap_or_default();