From f4c7940d3b13ec879c9fdc218812f71a65149123 Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Tue, 28 Jul 2020 13:09:18 -0500 Subject: [PATCH] Improve `__clzsi2` performance (#366) --- src/int/leading_zeros.rs | 143 +++++++++++++++++++++++++ src/int/mod.rs | 69 +----------- testcrate/Cargo.toml | 6 ++ testcrate/tests/count_leading_zeros.rs | 23 ---- testcrate/tests/leading_zeros.rs | 54 ++++++++++ 5 files changed, 206 insertions(+), 89 deletions(-) create mode 100644 src/int/leading_zeros.rs delete mode 100644 testcrate/tests/count_leading_zeros.rs create mode 100644 testcrate/tests/leading_zeros.rs diff --git a/src/int/leading_zeros.rs b/src/int/leading_zeros.rs new file mode 100644 index 0000000..78556f0 --- /dev/null +++ b/src/int/leading_zeros.rs @@ -0,0 +1,143 @@ +// Note: these functions happen to produce the correct `usize::leading_zeros(0)` value +// without a explicit zero check. Zero is probably common enough that it could warrant +// adding a zero check at the beginning, but `__clzsi2` has a precondition that `x != 0`. +// Compilers will insert the check for zero in cases where it is needed. + +/// Returns the number of leading binary zeros in `x`. +pub fn usize_leading_zeros_default(x: usize) -> usize { + // The basic idea is to test if the higher bits of `x` are zero and bisect the number + // of leading zeros. It is possible for all branches of the bisection to use the same + // code path by conditionally shifting the higher parts down to let the next bisection + // step work on the higher or lower parts of `x`. Instead of starting with `z == 0` + // and adding to the number of zeros, it is slightly faster to start with + // `z == usize::MAX.count_ones()` and subtract from the potential number of zeros, + // because it simplifies the final bisection step. + let mut x = x; + // the number of potential leading zeros + let mut z = usize::MAX.count_ones() as usize; + // a temporary + let mut t: usize; + #[cfg(target_pointer_width = "64")] + { + t = x >> 32; + if t != 0 { + z -= 32; + x = t; + } + } + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + { + t = x >> 16; + if t != 0 { + z -= 16; + x = t; + } + } + t = x >> 8; + if t != 0 { + z -= 8; + x = t; + } + t = x >> 4; + if t != 0 { + z -= 4; + x = t; + } + t = x >> 2; + if t != 0 { + z -= 2; + x = t; + } + // the last two bisections are combined into one conditional + t = x >> 1; + if t != 0 { + z - 2 + } else { + z - x + } + + // We could potentially save a few cycles by using the LUT trick from + // "https://embeddedgurus.com/state-space/2014/09/ + // fast-deterministic-and-portable-counting-leading-zeros/". + // However, 256 bytes for a LUT is too large for embedded use cases. We could remove + // the last 3 bisections and use this 16 byte LUT for the rest of the work: + //const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]; + //z -= LUT[x] as usize; + //z + // However, it ends up generating about the same number of instructions. When benchmarked + // on x86_64, it is slightly faster to use the LUT, but this is probably because of OOO + // execution effects. Changing to using a LUT and branching is risky for smaller cores. +} + +// The above method does not compile well on RISC-V (because of the lack of predicated +// instructions), producing code with many branches or using an excessively long +// branchless solution. This method takes advantage of the set-if-less-than instruction on +// RISC-V that allows `(x >= power-of-two) as usize` to be branchless. + +/// Returns the number of leading binary zeros in `x`. +pub fn usize_leading_zeros_riscv(x: usize) -> usize { + let mut x = x; + // the number of potential leading zeros + let mut z = usize::MAX.count_ones() as usize; + // a temporary + let mut t: usize; + + // RISC-V does not have a set-if-greater-than-or-equal instruction and + // `(x >= power-of-two) as usize` will get compiled into two instructions, but this is + // still the most optimal method. A conditional set can only be turned into a single + // immediate instruction if `x` is compared with an immediate `imm` (that can fit into + // 12 bits) like `x < imm` but not `imm < x` (because the immediate is always on the + // right). If we try to save an instruction by using `x < imm` for each bisection, we + // have to shift `x` left and compare with powers of two approaching `usize::MAX + 1`, + // but the immediate will never fit into 12 bits and never save an instruction. + #[cfg(target_pointer_width = "64")] + { + // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise + // `t` is set to 0. + t = ((x >= (1 << 32)) as usize) << 5; + // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the + // next step to process. + x >>= t; + // If `t` was set to `1 << 5`, then we subtract 32 from the number of potential + // leading zeros + z -= t; + } + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + { + t = ((x >= (1 << 16)) as usize) << 4; + x >>= t; + z -= t; + } + t = ((x >= (1 << 8)) as usize) << 3; + x >>= t; + z -= t; + t = ((x >= (1 << 4)) as usize) << 2; + x >>= t; + z -= t; + t = ((x >= (1 << 2)) as usize) << 1; + x >>= t; + z -= t; + t = (x >= (1 << 1)) as usize; + x >>= t; + z -= t; + // All bits except the LSB are guaranteed to be zero for this final bisection step. + // If `x != 0` then `x == 1` and subtracts one potential zero from `z`. + z - x +} + +intrinsics! { + #[maybe_use_optimized_c_shim] + #[cfg(any( + target_pointer_width = "16", + target_pointer_width = "32", + target_pointer_width = "64" + ))] + /// Returns the number of leading binary zeros in `x`. + pub extern "C" fn __clzsi2(x: usize) -> usize { + if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) { + usize_leading_zeros_riscv(x) + } else { + usize_leading_zeros_default(x) + } + } +} diff --git a/src/int/mod.rs b/src/int/mod.rs index d73bf6d..8a469d9 100644 --- a/src/int/mod.rs +++ b/src/int/mod.rs @@ -13,11 +13,14 @@ macro_rules! os_ty { } pub mod addsub; +pub mod leading_zeros; pub mod mul; pub mod sdiv; pub mod shift; pub mod udiv; +pub use self::leading_zeros::__clzsi2; + /// Trait for some basic operations on integers pub(crate) trait Int: Copy @@ -300,69 +303,3 @@ macro_rules! impl_wide_int { impl_wide_int!(u32, u64, 32); impl_wide_int!(u64, u128, 64); - -intrinsics! { - #[maybe_use_optimized_c_shim] - #[cfg(any( - target_pointer_width = "16", - target_pointer_width = "32", - target_pointer_width = "64" - ))] - pub extern "C" fn __clzsi2(x: usize) -> usize { - // TODO: const this? Would require const-if - // Note(Lokathor): the `intrinsics!` macro can't process mut inputs - let mut x = x; - let mut y: usize; - let mut n: usize = { - #[cfg(target_pointer_width = "64")] - { - 64 - } - #[cfg(target_pointer_width = "32")] - { - 32 - } - #[cfg(target_pointer_width = "16")] - { - 16 - } - }; - #[cfg(target_pointer_width = "64")] - { - y = x >> 32; - if y != 0 { - n -= 32; - x = y; - } - } - #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] - { - y = x >> 16; - if y != 0 { - n -= 16; - x = y; - } - } - y = x >> 8; - if y != 0 { - n -= 8; - x = y; - } - y = x >> 4; - if y != 0 { - n -= 4; - x = y; - } - y = x >> 2; - if y != 0 { - n -= 2; - x = y; - } - y = x >> 1; - if y != 0 { - n - 2 - } else { - n - x - } - } -} diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml index 3b99b57..61282af 100644 --- a/testcrate/Cargo.toml +++ b/testcrate/Cargo.toml @@ -11,6 +11,12 @@ doctest = false [build-dependencies] rand = "0.7" +[dev-dependencies] +# For fuzzing tests we want a deterministic seedable RNG. We also eliminate potential +# problems with system RNGs on the variety of platforms this crate is tested on. +# `xoshiro128**` is used for its quality, size, and speed at generating `u32` shift amounts. +rand_xoshiro = "0.4" + [dependencies.compiler_builtins] path = ".." default-features = false diff --git a/testcrate/tests/count_leading_zeros.rs b/testcrate/tests/count_leading_zeros.rs deleted file mode 100644 index 022b2d8..0000000 --- a/testcrate/tests/count_leading_zeros.rs +++ /dev/null @@ -1,23 +0,0 @@ -extern crate compiler_builtins; - -use compiler_builtins::int::__clzsi2; - -#[test] -fn __clzsi2_test() { - let mut i: usize = core::usize::MAX; - // Check all values above 0 - while i > 0 { - assert_eq!(__clzsi2(i) as u32, i.leading_zeros()); - i >>= 1; - } - // check 0 also - i = 0; - assert_eq!(__clzsi2(i) as u32, i.leading_zeros()); - // double check for bit patterns that aren't just solid 1s - i = 1; - for _ in 0..63 { - assert_eq!(__clzsi2(i) as u32, i.leading_zeros()); - i <<= 2; - i += 1; - } -} diff --git a/testcrate/tests/leading_zeros.rs b/testcrate/tests/leading_zeros.rs new file mode 100644 index 0000000..b857d9e --- /dev/null +++ b/testcrate/tests/leading_zeros.rs @@ -0,0 +1,54 @@ +use rand_xoshiro::rand_core::{RngCore, SeedableRng}; +use rand_xoshiro::Xoshiro128StarStar; + +use compiler_builtins::int::__clzsi2; +use compiler_builtins::int::leading_zeros::{ + usize_leading_zeros_default, usize_leading_zeros_riscv, +}; + +#[test] +fn __clzsi2_test() { + // Binary fuzzer. We cannot just send a random number directly to `__clzsi2()`, because we need + // large sequences of zeros to test. This XORs, ANDs, and ORs random length strings of 1s to + // `x`. ORs insure sequences of ones, ANDs insures sequences of zeros, and XORs are not often + // destructive but add entropy. + let mut rng = Xoshiro128StarStar::seed_from_u64(0); + let mut x = 0usize; + // creates a mask for indexing the bits of the type + let bit_indexing_mask = usize::MAX.count_ones() - 1; + // 10000 iterations is enough to make sure edge cases like single set bits are tested and to go + // through many paths. + for _ in 0..10_000 { + let r0 = bit_indexing_mask & rng.next_u32(); + // random length of ones + let ones: usize = !0 >> r0; + let r1 = bit_indexing_mask & rng.next_u32(); + // random circular shift + let mask = ones.rotate_left(r1); + match rng.next_u32() % 4 { + 0 => x |= mask, + 1 => x &= mask, + // both 2 and 3 to make XORs as common as ORs and ANDs combined + _ => x ^= mask, + } + let lz = x.leading_zeros() as usize; + let lz0 = __clzsi2(x); + let lz1 = usize_leading_zeros_default(x); + let lz2 = usize_leading_zeros_riscv(x); + if lz0 != lz { + panic!("__clzsi2({}): expected: {}, found: {}", x, lz, lz0); + } + if lz1 != lz { + panic!( + "usize_leading_zeros_default({}): expected: {}, found: {}", + x, lz, lz1 + ); + } + if lz2 != lz { + panic!( + "usize_leading_zeros_riscv({}): expected: {}, found: {}", + x, lz, lz2 + ); + } + } +}