diff --git a/Cargo.toml b/Cargo.toml index b4fbb0b..5a9f53a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ test = false core = { version = "1.0.0", optional = true, package = 'rustc-std-workspace-core' } [build-dependencies] -cc = { optional = true, version = "1.0" } +cc = { version = "1.0" } [dev-dependencies] panic-handler = { path = 'crates/panic-handler' } @@ -42,7 +42,7 @@ default = ["compiler-builtins"] # Enable compilation of C code in compiler-rt, filling in some more optimized # implementations and also filling in unimplemented intrinsics -c = ["cc"] +# c = ["cc"] # Flag this library as the unstable compiler-builtins lib compiler-builtins = [] diff --git a/README.md b/README.md index a20d038..c427337 100644 --- a/README.md +++ b/README.md @@ -1,398 +1,9 @@ # `compiler-builtins` +Fork of [compiler-builtin](https://github.com/rust-lang/compiler-builtins) from +rust for zynq, with fast memcpy function adapted from newlib. We have to fork it +because the compiler-builtins would use their slow memcpy implementation for +compiler intrinsics regardless if you provided you own memcpy. +[#253](https://github.com/rust-lang/compiler-builtins/issues/253). -> Porting `compiler-rt` intrinsics to Rust +The memcpy function is in assembly with neon optimization. -See [rust-lang/rust#35437][0]. - -[0]: https://github.com/rust-lang/rust/issues/35437 - -## When and how to use this crate? - -If you are working with a target that doesn't have binary releases of std -available via rustup (this probably means you are building the core crate -yourself) and need compiler-rt intrinsics (i.e. you are probably getting linker -errors when building an executable: `undefined reference to __aeabi_memcpy`), -you can use this crate to get those intrinsics and solve the linker errors. To -do that, add this crate somewhere in the dependency graph of the crate you are -building: - -``` toml -# Cargo.toml -[dependencies] -compiler_builtins = { git = "https://github.com/rust-lang/compiler-builtins" } -``` - -``` rust -extern crate compiler_builtins; - -// ... -``` - -If you still get an "undefined reference to $INTRINSIC" error after that change, -that means that we haven't ported `$INTRINSIC` to Rust yet! Please open [an -issue] with the name of the intrinsic and the LLVM triple (e.g. -thumbv7m-none-eabi) of the target you are using. That way we can prioritize -porting that particular intrinsic. - -If you've got a C compiler available for your target then while we implement -this intrinsic you can temporarily enable a fallback to the actual compiler-rt -implementation as well for unimplemented intrinsics: - -```toml -[dependencies.compiler_builtins] -git = "https://github.com/rust-lang/compiler-builtins" -features = ["c"] -``` - -[an issue]: https://github.com/rust-lang/compiler-builtins/issues - -## Contributing - -1. Pick one or more intrinsics from the [pending list](#progress). -2. Fork this repository. -3. Port the intrinsic(s) and their corresponding [unit tests][1] from their - [C implementation][2] to Rust. -4. Implement a [test generator][3] to compare the behavior of the ported intrinsic(s) - with their implementation on the testing host. Note that randomized compiler-builtin tests - should be run using `cargo test --features gen-tests`. -4. Send a Pull Request (PR). -5. Once the PR passes our extensive [testing infrastructure][4], we'll merge it! -6. Celebrate :tada: - -[1]: https://github.com/rust-lang/compiler-rt/tree/8598065bd965d9713bfafb6c1e766d63a7b17b89/test/builtins/Unit -[2]: https://github.com/rust-lang/compiler-rt/tree/8598065bd965d9713bfafb6c1e766d63a7b17b89/lib/builtins -[3]: https://github.com/rust-lang/compiler-builtins/blob/0ba07e49264a54cb5bbd4856fcea083bb3fbec15/build.rs#L180-L265 -[4]: https://travis-ci.org/rust-lang/compiler-builtins - -### Porting Reminders - -1. [Rust][5a] and [C][5b] have slightly different operator precedence. C evaluates comparisons (`== !=`) before bitwise operations (`& | ^`), while Rust evaluates the other way. -2. C assumes wrapping operations everywhere. Rust panics on overflow when in debug mode. Consider using the [Wrapping][6] type or the explicit [wrapping_*][7] functions where applicable. -3. Note [C implicit casts][8], especially integer promotion. Rust is much more explicit about casting, so be sure that any cast which affects the output is ported to the Rust implementation. -4. Rust has [many functions][9] for integer or floating point manipulation in the standard library. Consider using one of these functions rather than porting a new one. - -[5a]: https://doc.rust-lang.org/reference/expressions.html#expression-precedence -[5b]: http://en.cppreference.com/w/c/language/operator_precedence -[6]: https://doc.rust-lang.org/core/num/struct.Wrapping.html -[7]: https://doc.rust-lang.org/std/primitive.i32.html#method.wrapping_add -[8]: http://en.cppreference.com/w/cpp/language/implicit_conversion -[9]: https://doc.rust-lang.org/std/primitive.i32.html - -## Progress - -- [x] adddf3.c -- [x] addsf3.c -- [x] arm/adddf3vfp.S -- [x] arm/addsf3vfp.S -- [x] arm/aeabi_dcmp.S -- [x] arm/aeabi_fcmp.S -- [x] arm/aeabi_idivmod.S -- [x] arm/aeabi_ldivmod.S -- [x] arm/aeabi_memcpy.S -- [x] arm/aeabi_memmove.S -- [x] arm/aeabi_memset.S -- [x] arm/aeabi_uidivmod.S -- [x] arm/aeabi_uldivmod.S -- [x] arm/divdf3vfp.S -- [ ] arm/divmodsi4.S (generic version is done) -- [x] arm/divsf3vfp.S -- [ ] arm/divsi3.S (generic version is done) -- [x] arm/eqdf2vfp.S -- [x] arm/eqsf2vfp.S -- [x] arm/extendsfdf2vfp.S -- [ ] arm/fixdfsivfp.S -- [ ] arm/fixsfsivfp.S -- [ ] arm/fixunsdfsivfp.S -- [ ] arm/fixunssfsivfp.S -- [ ] arm/floatsidfvfp.S -- [ ] arm/floatsisfvfp.S -- [ ] arm/floatunssidfvfp.S -- [ ] arm/floatunssisfvfp.S -- [x] arm/gedf2vfp.S -- [x] arm/gesf2vfp.S -- [x] arm/gtdf2vfp.S -- [x] arm/gtsf2vfp.S -- [x] arm/ledf2vfp.S -- [x] arm/lesf2vfp.S -- [x] arm/ltdf2vfp.S -- [x] arm/ltsf2vfp.S -- [ ] arm/modsi3.S (generic version is done) -- [x] arm/muldf3vfp.S -- [x] arm/mulsf3vfp.S -- [x] arm/nedf2vfp.S -- [ ] arm/negdf2vfp.S -- [ ] arm/negsf2vfp.S -- [x] arm/nesf2vfp.S -- [x] arm/softfloat-alias.list -- [x] arm/subdf3vfp.S -- [x] arm/subsf3vfp.S -- [ ] arm/truncdfsf2vfp.S -- [ ] arm/udivmodsi4.S (generic version is done) -- [ ] arm/udivsi3.S (generic version is done) -- [ ] arm/umodsi3.S (generic version is done) -- [ ] arm/unorddf2vfp.S -- [ ] arm/unordsf2vfp.S -- [x] ashldi3.c -- [x] ashrdi3.c -- [x] comparedf2.c -- [x] comparesf2.c -- [x] divdf3.c -- [x] divdi3.c -- [x] divmoddi4.c -- [x] divmodsi4.c -- [x] divsf3.c -- [x] divsi3.c -- [ ] extendhfsf2.c -- [x] extendsfdf2.c -- [x] fixdfdi.c -- [x] fixdfsi.c -- [x] fixsfdi.c -- [x] fixsfsi.c -- [x] fixunsdfdi.c -- [x] fixunsdfsi.c -- [x] fixunssfdi.c -- [x] fixunssfsi.c -- [x] floatdidf.c -- [x] floatdisf.c -- [x] floatsidf.c -- [x] floatsisf.c -- [x] floatundidf.c -- [x] floatundisf.c -- [x] floatunsidf.c -- [x] floatunsisf.c -- [ ] i386/ashldi3.S -- [ ] i386/ashrdi3.S -- [x] i386/chkstk.S -- [x] i386/chkstk2.S -- [ ] i386/divdi3.S -- [ ] i386/lshrdi3.S -- [ ] i386/moddi3.S -- [ ] i386/muldi3.S -- [ ] i386/udivdi3.S -- [ ] i386/umoddi3.S -- [x] lshrdi3.c -- [x] moddi3.c -- [x] modsi3.c -- [x] muldf3.c -- [x] muldi3.c -- [x] mulodi4.c -- [x] mulosi4.c -- [x] mulsf3.c -- [x] powidf2.c -- [x] powisf2.c -- [x] subdf3.c -- [x] subsf3.c -- [ ] truncdfhf2.c -- [ ] truncdfsf2.c -- [ ] truncsfhf2.c -- [x] udivdi3.c -- [x] udivmoddi4.c -- [x] udivmodsi4.c -- [x] udivsi3.c -- [x] umoddi3.c -- [x] umodsi3.c -- [x] x86_64/chkstk.S -- [x] x86_64/chkstk2.S - -These builtins are needed to support 128-bit integers, which are in the process of being added to Rust. - -- [x] ashlti3.c -- [x] ashrti3.c -- [x] divti3.c -- [x] fixdfti.c -- [x] fixsfti.c -- [x] fixunsdfti.c -- [x] fixunssfti.c -- [x] floattidf.c -- [x] floattisf.c -- [x] floatuntidf.c -- [x] floatuntisf.c -- [x] lshrti3.c -- [x] modti3.c -- [x] muloti4.c -- [x] multi3.c -- [x] udivmodti4.c -- [x] udivti3.c -- [x] umodti3.c - -## Unimplemented functions - -These builtins involve floating-point types ("`f128`", "`f80`" and complex numbers) that are not supported by Rust. - -- ~~addtf3.c~~ -- ~~comparetf2.c~~ -- ~~divdc3.c~~ -- ~~divsc3.c~~ -- ~~divtc3.c~~ -- ~~divtf3.c~~ -- ~~divxc3.c~~ -- ~~extenddftf2.c~~ -- ~~extendsftf2.c~~ -- ~~fixtfdi.c~~ -- ~~fixtfsi.c~~ -- ~~fixtfti.c~~ -- ~~fixunstfdi.c~~ -- ~~fixunstfsi.c~~ -- ~~fixunstfti.c~~ -- ~~fixunsxfdi.c~~ -- ~~fixunsxfsi.c~~ -- ~~fixunsxfti.c~~ -- ~~fixxfdi.c~~ -- ~~fixxfti.c~~ -- ~~floatditf.c~~ -- ~~floatdixf.c~~ -- ~~floatsitf.c~~ -- ~~floattixf.c~~ -- ~~floatunditf.c~~ -- ~~floatundixf.c~~ -- ~~floatunsitf.c~~ -- ~~floatuntixf.c~~ -- ~~i386/floatdixf.S~~ -- ~~i386/floatundixf.S~~ -- ~~muldc3.c~~ -- ~~mulsc3.c~~ -- ~~multc3.c~~ -- ~~multf3.c~~ -- ~~mulxc3.c~~ -- ~~powitf2.c~~ -- ~~powixf2.c~~ -- ~~ppc/divtc3.c~~ -- ~~ppc/fixtfdi.c~~ -- ~~ppc/fixunstfdi.c~~ -- ~~ppc/floatditf.c~~ -- ~~ppc/floatunditf.c~~ -- ~~ppc/gcc_qadd.c~~ -- ~~ppc/gcc_qdiv.c~~ -- ~~ppc/gcc_qmul.c~~ -- ~~ppc/gcc_qsub.c~~ -- ~~ppc/multc3.c~~ -- ~~subtf3.c~~ -- ~~trunctfdf2.c~~ -- ~~trunctfsf2.c~~ -- ~~x86_64/floatdixf.c~~ -- ~~x86_64/floatundixf.S~~ - -These builtins are never called by LLVM. - -- ~~absvdi2.c~~ -- ~~absvsi2.c~~ -- ~~absvti2.c~~ -- ~~addvdi3.c~~ -- ~~addvsi3.c~~ -- ~~addvti3.c~~ -- ~~arm/aeabi_cdcmp.S~~ -- ~~arm/aeabi_cdcmpeq_check_nan.c~~ -- ~~arm/aeabi_cfcmp.S~~ -- ~~arm/aeabi_cfcmpeq_check_nan.c~~ -- ~~arm/aeabi_div0.c~~ -- ~~arm/aeabi_drsub.c~~ -- ~~arm/aeabi_frsub.c~~ -- ~~arm/aeabi_memcmp.S~~ -- ~~arm/bswapdi2.S~~ -- ~~arm/bswapsi2.S~~ -- ~~arm/clzdi2.S~~ -- ~~arm/clzsi2.S~~ -- ~~arm/comparesf2.S~~ -- ~~arm/restore_vfp_d8_d15_regs.S~~ -- ~~arm/save_vfp_d8_d15_regs.S~~ -- ~~arm/switch16.S~~ -- ~~arm/switch32.S~~ -- ~~arm/switch8.S~~ -- ~~arm/switchu8.S~~ -- ~~clzdi2.c~~ -- ~~clzsi2.c~~ -- ~~clzti2.c~~ -- ~~cmpdi2.c~~ -- ~~cmpti2.c~~ -- ~~ctzdi2.c~~ -- ~~ctzsi2.c~~ -- ~~ctzti2.c~~ -- ~~ffsdi2.c~~ - this is [called by gcc][jemalloc-fail] though! -- ~~ffsti2.c~~ -- ~~mulvdi3.c~~ -- ~~mulvsi3.c~~ -- ~~mulvti3.c~~ -- ~~negdf2.c~~ -- ~~negdi2.c~~ -- ~~negsf2.c~~ -- ~~negti2.c~~ -- ~~negvdi2.c~~ -- ~~negvsi2.c~~ -- ~~negvti2.c~~ -- ~~paritydi2.c~~ -- ~~paritysi2.c~~ -- ~~parityti2.c~~ -- ~~popcountdi2.c~~ -- ~~popcountsi2.c~~ -- ~~popcountti2.c~~ -- ~~ppc/restFP.S~~ -- ~~ppc/saveFP.S~~ -- ~~subvdi3.c~~ -- ~~subvsi3.c~~ -- ~~subvti3.c~~ -- ~~ucmpdi2.c~~ -- ~~ucmpti2.c~~ -- ~~udivmodti4.c~~ - -[jemalloc-fail]: https://travis-ci.org/rust-lang/rust/jobs/249772758 - -Rust only exposes atomic types on platforms that support them, and therefore does not need to fall back to software implementations. - -- ~~arm/sync_fetch_and_add_4.S~~ -- ~~arm/sync_fetch_and_add_8.S~~ -- ~~arm/sync_fetch_and_and_4.S~~ -- ~~arm/sync_fetch_and_and_8.S~~ -- ~~arm/sync_fetch_and_max_4.S~~ -- ~~arm/sync_fetch_and_max_8.S~~ -- ~~arm/sync_fetch_and_min_4.S~~ -- ~~arm/sync_fetch_and_min_8.S~~ -- ~~arm/sync_fetch_and_nand_4.S~~ -- ~~arm/sync_fetch_and_nand_8.S~~ -- ~~arm/sync_fetch_and_or_4.S~~ -- ~~arm/sync_fetch_and_or_8.S~~ -- ~~arm/sync_fetch_and_sub_4.S~~ -- ~~arm/sync_fetch_and_sub_8.S~~ -- ~~arm/sync_fetch_and_umax_4.S~~ -- ~~arm/sync_fetch_and_umax_8.S~~ -- ~~arm/sync_fetch_and_umin_4.S~~ -- ~~arm/sync_fetch_and_umin_8.S~~ -- ~~arm/sync_fetch_and_xor_4.S~~ -- ~~arm/sync_fetch_and_xor_8.S~~ -- ~~arm/sync_synchronize.S~~ -- ~~atomic.c~~ -- ~~atomic_flag_clear.c~~ -- ~~atomic_flag_clear_explicit.c~~ -- ~~atomic_flag_test_and_set.c~~ -- ~~atomic_flag_test_and_set_explicit.c~~ -- ~~atomic_signal_fence.c~~ -- ~~atomic_thread_fence.c~~ - -Miscellaneous functionality that is not used by Rust. - -- ~~apple_versioning.c~~ -- ~~clear_cache.c~~ -- ~~emutls.c~~ -- ~~enable_execute_stack.c~~ -- ~~eprintf.c~~ -- ~~gcc_personality_v0.c~~ -- ~~trampoline_setup.c~~ - -Floating-point implementations of builtins that are only called from soft-float code. It would be better to simply use the generic soft-float versions in this case. - -- ~~i386/floatdidf.S~~ -- ~~i386/floatdisf.S~~ -- ~~i386/floatundidf.S~~ -- ~~i386/floatundisf.S~~ -- ~~x86_64/floatundidf.S~~ -- ~~x86_64/floatundisf.S~~ -- ~~x86_64/floatdidf.c~~ -- ~~x86_64/floatdisf.c~~ - -## License - -The compiler-builtins crate is dual licensed under both the University of -Illinois "BSD-Like" license and the MIT license. As a user of this code you may -choose to use it under either license. As a contributor, you agree to allow -your code to be used under both. - -Full text of the relevant licenses is in LICENSE.TXT. diff --git a/asm/memcpy.S b/asm/memcpy.S new file mode 100644 index 0000000..cd7962e --- /dev/null +++ b/asm/memcpy.S @@ -0,0 +1,619 @@ +/* Copyright (c) 2013, Linaro Limited + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Linaro Limited nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This memcpy routine is optimised for Cortex-A15 cores and takes advantage + of VFP or NEON when built with the appropriate flags. + + Assumptions: + + ARMv6 (ARMv7-a if using Neon) + ARM state + Unaligned accesses + LDRD/STRD support unaligned word accesses + + If compiled with GCC, this file should be enclosed within following + pre-processing check: + if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) + + */ + .syntax unified + /* This implementation requires ARM state. */ + .arm + +#ifdef __ARM_NEON__ + + .fpu neon + .arch armv7-a +# define FRAME_SIZE 4 +# define USE_VFP +# define USE_NEON + +#elif !defined (__SOFTFP__) + + .arch armv6 + .fpu vfpv2 +# define FRAME_SIZE 32 +# define USE_VFP + +#else + .arch armv6 +# define FRAME_SIZE 32 + +#endif + +/* Old versions of GAS incorrectly implement the NEON align semantics. */ +#ifdef BROKEN_ASM_NEON_ALIGN +#define ALIGN(addr, align) addr,:align +#else +#define ALIGN(addr, align) addr:align +#endif + +#define PC_OFFSET 8 /* PC pipeline compensation. */ +#define INSN_SIZE 4 + +/* Call parameters. */ +#define dstin r0 +#define src r1 +#define count r2 + +/* Locals. */ +#define tmp1 r3 +#define dst ip +#define tmp2 r10 + +#ifndef USE_NEON +/* For bulk copies using GP registers. */ +#define A_l r2 /* Call-clobbered. */ +#define A_h r3 /* Call-clobbered. */ +#define B_l r4 +#define B_h r5 +#define C_l r6 +#define C_h r7 +#define D_l r8 +#define D_h r9 +#endif + +/* Number of lines ahead to pre-fetch data. If you change this the code + below will need adjustment to compensate. */ + +#define prefetch_lines 5 + +#ifdef USE_VFP + .macro cpy_line_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm + + .macro cpy_tail_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm +#endif + + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +def_fn memcpy p2align=6 + + mov dst, dstin /* Preserve dstin, we need to return it. */ + cmp count, #64 + bge .Lcpy_not_short + /* Deal with small copies quickly by dropping straight into the + exit block. */ + +.Ltail63unaligned: +#ifdef USE_NEON + and tmp1, count, #0x38 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + vld1.8 {d0}, [src]! /* 14 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 12 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 10 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 8 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 6 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 4 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 2 words to go. */ + vst1.8 {d0}, [dst]! + + tst count, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 +#else + /* Copy up to 15 full words of data. May not be aligned. */ + /* Cannot use VFP for unaligned data. */ + and tmp1, count, #0x3c + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) + /* Jump directly into the sequence below at the correct offset. */ + add pc, pc, tmp1, lsl #1 + + ldr tmp1, [src, #-60] /* 15 words to go. */ + str tmp1, [dst, #-60] + + ldr tmp1, [src, #-56] /* 14 words to go. */ + str tmp1, [dst, #-56] + ldr tmp1, [src, #-52] + str tmp1, [dst, #-52] + + ldr tmp1, [src, #-48] /* 12 words to go. */ + str tmp1, [dst, #-48] + ldr tmp1, [src, #-44] + str tmp1, [dst, #-44] + + ldr tmp1, [src, #-40] /* 10 words to go. */ + str tmp1, [dst, #-40] + ldr tmp1, [src, #-36] + str tmp1, [dst, #-36] + + ldr tmp1, [src, #-32] /* 8 words to go. */ + str tmp1, [dst, #-32] + ldr tmp1, [src, #-28] + str tmp1, [dst, #-28] + + ldr tmp1, [src, #-24] /* 6 words to go. */ + str tmp1, [dst, #-24] + ldr tmp1, [src, #-20] + str tmp1, [dst, #-20] + + ldr tmp1, [src, #-16] /* 4 words to go. */ + str tmp1, [dst, #-16] + ldr tmp1, [src, #-12] + str tmp1, [dst, #-12] + + ldr tmp1, [src, #-8] /* 2 words to go. */ + str tmp1, [dst, #-8] + ldr tmp1, [src, #-4] + str tmp1, [dst, #-4] +#endif + + lsls count, count, #31 + ldrhcs tmp1, [src], #2 + ldrbne src, [src] /* Src is dead, use as a scratch. */ + strhcs tmp1, [dst], #2 + strbne src, [dst] + bx lr + +.Lcpy_not_short: + /* At least 64 bytes to copy, but don't know the alignment yet. */ + str tmp2, [sp, #-FRAME_SIZE]! + and tmp2, src, #7 + and tmp1, dst, #7 + cmp tmp1, tmp2 + bne .Lcpy_notaligned + +#ifdef USE_VFP + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show + that the FP pipeline is much better at streaming loads and + stores. This is outside the critical loop. */ + vmov.f32 s0, s0 +#endif + + /* SRC and DST have the same mutual 32-bit alignment, but we may + still need to pre-copy some bytes to get to natural alignment. + We bring DST into full 64-bit alignment. */ + lsls tmp2, dst, #29 + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src], #1 + strhcs tmp1, [dst], #2 + strbne tmp2, [dst], #1 + +1: + subs tmp2, count, #64 /* Use tmp2 for count. */ + blt .Ltail63aligned + + cmp tmp2, #512 + bge .Lcpy_body_long + +.Lcpy_body_medium: /* Count in tmp2. */ +#ifdef USE_VFP +1: + vldr d0, [src, #0] + subs tmp2, tmp2, #64 + vldr d1, [src, #8] + vstr d0, [dst, #0] + vldr d0, [src, #16] + vstr d1, [dst, #8] + vldr d1, [src, #24] + vstr d0, [dst, #16] + vldr d0, [src, #32] + vstr d1, [dst, #24] + vldr d1, [src, #40] + vstr d0, [dst, #32] + vldr d0, [src, #48] + vstr d1, [dst, #40] + vldr d1, [src, #56] + vstr d0, [dst, #48] + add src, src, #64 + vstr d1, [dst, #56] + add dst, dst, #64 + bge 1b + tst tmp2, #0x3f + beq .Ldone + +.Ltail63aligned: /* Count in tmp2. */ + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + + vldr d0, [src, #-56] /* 14 words to go. */ + vstr d0, [dst, #-56] + vldr d0, [src, #-48] /* 12 words to go. */ + vstr d0, [dst, #-48] + vldr d0, [src, #-40] /* 10 words to go. */ + vstr d0, [dst, #-40] + vldr d0, [src, #-32] /* 8 words to go. */ + vstr d0, [dst, #-32] + vldr d0, [src, #-24] /* 6 words to go. */ + vstr d0, [dst, #-24] + vldr d0, [src, #-16] /* 4 words to go. */ + vstr d0, [dst, #-16] + vldr d0, [src, #-8] /* 2 words to go. */ + vstr d0, [dst, #-8] +#else + sub src, src, #8 + sub dst, dst, #8 +1: + ldrd A_l, A_h, [src, #8] + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #16] + strd A_l, A_h, [dst, #16] + ldrd A_l, A_h, [src, #24] + strd A_l, A_h, [dst, #24] + ldrd A_l, A_h, [src, #32] + strd A_l, A_h, [dst, #32] + ldrd A_l, A_h, [src, #40] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #48] + strd A_l, A_h, [dst, #48] + ldrd A_l, A_h, [src, #56] + strd A_l, A_h, [dst, #56] + ldrd A_l, A_h, [src, #64]! + strd A_l, A_h, [dst, #64]! + subs tmp2, tmp2, #64 + bge 1b + tst tmp2, #0x3f + bne 1f + ldr tmp2,[sp], #FRAME_SIZE + bx lr +1: + add src, src, #8 + add dst, dst, #8 + +.Ltail63aligned: /* Count in tmp2. */ + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but + we know that the src and dest are 32-bit aligned so we can use + LDRD/STRD to improve efficiency. */ + /* TMP2 is now negative, but we don't care about that. The bottom + six bits still tell us how many bytes are left to copy. */ + + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ + strd A_l, A_h, [dst, #-56] + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ + strd A_l, A_h, [dst, #-48] + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ + strd A_l, A_h, [dst, #-40] + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ + strd A_l, A_h, [dst, #-32] + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ + strd A_l, A_h, [dst, #-24] + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ + strd A_l, A_h, [dst, #-16] + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ + strd A_l, A_h, [dst, #-8] + +#endif + tst tmp2, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src] + strhcs tmp1, [dst], #2 + strbne tmp2, [dst] + +.Ldone: + ldr tmp2, [sp], #FRAME_SIZE + bx lr + +.Lcpy_body_long: /* Count in tmp2. */ + + /* Long copy. We know that there's at least (prefetch_lines * 64) + bytes to go. */ +#ifdef USE_VFP + /* Don't use PLD. Instead, read some data in advance of the current + copy position into a register. This should act like a PLD + operation but we won't have to repeat the transfer. */ + + vldr d3, [src, #0] + vldr d4, [src, #64] + vldr d5, [src, #128] + vldr d6, [src, #192] + vldr d7, [src, #256] + + vldr d0, [src, #8] + vldr d1, [src, #16] + vldr d2, [src, #24] + add src, src, #32 + + subs tmp2, tmp2, #prefetch_lines * 64 * 2 + blt 2f +1: + cpy_line_vfp d3, 0 + cpy_line_vfp d4, 64 + cpy_line_vfp d5, 128 + add dst, dst, #3 * 64 + add src, src, #3 * 64 + cpy_line_vfp d6, 0 + cpy_line_vfp d7, 64 + add dst, dst, #2 * 64 + add src, src, #2 * 64 + subs tmp2, tmp2, #prefetch_lines * 64 + bge 1b + +2: + cpy_tail_vfp d3, 0 + cpy_tail_vfp d4, 64 + cpy_tail_vfp d5, 128 + add src, src, #3 * 64 + add dst, dst, #3 * 64 + cpy_tail_vfp d6, 0 + vstr d7, [dst, #64] + vldr d7, [src, #64] + vstr d0, [dst, #64 + 8] + vldr d0, [src, #64 + 8] + vstr d1, [dst, #64 + 16] + vldr d1, [src, #64 + 16] + vstr d2, [dst, #64 + 24] + vldr d2, [src, #64 + 24] + vstr d7, [dst, #64 + 32] + add src, src, #96 + vstr d0, [dst, #64 + 40] + vstr d1, [dst, #64 + 48] + vstr d2, [dst, #64 + 56] + add dst, dst, #128 + add tmp2, tmp2, #prefetch_lines * 64 + b .Lcpy_body_medium +#else + /* Long copy. Use an SMS style loop to maximize the I/O + bandwidth of the core. We don't have enough spare registers + to synthesise prefetching, so use PLD operations. */ + /* Pre-bias src and dst. */ + sub src, src, #8 + sub dst, dst, #8 + pld [src, #8] + pld [src, #72] + subs tmp2, tmp2, #64 + pld [src, #136] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [sp, #24] + pld [src, #200] + ldrd D_l, D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #232] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldrd D_l, D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldrd D_l, D_h, [src, #32] + bcs 2b + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #40 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + tst tmp2, #0x3f + bne .Ltail63aligned + ldr tmp2, [sp], #FRAME_SIZE + bx lr +#endif + +.Lcpy_notaligned: + pld [src] + pld [src, #64] + /* There's at least 64 bytes to copy, but there is no mutual + alignment. */ + /* Bring DST to 64-bit alignment. */ + lsls tmp2, dst, #29 + pld [src, #(2 * 64)] + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrbne tmp1, [src], #1 + ldrhcs tmp2, [src], #2 + strbne tmp1, [dst], #1 + strhcs tmp2, [dst], #2 +1: + pld [src, #(3 * 64)] + subs count, count, #64 + ldrmi tmp2, [sp], #FRAME_SIZE + bmi .Ltail63unaligned + pld [src, #(4 * 64)] + +#ifdef USE_NEON + vld1.8 {d0-d3}, [src]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bmi 2f +1: + pld [src, #(4 * 64)] + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vld1.8 {d0-d3}, [src]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bpl 1b +2: + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + ands count, count, #0x3f +#else + /* Use an SMS style loop to maximize the I/O bandwidth. */ + sub src, src, #4 + sub dst, dst, #8 + subs tmp2, count, #64 /* Use tmp2 for count. */ + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [sp, #24] + ldr D_l, [src, #28] + ldr D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #(5 * 64) - (32 - 4)] + strd A_l, A_h, [dst, #40] + ldr A_l, [src, #36] + ldr A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldr B_l, [src, #44] + ldr B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldr C_l, [src, #52] + ldr C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldr D_l, [src, #60] + ldr D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldr D_l, [src, #28] + ldr D_h, [src, #32] + bcs 2b + + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #36 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + ands count, tmp2, #0x3f +#endif + ldr tmp2, [sp], #FRAME_SIZE + bne .Ltail63unaligned + bx lr + + .size memcpy, . - memcpy diff --git a/build.rs b/build.rs index f948edb..e7c3d49 100644 --- a/build.rs +++ b/build.rs @@ -73,6 +73,31 @@ fn main() { if llvm_target[0] == "armv4t" || llvm_target[0] == "armv5te" { println!("cargo:rustc-cfg=kernel_user_helpers") } + + compile_memcpy(); +} + +fn compile_memcpy() { + use std::path::Path; + extern crate cc; + + let cfg = &mut cc::Build::new(); + cfg.compiler("clang"); + cfg.no_default_flags(true); + cfg.warnings(false); + cfg.flag("--target=armv7-none-eabihf"); + + let sources = vec![ + "memcpy.S", + ]; + + let root = Path::new("./asm"); + for src in sources { + println!("cargo:rerun-if-changed={}", src); + cfg.file(root.join(src)); + } + + cfg.compile("memcpy"); } #[cfg(feature = "c")] diff --git a/src/arm.rs b/src/arm.rs index 190bba7..28e5506 100644 --- a/src/arm.rs +++ b/src/arm.rs @@ -139,28 +139,32 @@ pub unsafe fn __aeabi_ldivmod() { } // FIXME: The `*4` and `*8` variants should be defined as aliases. - -#[cfg(not(target_os = "ios"))] -#[cfg_attr(not(feature = "mangled-names"), no_mangle)] -#[cfg_attr(thumb, linkage = "weak")] -pub unsafe extern "aapcs" fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize) { - ::mem::memcpy(dest, src, n); +extern "C" { + // optimized memcpy using assembly + fn memcpy(dest: *mut u8, src: *const u8, n: usize); } #[cfg(not(target_os = "ios"))] #[cfg_attr(not(feature = "mangled-names"), no_mangle)] #[cfg_attr(thumb, linkage = "weak")] -pub unsafe extern "aapcs" fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, mut n: usize) { - // We are guaranteed 4-alignment, so accessing at u32 is okay. - let mut dest = dest as *mut u32; - let mut src = src as *mut u32; +pub unsafe extern "aapcs" fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize) { + memcpy(dest, src, n); +} - while n >= 4 { - *dest = *src; - dest = dest.offset(1); - src = src.offset(1); - n -= 4; - } +#[cfg(not(target_os = "ios"))] +#[cfg_attr(not(feature = "mangled-names"), no_mangle)] +#[cfg_attr(thumb, linkage = "weak")] +pub unsafe extern "aapcs" fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize) { + // We are guaranteed 4-alignment, so accessing at u32 is okay. + // let mut dest = dest as *mut u32; + // let mut src = src as *mut u32; + + // while n >= 4 { + // *dest = *src; + // dest = dest.offset(1); + // src = src.offset(1); + // n -= 4; + // } __aeabi_memcpy(dest as *mut u8, src as *const u8, n); } diff --git a/src/mem.rs b/src/mem.rs index 24552ed..69abd05 100644 --- a/src/mem.rs +++ b/src/mem.rs @@ -9,16 +9,6 @@ use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div} use core::mem; use core::ops::{BitOr, Shl}; -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { - let mut i = 0; - while i < n { - *dest.offset(i as isize) = *src.offset(i as isize); - i += 1; - } - dest -} - #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { if src < dest as *const u8 {