Added memcpy with neon optimization for zynq.

2020-08-21 15:29:36 +08:00 · 2020-08-21 15:29:36 +08:00 · c21d3aeec3
parent 2635ae9a6c
commit c21d3aeec3
6 changed files with 672 additions and 423 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -32,7 +32,7 @@ test = false
 core = { version = "1.0.0", optional = true, package = 'rustc-std-workspace-core' }

 [build-dependencies]
-cc = { optional = true, version = "1.0" }
+cc = { version = "1.0" }

 [dev-dependencies]
 panic-handler = { path = 'crates/panic-handler' }
@ -42,7 +42,7 @@ default = ["compiler-builtins"]

 # Enable compilation of C code in compiler-rt, filling in some more optimized
 # implementations and also filling in unimplemented intrinsics
-c = ["cc"]
+# c = ["cc"]

 # Flag this library as the unstable compiler-builtins lib
 compiler-builtins = []
--- a/README.md
+++ b/README.md
@ -1,398 +1,9 @@
 # `compiler-builtins`
+Fork of [compiler-builtin](https://github.com/rust-lang/compiler-builtins) from
+rust for zynq, with fast memcpy function adapted from newlib. We have to fork it
+because the compiler-builtins would use their slow memcpy implementation for
+compiler intrinsics regardless if you provided you own memcpy.
+[#253](https://github.com/rust-lang/compiler-builtins/issues/253).

-> Porting `compiler-rt` intrinsics to Rust
+The memcpy function is in assembly with neon optimization.

-See [rust-lang/rust#35437][0].
-
-[0]: https://github.com/rust-lang/rust/issues/35437
-
-## When and how to use this crate?
-
-If you are working with a target that doesn't have binary releases of std
-available via rustup (this probably means you are building the core crate
-yourself) and need compiler-rt intrinsics (i.e. you are probably getting linker
-errors when building an executable: `undefined reference to __aeabi_memcpy`),
-you can use this crate to get those intrinsics and solve the linker errors. To
-do that, add this crate somewhere in the dependency graph of the crate you are
-building:
-
-``` toml
-# Cargo.toml
-[dependencies]
-compiler_builtins = { git = "https://github.com/rust-lang/compiler-builtins" }
-```
-
-``` rust
-extern crate compiler_builtins;
-
-// ...
-```
-
-If you still get an "undefined reference to $INTRINSIC" error after that change,
-that means that we haven't ported `$INTRINSIC` to Rust yet! Please open [an
-issue] with the name of the intrinsic and the LLVM triple (e.g.
-thumbv7m-none-eabi) of the target you are using. That way we can prioritize
-porting that particular intrinsic.
-
-If you've got a C compiler available for your target then while we implement
-this intrinsic you can temporarily enable a fallback to the actual compiler-rt
-implementation as well for unimplemented intrinsics:
-
-```toml
-[dependencies.compiler_builtins]
-git = "https://github.com/rust-lang/compiler-builtins"
-features = ["c"]
-```
-
-[an issue]: https://github.com/rust-lang/compiler-builtins/issues
-
-## Contributing
-
-1. Pick one or more intrinsics from the [pending list](#progress).
-2. Fork this repository.
-3. Port the intrinsic(s) and their corresponding [unit tests][1] from their
-   [C implementation][2] to Rust.
-4. Implement a [test generator][3] to compare the behavior of the ported intrinsic(s)
-   with their implementation on the testing host. Note that randomized compiler-builtin tests
-   should be run using `cargo test --features gen-tests`.
-4. Send a Pull Request (PR).
-5. Once the PR passes our extensive [testing infrastructure][4], we'll merge it!
-6. Celebrate :tada:
-
-[1]: https://github.com/rust-lang/compiler-rt/tree/8598065bd965d9713bfafb6c1e766d63a7b17b89/test/builtins/Unit
-[2]: https://github.com/rust-lang/compiler-rt/tree/8598065bd965d9713bfafb6c1e766d63a7b17b89/lib/builtins
-[3]: https://github.com/rust-lang/compiler-builtins/blob/0ba07e49264a54cb5bbd4856fcea083bb3fbec15/build.rs#L180-L265
-[4]: https://travis-ci.org/rust-lang/compiler-builtins
-
-### Porting Reminders
-
-1. [Rust][5a] and [C][5b] have slightly different operator precedence. C evaluates comparisons (`== !=`) before bitwise operations (`& | ^`), while Rust evaluates the other way.
-2. C assumes wrapping operations everywhere. Rust panics on overflow when in debug mode. Consider using the [Wrapping][6] type or the explicit [wrapping_*][7] functions where applicable.
-3. Note [C implicit casts][8], especially integer promotion. Rust is much more explicit about casting, so be sure that any cast which affects the output is ported to the Rust implementation.
-4. Rust has [many functions][9] for integer or floating point manipulation in the standard library. Consider using one of these functions rather than porting a new one.
-
-[5a]: https://doc.rust-lang.org/reference/expressions.html#expression-precedence
-[5b]: http://en.cppreference.com/w/c/language/operator_precedence
-[6]: https://doc.rust-lang.org/core/num/struct.Wrapping.html
-[7]: https://doc.rust-lang.org/std/primitive.i32.html#method.wrapping_add
-[8]: http://en.cppreference.com/w/cpp/language/implicit_conversion
-[9]: https://doc.rust-lang.org/std/primitive.i32.html
-
-## Progress
-
- [x] adddf3.c
- [x] addsf3.c
- [x] arm/adddf3vfp.S
- [x] arm/addsf3vfp.S
- [x] arm/aeabi_dcmp.S
- [x] arm/aeabi_fcmp.S
- [x] arm/aeabi_idivmod.S
- [x] arm/aeabi_ldivmod.S
- [x] arm/aeabi_memcpy.S
- [x] arm/aeabi_memmove.S
- [x] arm/aeabi_memset.S
- [x] arm/aeabi_uidivmod.S
- [x] arm/aeabi_uldivmod.S
- [x] arm/divdf3vfp.S
- [ ] arm/divmodsi4.S (generic version is done)
- [x] arm/divsf3vfp.S
- [ ] arm/divsi3.S (generic version is done)
- [x] arm/eqdf2vfp.S
- [x] arm/eqsf2vfp.S
- [x] arm/extendsfdf2vfp.S
- [ ] arm/fixdfsivfp.S
- [ ] arm/fixsfsivfp.S
- [ ] arm/fixunsdfsivfp.S
- [ ] arm/fixunssfsivfp.S
- [ ] arm/floatsidfvfp.S
- [ ] arm/floatsisfvfp.S
- [ ] arm/floatunssidfvfp.S
- [ ] arm/floatunssisfvfp.S
- [x] arm/gedf2vfp.S
- [x] arm/gesf2vfp.S
- [x] arm/gtdf2vfp.S
- [x] arm/gtsf2vfp.S
- [x] arm/ledf2vfp.S
- [x] arm/lesf2vfp.S
- [x] arm/ltdf2vfp.S
- [x] arm/ltsf2vfp.S
- [ ] arm/modsi3.S (generic version is done)
- [x] arm/muldf3vfp.S
- [x] arm/mulsf3vfp.S
- [x] arm/nedf2vfp.S
- [ ] arm/negdf2vfp.S
- [ ] arm/negsf2vfp.S
- [x] arm/nesf2vfp.S
- [x] arm/softfloat-alias.list
- [x] arm/subdf3vfp.S
- [x] arm/subsf3vfp.S
- [ ] arm/truncdfsf2vfp.S
- [ ] arm/udivmodsi4.S (generic version is done)
- [ ] arm/udivsi3.S (generic version is done)
- [ ] arm/umodsi3.S (generic version is done)
- [ ] arm/unorddf2vfp.S
- [ ] arm/unordsf2vfp.S
- [x] ashldi3.c
- [x] ashrdi3.c
- [x] comparedf2.c
- [x] comparesf2.c
- [x] divdf3.c
- [x] divdi3.c
- [x] divmoddi4.c
- [x] divmodsi4.c
- [x] divsf3.c
- [x] divsi3.c
- [ ] extendhfsf2.c
- [x] extendsfdf2.c
- [x] fixdfdi.c
- [x] fixdfsi.c
- [x] fixsfdi.c
- [x] fixsfsi.c
- [x] fixunsdfdi.c
- [x] fixunsdfsi.c
- [x] fixunssfdi.c
- [x] fixunssfsi.c
- [x] floatdidf.c
- [x] floatdisf.c
- [x] floatsidf.c
- [x] floatsisf.c
- [x] floatundidf.c
- [x] floatundisf.c
- [x] floatunsidf.c
- [x] floatunsisf.c
- [ ] i386/ashldi3.S
- [ ] i386/ashrdi3.S
- [x] i386/chkstk.S
- [x] i386/chkstk2.S
- [ ] i386/divdi3.S
- [ ] i386/lshrdi3.S
- [ ] i386/moddi3.S
- [ ] i386/muldi3.S
- [ ] i386/udivdi3.S
- [ ] i386/umoddi3.S
- [x] lshrdi3.c
- [x] moddi3.c
- [x] modsi3.c
- [x] muldf3.c
- [x] muldi3.c
- [x] mulodi4.c
- [x] mulosi4.c
- [x] mulsf3.c
- [x] powidf2.c
- [x] powisf2.c
- [x] subdf3.c
- [x] subsf3.c
- [ ] truncdfhf2.c
- [ ] truncdfsf2.c
- [ ] truncsfhf2.c
- [x] udivdi3.c
- [x] udivmoddi4.c
- [x] udivmodsi4.c
- [x] udivsi3.c
- [x] umoddi3.c
- [x] umodsi3.c
- [x] x86_64/chkstk.S
- [x] x86_64/chkstk2.S
-
-These builtins are needed to support 128-bit integers, which are in the process of being added to Rust.
-
- [x] ashlti3.c
- [x] ashrti3.c
- [x] divti3.c
- [x] fixdfti.c
- [x] fixsfti.c
- [x] fixunsdfti.c
- [x] fixunssfti.c
- [x] floattidf.c
- [x] floattisf.c
- [x] floatuntidf.c
- [x] floatuntisf.c
- [x] lshrti3.c
- [x] modti3.c
- [x] muloti4.c
- [x] multi3.c
- [x] udivmodti4.c
- [x] udivti3.c
- [x] umodti3.c
-
-## Unimplemented functions
-
-These builtins involve floating-point types ("`f128`", "`f80`" and complex numbers) that are not supported by Rust.
-
- ~~addtf3.c~~
- ~~comparetf2.c~~
- ~~divdc3.c~~
- ~~divsc3.c~~
- ~~divtc3.c~~
- ~~divtf3.c~~
- ~~divxc3.c~~
- ~~extenddftf2.c~~
- ~~extendsftf2.c~~
- ~~fixtfdi.c~~
- ~~fixtfsi.c~~
- ~~fixtfti.c~~
- ~~fixunstfdi.c~~
- ~~fixunstfsi.c~~
- ~~fixunstfti.c~~
- ~~fixunsxfdi.c~~
- ~~fixunsxfsi.c~~
- ~~fixunsxfti.c~~
- ~~fixxfdi.c~~
- ~~fixxfti.c~~
- ~~floatditf.c~~
- ~~floatdixf.c~~
- ~~floatsitf.c~~
- ~~floattixf.c~~
- ~~floatunditf.c~~
- ~~floatundixf.c~~
- ~~floatunsitf.c~~
- ~~floatuntixf.c~~
- ~~i386/floatdixf.S~~
- ~~i386/floatundixf.S~~
- ~~muldc3.c~~
- ~~mulsc3.c~~
- ~~multc3.c~~
- ~~multf3.c~~
- ~~mulxc3.c~~
- ~~powitf2.c~~
- ~~powixf2.c~~
- ~~ppc/divtc3.c~~
- ~~ppc/fixtfdi.c~~
- ~~ppc/fixunstfdi.c~~
- ~~ppc/floatditf.c~~
- ~~ppc/floatunditf.c~~
- ~~ppc/gcc_qadd.c~~
- ~~ppc/gcc_qdiv.c~~
- ~~ppc/gcc_qmul.c~~
- ~~ppc/gcc_qsub.c~~
- ~~ppc/multc3.c~~
- ~~subtf3.c~~
- ~~trunctfdf2.c~~
- ~~trunctfsf2.c~~
- ~~x86_64/floatdixf.c~~
- ~~x86_64/floatundixf.S~~
-
-These builtins are never called by LLVM.
-
- ~~absvdi2.c~~
- ~~absvsi2.c~~
- ~~absvti2.c~~
- ~~addvdi3.c~~
- ~~addvsi3.c~~
- ~~addvti3.c~~
- ~~arm/aeabi_cdcmp.S~~
- ~~arm/aeabi_cdcmpeq_check_nan.c~~
- ~~arm/aeabi_cfcmp.S~~
- ~~arm/aeabi_cfcmpeq_check_nan.c~~
- ~~arm/aeabi_div0.c~~
- ~~arm/aeabi_drsub.c~~
- ~~arm/aeabi_frsub.c~~
- ~~arm/aeabi_memcmp.S~~
- ~~arm/bswapdi2.S~~
- ~~arm/bswapsi2.S~~
- ~~arm/clzdi2.S~~
- ~~arm/clzsi2.S~~
- ~~arm/comparesf2.S~~
- ~~arm/restore_vfp_d8_d15_regs.S~~
- ~~arm/save_vfp_d8_d15_regs.S~~
- ~~arm/switch16.S~~
- ~~arm/switch32.S~~
- ~~arm/switch8.S~~
- ~~arm/switchu8.S~~
- ~~clzdi2.c~~
- ~~clzsi2.c~~
- ~~clzti2.c~~
- ~~cmpdi2.c~~
- ~~cmpti2.c~~
- ~~ctzdi2.c~~
- ~~ctzsi2.c~~
- ~~ctzti2.c~~
- ~~ffsdi2.c~~ - this is [called by gcc][jemalloc-fail] though!
- ~~ffsti2.c~~
- ~~mulvdi3.c~~
- ~~mulvsi3.c~~
- ~~mulvti3.c~~
- ~~negdf2.c~~
- ~~negdi2.c~~
- ~~negsf2.c~~
- ~~negti2.c~~
- ~~negvdi2.c~~
- ~~negvsi2.c~~
- ~~negvti2.c~~
- ~~paritydi2.c~~
- ~~paritysi2.c~~
- ~~parityti2.c~~
- ~~popcountdi2.c~~
- ~~popcountsi2.c~~
- ~~popcountti2.c~~
- ~~ppc/restFP.S~~
- ~~ppc/saveFP.S~~
- ~~subvdi3.c~~
- ~~subvsi3.c~~
- ~~subvti3.c~~
- ~~ucmpdi2.c~~
- ~~ucmpti2.c~~
- ~~udivmodti4.c~~
-
-[jemalloc-fail]: https://travis-ci.org/rust-lang/rust/jobs/249772758
-
-Rust only exposes atomic types on platforms that support them, and therefore does not need to fall back to software implementations.
-
- ~~arm/sync_fetch_and_add_4.S~~
- ~~arm/sync_fetch_and_add_8.S~~
- ~~arm/sync_fetch_and_and_4.S~~
- ~~arm/sync_fetch_and_and_8.S~~
- ~~arm/sync_fetch_and_max_4.S~~
- ~~arm/sync_fetch_and_max_8.S~~
- ~~arm/sync_fetch_and_min_4.S~~
- ~~arm/sync_fetch_and_min_8.S~~
- ~~arm/sync_fetch_and_nand_4.S~~
- ~~arm/sync_fetch_and_nand_8.S~~
- ~~arm/sync_fetch_and_or_4.S~~
- ~~arm/sync_fetch_and_or_8.S~~
- ~~arm/sync_fetch_and_sub_4.S~~
- ~~arm/sync_fetch_and_sub_8.S~~
- ~~arm/sync_fetch_and_umax_4.S~~
- ~~arm/sync_fetch_and_umax_8.S~~
- ~~arm/sync_fetch_and_umin_4.S~~
- ~~arm/sync_fetch_and_umin_8.S~~
- ~~arm/sync_fetch_and_xor_4.S~~
- ~~arm/sync_fetch_and_xor_8.S~~
- ~~arm/sync_synchronize.S~~
- ~~atomic.c~~
- ~~atomic_flag_clear.c~~
- ~~atomic_flag_clear_explicit.c~~
- ~~atomic_flag_test_and_set.c~~
- ~~atomic_flag_test_and_set_explicit.c~~
- ~~atomic_signal_fence.c~~
- ~~atomic_thread_fence.c~~
-
-Miscellaneous functionality that is not used by Rust.
-
- ~~apple_versioning.c~~
- ~~clear_cache.c~~
- ~~emutls.c~~
- ~~enable_execute_stack.c~~
- ~~eprintf.c~~
- ~~gcc_personality_v0.c~~
- ~~trampoline_setup.c~~
-
-Floating-point implementations of builtins that are only called from soft-float code. It would be better to simply use the generic soft-float versions in this case.
-
- ~~i386/floatdidf.S~~
- ~~i386/floatdisf.S~~
- ~~i386/floatundidf.S~~
- ~~i386/floatundisf.S~~
- ~~x86_64/floatundidf.S~~
- ~~x86_64/floatundisf.S~~
- ~~x86_64/floatdidf.c~~
- ~~x86_64/floatdisf.c~~
-
-## License
-
-The compiler-builtins crate is dual licensed under both the University of
-Illinois "BSD-Like" license and the MIT license.  As a user of this code you may
-choose to use it under either license.  As a contributor, you agree to allow
-your code to be used under both.
-
-Full text of the relevant licenses is in LICENSE.TXT.
--- a/asm/memcpy.S
+++ b/asm/memcpy.S
@ -0,0 +1,619 @@
+/* Copyright (c) 2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+   of VFP or NEON when built with the appropriate flags.
+
+   Assumptions:
+
+    ARMv6 (ARMv7-a if using Neon)
+    ARM state
+    Unaligned accesses
+    LDRD/STRD support unaligned word accesses
+
+   If compiled with GCC, this file should be enclosed within following
+   pre-processing check:
+   if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED)
+
+ */
+	.syntax unified
+	/* This implementation requires ARM state.  */
+	.arm
+
+#ifdef __ARM_NEON__
+
+	.fpu	neon
+	.arch	armv7-a
+# define FRAME_SIZE	4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+	.arch	armv6
+	.fpu	vfpv2
+# define FRAME_SIZE	32
+# define USE_VFP
+
+#else
+	.arch	armv6
+# define FRAME_SIZE    32
+
+#endif
+
+/* Old versions of GAS incorrectly implement the NEON align semantics.  */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
+
+#define PC_OFFSET	8	/* PC pipeline compensation.  */
+#define INSN_SIZE	4
+
+/* Call parameters.  */
+#define dstin	r0
+#define src	r1
+#define count	r2
+
+/* Locals.  */
+#define tmp1	r3
+#define dst	ip
+#define tmp2	r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers.  */
+#define	A_l	r2		/* Call-clobbered.  */
+#define	A_h	r3		/* Call-clobbered.  */
+#define	B_l	r4
+#define	B_h	r5
+#define	C_l	r6
+#define	C_h	r7
+#define	D_l	r8
+#define	D_h	r9
+#endif
+
+/* Number of lines ahead to pre-fetch data.  If you change this the code
+   below will need adjustment to compensate.  */
+
+#define prefetch_lines	5
+
+#ifdef USE_VFP
+	.macro	cpy_line_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+
+	.macro	cpy_tail_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+#endif
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn memcpy p2align=6
+
+	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
+	cmp	count, #64
+	bge	.Lcpy_not_short
+	/* Deal with small copies quickly by dropping straight into the
+	   exit block.  */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+	and	tmp1, count, #0x38
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	vld1.8	{d0}, [src]!	/* 14 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 12 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 10 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 8 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 6 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 4 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 2 words to go.  */
+	vst1.8	{d0}, [dst]!
+
+	tst	count, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+#else
+	/* Copy up to 15 full words of data.  May not be aligned.  */
+	/* Cannot use VFP for unaligned data.  */
+	and	tmp1, count, #0x3c
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+	/* Jump directly into the sequence below at the correct offset.  */
+	add	pc, pc, tmp1, lsl #1
+
+	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
+	str	tmp1, [dst, #-60]
+
+	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
+	str	tmp1, [dst, #-56]
+	ldr	tmp1, [src, #-52]
+	str	tmp1, [dst, #-52]
+
+	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
+	str	tmp1, [dst, #-48]
+	ldr	tmp1, [src, #-44]
+	str	tmp1, [dst, #-44]
+
+	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
+	str	tmp1, [dst, #-40]
+	ldr	tmp1, [src, #-36]
+	str	tmp1, [dst, #-36]
+
+	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
+	str	tmp1, [dst, #-32]
+	ldr	tmp1, [src, #-28]
+	str	tmp1, [dst, #-28]
+
+	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
+	str	tmp1, [dst, #-24]
+	ldr	tmp1, [src, #-20]
+	str	tmp1, [dst, #-20]
+
+	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
+	str	tmp1, [dst, #-16]
+	ldr	tmp1, [src, #-12]
+	str	tmp1, [dst, #-12]
+
+	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
+	str	tmp1, [dst, #-8]
+	ldr	tmp1, [src, #-4]
+	str	tmp1, [dst, #-4]
+#endif
+
+	lsls	count, count, #31
+	ldrhcs	tmp1, [src], #2
+	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
+	strhcs	tmp1, [dst], #2
+	strbne	src, [dst]
+	bx	lr
+
+.Lcpy_not_short:
+	/* At least 64 bytes to copy, but don't know the alignment yet.  */
+	str	tmp2, [sp, #-FRAME_SIZE]!
+	and	tmp2, src, #7
+	and	tmp1, dst, #7
+	cmp	tmp1, tmp2
+	bne	.Lcpy_notaligned
+
+#ifdef USE_VFP
+	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
+	   that the FP pipeline is much better at streaming loads and
+	   stores.  This is outside the critical loop.  */
+	vmov.f32	s0, s0
+#endif
+
+	/* SRC and DST have the same mutual 32-bit alignment, but we may
+	   still need to pre-copy some bytes to get to natural alignment.
+	   We bring DST into full 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src], #1
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst], #1
+
+1:
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	blt	.Ltail63aligned
+
+	cmp	tmp2, #512
+	bge	.Lcpy_body_long
+
+.Lcpy_body_medium:			/* Count in tmp2.  */
+#ifdef USE_VFP
+1:
+	vldr	d0, [src, #0]
+	subs	tmp2, tmp2, #64
+	vldr	d1, [src, #8]
+	vstr	d0, [dst, #0]
+	vldr	d0, [src, #16]
+	vstr	d1, [dst, #8]
+	vldr	d1, [src, #24]
+	vstr	d0, [dst, #16]
+	vldr	d0, [src, #32]
+	vstr	d1, [dst, #24]
+	vldr	d1, [src, #40]
+	vstr	d0, [dst, #32]
+	vldr	d0, [src, #48]
+	vstr	d1, [dst, #40]
+	vldr	d1, [src, #56]
+	vstr	d0, [dst, #48]
+	add	src, src, #64
+	vstr	d1, [dst, #56]
+	add	dst, dst, #64
+	bge	1b
+	tst	tmp2, #0x3f
+	beq	.Ldone
+
+.Ltail63aligned:			/* Count in tmp2.  */
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+
+	vldr	d0, [src, #-56]	/* 14 words to go.  */
+	vstr	d0, [dst, #-56]
+	vldr	d0, [src, #-48]	/* 12 words to go.  */
+	vstr	d0, [dst, #-48]
+	vldr	d0, [src, #-40]	/* 10 words to go.  */
+	vstr	d0, [dst, #-40]
+	vldr	d0, [src, #-32]	/* 8 words to go.  */
+	vstr	d0, [dst, #-32]
+	vldr	d0, [src, #-24]	/* 6 words to go.  */
+	vstr	d0, [dst, #-24]
+	vldr	d0, [src, #-16]	/* 4 words to go.  */
+	vstr	d0, [dst, #-16]
+	vldr	d0, [src, #-8]	/* 2 words to go.  */
+	vstr	d0, [dst, #-8]
+#else
+	sub	src, src, #8
+	sub	dst, dst, #8
+1:
+	ldrd	A_l, A_h, [src, #8]
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #16]
+	strd	A_l, A_h, [dst, #16]
+	ldrd	A_l, A_h, [src, #24]
+	strd	A_l, A_h, [dst, #24]
+	ldrd	A_l, A_h, [src, #32]
+	strd	A_l, A_h, [dst, #32]
+	ldrd	A_l, A_h, [src, #40]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #48]
+	strd	A_l, A_h, [dst, #48]
+	ldrd	A_l, A_h, [src, #56]
+	strd	A_l, A_h, [dst, #56]
+	ldrd	A_l, A_h, [src, #64]!
+	strd	A_l, A_h, [dst, #64]!
+	subs	tmp2, tmp2, #64
+	bge	1b
+	tst	tmp2, #0x3f
+	bne	1f
+	ldr	tmp2,[sp], #FRAME_SIZE
+	bx	lr
+1:
+	add	src, src, #8
+	add	dst, dst, #8
+
+.Ltail63aligned:			/* Count in tmp2.  */
+	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
+	   we know that the src and dest are 32-bit aligned so we can use
+	   LDRD/STRD to improve efficiency.  */
+	/* TMP2 is now negative, but we don't care about that.  The bottom
+	   six bits still tell us how many bytes are left to copy.  */
+
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
+	strd	A_l, A_h, [dst, #-56]
+	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
+	strd	A_l, A_h, [dst, #-48]
+	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
+	strd	A_l, A_h, [dst, #-40]
+	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
+	strd	A_l, A_h, [dst, #-32]
+	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
+	strd	A_l, A_h, [dst, #-24]
+	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
+	strd	A_l, A_h, [dst, #-16]
+	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
+	strd	A_l, A_h, [dst, #-8]
+
+#endif
+	tst	tmp2, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src]
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst]
+
+.Ldone:
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+
+.Lcpy_body_long:			/* Count in tmp2.  */
+
+	/* Long copy.  We know that there's at least (prefetch_lines * 64)
+	   bytes to go.  */
+#ifdef USE_VFP
+	/* Don't use PLD.  Instead, read some data in advance of the current
+	   copy position into a register.  This should act like a PLD
+	   operation but we won't have to repeat the transfer.  */
+
+	vldr	d3, [src, #0]
+	vldr	d4, [src, #64]
+	vldr	d5, [src, #128]
+	vldr	d6, [src, #192]
+	vldr	d7, [src, #256]
+
+	vldr	d0, [src, #8]
+	vldr	d1, [src, #16]
+	vldr	d2, [src, #24]
+	add	src, src, #32
+
+	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
+	blt	2f
+1:
+	cpy_line_vfp	d3, 0
+	cpy_line_vfp	d4, 64
+	cpy_line_vfp	d5, 128
+	add	dst, dst, #3 * 64
+	add	src, src, #3 * 64
+	cpy_line_vfp	d6, 0
+	cpy_line_vfp	d7, 64
+	add	dst, dst, #2 * 64
+	add	src, src, #2 * 64
+	subs	tmp2, tmp2, #prefetch_lines * 64
+	bge	1b
+
+2:
+	cpy_tail_vfp	d3, 0
+	cpy_tail_vfp	d4, 64
+	cpy_tail_vfp	d5, 128
+	add	src, src, #3 * 64
+	add	dst, dst, #3 * 64
+	cpy_tail_vfp	d6, 0
+	vstr	d7, [dst, #64]
+	vldr	d7, [src, #64]
+	vstr	d0, [dst, #64 + 8]
+	vldr	d0, [src, #64 + 8]
+	vstr	d1, [dst, #64 + 16]
+	vldr	d1, [src, #64 + 16]
+	vstr	d2, [dst, #64 + 24]
+	vldr	d2, [src, #64 + 24]
+	vstr	d7, [dst, #64 + 32]
+	add	src, src, #96
+	vstr	d0, [dst, #64 + 40]
+	vstr	d1, [dst, #64 + 48]
+	vstr	d2, [dst, #64 + 56]
+	add	dst, dst, #128
+	add	tmp2, tmp2, #prefetch_lines * 64
+	b	.Lcpy_body_medium
+#else
+	/* Long copy.  Use an SMS style loop to maximize the I/O
+	   bandwidth of the core.  We don't have enough spare registers
+	   to synthesise prefetching, so use PLD operations.  */
+	/* Pre-bias src and dst.  */
+	sub	src, src, #8
+	sub	dst, dst, #8
+	pld	[src, #8]
+	pld	[src, #72]
+	subs	tmp2, tmp2, #64
+	pld	[src, #136]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	pld	[src, #200]
+	ldrd	D_l, D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #232]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldrd	D_l, D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldrd	D_l, D_h, [src, #32]
+	bcs	2b
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #40
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	tst	tmp2, #0x3f
+	bne	.Ltail63aligned
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+#endif
+
+.Lcpy_notaligned:
+	pld	[src]
+	pld	[src, #64]
+	/* There's at least 64 bytes to copy, but there is no mutual
+	   alignment.  */
+	/* Bring DST to 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	pld	[src, #(2 * 64)]
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrbne	tmp1, [src], #1
+	ldrhcs	tmp2, [src], #2
+	strbne	tmp1, [dst], #1
+	strhcs	tmp2, [dst], #2
+1:
+	pld	[src, #(3 * 64)]
+	subs	count, count, #64
+	ldrmi	tmp2, [sp], #FRAME_SIZE
+	bmi	.Ltail63unaligned
+	pld	[src, #(4 * 64)]
+
+#ifdef USE_NEON
+	vld1.8	{d0-d3}, [src]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bmi	2f
+1:
+	pld	[src, #(4 * 64)]
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vld1.8	{d0-d3}, [src]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bpl	1b
+2:
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	ands	count, count, #0x3f
+#else
+	/* Use an SMS style loop to maximize the I/O bandwidth.  */
+	sub	src, src, #4
+	sub	dst, dst, #8
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #(5 * 64) - (32 - 4)]
+	strd	A_l, A_h, [dst, #40]
+	ldr	A_l, [src, #36]
+	ldr	A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldr	B_l, [src, #44]
+	ldr	B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldr	C_l, [src, #52]
+	ldr	C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldr	D_l, [src, #60]
+	ldr	D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]
+	bcs	2b
+
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #36
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	ands	count, tmp2, #0x3f
+#endif
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bne	.Ltail63unaligned
+	bx	lr
+
+	.size	memcpy, . - memcpy
--- a/build.rs
+++ b/build.rs
@ -73,6 +73,31 @@ fn main() {
    if llvm_target[0] == "armv4t" || llvm_target[0] == "armv5te" {
        println!("cargo:rustc-cfg=kernel_user_helpers")
    }
+
+    compile_memcpy();
+}
+
+fn compile_memcpy() {
+    use std::path::Path;
+    extern crate cc;
+
+    let cfg = &mut cc::Build::new();
+    cfg.compiler("clang");
+    cfg.no_default_flags(true);
+    cfg.warnings(false);
+    cfg.flag("--target=armv7-none-eabihf");
+
+    let sources = vec![
+        "memcpy.S",
+    ];
+
+    let root = Path::new("./asm");
+    for src in sources {
+        println!("cargo:rerun-if-changed={}", src);
+        cfg.file(root.join(src));
+    }
+
+    cfg.compile("memcpy");
 }

 #[cfg(feature = "c")]
--- a/src/arm.rs
+++ b/src/arm.rs
@ -139,28 +139,32 @@ pub unsafe fn __aeabi_ldivmod() {
 }

 // FIXME: The `*4` and `*8` variants should be defined as aliases.
-
-#[cfg(not(target_os = "ios"))]
-#[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-#[cfg_attr(thumb, linkage = "weak")]
-pub unsafe extern "aapcs" fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize) {
-    ::mem::memcpy(dest, src, n);
+extern "C" {
+    // optimized memcpy using assembly
+    fn memcpy(dest: *mut u8, src: *const u8, n: usize);
 }

 #[cfg(not(target_os = "ios"))]
 #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
 #[cfg_attr(thumb, linkage = "weak")]
-pub unsafe extern "aapcs" fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, mut n: usize) {
-    // We are guaranteed 4-alignment, so accessing at u32 is okay.
-    let mut dest = dest as *mut u32;
-    let mut src = src as *mut u32;
+pub unsafe extern "aapcs" fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize) {
+    memcpy(dest, src, n);
+}

-    while n >= 4 {
-        *dest = *src;
-        dest = dest.offset(1);
-        src = src.offset(1);
-        n -= 4;
-    }
+#[cfg(not(target_os = "ios"))]
+#[cfg_attr(not(feature = "mangled-names"), no_mangle)]
+#[cfg_attr(thumb, linkage = "weak")]
+pub unsafe extern "aapcs" fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize) {
+    // We are guaranteed 4-alignment, so accessing at u32 is okay.
+    // let mut dest = dest as *mut u32;
+    // let mut src = src as *mut u32;
+
+    // while n >= 4 {
+    //     *dest = *src;
+    //     dest = dest.offset(1);
+    //     src = src.offset(1);
+    //     n -= 4;
+    // }

    __aeabi_memcpy(dest as *mut u8, src as *const u8, n);
 }
--- a/src/mem.rs
+++ b/src/mem.rs
@ -9,16 +9,6 @@ use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div}
 use core::mem;
 use core::ops::{BitOr, Shl};

-#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
-pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
-    let mut i = 0;
-    while i < n {
-        *dest.offset(i as isize) = *src.offset(i as isize);
-        i += 1;
-    }
-    dest
-}
-
 #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
 pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
    if src < dest as *const u8 {