diff --git a/README.md b/README.md
index 32f85cd..73a25e9 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,8 @@ See [rust-lang/rust#35437][0].
 - [x] arm/aeabi_memcpy.S
 - [x] arm/aeabi_memmove.S
 - [x] arm/aeabi_memset.S
-- [ ] arm/aeabi_uidivmod.S
-- [ ] arm/aeabi_uldivmod.S
+- [x] arm/aeabi_uidivmod.S
+- [x] arm/aeabi_uldivmod.S
 - [ ] arm/divdf3vfp.S
 - [ ] arm/divmodsi4.S
 - [ ] arm/divsf3vfp.S
@@ -78,41 +78,30 @@ See [rust-lang/rust#35437][0].
 - [ ] arm/umodsi3.S
 - [ ] arm/unorddf2vfp.S
 - [ ] arm/unordsf2vfp.S
-- [ ] ashldi3.c
-- [ ] ashlti3.c
-- [ ] ashrdi3.c
-- [ ] ashrti3.c
+- [x] ashldi3.c
+- [x] ashrdi3.c
 - [ ] divdf3.c
 - [ ] divdi3.c
 - [ ] divsf3.c
 - [ ] divsi3.c
-- [ ] divti3.c
 - [ ] extendhfsf2.c
 - [ ] extendsfdf2.c
 - [ ] fixdfdi.c
 - [ ] fixdfsi.c
-- [ ] fixdfti.c
 - [ ] fixsfdi.c
 - [ ] fixsfsi.c
-- [ ] fixsfti.c
 - [ ] fixunsdfdi.c
 - [ ] fixunsdfsi.c
-- [ ] fixunsdfti.c
 - [ ] fixunssfdi.c
 - [ ] fixunssfsi.c
-- [ ] fixunssfti.c
 - [ ] floatdidf.c
 - [ ] floatdisf.c
 - [ ] floatsidf.c
 - [ ] floatsisf.c
-- [ ] floattidf.c
-- [ ] floattisf.c
 - [ ] floatundidf.c
 - [ ] floatundisf.c
 - [ ] floatunsidf.c
 - [ ] floatunsisf.c
-- [ ] floatuntidf.c
-- [ ] floatuntisf.c
 - [ ] i386/ashldi3.S
 - [ ] i386/ashrdi3.S
 - [ ] i386/chkstk.S
@@ -123,18 +112,14 @@ See [rust-lang/rust#35437][0].
 - [ ] i386/muldi3.S
 - [ ] i386/udivdi3.S
 - [ ] i386/umoddi3.S
-- [ ] lshrdi3.c
-- [ ] lshrti3.c
+- [x] lshrdi3.c
 - [ ] moddi3.c
 - [ ] modsi3.c
-- [ ] modti3.c
 - [ ] muldf3.c
-- [ ] muldi3.c
-- [ ] mulodi4.c
-- [ ] mulosi4.c
-- [ ] muloti4.c
+- [x] muldi3.c
+- [x] mulodi4.c
+- [x] mulosi4.c
 - [ ] mulsf3.c
-- [ ] multi3.c
 - [ ] powidf2.c
 - [ ] powisf2.c
 - [ ] subdf3.c
@@ -142,17 +127,36 @@ See [rust-lang/rust#35437][0].
 - [ ] truncdfhf2.c
 - [ ] truncdfsf2.c
 - [ ] truncsfhf2.c
-- [ ] udivdi3.c
+- [x] udivdi3.c
 - [x] udivmoddi4.c
 - [x] udivmodsi4.c
-- [ ] udivsi3.c
-- [ ] udivti3.c
-- [ ] umoddi3.c
-- [ ] umodsi3.c
-- [ ] umodti3.c
+- [x] udivsi3.c
+- [x] umoddi3.c
+- [x] umodsi3.c
 - [ ] x86_64/chkstk.S
 - [ ] x86_64/chkstk2.S
 
+These builtins are needed to support 128-bit integers, which are in the process of being added to Rust.
+
+- [ ] ashlti3.c
+- [ ] ashrti3.c
+- [ ] divti3.c
+- [ ] fixdfti.c
+- [ ] fixsfti.c
+- [ ] fixunsdfti.c
+- [ ] fixunssfti.c
+- [ ] floattidf.c
+- [ ] floattisf.c
+- [ ] floatuntidf.c
+- [ ] floatuntisf.c
+- [ ] lshrti3.c
+- [ ] modti3.c
+- [ ] muloti4.c
+- [ ] multi3.c
+- [ ] udivmodti4.c
+- [ ] udivti3.c
+- [ ] umodti3.c
+
 ## Unimplemented functions
 
 These builtins involve floating-point types ("`f128`", "`f80`" and complex numbers) that are not supported by Rust.
diff --git a/src/arm.rs b/src/arm.rs
index 51206ed..0759b55 100644
--- a/src/arm.rs
+++ b/src/arm.rs
@@ -2,33 +2,28 @@ use core::intrinsics;
 
 // NOTE This function and the one below are implemented using assembly because they using a custom
 // calling convention which can't be implemented using a normal Rust function
-// TODO use `global_asm!`
 #[naked]
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_uidivmod() {
-    asm!("push    { lr }
-          sub     sp, sp, #4
-          mov     r2, sp
-          bl      __udivmodsi4
-          ldr     r1, [sp]
-          add     sp, sp, #4
-          pop     { pc }");
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe fn __aeabi_uidivmod() {
+    asm!("push {lr}
+          sub sp, sp, #4
+          mov r2, sp
+          bl __udivmodsi4
+          ldr r1, [sp], #4
+          pop {pc}");
     intrinsics::unreachable();
 }
 
-// TODO use `global_asm!`
 #[naked]
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_uldivmod() {
-    asm!("push	{r11, lr}
-          sub	sp, sp, #16
-          add	r12, sp, #8
-          str	r12, [sp]
-          bl	__udivmoddi4
-          ldr	r2, [sp, #8]
-          ldr	r3, [sp, #12]
-          add	sp, sp, #16
-          pop	{r11, pc}");
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe fn __aeabi_uldivmod() {
+    asm!("push {lr}
+          sub r12, sp, #12
+          str r12, [sp, #-20]!
+          bl __udivmoddi4
+          ldrd r2, r3, [sp, #8]
+          add sp, sp, #20
+          pop {pc}");
     intrinsics::unreachable();
 }
 
@@ -40,55 +35,55 @@ extern "C" {
 
 // FIXME: The `*4` and `*8` variants should be defined as aliases.
 
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize) {
     memcpy(dest, src, n);
 }
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize) {
     memcpy(dest, src, n);
 }
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memcpy8(dest: *mut u8, src: *const u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memcpy8(dest: *mut u8, src: *const u8, n: usize) {
     memcpy(dest, src, n);
 }
 
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memmove(dest: *mut u8, src: *const u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memmove(dest: *mut u8, src: *const u8, n: usize) {
     memmove(dest, src, n);
 }
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memmove4(dest: *mut u8, src: *const u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memmove4(dest: *mut u8, src: *const u8, n: usize) {
     memmove(dest, src, n);
 }
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memmove8(dest: *mut u8, src: *const u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memmove8(dest: *mut u8, src: *const u8, n: usize) {
     memmove(dest, src, n);
 }
 
 // Note the different argument order
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memset(dest: *mut u8, n: usize, c: i32) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memset(dest: *mut u8, n: usize, c: i32) {
     memset(dest, c, n);
 }
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memset4(dest: *mut u8, n: usize, c: i32) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memset4(dest: *mut u8, n: usize, c: i32) {
     memset(dest, c, n);
 }
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memset8(dest: *mut u8, n: usize, c: i32) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memset8(dest: *mut u8, n: usize, c: i32) {
     memset(dest, c, n);
 }
 
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memclr(dest: *mut u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memclr(dest: *mut u8, n: usize) {
     memset(dest, 0, n);
 }
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memclr4(dest: *mut u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memclr4(dest: *mut u8, n: usize) {
     memset(dest, 0, n);
 }
-#[no_mangle]
-pub unsafe extern "aapcs" fn __aeabi_memclr8(dest: *mut u8, n: usize) {
+#[cfg_attr(not(test), no_mangle)]
+pub unsafe extern "C" fn __aeabi_memclr8(dest: *mut u8, n: usize) {
     memset(dest, 0, n);
 }
diff --git a/src/div.rs b/src/div.rs
deleted file mode 100644
index 44d59b7..0000000
--- a/src/div.rs
+++ /dev/null
@@ -1,271 +0,0 @@
-use {Int, LargeInt, U64};
-
-/// Returns `n / d`
-#[no_mangle]
-pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
-    let u32_bits = u32::bits() as u32;
-
-    // Special cases
-    if d == 0 {
-        panic!("Division by zero");
-    }
-
-    if n == 0 {
-        return 0;
-    }
-
-    let mut sr = d.leading_zeros().wrapping_sub(n.leading_zeros());
-
-    // d > n
-    if sr > u32_bits - 1 {
-        return 0;
-    }
-
-    // d == 1
-    if sr == u32_bits - 1 {
-        return n;
-    }
-
-    sr = sr + 1;
-
-    // 1 <= sr <= u32_bits - 1
-    let mut q = n << (u32_bits - sr);
-    let mut r = n >> sr;
-
-    let mut carry = 0;
-    for _ in 0..sr {
-        // r:q = ((r:q) << 1) | carry
-        r = (r << 1) | (q >> (u32_bits - 1));
-        q = (q << 1) | carry;
-
-        // carry = 0;
-        // if r > d {
-        //     r -= d;
-        //     carry = 1;
-        // }
-
-        let s = (d.wrapping_sub(r).wrapping_sub(1)) as i32 >> (u32_bits - 1);
-        carry = (s & 1) as u32;
-        r -= d & s as u32;
-    }
-
-    (q << 1) | carry
-}
-
-/// Returns `n / d` and sets `*rem = n % d`
-#[no_mangle]
-pub extern "C" fn __udivmodsi4(a: u32, b: u32, rem: Option<&mut u32>) -> u32 {
-    let d = __udivsi3(a, b);
-    if let Some(rem) = rem {
-        *rem = a - (d * b);
-    }
-    return d;
-}
-
-/// Returns `n / d` and sets `*rem = n % d`
-#[no_mangle]
-pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
-    let u32_bits = u32::bits() as u32;
-    let u64_bits = u64::bits() as u32;
-
-    // NOTE X is unknown, K != 0
-    if n.high() == 0 {
-        if d.high() == 0 {
-            // 0 X
-            // ---
-            // 0 X
-
-            if let Some(rem) = rem {
-                *rem = u64::from(n.low() % d.low());
-            }
-            return u64::from(n.low() / d.low());
-        } else
-        // d.high() != 0
-        {
-            // 0 X
-            // ---
-            // K X
-
-            if let Some(rem) = rem {
-                *rem = u64::from(n.low());
-            }
-            return 0;
-        };
-    }
-
-    let mut sr;
-    let mut q = U64 { low: 0, high: 0 };
-    let mut r = U64 { low: 0, high: 0 };
-
-    // n.high() != 0
-    if d.low() == 0 {
-        if d.high() == 0 {
-            // K X
-            // ---
-            // 0 0
-
-            panic!("Division by zero");
-        }
-
-        // d.high() != 0
-        if n.low() == 0 {
-            // K 0
-            // ---
-            // K 0
-
-            if let Some(rem) = rem {
-                *rem = U64 {
-                    low: 0,
-                    high: n.high() % d.high(),
-                }[..];
-            }
-            return u64::from(n.high() / d.high());
-        }
-
-        // n.low() != 0
-        // K K
-        // ---
-        // K 0
-
-        if d.high().is_power_of_two() {
-            if let Some(rem) = rem {
-                *rem = U64 {
-                    low: n.low(),
-                    high: n.high() & (d.high() - 1),
-                }[..];
-            }
-
-            return u64::from(n.high() >> d.high().trailing_zeros());
-        }
-
-        sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
-
-        // D > N
-        if sr > u32_bits - 2 {
-            if let Some(rem) = rem {
-                *rem = n;
-            }
-            return 0;
-        }
-
-        sr = sr + 1;
-
-        // 1 <= sr <= u32_bits - 1
-        // q = n << (u64_bits - sr);
-        q.low = 0;
-        q.high = n.low() << (u32_bits - sr);
-        // r = n >> sr
-        r.high = n.high() >> sr;
-        r.low = (n.high() << (u32_bits - sr)) | (n.low() >> sr);
-    } else
-    // d.low() != 0
-    {
-        if d.high() == 0 {
-            // K X
-            // ---
-            // 0 K
-            if d.low().is_power_of_two() {
-                if let Some(rem) = rem {
-                    *rem = u64::from(n.low() & (d.low() - 1));
-                }
-
-                if d.low() == 1 {
-                    return n;
-                } else {
-                    let sr = d.low().trailing_zeros();
-                    return U64 {
-                        low: (n.high() << (u32_bits - sr)) | (n.low() >> sr),
-                        high: n.high() >> sr,
-                    }[..];
-                };
-            }
-
-            sr = 1 + u32_bits + d.low().leading_zeros() - n.high().leading_zeros();
-
-            // 2 <= sr <= u64_bits - 1
-            // q = n << (u64_bits - sr)
-            // r = n >> sr;
-            if sr == u32_bits {
-                q.low = 0;
-                q.high = n.low();
-                r.high = 0;
-                r.low = n.high();
-            } else if sr < u32_bits
-            // 2 <= sr <= u32_bits - 1
-            {
-                q.low = 0;
-                q.high = n.low() << (u32_bits - sr);
-                r.high = n.high() >> sr;
-                r.low = (n.high() << (u32_bits - sr)) | (n.low() >> sr);
-            } else
-            // u32_bits + 1 <= sr <= u64_bits - 1
-            {
-                q.low = n.low() << (u64_bits - sr);
-                q.high = (n.high() << (u64_bits - sr)) | (n.low() >> (sr - u32_bits));
-                r.high = 0;
-                r.low = n.high() >> (sr - u32_bits);
-            }
-
-        } else
-        // d.high() != 0
-        {
-            // K X
-            // ---
-            // K K
-
-            sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
-
-            // D > N
-            if sr > u32_bits - 1 {
-                if let Some(rem) = rem {
-                    *rem = n;
-                    return 0;
-                }
-            }
-
-            sr += 1;
-
-            // 1 <= sr <= u32_bits
-            // q = n << (u64_bits - sr)
-            q.low = 0;
-            if sr == u32_bits {
-                q.high = n.low();
-                r.high = 0;
-                r.low = n.high();
-            } else {
-                q.high = n.low() << (u32_bits - sr);
-                r.high = n.high() >> sr;
-                r.low = (n.high() << (u32_bits - sr)) | (n.low() >> sr);
-            }
-        }
-    }
-
-    // Not a special case
-    // q and r are initialized with
-    // q = n << (u64_bits - sr)
-    // r = n >> sr
-    // 1 <= sr <= u64_bits - 1
-    let mut carry = 0;
-
-    for _ in 0..sr {
-        // r:q = ((r:q) << 1) | carry
-        r[..] = (r[..] << 1) | (q[..] >> 63);
-        q[..] = (q[..] << 1) | carry as u64;
-
-        // carry = 0
-        // if r >= d {
-        //     r -= d;
-        //     carry = 1;
-        // }
-
-        let s = (d.wrapping_sub(r[..]).wrapping_sub(1)) as i64 >> (u64_bits - 1);
-        carry = (s & 1) as u32;
-        r[..] -= d & s as u64;
-    }
-
-    q[..] = (q[..] << 1) | carry as u64;
-    if let Some(rem) = rem {
-        *rem = r[..];
-    }
-    q[..]
-}
diff --git a/src/lib.rs b/src/lib.rs
index 196f37c..6d5580b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,13 +1,14 @@
 #![allow(unused_features)]
-#![cfg_attr(not(test), no_std)]
+#![no_std]
 #![feature(asm)]
 #![feature(core_intrinsics)]
 #![feature(naked_functions)]
 // TODO(rust-lang/rust#35021) uncomment when that PR lands
 // #![feature(rustc_builtins)]
 
-#[cfg(test)]
-extern crate core;
+// We disable #[no_mangle] for tests so that we can verify the test results
+// against the native compiler-rt implementations of the builtins.
+
 #[cfg(test)]
 #[macro_use]
 extern crate quickcheck;
@@ -15,36 +16,33 @@ extern crate quickcheck;
 #[cfg(target_arch = "arm")]
 pub mod arm;
 
-pub mod div;
-
-#[cfg(test)]
-mod test;
-
-use core::ops::{Index, IndexMut, RangeFull};
+pub mod udiv;
+pub mod mul;
+pub mod shift;
 
 /// Trait for some basic operations on integers
 trait Int {
-    fn bits() -> usize;
+    fn bits() -> u32;
 }
 
 // TODO: Once i128/u128 support lands, we'll want to add impls for those as well
 impl Int for u32 {
-    fn bits() -> usize {
+    fn bits() -> u32 {
         32
     }
 }
 impl Int for i32 {
-    fn bits() -> usize {
+    fn bits() -> u32 {
         32
     }
 }
 impl Int for u64 {
-    fn bits() -> usize {
+    fn bits() -> u32 {
         64
     }
 }
 impl Int for i64 {
-    fn bits() -> usize {
+    fn bits() -> u32 {
         64
     }
 }
@@ -88,33 +86,3 @@ impl LargeInt for i64 {
         low as i64 | ((high as i64) << 32)
     }
 }
-
-/// Union-like access to the 32-bit words that make an `u64`: `x.low` and `x.high`. The whole `u64`
-/// can be accessed via the expression `x[..]`, which can be used in lvalue or rvalue position.
-#[cfg(target_endian = "little")]
-#[repr(C)]
-struct U64 {
-    low: u32,
-    high: u32,
-}
-
-#[cfg(target_endian = "big")]
-#[repr(C)]
-struct U64 {
-    high: u32,
-    low: u32,
-}
-
-impl Index<RangeFull> for U64 {
-    type Output = u64;
-
-    fn index(&self, _: RangeFull) -> &u64 {
-        unsafe { &*(self as *const _ as *const u64) }
-    }
-}
-
-impl IndexMut<RangeFull> for U64 {
-    fn index_mut(&mut self, _: RangeFull) -> &mut u64 {
-        unsafe { &mut *(self as *const _ as *mut u64) }
-    }
-}
diff --git a/src/mul.rs b/src/mul.rs
new file mode 100644
index 0000000..a4350b5
--- /dev/null
+++ b/src/mul.rs
@@ -0,0 +1,99 @@
+use {Int, LargeInt};
+
+macro_rules! mul {
+    ($intrinsic:ident: $ty:ty) => {
+        /// Returns `a * b`
+        #[cfg_attr(not(test), no_mangle)]
+        pub extern fn $intrinsic(a: $ty, b: $ty) -> $ty {
+            let half_bits = <$ty>::bits() / 4;
+            let lower_mask = !0 >> half_bits;
+            let mut low = (a.low() & lower_mask) * (b.low() & lower_mask);
+            let mut t = low >> half_bits;
+            low &= lower_mask;
+            t += (a.low() >> half_bits) * (b.low() & lower_mask);
+            low += (t & lower_mask) << half_bits;
+            let mut high = t >> half_bits;
+            t = low >> half_bits;
+            low &= lower_mask;
+            t += (b.low() >> half_bits) * (a.low() & lower_mask);
+            low += (t & lower_mask) << half_bits;
+            high += t >> half_bits;
+            high += (a.low() >> half_bits) * (b.low() >> half_bits);
+            high += a.high().wrapping_mul(b.low()) + a.low().wrapping_mul(b.high());
+            <$ty>::from_parts(low, high)
+        }
+    }
+}
+
+macro_rules! mulo {
+    ($intrinsic:ident: $ty:ty) => {
+        /// Returns `a * b` and sets `*overflow = 1` if `a * b` overflows
+        #[cfg_attr(not(test), no_mangle)]
+        pub extern fn $intrinsic(a: $ty, b: $ty, overflow: &mut i32) -> $ty {
+            *overflow = 0;
+            let result = a.wrapping_mul(b);
+            if a == <$ty>::min_value() {
+                if b != 0 && b != 1 {
+                    *overflow = 1;
+                }
+                return result;
+            }
+            if b == <$ty>::min_value() {
+                if a != 0 && a != 1 {
+                    *overflow = 1;
+                }
+                return result;
+            }
+
+            let sa = a >> (<$ty>::bits() - 1);
+            let abs_a = (a ^ sa) - sa;
+            let sb = b >> (<$ty>::bits() - 1);
+            let abs_b = (b ^ sb) - sb;
+            if abs_a < 2 || abs_b < 2 {
+                return result;
+            }
+            if sa == sb {
+                if abs_a > <$ty>::max_value() / abs_b {
+                    *overflow = 1;
+                }
+            } else {
+                if abs_a > <$ty>::min_value() / -abs_b {
+                    *overflow = 1;
+                }
+            }
+            result
+        }
+    }
+}
+
+mul!(__muldi4: u64);
+mulo!(__mulosi4: i32);
+mulo!(__mulodi4: i64);
+
+#[cfg(test)]
+mod tests {
+    quickcheck! {
+        fn muldi(a: u64, b: u64) -> bool {
+            let r = super::__muldi4(a, b);
+            r == a.wrapping_mul(b)
+        }
+
+        fn mulosi(a: i32, b: i32) -> bool {
+            let mut overflow = 2;
+            let r = super::__mulosi4(a, b, &mut overflow);
+            if overflow != 0 && overflow != 1 {
+                return false;
+            }
+            (r, overflow != 0) == a.overflowing_mul(b)
+        }
+
+        fn mulodi(a: i64, b: i64) -> bool {
+            let mut overflow = 2;
+            let r = super::__mulodi4(a, b, &mut overflow);
+            if overflow != 0 && overflow != 1 {
+                return false;
+            }
+            (r, overflow != 0) == a.overflowing_mul(b)
+        }
+    }
+}
diff --git a/src/shift.rs b/src/shift.rs
new file mode 100644
index 0000000..909d6f4
--- /dev/null
+++ b/src/shift.rs
@@ -0,0 +1,93 @@
+use {Int, LargeInt};
+
+macro_rules! ashl {
+    ($intrinsic:ident: $ty:ty) => {
+        /// Returns `a << b`, requires `b < $ty::bits()`
+        #[cfg_attr(not(test), no_mangle)]
+        pub extern fn $intrinsic(a: $ty, b: u32) -> $ty {
+            let half_bits = <$ty>::bits() / 2;
+            if b & half_bits != 0 {
+                <$ty>::from_parts(0, a.low() << (b - half_bits))
+            } else if b == 0 {
+                a
+            } else {
+                <$ty>::from_parts(a.low() << b, (a.high() << b) | (a.low() >> (half_bits - b)))
+            }
+        }
+    }
+}
+
+macro_rules! ashr {
+    ($intrinsic:ident: $ty:ty) => {
+        /// Returns arithmetic `a >> b`, requires `b < $ty::bits()`
+        #[cfg_attr(not(test), no_mangle)]
+        pub extern fn $intrinsic(a: $ty, b: u32) -> $ty {
+            let half_bits = <$ty>::bits() / 2;
+            if b & half_bits != 0 {
+                <$ty>::from_parts((a.high() >> (b - half_bits)) as <$ty as LargeInt>::LowHalf,
+                                  a.high() >> (half_bits - 1))
+            } else if b == 0 {
+                a
+            } else {
+                let high_unsigned = a.high() as <$ty as LargeInt>::LowHalf;
+                <$ty>::from_parts((high_unsigned << (half_bits - b)) | (a.low() >> b),
+                                  a.high() >> b)
+            }
+        }
+    }
+}
+
+macro_rules! lshr {
+    ($intrinsic:ident: $ty:ty) => {
+        /// Returns logical `a >> b`, requires `b < $ty::bits()`
+        #[cfg_attr(not(test), no_mangle)]
+        pub extern fn $intrinsic(a: $ty, b: u32) -> $ty {
+            let half_bits = <$ty>::bits() / 2;
+            if b & half_bits != 0 {
+                <$ty>::from_parts(a.high() >> (b - half_bits), 0)
+            } else if b == 0 {
+                a
+            } else {
+                <$ty>::from_parts((a.high() << (half_bits - b)) | (a.low() >> b), a.high() >> b)
+            }
+        }
+    }
+}
+
+ashl!(__ashldi3: u64);
+ashr!(__ashrdi3: i64);
+lshr!(__lshrdi3: u64);
+
+#[cfg(test)]
+mod tests {
+    use quickcheck::TestResult;
+
+    quickcheck! {
+        fn ashldi(a: u64, b: u32) -> TestResult {
+            if b >= 64 {
+                TestResult::discard()
+            } else {
+                let r = super::__ashldi3(a, b);
+                TestResult::from_bool(r == a << b)
+            }
+        }
+
+        fn ashrdi(a: i64, b: u32) -> TestResult {
+            if b >= 64 {
+                TestResult::discard()
+            } else {
+                let r = super::__ashrdi3(a, b);
+                TestResult::from_bool(r == a >> b)
+            }
+        }
+
+        fn lshrdi(a: u64, b: u32) -> TestResult {
+            if b >= 64 {
+                TestResult::discard()
+            } else {
+                let r = super::__lshrdi3(a, b);
+                TestResult::from_bool(r == a >> b)
+            }
+        }
+    }
+}
diff --git a/src/test.rs b/src/test.rs
deleted file mode 100644
index 96e0c12..0000000
--- a/src/test.rs
+++ /dev/null
@@ -1,32 +0,0 @@
-use std::panic;
-
-use quickcheck::TestResult;
-
-quickcheck! {
-    fn udivmoddi4(n: (u32, u32), d: (u32, u32)) -> TestResult {
-        let n = ::U64 { low: n.0, high: n.1 }[..];
-        let d = ::U64 { low: d.0, high: d.1 }[..];
-
-        if d == 0 {
-            TestResult::discard()
-        } else {
-            let mut r = 0;
-            let q = ::div::__udivmoddi4(n, d, Some(&mut r));
-
-            TestResult::from_bool(q * d + r == n)
-        }
-    }
-}
-
-quickcheck! {
-    fn udivmodsi4(n: u32, d: u32) -> TestResult {
-        if d == 0 {
-            TestResult::discard()
-        } else {
-            let mut r = 0;
-            let q = ::div::__udivmodsi4(n, d, Some(&mut r));
-
-            TestResult::from_bool(q * d + r == n)
-        }
-    }
-}
diff --git a/src/udiv.rs b/src/udiv.rs
new file mode 100644
index 0000000..95a550f
--- /dev/null
+++ b/src/udiv.rs
@@ -0,0 +1,290 @@
+use core::mem;
+use {Int, LargeInt};
+
+/// Returns `n / d`
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
+    // Special cases
+    if d == 0 {
+        panic!("Division by zero");
+    }
+
+    if n == 0 {
+        return 0;
+    }
+
+    let mut sr = d.leading_zeros().wrapping_sub(n.leading_zeros());
+
+    // d > n
+    if sr > u32::bits() - 1 {
+        return 0;
+    }
+
+    // d == 1
+    if sr == u32::bits() - 1 {
+        return n;
+    }
+
+    sr += 1;
+
+    // 1 <= sr <= u32::bits() - 1
+    let mut q = n << (u32::bits() - sr);
+    let mut r = n >> sr;
+
+    let mut carry = 0;
+    for _ in 0..sr {
+        // r:q = ((r:q) << 1) | carry
+        r = (r << 1) | (q >> (u32::bits() - 1));
+        q = (q << 1) | carry;
+
+        // carry = 0;
+        // if r > d {
+        //     r -= d;
+        //     carry = 1;
+        // }
+
+        let s = (d.wrapping_sub(r).wrapping_sub(1)) as i32 >> (u32::bits() - 1);
+        carry = (s & 1) as u32;
+        r -= d & s as u32;
+    }
+
+    (q << 1) | carry
+}
+
+/// Returns `n % d`
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 {
+    n - __udivsi3(n, d) * d
+}
+
+/// Returns `n / d` and sets `*rem = n % d`
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 {
+    let q = __udivsi3(n, d);
+    if let Some(rem) = rem {
+        *rem = n - (q * d);
+    }
+    q
+}
+
+/// Returns `n / d`
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 {
+    __udivmoddi4(n, d, None)
+}
+
+/// Returns `n % d`
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn __umoddi3(a: u64, b: u64) -> u64 {
+    let mut rem = unsafe { mem::uninitialized() };
+    __udivmoddi4(a, b, Some(&mut rem));
+    rem
+}
+
+/// Returns `n / d` and sets `*rem = n % d`
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 {
+    // NOTE X is unknown, K != 0
+    if n.high() == 0 {
+        if d.high() == 0 {
+            // 0 X
+            // ---
+            // 0 X
+            if let Some(rem) = rem {
+                *rem = u64::from(n.low() % d.low());
+            }
+            return u64::from(n.low() / d.low());
+        } else {
+            // 0 X
+            // ---
+            // K X
+            if let Some(rem) = rem {
+                *rem = n;
+            }
+            return 0;
+        };
+    }
+
+    let mut sr;
+    let mut q;
+    let mut r;
+
+    if d.low() == 0 {
+        if d.high() == 0 {
+            // K X
+            // ---
+            // 0 0
+            panic!("Division by zero");
+        }
+
+        if n.low() == 0 {
+            // K 0
+            // ---
+            // K 0
+            if let Some(rem) = rem {
+                *rem = u64::from_parts(0, n.high() % d.high());
+            }
+            return u64::from(n.high() / d.high());
+        }
+
+        // K K
+        // ---
+        // K 0
+
+        if d.high().is_power_of_two() {
+            if let Some(rem) = rem {
+                *rem = u64::from_parts(n.low(), n.high() & (d.high() - 1));
+            }
+            return u64::from(n.high() >> d.high().trailing_zeros());
+        }
+
+        sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
+
+        // D > N
+        if sr > u32::bits() - 2 {
+            if let Some(rem) = rem {
+                *rem = n;
+            }
+            return 0;
+        }
+
+        sr += 1;
+
+        // 1 <= sr <= u32::bits() - 1
+        q = n << (u64::bits() - sr);
+        r = n >> sr;
+    } else {
+        if d.high() == 0 {
+            // K X
+            // ---
+            // 0 K
+            if d.low().is_power_of_two() {
+                if let Some(rem) = rem {
+                    *rem = u64::from(n.low() & (d.low() - 1));
+                }
+
+                if d.low() == 1 {
+                    return n;
+                } else {
+                    let sr = d.low().trailing_zeros();
+                    return n >> sr;
+                };
+            }
+
+            sr = 1 + u32::bits() + d.low().leading_zeros() - n.high().leading_zeros();
+
+            // 2 <= sr <= u64::bits() - 1
+            q = n << (u64::bits() - sr);
+            r = n >> sr;
+        } else {
+            // K X
+            // ---
+            // K K
+            sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
+
+            // D > N
+            if sr > u32::bits() - 1 {
+                if let Some(rem) = rem {
+                    *rem = n;
+                    return 0;
+                }
+            }
+
+            sr += 1;
+
+            // 1 <= sr <= u32::bits()
+            q = n << (u64::bits() - sr);
+            r = n >> sr;
+        }
+    }
+
+    // Not a special case
+    // q and r are initialized with
+    // q = n << (u64::bits() - sr)
+    // r = n >> sr
+    // 1 <= sr <= u64::bits() - 1
+    let mut carry = 0;
+
+    for _ in 0..sr {
+        // r:q = ((r:q) << 1) | carry
+        r = (r << 1) | (q >> (u64::bits() - 1));
+        q = (q << 1) | carry as u64;
+
+        // carry = 0
+        // if r >= d {
+        //     r -= d;
+        //     carry = 1;
+        // }
+        let s = (d.wrapping_sub(r).wrapping_sub(1)) as i64 >> (u64::bits() - 1);
+        carry = (s & 1) as u32;
+        r -= d & s as u64;
+    }
+
+    if let Some(rem) = rem {
+        *rem = r;
+    }
+    (q << 1) | carry as u64
+}
+
+#[cfg(test)]
+mod tests {
+    use quickcheck::TestResult;
+
+    quickcheck!{
+        fn udivdi3(n: u64, d: u64) -> TestResult {
+            if d == 0 {
+                TestResult::discard()
+            } else {
+                let q = super::__udivdi3(n, d);
+                TestResult::from_bool(q == n / d)
+            }
+        }
+
+        fn umoddi3(n: u64, d: u64) -> TestResult {
+            if d == 0 {
+                TestResult::discard()
+            } else {
+                let r = super::__umoddi3(n, d);
+                TestResult::from_bool(r == n % d)
+            }
+        }
+
+        fn udivmoddi4(n: u64, d: u64) -> TestResult {
+            if d == 0 {
+                TestResult::discard()
+            } else {
+                let mut r = 0;
+                let q = super::__udivmoddi4(n, d, Some(&mut r));
+                TestResult::from_bool(q == n / d && r == n % d)
+            }
+        }
+
+        fn udivsi3(n: u32, d: u32) -> TestResult {
+            if d == 0 {
+                TestResult::discard()
+            } else {
+                let q = super::__udivsi3(n, d);
+                TestResult::from_bool(q == n / d)
+            }
+        }
+
+        fn umodsi3(n: u32, d: u32) -> TestResult {
+            if d == 0 {
+                TestResult::discard()
+            } else {
+                let r = super::__umodsi3(n, d);
+                TestResult::from_bool(r == n % d)
+            }
+        }
+
+        fn udivmodsi4(n: u32, d: u32) -> TestResult {
+            if d == 0 {
+                TestResult::discard()
+            } else {
+                let mut r = 0;
+                let q = super::__udivmodsi4(n, d, Some(&mut r));
+                TestResult::from_bool(q == n / d && r == n % d)
+            }
+        }
+    }
+}