diff --git a/src/arch/x86.rs b/src/arch/x86.rs
index f677c34..3e257bf 100644
--- a/src/arch/x86.rs
+++ b/src/arch/x86.rs
@@ -3,15 +3,32 @@
 //               whitequark <whitequark@whitequark.org>
 // See the LICENSE file included in this distribution.
 
-//! To understand the code in this file, keep in mind this fact:
-//! * i686 SysV C ABI requires the stack to be aligned at function entry,
-//!   so that `%esp+4` is a multiple of 16. Aligned operands are a requirement
-//!   of SIMD instructions, and making this the responsibility of the caller
-//!   avoids having to maintain a frame pointer, which is necessary when
-//!   a function has to realign the stack from an unknown state.
-//! * i686 SysV C ABI passes the first argument on the stack. This is
-//!   unfortunate, because unlike every other architecture we can't reuse
-//!   `swap` for the initial call, and so we use a trampoline.
+// To understand the machine code in this file, keep in mind these facts:
+// * i686 SysV C ABI requires the stack to be aligned at function entry,
+//   so that `%esp+4` is a multiple of 16. Aligned operands are a requirement
+//   of SIMD instructions, and making this the responsibility of the caller
+//   avoids having to maintain a frame pointer, which is necessary when
+//   a function has to realign the stack from an unknown state.
+// * i686 SysV C ABI passes the first argument on the stack. This is
+//   unfortunate, because unlike every other architecture we can't reuse
+//   `swap` for the initial call, and so we use a trampoline.
+//
+// To understand the DWARF CFI code in this file, keep in mind these facts:
+// * CFI is "call frame information"; a set of instructions to a debugger or
+//   an unwinder that allow it to simulate returning from functions. This implies
+//   restoring every register to its pre-call state, as well as the stack pointer.
+// * CFA is "call frame address"; the value of stack pointer right before the call
+//   instruction in the caller. Everything strictly below CFA (and inclusive until
+//   the next CFA) is the call frame of the callee. This implies that the return
+//   address is the part of callee's call frame.
+// * Logically, DWARF CFI is a table where rows are instruction pointer values and
+//   columns describe where registers are spilled (mostly using expressions that
+//   compute a memory location as CFA+n). A .cfi_offset pseudoinstruction changes
+//   the state of a column for all IP numerically larger than the one it's placed
+//   after. A .cfi_def_* pseudoinstruction changes the CFA value similarly.
+// * Simulating return is as easy as restoring register values from the CFI table
+//   and then setting stack pointer to CFA.
+use core::intrinsics;
 use stack::Stack;
 
 #[derive(Debug)]
@@ -19,17 +36,50 @@ pub struct StackPointer(*mut usize);
 
 pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer {
   #[naked]
-  unsafe extern "C" fn trampoline() -> ! {
+  unsafe extern "C" fn init_trampoline_1() -> ! {
     asm!(
       r#"
-        # Pop function.
-        popl    %ebx
+        # gdb has a hardcoded check that rejects backtraces where frame addresses
+        # do not monotonically decrease. It is turned off if the function is called
+        # "__morestack" and that is hardcoded. So, to make gdb backtraces match
+        # the actual unwinder behavior, we call ourselves "__morestack" and mark
+        # the symbol as local; it shouldn't interfere with anything.
+      __morestack:
+      .local __morestack
+
+        # Set up the first part of our DWARF CFI linking stacks together.
+        # When unwinding the frame corresponding to this function, a DWARF unwinder
+        # will use %ebx as the next call frame address, restore return address
+        # from CFA-4 and restore %ebp from CFA-8. This mirrors what the second half
+        # of `swap_trampoline` does.
+        .cfi_def_cfa %ebx, 0
+        .cfi_offset %ebp, -8
+        # Call the next trampoline.
+        call   ${0:c}
+
+      .Lend:
+      .size __morestack, .Lend-__morestack
+      "#
+      : : "s" (init_trampoline_2 as usize) : "memory" : "volatile");
+    intrinsics::unreachable()
+  }
+
+  #[naked]
+  unsafe extern "C" fn init_trampoline_2() -> ! {
+    asm!(
+      r#"
+        # Set up the second part of our DWARF CFI.
+        # When unwinding the frame corresponding to this function, a DWARF unwinder
+        # will restore %ebx (and thus CFA of the first trampoline) from the stack slot.
+        .cfi_offset %ebx, 4
         # Push argument.
+        .cfi_def_cfa_offset 8
         pushl   %eax
-        # Call it.
-        call    *%ebx
-      "# ::: "memory" : "volatile");
-    ::core::intrinsics::unreachable()
+        # Call the provided function.
+        call    *8(%esp)
+      "#
+      : : : "memory" : "volatile");
+    intrinsics::unreachable()
   }
 
   unsafe fn push(sp: &mut StackPointer, val: usize) {
@@ -38,49 +88,62 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
   }
 
   let mut sp = StackPointer(stack.top() as *mut usize);
-  push(&mut sp, 0); // alignment
-  push(&mut sp, 0); // alignment
-  push(&mut sp, 0); // alignment
+  push(&mut sp, 0xdead0cfa); // CFA slot
   push(&mut sp, f as usize); // function
-  push(&mut sp, trampoline as usize);
+  push(&mut sp, init_trampoline_1 as usize);
+  push(&mut sp, 0xdeadbbbb); // saved %ebp
   sp
 }
 
 #[inline(always)]
-pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) -> usize {
+pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer,
+                   new_stack: &Stack) -> usize {
+  // Address of the topmost CFA stack slot.
+  let new_cfa = (new_stack.top() as *mut usize).offset(-1);
+
+  #[naked]
+  unsafe extern "C" fn swap_trampoline() -> ! {
+    asm!(
+      r#"
+        # Save frame pointer explicitly; the unwinder uses it to find CFA of
+        # the caller, and so it has to have the correct value immediately after
+        # the call instruction that invoked the trampoline.
+        pushl   %ebp
+
+        # Remember stack pointer of the old context, in case %edx==%esi.
+        movl    %esp, %ebx
+        # Load stack pointer of the new context.
+        movl    (%edx), %esp
+        # Save stack pointer of the old context.
+        movl    %ebx, (%esi)
+
+        # Restore frame pointer of the new context.
+        popl    %ebp
+
+        # Return into the new context. Use `pop` and `jmp` instead of a `ret`
+        # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge).
+        popl    %ebx
+        jmpl    *%ebx
+      "#
+      : : : "memory" : "volatile");
+    intrinsics::unreachable();
+  }
+
   let ret: usize;
   asm!(
     r#"
-      # Save frame pointer explicitly; LLVM doesn't spill it even if it is
-      # marked as clobbered.
-      pushl   %ebp
+      # Link the call stacks together.
+      movl    %esp, (%edi)
       # Push instruction pointer of the old context and switch to
       # the new context.
-      call    1f
-      # Restore frame pointer.
-      popl    %ebp
-      # Continue executing old context.
-      jmp     2f
-
-    1:
-      # Remember stack pointer of the old context, in case %rdx==%rsi.
-      movl    %esp, %ebx
-      # Load stack pointer of the new context.
-      movl    (%edx), %esp
-      # Save stack pointer of the old context.
-      movl    %ebx, (%esi)
-
-      # Pop instruction pointer of the new context (placed onto stack by
-      # the call above) and jump there; don't use `ret` to avoid return
-      # address mispredictions (~8ns on Ivy Bridge).
-      popl    %ebx
-      jmpl    *%ebx
-    2:
+      call    ${1:c}
     "#
     : "={eax}" (ret)
-    : "{eax}" (arg)
+    : "s" (swap_trampoline as usize)
+      "{eax}" (arg)
       "{esi}" (old_sp)
       "{edx}" (new_sp)
+      "{edi}" (new_cfa)
     : "eax",  "ebx",  "ecx",  "edx",  "esi",  "edi", //"ebp",  "esp",
       "mmx0", "mmx1", "mmx2", "mmx3", "mmx4", "mmx5", "mmx6", "mmx7",
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
diff --git a/src/arch/x86_64.rs b/src/arch/x86_64.rs
index 6afccb1..c62d00f 100644
--- a/src/arch/x86_64.rs
+++ b/src/arch/x86_64.rs
@@ -3,54 +3,115 @@
 //               whitequark <whitequark@whitequark.org>
 // See the LICENSE file included in this distribution.
 
-//! To understand the code in this file, keep in mind these two facts:
-//! * x86_64 SysV C ABI has a "red zone": 128 bytes under the top of the stack
-//!   that is defined to be unmolested by signal handlers, interrupts, etc.
-//!   Leaf functions can use the red zone without adjusting rsp or rbp.
-//! * x86_64 SysV C ABI requires the stack to be aligned at function entry,
-//!   so that (%rsp+8) is a multiple of 16. Aligned operands are a requirement
-//!   of SIMD instructions, and making this the responsibility of the caller
-//!   avoids having to maintain a frame pointer, which is necessary when
-//!   a function has to realign the stack from an unknown state.
-//! * x86_64 SysV C ABI passes the first argument in %rdi. We also use %rdi
-//!   to pass a value while swapping context; this is an arbitrary choice
-//!   (we clobber all registers and could use any of them) but this allows us
-//!   to reuse the swap function to perform the initial call.
-
+// To understand the code in this file, keep in mind these two facts:
+// * x86_64 SysV C ABI has a "red zone": 128 bytes under the top of the stack
+//   that is defined to be unmolested by signal handlers, interrupts, etc.
+//   Leaf functions can use the red zone without adjusting rsp or rbp.
+// * x86_64 SysV C ABI requires the stack to be aligned at function entry,
+//   so that (%rsp+8) is a multiple of 16. Aligned operands are a requirement
+//   of SIMD instructions, and making this the responsibility of the caller
+//   avoids having to maintain a frame pointer, which is necessary when
+//   a function has to realign the stack from an unknown state.
+// * x86_64 SysV C ABI passes the first argument in %rdi. We also use %rdi
+//   to pass a value while swapping context; this is an arbitrary choice
+//   (we clobber all registers and could use any of them) but this allows us
+//   to reuse the swap function to perform the initial call.
+//
+// To understand the DWARF CFI code in this file, keep in mind these facts:
+// * CFI is "call frame information"; a set of instructions to a debugger or
+//   an unwinder that allow it to simulate returning from functions. This implies
+//   restoring every register to its pre-call state, as well as the stack pointer.
+// * CFA is "call frame address"; the value of stack pointer right before the call
+//   instruction in the caller. Everything strictly below CFA (and inclusive until
+//   the next CFA) is the call frame of the callee. This implies that the return
+//   address is the part of callee's call frame.
+// * Logically, DWARF CFI is a table where rows are instruction pointer values and
+//   columns describe where registers are spilled (mostly using expressions that
+//   compute a memory location as CFA+n). A .cfi_offset pseudoinstruction changes
+//   the state of a column for all IP numerically larger than the one it's placed
+//   after. A .cfi_def_* pseudoinstruction changes the CFA value similarly.
+// * Simulating return is as easy as restoring register values from the CFI table
+//   and then setting stack pointer to CFA.
+use core::intrinsics;
 use stack::Stack;
 
 #[derive(Debug)]
 pub struct StackPointer(*mut usize);
 
 pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer {
+  #[naked]
+  unsafe extern "C" fn init_trampoline_1() -> ! {
+    asm!(
+      r#"
+        # gdb has a hardcoded check that rejects backtraces where frame addresses
+        # do not monotonically decrease. It is turned off if the function is called
+        # "__morestack" and that is hardcoded. So, to make gdb backtraces match
+        # the actual unwinder behavior, we call ourselves "__morestack" and mark
+        # the symbol as local; it shouldn't interfere with anything.
+      __morestack:
+      .local __morestack
+
+        # Set up the first part of our DWARF CFI linking stacks together.
+        # When unwinding the frame corresponding to this function, a DWARF unwinder
+        # will use %rbx as the next call frame address, restore return address
+        # from CFA-8 and restore %rbp from CFA-16. This mirrors what the second half
+        # of `swap_trampoline` does.
+        .cfi_def_cfa %rbx, 0
+        .cfi_offset %rbp, -16
+        # Call the next trampoline.
+        call   ${0:c}
+
+      .Lend:
+      .size __morestack, .Lend-__morestack
+      "#
+      : : "s" (init_trampoline_2 as usize) : "memory" : "volatile");
+    intrinsics::unreachable()
+  }
+
+  #[naked]
+  unsafe extern "C" fn init_trampoline_2() -> ! {
+    asm!(
+      r#"
+        # Set up the second part of our DWARF CFI.
+        # When unwinding the frame corresponding to this function, a DWARF unwinder
+        # will restore %rbx (and thus CFA of the first trampoline) from the stack slot.
+        .cfi_offset %rbx, 16
+        # Call the provided function.
+        call    *8(%rsp)
+      "#
+      : : : "memory" : "volatile");
+    intrinsics::unreachable()
+  }
+
   unsafe fn push(sp: &mut StackPointer, val: usize) {
     sp.0 = sp.0.offset(-1);
     *sp.0 = val
   }
 
   let mut sp = StackPointer(stack.top() as *mut usize);
-  push(&mut sp, 0); // alignment
-  push(&mut sp, f as usize);
+  push(&mut sp, 0xdeaddeaddead0cfa); // CFA slot
+  push(&mut sp, 0 as usize); // alignment
+  push(&mut sp, f as usize); // function
+  push(&mut sp, init_trampoline_1 as usize);
+  push(&mut sp, 0xdeaddeaddeadbbbb); // saved %rbp
   sp
 }
 
 #[inline(always)]
-pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) -> usize {
-  macro_rules! swap_body {
-    () => {
-      r#"
-        # Save frame pointer explicitly; LLVM doesn't spill it even if it is
-        # marked as clobbered.
-        pushq   %rbp
-        # Push instruction pointer of the old context and switch to
-        # the new context.
-        call    1f
-        # Restore frame pointer.
-        popq    %rbp
-        # Continue executing old context.
-        jmp     2f
+pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer,
+                   new_stack: &Stack) -> usize {
+  // Address of the topmost CFA stack slot.
+  let new_cfa = (new_stack.top() as *mut usize).offset(-1);
+
+  #[naked]
+  unsafe extern "C" fn swap_trampoline() -> ! {
+    asm!(
+      r#"
+        # Save frame pointer explicitly; the unwinder uses it to find CFA of
+        # the caller, and so it has to have the correct value immediately after
+        # the call instruction that invoked the trampoline.
+        pushq   %rbp
 
-      1:
         # Remember stack pointer of the old context, in case %rdx==%rsi.
         movq    %rsp, %rbx
         # Load stack pointer of the new context.
@@ -58,66 +119,45 @@ pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer)
         # Save stack pointer of the old context.
         movq    %rbx, (%rsi)
 
-        # Pop instruction pointer of the new context (placed onto stack by
-        # the call above) and jump there; don't use `ret` to avoid return
-        # address mispredictions (~8ns on Ivy Bridge).
+        # Restore frame pointer of the new context.
+        popq    %rbp
+
+        # Return into the new context. Use `pop` and `jmp` instead of a `ret`
+        # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge).
         popq    %rbx
         jmpq    *%rbx
-      2:
       "#
-    }
+      : : : "memory" : "volatile");
+    intrinsics::unreachable();
   }
 
-  #[cfg(not(windows))]
-  #[inline(always)]
-  unsafe fn swap_impl(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) -> usize {
-    let ret: usize;
-    asm!(swap_body!()
-      : "={rdi}" (ret)
-      : "{rdi}" (arg)
-        "{rsi}" (old_sp)
-        "{rdx}" (new_sp)
-      : "rax",   "rbx",   "rcx",   "rdx",   "rsi",   "rdi", //"rbp",   "rsp",
-        "r8",    "r9",    "r10",   "r11",   "r12",   "r13",   "r14",   "r15",
-        "xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
-        "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
-        "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
-        "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"
-        "cc", "fpsr", "flags", "memory"
-        // Ideally, we would set the LLVM "noredzone" attribute on this function
-        // (and it would be propagated to the call site). Unfortunately, rustc
-        // provides no such functionality. Fortunately, by a lucky coincidence,
-        // the "alignstack" LLVM inline assembly option does exactly the same
-        // thing on x86_64.
-      : "volatile", "alignstack");
-    ret
-  }
-
-
-  #[cfg(windows)]
-  #[inline(always)]
-  unsafe fn swap_impl(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) -> usize {
-    let ret: usize;
-    asm!(swap_body!()
-      : "={rcx}" (ret)
-      : "{rcx}" (arg)
-        "{rsi}" (old_sp)
-        "{rdx}" (new_sp)
-      : "rax",   "rbx",   "rcx",   "rdx",   "rsi",   "rdi", //"rbp",   "rsp",
-        "r8",    "r9",    "r10",   "r11",   "r12",   "r13",   "r14",   "r15",
-        "xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
-        "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
-        "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
-        "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"
-        "cc", "fpsr", "flags", "memory"
-        // Ideally, we would set the LLVM "noredzone" attribute on this function
-        // (and it would be propagated to the call site). Unfortunately, rustc
-        // provides no such functionality. Fortunately, by a lucky coincidence,
-        // the "alignstack" LLVM inline assembly option does exactly the same
-        // thing on x86_64.
-      : "volatile", "alignstack");
-    ret
-  }
-
-  swap_impl(arg, old_sp, new_sp)
+  let ret: usize;
+  asm!(
+    r#"
+      # Link the call stacks together.
+      movq    %rsp, (%rcx)
+      # Push instruction pointer of the old context and switch to
+      # the new context.
+      call    ${1:c}
+    "#
+    : "={rdi}" (ret)
+    : "s" (swap_trampoline as usize)
+      "{rdi}" (arg)
+      "{rsi}" (old_sp)
+      "{rdx}" (new_sp)
+      "{rcx}" (new_cfa)
+    : "rax",   "rbx",   "rcx",   "rdx",   "rsi",   "rdi", //"rbp",   "rsp",
+      "r8",    "r9",    "r10",   "r11",   "r12",   "r13",   "r14",   "r15",
+      "xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
+      "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
+      "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
+      "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"
+      "cc", "fpsr", "flags", "memory"
+      // Ideally, we would set the LLVM "noredzone" attribute on this function
+      // (and it would be propagated to the call site). Unfortunately, rustc
+      // provides no such functionality. Fortunately, by a lucky coincidence,
+      // the "alignstack" LLVM inline assembly option does exactly the same
+      // thing on x86_64.
+    : "volatile", "alignstack");
+  ret
 }
diff --git a/src/context.rs b/src/context.rs
index 6c78241..a936a5c 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -49,6 +49,6 @@ impl<OldStack> Context<OldStack> where OldStack: stack::Stack {
                                new_ctx: *const Context<NewStack>,
                                arg: usize) -> usize
       where NewStack: stack::Stack {
-    arch::swap(arg, &mut (*old_ctx).stack_ptr, &(*new_ctx).stack_ptr)
+    arch::swap(arg, &mut (*old_ctx).stack_ptr, &(*new_ctx).stack_ptr, &(*new_ctx).stack)
   }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 101745d..5dd83af 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,7 +2,8 @@
 // Copyright (c) edef <edef@edef.eu>
 // See the LICENSE file included in this distribution.
 #![feature(asm)]
-#![cfg_attr(target_arch = "x86", feature(naked_functions, core_intrinsics))]
+#![cfg_attr(target_arch = "x86",    feature(naked_functions, core_intrinsics))]
+#![cfg_attr(target_arch = "x86_64", feature(naked_functions, core_intrinsics))]
 #![no_std]
 
 //! libfringe is a library implementing lightweight context switches,
diff --git a/src/os/mod.rs b/src/os/mod.rs
index dd29536..05160a4 100644
--- a/src/os/mod.rs
+++ b/src/os/mod.rs
@@ -50,12 +50,14 @@ impl Stack {
 }
 
 impl stack::Stack for Stack {
+  #[inline(always)]
   fn top(&self) -> *mut u8 {
     unsafe {
       self.ptr.offset(self.len as isize)
     }
   }
 
+  #[inline(always)]
   fn limit(&self) -> *mut u8 {
     unsafe {
       self.ptr.offset(sys::page_size() as isize)
diff --git a/tests/panic.rs b/tests/panic.rs
new file mode 100644
index 0000000..80c21cc
--- /dev/null
+++ b/tests/panic.rs
@@ -0,0 +1,45 @@
+// This file is part of libfringe, a low-level green threading library.
+// Copyright (c) whitequark <whitequark@whitequark.org>
+// See the LICENSE file included in this distribution.
+#![feature(thread_local)]
+extern crate fringe;
+
+use fringe::Context;
+
+#[thread_local]
+static mut ctx_slot: *mut Context<fringe::OsStack> = 0 as *mut Context<_>;
+
+unsafe extern "C" fn do_panic(arg: usize) -> ! {
+  match arg {
+    0 => panic!("arg=0"),
+    1 => {
+      Context::swap(ctx_slot, ctx_slot, 0);
+      panic!("arg=1");
+    }
+    _ => unreachable!()
+  }
+}
+
+#[test]
+#[should_panic="arg=0"]
+fn panic_after_start() {
+  unsafe {
+    let stack = fringe::OsStack::new(4 << 20).unwrap();
+    let mut ctx = Context::new(stack, do_panic);
+
+    Context::swap(&mut ctx, &ctx, 0);
+  }
+}
+
+#[test]
+#[should_panic="arg=1"]
+fn panic_after_swap() {
+  unsafe {
+    let stack = fringe::OsStack::new(4 << 20).unwrap();
+    let mut ctx = Context::new(stack, do_panic);
+    ctx_slot = &mut ctx;
+
+    Context::swap(&mut ctx, &ctx, 1);
+    Context::swap(&mut ctx, &ctx, 0);
+  }
+}