Clean up and simplify the implementation of generators

2016-09-07 13:56:32 +01:00 · 2016-09-07 13:56:32 +01:00 · fff625767c
parent f7f209c1eb
commit fff625767c
9 changed files with 274 additions and 310 deletions
--- a/benches/generator.rs
+++ b/benches/generator.rs
@ -17,5 +17,5 @@ fn generate(b: &mut test::Bencher) {
    loop { input = yielder.suspend(input) }
  });

-  b.iter(|| test::black_box(identity.resume(test::black_box(0))));
+  b.iter(|| for _ in 0..10 { test::black_box(identity.resume(test::black_box(0))); });
 }
--- a/src/arch/mod.rs
+++ b/src/arch/mod.rs
@ -13,3 +13,114 @@ pub use self::imp::*;
 #[cfg_attr(target_arch = "x86_64", path = "x86_64.rs")]
 #[cfg_attr(target_arch = "or1k",   path = "or1k.rs")]
 mod imp;
+
+#[cfg(test)]
+mod tests {
+  extern crate test;
+  extern crate simd;
+
+  use arch::{self, StackPointer};
+  use ::OsStack;
+
+  #[test]
+  fn context() {
+    unsafe extern "C" fn adder(arg: usize, stack_ptr: StackPointer) -> ! {
+      println!("it's alive! arg: {}", arg);
+      let (arg, stack_ptr) = arch::swap(arg + 1, stack_ptr, None);
+      println!("still alive! arg: {}", arg);
+      arch::swap(arg + 1, stack_ptr, None);
+      panic!("i should be dead");
+    }
+
+    unsafe {
+      let stack = OsStack::new(4 << 20).unwrap();
+      let stack_ptr = arch::init(&stack, adder);
+
+      let (ret, stack_ptr) = arch::swap(10, stack_ptr, Some(&stack));
+      assert_eq!(ret, 11);
+      let (ret, _) = arch::swap(50, stack_ptr, Some(&stack));
+      assert_eq!(ret, 51);
+    }
+  }
+
+  #[test]
+  fn context_simd() {
+    unsafe extern "C" fn permuter(arg: usize, stack_ptr: StackPointer) -> ! {
+      // This will crash if the stack is not aligned properly.
+      let x = simd::i32x4::splat(arg as i32);
+      let y = x * x;
+      println!("simd result: {:?}", y);
+      let (_, stack_ptr) = arch::swap(0, stack_ptr, None);
+      // And try again after a context switch.
+      let x = simd::i32x4::splat(arg as i32);
+      let y = x * x;
+      println!("simd result: {:?}", y);
+      arch::swap(0, stack_ptr, None);
+      panic!("i should be dead");
+    }
+
+    unsafe {
+      let stack = OsStack::new(4 << 20).unwrap();
+      let stack_ptr = arch::init(&stack, permuter);
+
+      let (_, stack_ptr) = arch::swap(10, stack_ptr, Some(&stack));
+      arch::swap(20, stack_ptr, Some(&stack));
+    }
+  }
+
+  unsafe extern "C" fn do_panic(arg: usize, stack_ptr: StackPointer) -> ! {
+    match arg {
+      0 => panic!("arg=0"),
+      1 => {
+        arch::swap(0, stack_ptr, None);
+        panic!("arg=1");
+      }
+      _ => unreachable!()
+    }
+  }
+
+  #[test]
+  #[should_panic="arg=0"]
+  fn panic_after_start() {
+    unsafe {
+      let stack = OsStack::new(4 << 20).unwrap();
+      let stack_ptr = arch::init(&stack, do_panic);
+
+      arch::swap(0, stack_ptr, Some(&stack));
+    }
+  }
+
+  #[test]
+  #[should_panic="arg=1"]
+  fn panic_after_swap() {
+    unsafe {
+      let stack = OsStack::new(4 << 20).unwrap();
+      let stack_ptr = arch::init(&stack, do_panic);
+
+      let (_, stack_ptr) = arch::swap(1, stack_ptr, Some(&stack));
+      arch::swap(0, stack_ptr, Some(&stack));
+    }
+  }
+
+  #[bench]
+  fn swap(b: &mut test::Bencher) {
+    unsafe extern "C" fn loopback(mut arg: usize, mut stack_ptr: StackPointer) -> ! {
+      // This deliberately does not ignore arg, to measure the time it takes
+      // to move the return value between registers.
+      loop {
+        let data = arch::swap(arg, stack_ptr, None);
+        arg = data.0;
+        stack_ptr = data.1;
+      }
+    }
+
+    unsafe {
+      let stack = OsStack::new(4 << 20).unwrap();
+      let mut stack_ptr = arch::init(&stack, loopback);
+
+      b.iter(|| for _ in 0..10 {
+        stack_ptr = arch::swap(0, stack_ptr, Some(&stack)).1;
+      });
+    }
+  }
+}
--- a/src/arch/or1k.rs
+++ b/src/arch/or1k.rs
@ -14,7 +14,8 @@
 // * OR1K C ABI passes the first argument in r3. We also use r3 to pass a value
 //   while swapping context; this is an arbitrary choice
 //   (we clobber all registers and could use any of them) but this allows us
-//   to reuse the swap function to perform the initial call.
+//   to reuse the swap function to perform the initial call. We do the same
+//   thing with r4 to pass the stack pointer to the new context.
 //
 // To understand the DWARF CFI code in this file, keep in mind these facts:
 // * CFI is "call frame information"; a set of instructions to a debugger or
@ -47,7 +48,7 @@ pub const STACK_ALIGNMENT: usize = 4;
 #[derive(Debug, Clone, Copy)]
 pub struct StackPointer(*mut usize);

-pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer {
+pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer {
  #[naked]
  unsafe extern "C" fn trampoline_1() {
    asm!(
@ -96,6 +97,12 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
        .cfi_offset r2, -4
        .cfi_offset r9, -8

+        # This nop is here so that the return address of the swap trampoline
+        # doesn't point to the start of the symbol. This confuses gdb's backtraces,
+        # causing them to think the parent function is trampoline_1 instead of
+        # trampoline_2.
+        nop
+
        # Call the provided function.
        l.lwz   r4, 8(r1)
        l.jalr  r4
@ -130,18 +137,24 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
  // parent call frame.
  let frame = sp;
  push(&mut sp, frame.0 as usize);          // Pointer to parent call frame
-  push(&mut sp, trampoline_2 as usize); // Entry point
+  push(&mut sp, trampoline_2 as usize + 4); // Entry point, skip initial nop

-  // The call frame for swap::trampoline is actually in the red zone and not
-  // below the stack pointer.
+  // The last two values are read by the swap trampoline and are actually in the
+  // red zone and not below the stack pointer.
  frame
 }

 #[inline(always)]
-pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
-                   new_stack: &Stack) -> usize {
+pub unsafe fn swap(arg: usize, new_sp: StackPointer,
+                   new_stack: Option<&Stack>) -> (usize, StackPointer) {
  // Address of the topmost CFA stack slot.
-  let new_cfa = (new_stack.base() as *mut usize).offset(-2);
+  let mut dummy: usize = mem::uninitialized();
+  let new_cfa = if let Some(new_stack) = new_stack {
+    (new_stack.base() as *mut usize).offset(-2)
+  } else {
+    // Just pass a dummy pointer if we aren't linking the stack
+    &mut dummy
+  };

  #[naked]
  unsafe extern "C" fn trampoline() {
@ -160,17 +173,13 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
        l.addi  r7, r1, -8
        l.sw    0(r6), r7

-        # Switch to the new stack for unwinding purposes. The old stack may no
-        # longer be valid now that we have modified the link.
-        .cfi_def_cfa_register r5
-
-        # Save stack pointer of the old context.
-        l.sw    0(r4), r1
+        # Pass the stack pointer of the old context to the new one.
+        l.or    r4, r0, r1
        # Load stack pointer of the new context.
        l.or    r1, r0, r5
-        .cfi_def_cfa_register r1

        # Restore frame pointer and link register of the new context.
+        # Load frame and instruction pointers of the new context.
        l.lwz   r2, -4(r1)
        l.lwz   r9, -8(r1)

@ -182,23 +191,24 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
  }

  let ret: usize;
+  let ret_sp: *mut usize;
  asm!(
    r#"
      # Call the trampoline to switch to the new context.
-      l.jal   ${1}
+      l.jal   ${2}
      l.nop
    "#
    : "={r3}" (ret)
+      "={r4}" (ret_sp)
    : "s" (trampoline as usize)
      "{r3}" (arg)
-      "{r4}" (old_sp)
      "{r5}" (new_sp.0)
      "{r6}" (new_cfa)
-    :/*"r0", "r1",  "r2",  "r3",*/"r4",  "r5",  "r6",  "r7",
+    :/*"r0", "r1",  "r2",  "r3",  "r4",*/"r5",  "r6",  "r7",
      "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
      "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
      "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
      "cc", "memory"
    : "volatile");
-  ret
+  (ret, StackPointer(ret_sp))
 }
--- a/src/arch/x86.rs
+++ b/src/arch/x86.rs
@ -41,6 +41,7 @@
 // * The 1st init trampoline tells the unwinder to restore %ebp and its return
 //   address from the stack frame at %ebp (in the parent stack), thus continuing
 //   unwinding at the swap call site instead of falling off the end of context stack.
+use core::mem;
 use stack::Stack;

 pub const STACK_ALIGNMENT: usize = 16;
@ -48,7 +49,7 @@ pub const STACK_ALIGNMENT: usize = 16;
 #[derive(Debug, Clone, Copy)]
 pub struct StackPointer(*mut usize);

-pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer {
+pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer {
  #[cfg(not(target_vendor = "apple"))]
  #[naked]
  unsafe extern "C" fn trampoline_1() {
@ -69,8 +70,8 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
        # will use %ebp+8 as the next call frame address, restore return address
        # from CFA-4 and restore %ebp from CFA-8. This mirrors what the second half
        # of `swap_trampoline` does.
-        .cfi_def_cfa ebp, 8
-        .cfi_offset ebp, -8
+        .cfi_def_cfa %ebp, 8
+        .cfi_offset %ebp, -8

        # This nop is here so that the initial swap doesn't return to the start
        # of the trampoline, which confuses the unwinder since it will look for
@ -97,8 +98,8 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
      # Identical to the above, except avoids .local/.size that aren't available on Mach-O.
      __morestack:
      .private_extern __morestack
-        .cfi_def_cfa ebp, 8
-        .cfi_offset ebp, -8
+        .cfi_def_cfa %ebp, 8
+        .cfi_offset %ebp, -8
        nop
        nop
      "#
@ -114,13 +115,20 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
        # will restore %ebp (and thus CFA of the first trampoline) from the stack slot.
        # This stack slot is updated every time swap() is called to point to the bottom
        # of the stack of the context switch just switched from.
-        .cfi_def_cfa ebp, 8
-        .cfi_offset ebp, -8
+        .cfi_def_cfa %ebp, 8
+        .cfi_offset %ebp, -8

-        # Push argument.
-        pushl   %eax
+        # This nop is here so that the return address of the swap trampoline
+        # doesn't point to the start of the symbol. This confuses gdb's backtraces,
+        # causing them to think the parent function is trampoline_1 instead of
+        # trampoline_2.
+        nop
+
+        # Push arguments.
+        pushl   %esi
+        pushl   %edi
        # Call the provided function.
-        call    *12(%esp)
+        calll  *16(%esp)
      "#
      : : : : "volatile")
  }
@ -140,6 +148,9 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
  // such as perf or dtrace.
  let mut sp = StackPointer(stack.base() as *mut usize);

+  push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned
+  push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned
+  push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned
  push(&mut sp, f as usize); // Function that trampoline_2 should call

  // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline
@ -150,17 +161,23 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
  // Call frame for swap::trampoline. We set up the %ebp value to point to the
  // parent call frame.
  let frame = sp;
-  push(&mut sp, trampoline_2 as usize); // Entry point
+  push(&mut sp, trampoline_2 as usize + 1); // Entry point, skip initial nop
  push(&mut sp, frame.0 as usize);          // Pointer to parent call frame

  sp
 }

 #[inline(always)]
-pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
-                   new_stack: &Stack) -> usize {
+pub unsafe fn swap(arg: usize, new_sp: StackPointer,
+                   new_stack: Option<&Stack>) -> (usize, StackPointer) {
  // Address of the topmost CFA stack slot.
-  let new_cfa = (new_stack.base() as *mut usize).offset(-3);
+  let mut dummy: usize = mem::uninitialized();
+  let new_cfa = if let Some(new_stack) = new_stack {
+    (new_stack.base() as *mut usize).offset(-6)
+  } else {
+    // Just pass a dummy pointer if we aren't linking the stack
+    &mut dummy
+  };

  #[naked]
  unsafe extern "C" fn trampoline() {
@ -171,54 +188,50 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
        # the call instruction that invoked the trampoline.
        pushl   %ebp
        .cfi_adjust_cfa_offset 4
-        .cfi_rel_offset ebp, 0
+        .cfi_rel_offset %ebp, 0

        # Link the call stacks together by writing the current stack bottom
        # address to the CFA slot in the new stack.
-        movl    %esp, (%edi)
+        movl    %esp, (%ecx)

-        # Switch to the new stack for unwinding purposes. The old stack may no
-        # longer be valid now that we have modified the link.
-        .cfi_def_cfa_register edx
-
-        # Save stack pointer of the old context.
-        movl    %esp, (%esi)
+        # Pass the stack pointer of the old context to the new one.
+        movl    %esp, %esi
        # Load stack pointer of the new context.
        movl    %edx, %esp
-        .cfi_def_cfa_register esp

        # Restore frame pointer of the new context.
        popl    %ebp
        .cfi_adjust_cfa_offset -4
-        .cfi_restore ebp
+        .cfi_restore %ebp

        # Return into the new context. Use `pop` and `jmp` instead of a `ret`
        # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge).
-        popl    %ecx
+        popl    %eax
        .cfi_adjust_cfa_offset -4
-        .cfi_register eip, ecx
-        jmpl    *%ecx
+        .cfi_register %eip, %eax
+        jmpl    *%eax
      "#
      : : : : "volatile")
  }

  let ret: usize;
+  let ret_sp: *mut usize;
  asm!(
    r#"
      # Push instruction pointer of the old context and switch to
      # the new context.
-      call    ${1:c}
+      call    ${2:c}
    "#
-    : "={eax}" (ret)
+    : "={edi}" (ret)
+      "={esi}" (ret_sp)
    : "s" (trampoline as usize)
-      "{eax}" (arg)
-      "{esi}" (old_sp)
+      "{edi}" (arg)
      "{edx}" (new_sp.0)
-      "{edi}" (new_cfa)
-    :/*"eax",*/"ebx", "ecx",  "edx",  "esi",  "edi",/*"ebp",  "esp",*/
+      "{ecx}" (new_cfa)
+    : "eax", "ebx", "ecx",  "edx", /*"esi",  "edi", "ebp",  "esp",*/
      "mm0",  "mm1",  "mm2",  "mm3",  "mm4",  "mm5",  "mm6",  "mm7",
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
      "cc", "dirflag", "fpsr", "flags", "memory"
    : "volatile");
-  ret
+  (ret, StackPointer(ret_sp))
 }
--- a/src/arch/x86_64.rs
+++ b/src/arch/x86_64.rs
@ -19,7 +19,8 @@
 // * x86_64 SysV C ABI passes the first argument in %rdi. We also use %rdi
 //   to pass a value while swapping context; this is an arbitrary choice
 //   (we clobber all registers and could use any of them) but this allows us
-//   to reuse the swap function to perform the initial call.
+//   to reuse the swap function to perform the initial call. We do the same
+//   thing with %rsi to pass the stack pointer to the new context.
 //
 // To understand the DWARF CFI code in this file, keep in mind these facts:
 // * CFI is "call frame information"; a set of instructions to a debugger or
@ -45,6 +46,7 @@
 // * The 1st init trampoline tells the unwinder to restore %rbp and its return
 //   address from the stack frame at %rbp (in the parent stack), thus continuing
 //   unwinding at the swap call site instead of falling off the end of context stack.
+use core::mem;
 use stack::Stack;

 pub const STACK_ALIGNMENT: usize = 16;
@ -52,7 +54,7 @@ pub const STACK_ALIGNMENT: usize = 16;
 #[derive(Debug, Clone, Copy)]
 pub struct StackPointer(*mut usize);

-pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer {
+pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize, StackPointer) -> !) -> StackPointer {
  #[cfg(not(target_vendor = "apple"))]
  #[naked]
  unsafe extern "C" fn trampoline_1() {
@ -73,8 +75,8 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
        # will use %rbp+16 as the next call frame address, restore return address
        # from CFA-8 and restore %rbp from CFA-16. This mirrors what the second half
        # of `swap_trampoline` does.
-        .cfi_def_cfa rbp, 16
-        .cfi_offset rbp, -16
+        .cfi_def_cfa %rbp, 16
+        .cfi_offset %rbp, -16

        # This nop is here so that the initial swap doesn't return to the start
        # of the trampoline, which confuses the unwinder since it will look for
@ -101,8 +103,8 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
      # Identical to the above, except avoids .local/.size that aren't available on Mach-O.
      __morestack:
      .private_extern __morestack
-        .cfi_def_cfa rbp, 16
-        .cfi_offset rbp, -16
+        .cfi_def_cfa %rbp, 16
+        .cfi_offset %rbp, -16
        nop
        nop
      "#
@ -118,8 +120,14 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
        # will restore %rbp (and thus CFA of the first trampoline) from the stack slot.
        # This stack slot is updated every time swap() is called to point to the bottom
        # of the stack of the context switch just switched from.
-        .cfi_def_cfa rbp, 16
-        .cfi_offset rbp, -16
+        .cfi_def_cfa %rbp, 16
+        .cfi_offset %rbp, -16
+
+        # This nop is here so that the return address of the swap trampoline
+        # doesn't point to the start of the symbol. This confuses gdb's backtraces,
+        # causing them to think the parent function is trampoline_1 instead of
+        # trampoline_2.
+        nop

        # Call the provided function.
        call    *16(%rsp)
@ -153,17 +161,23 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
  // Call frame for swap::trampoline. We set up the %rbp value to point to the
  // parent call frame.
  let frame = sp;
-  push(&mut sp, trampoline_2 as usize); // Entry point
+  push(&mut sp, trampoline_2 as usize + 1); // Entry point, skip initial nop
  push(&mut sp, frame.0 as usize);          // Pointer to parent call frame

  sp
 }

 #[inline(always)]
-pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
-                   new_stack: &Stack) -> usize {
+pub unsafe fn swap(arg: usize, new_sp: StackPointer,
+                   new_stack: Option<&Stack>) -> (usize, StackPointer) {
  // Address of the topmost CFA stack slot.
-  let new_cfa = (new_stack.base() as *mut usize).offset(-4);
+  let mut dummy: usize = mem::uninitialized();
+  let new_cfa = if let Some(new_stack) = new_stack {
+    (new_stack.base() as *mut usize).offset(-4)
+  } else {
+    // Just pass a dummy pointer if we aren't linking the stack
+    &mut dummy
+  };

  #[naked]
  unsafe extern "C" fn trampoline() {
@ -174,51 +188,47 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
        # the call instruction that invoked the trampoline.
        pushq   %rbp
        .cfi_adjust_cfa_offset 8
-        .cfi_rel_offset rbp, 0
+        .cfi_rel_offset %rbp, 0

        # Link the call stacks together by writing the current stack bottom
        # address to the CFA slot in the new stack.
        movq    %rsp, (%rcx)

-        # Switch to the new stack for unwinding purposes. The old stack may no
-        # longer be valid now that we have modified the link.
-        .cfi_def_cfa_register rdx
-
-        # Save stack pointer of the old context.
-        movq    %rsp, (%rsi)
+        # Pass the stack pointer of the old context to the new one.
+        movq    %rsp, %rsi
        # Load stack pointer of the new context.
        movq    %rdx, %rsp
-        .cfi_def_cfa_register rsp

        # Restore frame pointer of the new context.
        popq    %rbp
        .cfi_adjust_cfa_offset -8
-        .cfi_restore rbp
+        .cfi_restore %rbp

        # Return into the new context. Use `pop` and `jmp` instead of a `ret`
        # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge).
        popq    %rax
        .cfi_adjust_cfa_offset -8
-        .cfi_register rip, rax
+        .cfi_register %rip, %rax
        jmpq    *%rax
      "#
      : : : : "volatile")
  }

  let ret: usize;
+  let ret_sp: *mut usize;
  asm!(
    r#"
      # Push instruction pointer of the old context and switch to
      # the new context.
-      call    ${1:c}
+      call    ${2:c}
    "#
    : "={rdi}" (ret)
+      "={rsi}" (ret_sp)
    : "s" (trampoline as usize)
      "{rdi}" (arg)
-      "{rsi}" (old_sp)
      "{rdx}" (new_sp.0)
      "{rcx}" (new_cfa)
-    : "rax",   "rbx",   "rcx",   "rdx",   "rsi", /*"rdi",   "rbp",   "rsp",*/
+    : "rax",   "rbx",   "rcx",   "rdx", /*"rsi",   "rdi",   "rbp",   "rsp",*/
      "r8",    "r9",    "r10",   "r11",   "r12",   "r13",   "r14",   "r15",
      "mm0",   "mm1",   "mm2",   "mm3",   "mm4",   "mm5",   "mm6",   "mm7",
      "xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
@ -232,5 +242,5 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
      // the "alignstack" LLVM inline assembly option does exactly the same
      // thing on x86_64.
    : "volatile", "alignstack");
-  ret
+  (ret, StackPointer(ret_sp))
 }
--- a/src/context.rs
+++ b/src/context.rs
@ -1,172 +0,0 @@
-// This file is part of libfringe, a low-level green threading library.
-// Copyright (c) edef <edef@edef.eu>,
-//               whitequark <whitequark@whitequark.org>
-// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
-// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
-// http://opensource.org/licenses/MIT>, at your option. This file may not be
-// copied, modified, or distributed except according to those terms.
-use stack;
-use debug;
-use arch;
-
-/// Context holds a suspended thread of execution along with a stack.
-///
-/// It can be swapped into and out of with the swap method,
-/// and once you're done with it, you can get the stack back through unwrap.
-///
-/// Every operation is unsafe, because no guarantees can be made about
-/// the state of the context.
-#[derive(Debug)]
-pub struct Context<Stack: stack::Stack> {
-  stack:     Stack,
-  stack_id:  debug::StackId,
-  stack_ptr: arch::StackPointer
-}
-
-unsafe impl<Stack> Send for Context<Stack>
-  where Stack: stack::Stack + Send {}
-
-impl<Stack> Context<Stack> where Stack: stack::Stack {
-  /// Creates a new Context. When it is swapped into, it will call
-  /// `f(arg)`, where `arg` is the argument passed to `swap`.
-  pub unsafe fn new(stack: Stack, f: unsafe extern "C" fn(usize) -> !) -> Context<Stack> {
-    let stack_id  = debug::StackId::register(&stack);
-    let stack_ptr = arch::init(&stack, f);
-    Context {
-      stack:     stack,
-      stack_id:  stack_id,
-      stack_ptr: stack_ptr
-    }
-  }
-
-  /// Unwraps the context, returning the stack it contained.
-  pub unsafe fn unwrap(self) -> Stack {
-    self.stack
-  }
-}
-
-impl<OldStack> Context<OldStack> where OldStack: stack::Stack {
-  /// Switches to `in_ctx`, saving the current thread of execution to `out_ctx`.
-  #[inline(always)]
-  pub unsafe fn swap<NewStack>(old_ctx: *mut Context<OldStack>,
-                               new_ctx: *const Context<NewStack>,
-                               arg: usize) -> usize
-      where NewStack: stack::Stack {
-    arch::swap(arg, &mut (*old_ctx).stack_ptr, (*new_ctx).stack_ptr, &(*new_ctx).stack)
-  }
-}
-
-#[cfg(test)]
-mod test {
-  extern crate test;
-  extern crate simd;
-
-  use std::ptr;
-  use super::Context;
-  use ::OsStack;
-
-  #[thread_local]
-  static mut ctx_slot: *mut Context<OsStack> = ptr::null_mut();
-
-  #[test]
-  fn context() {
-    unsafe extern "C" fn adder(arg: usize) -> ! {
-      println!("it's alive! arg: {}", arg);
-      let arg = Context::swap(ctx_slot, ctx_slot, arg + 1);
-      println!("still alive! arg: {}", arg);
-      Context::swap(ctx_slot, ctx_slot, arg + 1);
-      panic!("i should be dead");
-    }
-
-    unsafe {
-      let stack = OsStack::new(4 << 20).unwrap();
-      let mut ctx = Context::new(stack, adder);
-      ctx_slot = &mut ctx;
-
-      let ret = Context::swap(ctx_slot, ctx_slot, 10);
-      assert_eq!(ret, 11);
-      let ret = Context::swap(ctx_slot, ctx_slot, 50);
-      assert_eq!(ret, 51);
-    }
-  }
-
-  #[test]
-  fn context_simd() {
-    unsafe extern "C" fn permuter(arg: usize) -> ! {
-      // This will crash if the stack is not aligned properly.
-      let x = simd::i32x4::splat(arg as i32);
-      let y = x * x;
-      println!("simd result: {:?}", y);
-      Context::swap(ctx_slot, ctx_slot, 0);
-      // And try again after a context switch.
-      let x = simd::i32x4::splat(arg as i32);
-      let y = x * x;
-      println!("simd result: {:?}", y);
-      Context::swap(ctx_slot, ctx_slot, 0);
-      panic!("i should be dead");
-    }
-
-    unsafe {
-      let stack = OsStack::new(4 << 20).unwrap();
-      let mut ctx = Context::new(stack, permuter);
-      ctx_slot = &mut ctx;
-
-      Context::swap(ctx_slot, ctx_slot, 10);
-      Context::swap(ctx_slot, ctx_slot, 20);
-    }
-  }
-
-  unsafe extern "C" fn do_panic(arg: usize) -> ! {
-    match arg {
-      0 => panic!("arg=0"),
-      1 => {
-        Context::swap(ctx_slot, ctx_slot, 0);
-        panic!("arg=1");
-      }
-      _ => unreachable!()
-    }
-  }
-
-  #[test]
-  #[should_panic="arg=0"]
-  fn panic_after_start() {
-    unsafe {
-      let stack = OsStack::new(4 << 20).unwrap();
-      let mut ctx = Context::new(stack, do_panic);
-
-      Context::swap(&mut ctx, &ctx, 0);
-    }
-  }
-
-  #[test]
-  #[should_panic="arg=1"]
-  fn panic_after_swap() {
-    unsafe {
-      let stack = OsStack::new(4 << 20).unwrap();
-      let mut ctx = Context::new(stack, do_panic);
-      ctx_slot = &mut ctx;
-
-      Context::swap(&mut ctx, &ctx, 1);
-      Context::swap(&mut ctx, &ctx, 0);
-    }
-  }
-
-  #[bench]
-  fn swap(b: &mut test::Bencher) {
-    unsafe extern "C" fn loopback(mut arg: usize) -> ! {
-      // This deliberately does not ignore arg, to measure the time it takes
-      // to move the return value between registers.
-      let ctx_ptr = ctx_slot;
-      loop { arg = Context::swap(ctx_ptr, ctx_ptr, arg) }
-    }
-
-    unsafe {
-      let stack = OsStack::new(4 << 20).unwrap();
-      let mut ctx = Context::new(stack, loopback);
-      ctx_slot = &mut ctx;
-
-      let ctx_ptr = &mut ctx;
-      b.iter(|| Context::swap(ctx_ptr, ctx_ptr, 0));
-    }
-  }
-}
--- a/src/generator.rs
+++ b/src/generator.rs
@ -16,7 +16,8 @@ use core::{ptr, mem};
 use core::cell::Cell;

 use stack;
-use context::Context;
+use debug;
+use arch::{self, StackPointer};

 #[derive(Debug, Clone, Copy)]
 pub enum State {
@ -81,7 +82,9 @@ pub enum State {
 #[derive(Debug)]
 pub struct Generator<Input: Send, Output: Send, Stack: stack::Stack> {
  state:     State,
-  context: Context<Stack>,
+  stack:     Stack,
+  stack_id:  debug::StackId,
+  stack_ptr: arch::StackPointer,
  phantom:   (PhantomData<*const Input>, PhantomData<*const Output>)
 }

@ -92,7 +95,7 @@ impl<Input, Output, Stack> Generator<Input, Output, Stack>
  /// See also the [contract](../trait.GuardedStack.html) that needs to be fulfilled by `stack`.
  pub fn new<F>(stack: Stack, f: F) -> Generator<Input, Output, Stack>
      where Stack: stack::GuardedStack,
-            F: FnOnce(&mut Yielder<Input, Output, Stack>, Input) + Send {
+            F: FnOnce(&mut Yielder<Input, Output>, Input) + Send {
    unsafe { Generator::unsafe_new(stack, f) }
  }

@ -104,35 +107,36 @@ impl<Input, Output, Stack> Generator<Input, Output, Stack>
  ///
  /// See also the [contract](../trait.Stack.html) that needs to be fulfilled by `stack`.
  pub unsafe fn unsafe_new<F>(stack: Stack, f: F) -> Generator<Input, Output, Stack>
-      where F: FnOnce(&mut Yielder<Input, Output, Stack>, Input) + Send {
-    unsafe extern "C" fn generator_wrapper<Input, Output, Stack, F>(env: usize) -> !
+      where F: FnOnce(&mut Yielder<Input, Output>, Input) + Send {
+    unsafe extern "C" fn generator_wrapper<Input, Output, Stack, F>(env: usize, stack_ptr: StackPointer) -> !
        where Input: Send, Output: Send, Stack: stack::Stack,
-              F: FnOnce(&mut Yielder<Input, Output, Stack>, Input) {
+              F: FnOnce(&mut Yielder<Input, Output>, Input) {
      // Retrieve our environment from the callee and return control to it.
-      let (mut yielder, f) = ptr::read(env as *mut (Yielder<Input, Output, Stack>, F));
-      let data = Context::swap(yielder.context.get(), yielder.context.get(), 0);
+      let f = ptr::read(env as *const F);
+      let (data, stack_ptr) = arch::swap(0, stack_ptr, None);
      // See the second half of Yielder::suspend_bare.
-      let (new_context, input) = ptr::read(data as *mut (*mut Context<Stack>, Input));
-      yielder.context.set(new_context as *mut Context<Stack>);
+      let input = ptr::read(data as *const Input);
      // Run the body of the generator.
+      let mut yielder = Yielder::new(stack_ptr);
      f(&mut yielder, input);
      // Past this point, the generator has dropped everything it has held.
      loop { yielder.suspend_bare(None); }
    }

-    let mut generator = Generator {
-      state:   State::Runnable,
-      context: Context::new(stack, generator_wrapper::<Input, Output, Stack, F>),
-      phantom: (PhantomData, PhantomData)
-    };
+    let stack_id  = debug::StackId::register(&stack);
+    let stack_ptr = arch::init(&stack, generator_wrapper::<Input, Output, Stack, F>);

    // Transfer environment to the callee.
-    let mut env = (Yielder::new(&mut generator.context), f);
-    Context::swap(&mut generator.context, &generator.context,
-                  &mut env as *mut (Yielder<Input, Output, Stack>, F) as usize);
-    mem::forget(env);
+    let stack_ptr = arch::swap(&f as *const F as usize, stack_ptr, Some(&stack)).1;
+    mem::forget(f);

-    generator
+    Generator {
+      state:     State::Runnable,
+      stack:     stack,
+      stack_id:  stack_id,
+      stack_ptr: stack_ptr,
+      phantom:   (PhantomData, PhantomData)
+    }
  }

  /// Resumes the generator and return the next value it yields.
@ -148,13 +152,10 @@ impl<Input, Output, Stack> Generator<Input, Output, Stack>

        // Switch to the generator function, and retrieve the yielded value.
        let val = unsafe {
-          let mut data_in = (&mut self.context as *mut Context<Stack>, input);
-          let data_out =
-            ptr::read(Context::swap(&mut self.context, &self.context,
-                                    &mut data_in as *mut (*mut Context<Stack>, Input)  as usize)
-                      as *mut Option<Output>);
-          mem::forget(data_in);
-          data_out
+          let (data_out, stack_ptr) = arch::swap(&input as *const Input as usize, self.stack_ptr, Some(&self.stack));
+          self.stack_ptr = stack_ptr;
+          mem::forget(input);
+          ptr::read(data_out as *const Option<Output>)
        };

        // Unless the generator function has returned, it can be switched to again, so
@ -177,7 +178,7 @@ impl<Input, Output, Stack> Generator<Input, Output, Stack>
  pub fn unwrap(self) -> Stack {
    match self.state {
      State::Runnable    => panic!("Argh! Bastard! Don't touch that!"),
-      State::Unavailable => unsafe { self.context.unwrap() }
+      State::Unavailable => self.stack
    }
  }
 }
@ -185,35 +186,27 @@ impl<Input, Output, Stack> Generator<Input, Output, Stack>
 /// Yielder is an interface provided to every generator through which it
 /// returns a value.
 #[derive(Debug)]
-pub struct Yielder<Input: Send, Output: Send, Stack: stack::Stack> {
-  context: Cell<*mut Context<Stack>>,
+pub struct Yielder<Input: Send, Output: Send> {
+  stack_ptr: Cell<StackPointer>,
  phantom: (PhantomData<*const Input>, PhantomData<*const Output>)
 }

-impl<Input, Output, Stack> Yielder<Input, Output, Stack>
-    where Input: Send, Output: Send, Stack: stack::Stack {
-  fn new(context: *mut Context<Stack>) -> Yielder<Input, Output, Stack> {
+impl<Input, Output> Yielder<Input, Output>
+    where Input: Send, Output: Send {
+  fn new(stack_ptr: StackPointer) -> Yielder<Input, Output> {
    Yielder {
-      context: Cell::new(context),
+      stack_ptr: Cell::new(stack_ptr),
      phantom: (PhantomData, PhantomData)
    }
  }

  #[inline(always)]
-  fn suspend_bare(&self, mut val: Option<Output>) -> Input {
+  fn suspend_bare(&self, val: Option<Output>) -> Input {
    unsafe {
-      let data = Context::swap(self.context.get(), self.context.get(),
-                               &mut val as *mut Option<Output> as usize);
+      let (data, stack_ptr) = arch::swap(&val as *const Option<Output> as usize, self.stack_ptr.get(), None);
+      self.stack_ptr.set(stack_ptr);
      mem::forget(val);
-      let (new_context, input) = ptr::read(data as *mut (*mut Context<Stack>, Input));
-      // The generator can be moved (and with it, the context).
-      // This changes the address of the context.
-      // Thus, we update it after each swap.
-      self.context.set(new_context);
-      // However, between this point and the next time we enter suspend_bare
-      // the generator cannot be moved, as a &mut Generator is necessary
-      // to resume the generator function.
-      input
+      ptr::read(data as *const Input)
    }
  }

--- a/src/lib.rs
+++ b/src/lib.rs
@ -6,7 +6,7 @@
 // copied, modified, or distributed except according to those terms.
 #![feature(asm, naked_functions, cfg_target_vendor)]
 #![cfg_attr(feature = "alloc", feature(alloc, heap_api))]
-#![cfg_attr(test, feature(test, thread_local, const_fn))]
+#![cfg_attr(test, feature(test))]
 #![no_std]

 //! libfringe is a library implementing safe, lightweight context switches,
@ -51,7 +51,6 @@ pub const STACK_ALIGNMENT: usize = arch::STACK_ALIGNMENT;

 mod debug;

-mod context;
 mod stack;
 mod slice_stack;
 pub mod generator;
--- a/tests/generator.rs
+++ b/tests/generator.rs
@ -7,10 +7,10 @@
 // copied, modified, or distributed except according to those terms.
 extern crate fringe;

-use fringe::{Stack, SliceStack, OwnedStack, OsStack};
+use fringe::{SliceStack, OwnedStack, OsStack};
 use fringe::generator::{Generator, Yielder};

-fn add_one_fn<S: Stack>(yielder: &mut Yielder<i32, i32, S>, mut input: i32) {
+fn add_one_fn(yielder: &mut Yielder<i32, i32>, mut input: i32) {
  loop {
    if input == 0 { break }
    input = yielder.suspend(input + 1)