diff --git a/link.x b/link.x
index 3a0c6ca9..88b32364 100644
--- a/link.x
+++ b/link.x
@@ -1,5 +1,6 @@
 ENTRY(_boot_cores);
 
+/* Size of stack for core 0 in bytes */
 STACK_SIZE = 0x8000;
 
 /* Provide some defaults */
diff --git a/src/cortex_a9/asm.rs b/src/cortex_a9/asm.rs
index 022530ef..7dd416e4 100644
--- a/src/cortex_a9/asm.rs
+++ b/src/cortex_a9/asm.rs
@@ -10,6 +10,12 @@ pub fn wfe() {
     unsafe { asm!("wfe" :::: "volatile") }
 }
 
+/// Send Event
+#[inline]
+pub fn sev() {
+    unsafe { asm!("sev" :::: "volatile") }
+}
+
 /// Data Memory Barrier
 #[inline]
 pub fn dmb() {
@@ -27,3 +33,4 @@ pub fn dsb() {
 pub fn isb() {
     unsafe { asm!("isb" :::: "volatile") }
 }
+
diff --git a/src/cortex_a9/mmu.rs b/src/cortex_a9/mmu.rs
index 435a8445..1a235a18 100644
--- a/src/cortex_a9/mmu.rs
+++ b/src/cortex_a9/mmu.rs
@@ -124,7 +124,8 @@ impl L1Table {
             tex: 0b101,
             domain: 0b1111,
             exec: true,
-            cacheable: false,
+            // TODO: temporarily turn on cache for SMP testing
+            cacheable: false, 
             bufferable: true,
         });
         /* (DDR cacheable) */
diff --git a/src/cortex_a9/regs.rs b/src/cortex_a9/regs.rs
index 647db472..72396e6b 100644
--- a/src/cortex_a9/regs.rs
+++ b/src/cortex_a9/regs.rs
@@ -115,6 +115,45 @@ register_bit!(sctlr,
               /// Thumb Exception Enable
               te, 30);
 
+impl crate::regs::RegisterRW for SCTLR {
+    fn modify<F: FnOnce(Self::R, Self::W) -> Self::W>(&mut self, f: F) {
+        // todo: this may fail for .nmfi and, in non-secure state,
+        //       also RR (bit 14)
+        let inner = self.read().inner;
+        let inner_w = f(
+            sctlr::Read { inner },
+            sctlr::Write { inner }
+        );
+        self.write(inner_w);
+    }
+}
+
+/// Auxiliary Control Register
+pub struct ACTLR;
+wrap_reg!(actlr);
+def_reg_r!(ACTLR, actlr::Read, "mrc p15, 0, $0, c1, c0, 1");
+def_reg_w!(ACTLR, actlr::Write, "mcr p15, 0, $0, c1, c0, 1");
+// SMP bit
+register_bit!(actlr, parity_on, 9);
+register_bit!(actlr, alloc_one_way, 8);
+register_bit!(actlr, excl, 7);
+register_bit!(actlr, smp, 6);
+register_bit!(actlr, write_full_line_of_zeros, 3);
+register_bit!(actlr, l1_prefetch_enable, 2);
+// Cache/TLB maintenance broadcast
+register_bit!(actlr, fw, 0);
+
+impl crate::regs::RegisterRW for ACTLR {
+    fn modify<F: FnOnce(Self::R, Self::W) -> Self::W>(&mut self, f: F) {
+        let inner = self.read().inner;
+        let inner_w = f(
+            actlr::Read { inner },
+            actlr::Write { inner }
+        );
+        self.write(inner_w);
+    }
+}
+
 /// Domain Access Control Register
 pub struct DACR;
 def_reg_r!(DACR, u32, "mrc p15, 0, $0, c3, c0, 0");
@@ -163,9 +202,51 @@ pub fn bpiall() {
 
 /// Invalidate D-Cache
 #[inline(always)]
-pub fn dccisw() {
+pub fn dcisw(setway: u32) {
     // TODO: $0 is r11 at what value?
     unsafe {
-        asm!("mcr p15, 0, $0, c7, c5, 6" :: "r" (0) :: "volatile");
+        // steinb: the following is incorrect
+        //asm!("mcr p15, 0, $0, c7, c5, 6" :: "r" (0) :: "volatile");
+
+        // acc. to ARM Architecture Reference Manual, Figure B3-32;
+        // also see example code (for DCCISW, but DCISW will be
+        // analogous) "Example code for cache maintenance operations"
+        // on pages B2-1286 and B2-1287.
+        asm!("mcr p15, 0, $0, c7, c6, 2" :: "r" (setway) :: "volatile");
+    }
+}
+
+/// A made-up "instruction": invalidate all of the L1 D-Cache
+#[inline(always)]
+pub fn dciall() {
+    // the cache associativity could be read from a register, but will
+    // always be 4 in L1 data cache of a cortex a9
+    let ways = 4;
+    let bit_pos_of_way = 30; // 32 - log2(ways)
+    
+    // the cache sets could be read from a register, but are always
+    // 256 for the cores in the zync-7000; in general, 128 or 512 are
+    // also possible.
+    let sets = 256;
+    let bit_pos_of_set = 5; // for a line size of 8 words = 2^5 bytes
+    
+    // select L1 data cache
+    unsafe {
+        asm!("mcr p15, 2, $0, c0, c0, 0" :: "r" (0) :: "volatile");
+    }
+    
+    // Invalidate entire D-Cache by iterating every set and every way    
+    for set in 0..sets {
+        for way in 0..ways {
+            dcisw((set << bit_pos_of_set) | (way << bit_pos_of_way));
+        }
+    }
+}
+
+/// clear cache line by virtual address to point of coherency (DCCMVAC)
+#[inline]
+pub fn dccmvac(addr: u32) {
+    unsafe {
+        asm!("mcr p15, 0, $0, c7, c10, 1" :: "r" (addr) :: "volatile");
     }
 }
diff --git a/src/mailbox.rs b/src/mailbox.rs
new file mode 100644
index 00000000..1459a2db
--- /dev/null
+++ b/src/mailbox.rs
@@ -0,0 +1,131 @@
+use crate::cortex_a9::asm;
+use core::ptr::{read_volatile, write_volatile};
+
+/*
+  One-way mailbox:
+
+  All transmissions must originate from one core only,
+  and all receives from the other core only.
+
+  Example transmission (to be executed on core 0):
+  {
+      while (!MAILBOX_FROM_CORE0.acknowledged()) {}
+      println!("ready to send");
+      MAILBOX_FROM_CORE0.send(&data);
+      println!("sent");
+      while (!MAILBOX_FROM_CORE0.acknowledged()) {}
+      println!("got receipt (acknowledgement)");    
+  }
+
+  Example reception (to be executed on core 1):
+  {
+      println("wait for data");
+      while (!MAILBOX_FROM_CORE0.available()) {}
+      let data = MAILBOX_FROM_CORE0.receive();
+      println("data received");
+      MAILBOX_FROM_CORE0.acknowledge(data);
+  }
+  
+  Note that unsafe { ... } blocks must be used around most functions;
+  these have been omitted from the examples for clarity.
+
+*/
+
+pub struct OneWayMailbox {
+    // pointer (data to be transferred): write-only for sending core,
+    // readable and clearable (to 0) for receiving core
+    pointer: usize,
+
+    // helper variable (last pointer value received) for receiving
+    // core
+    echo: usize,
+}
+
+pub static mut MAILBOX_FROM_CORE0: OneWayMailbox = OneWayMailbox::new();
+pub static mut MAILBOX_FROM_CORE1: OneWayMailbox = OneWayMailbox::new();
+
+impl OneWayMailbox {
+    // instantiate a one-way mailbox with no undelivered message
+    pub const fn new() -> OneWayMailbox {
+        OneWayMailbox { pointer: 0, echo: 0 }
+    }
+
+    // recreate pristine condition; may only be called when producers
+    // and consumers are stopped (e.g. when starting core 1 from core
+    // 0).
+    pub fn reset_discard(&mut self) {
+        unsafe {
+            write_volatile(&mut self.pointer, 0);
+            write_volatile(&mut self.echo, 0);
+        }
+    }
+
+    // send a pointer from one core to be received by the other core
+    pub fn send(&mut self, ptr: usize) -> usize {
+        assert!(ptr != 0); // ptr may not be the NULL-like flag        
+        asm::dmb(); // ensure data at (ptr) has been fully written
+        unsafe {
+            write_volatile(&mut self.pointer, ptr);
+        }
+        ptr
+    }
+
+    // receive a pointer from the other core, or 0 if none is present
+    pub fn receive(&self) -> usize {
+        let ptr = unsafe {
+            read_volatile(&self.pointer)
+        };
+        // necessary memory barrier to guarantee that the data at
+        // (ptr) has been fully written before it may be accessed
+        // by the caller of this function
+        asm::dmb(); 
+        ptr
+    }
+
+    // return true if and only if the next self.receive() will return
+    // actual data rather than 0
+    pub fn available(&self) -> bool {
+        let ptr = unsafe {
+            asm::dmb();
+            read_volatile(&self.pointer)
+        };
+        ptr != 0
+    }
+
+    // acknowledge receipt of data to the sender (i.e. release it)
+    pub fn acknowledge(&mut self, ptr: usize) {
+        // ensure that the data we release is the data last sent
+        assert_eq!(ptr, unsafe {
+            read_volatile(&self.pointer)
+        });
+        // first possibility for "release" flag:
+        //   pointer and echo are equal
+        unsafe {
+            write_volatile(&mut self.echo, ptr);
+        }
+        asm::dmb(); // write to self.echo before self.pointer
+        // second possibility for "release" flag:
+        //   NULL-like pointer
+        unsafe {
+            write_volatile(&mut self.pointer, 0);
+        }
+        asm::dmb();
+        // reset echo
+        unsafe {
+            write_volatile(&mut self.echo, 0);
+        }
+    }
+
+    // has data been acknowledged?
+    pub fn acknowledged(&self) -> bool {
+        let ptr = unsafe {
+            read_volatile(&self.pointer)
+        };
+        // read self.pointer before self.echo, not after
+        asm::dmb();
+        let echo = unsafe {
+            read_volatile(&self.echo)
+        };
+        (ptr == 0) || (ptr == echo)
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index a204a4d2..6ff368aa 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -9,30 +9,40 @@
 #![allow(dead_code)]
 
 use core::mem::{uninitialized, transmute};
+use core::ptr::write_volatile;
 use r0::zero_bss;
 use compiler_builtins as _;
 use smoltcp::wire::{EthernetAddress, IpAddress, IpCidr};
 use smoltcp::iface::{NeighborCache, EthernetInterfaceBuilder, EthernetInterface};
 use smoltcp::time::Instant;
 use smoltcp::socket::SocketSet;
+use mailbox::{MAILBOX_FROM_CORE0, MAILBOX_FROM_CORE1};
 
 mod regs;
 mod cortex_a9;
 mod clocks;
+mod mailbox;
+mod mpcore;
 mod slcr;
 mod uart;
 mod stdio;
 mod eth;
 
-use crate::regs::{RegisterR, RegisterW};
+use crate::regs::{RegisterR, RegisterW, RegisterRW};
 use crate::cortex_a9::{asm, regs::*, mmu};
 
 extern "C" {
     static mut __bss_start: u32;
     static mut __bss_end: u32;
-    static mut __stack_start: u32;
+    static mut __stack_start: u32; // refers to the stack for core 0
+    static mut __stack1_start: u32; // refers to the stack for core 1
 }
 
+// program address as u32, for execution after setting up core 1
+static mut START_ADDR_CORE1: u32 = 0; 
+// initial stack pointer for starting core 1
+static mut INITIAL_SP_CORE1: u32 = 0; // must be zero (as a flag)
+
 #[link_section = ".text.boot"]
 #[no_mangle]
 #[naked]
@@ -41,13 +51,24 @@ pub unsafe extern "C" fn _boot_cores() -> ! {
 
     match MPIDR.read() & CORE_MASK {
         0 => {
+            // executing on core 0
             SP.write(&mut __stack_start as *mut _ as u32);
             boot_core0();
         }
-        _ => loop {
-            // if not core0, infinitely wait for events
-            asm::wfe();
-        },
+        _ => {
+            // executing on core 1 (as there are only cores 0 and 1)
+            while INITIAL_SP_CORE1 == 0 {
+                // NOTE: This wfe and its loop can be removed as long
+                //       as the regular boot loader remains in place
+                //       (i.e. this program is not written into ROM).
+                asm::wfe();
+            }
+
+            // the following requires a stack (at least later, for the
+            // function for setting up the MMU)
+            SP.write(INITIAL_SP_CORE1);
+            boot_core1();
+        }
     }
 }
 
@@ -55,16 +76,59 @@ pub unsafe extern "C" fn _boot_cores() -> ! {
 #[inline(never)]
 unsafe fn boot_core0() -> ! {
     l1_cache_init();
+
+    // Invalidate SCU, for all cores
+    mpcore::RegisterBlock::new().scu_invalidate.write(0xffff);
+
     zero_bss(&mut __bss_start, &mut __bss_end);
 
     let mmu_table = mmu::L1Table::get()
         .setup_flat_layout();
     mmu::with_mmu(mmu_table, || {
+        // start SCU
+        mpcore::RegisterBlock::new().scu_control.modify(
+            |_, w| w.enable(true)
+        );
+        // enable SMP (for starting correct SCU operation)
+        ACTLR.modify(|_, w|
+             w.smp(true) // SMP mode
+              .fw(true) // cache and TLB maintenance broadcast on
+        );
+        asm::dmb();
+        asm::dsb();
         main();
         panic!("return from main");
     });
 }
 
+#[naked]
+#[inline(never)]
+unsafe fn boot_core1() -> ! {
+    l1_cache_init();
+
+    // Invalidate SCU, for core1 only
+    mpcore::RegisterBlock::new().scu_invalidate.write(0x00f0);
+
+    // use the MMU L1 Table already set up by core 0
+    let mmu_table = mmu::L1Table::get();
+    mmu::with_mmu(mmu_table, || {            
+        // enable SMP (for correct SCU operation)
+        ACTLR.modify(|_, w| 
+                     w.smp(true) // SMP mode
+                     .fw(true) // cache and TLB maintenance broadcast
+        );
+
+        asm::dmb();
+        asm::dsb();
+
+        // now that the MMU is active using the same table as active
+        // on the other core, one can branch to any normal memory
+        // location in which the code may reside
+        asm!("bx r1" :: "{r1}"(START_ADDR_CORE1) :: "volatile");
+        unreachable!();
+    });
+}
+
 fn l1_cache_init() {
     // Invalidate TLBs
     tlbiall();
@@ -73,13 +137,118 @@ fn l1_cache_init() {
     // Invalidate Branch Predictor Array
     bpiall();
     // Invalidate D-Cache
-    dccisw();
+    //
+    // Note: Do use dcisw rather than dccisw to only invalidate rather
+    //       than also clear (which may write values back into the
+    //       underlying L2 cache or memory!)
+    //
+    // use the "made-up instruction" (see definition) dciall()
+    dciall();
+
+    asm::dsb();
+    asm::isb();
 }
 
+fn stop_core1() {
+    slcr::RegisterBlock::unlocked(|slcr| {
+        slcr.a9_cpu_rst_ctrl.modify(|_, w| {
+            w.a9_rst1(true)
+        });
+        slcr.a9_cpu_rst_ctrl.modify(|_, w| {
+            w.a9_clkstop1(true)
+        });
+        slcr.a9_cpu_rst_ctrl.modify(|_, w| {
+            w.a9_rst1(false)
+        });
+    });
+}
+
+// Execute f on core 1 using the given stack. Note that these
+// semantics are inherently unsafe as the stack needs to live longer
+// than Rust semantics dictate...hence this method is marked as unsafe
+// to remind the caller to take special care (but also many operations
+// performed would otherwise require `unsafe` blocks).
+unsafe fn run_on_core1(f: fn() -> !, stack: &mut [u32]) {
+    // reset and stop core 1 (this is safe to repeat, if the caller
+    // has already performed this)
+    stop_core1();
+
+    // ensure any mailbox access finishes before the mailbox reset
+    asm::dmb();
+    // reset the mailbox for sending messages
+    MAILBOX_FROM_CORE0.reset_discard();
+    MAILBOX_FROM_CORE1.reset_discard();
+    // determine address of f and save it as start address for core 1
+    write_volatile(
+        &mut START_ADDR_CORE1,
+        f as *const () as u32
+    );
+    write_volatile(
+        &mut INITIAL_SP_CORE1,
+        &mut stack[stack.len() - 1] as *const _ as u32
+    );
+    // ensure the above is written to cache before it is cleaned
+    asm::dmb();
+    // TODO: Is the following necessary, considering that the SCU
+    //       should take care of coherency of all (normal) memory?
+    //
+    // clean cache lines containing START_ADDR_CORE1 and
+    // INITIAL_SP_CORE1
+    dccmvac(&START_ADDR_CORE1 as *const _ as u32);
+    dccmvac(&INITIAL_SP_CORE1 as *const _ as u32);
+    
+    // clean cache lines containing mailboxes
+    dccmvac(&MAILBOX_FROM_CORE0 as *const _ as u32);
+    dccmvac(&MAILBOX_FROM_CORE1 as *const _ as u32);
+
+    // restart core 1
+    slcr::RegisterBlock::unlocked(|slcr| {
+        slcr.a9_cpu_rst_ctrl.modify(|_, w| {
+            w.a9_rst1(false)
+        });
+        slcr.a9_cpu_rst_ctrl.modify(|_, w| {
+            w.a9_clkstop1(false)
+        });
+    });
+}
+
+fn main_core1() -> ! {
+    let mut data: [u32; 2] = [42, 42];
+    loop {
+        // effectively perform something similar to `println!("from
+        // core 1");` by passing a message to core 0 and having core 0
+        // output it via the println! macro
+        unsafe {
+            MAILBOX_FROM_CORE1.send(&data as *const _ as usize);
+            while !MAILBOX_FROM_CORE1.acknowledged() {}
+        }
+        
+        // change data to make it more interesting
+        data[1] += 1;
+    }
+}
+
+fn main_core1_program2() -> ! {
+    let mut data: [u32; 2] = [4200, 4200];
+    loop {
+        unsafe {
+            MAILBOX_FROM_CORE1.send(&data as *const _ as usize);
+            while !MAILBOX_FROM_CORE1.acknowledged() {}
+        }
+        // change data to make it more interesting
+        data[0] -= 1;
+        data[1] += 1;
+    }
+}
+
+// reserve some memory as stack for core1
+static mut STACK_CORE1: [u32; 256] = [0; 256];
+
 const HWADDR: [u8; 6] = [0, 0x23, 0xde, 0xea, 0xbe, 0xef];
 
 fn main() {
     println!("Main.");
+    println!("Core 0 SP: 0x{:X}", SP.read());
     let clocks = clocks::CpuClocks::get();
     println!("Clocks: {:?}", clocks);
     println!("CPU speeds: {}/{}/{}/{} MHz",
@@ -92,6 +261,52 @@ fn main() {
     println!("Eth on");
     eth.reset_phy();
 
+    // start executing main_core1() on core 1
+    unsafe {
+        run_on_core1(main_core1, &mut STACK_CORE1[..]);
+    }
+    println!("Started main_core1() on core 1");
+    for _ in 0..5 {
+        // wait for data
+        while unsafe { !MAILBOX_FROM_CORE1.available() } {}
+        // receive data
+        let data_ptr = unsafe { MAILBOX_FROM_CORE1.receive() };
+        println!(
+            "Received via mailbox from core 1: data {} and {} at address 0x{:X}",
+            unsafe { (*(data_ptr as *const [u32; 2]))[0] },
+            unsafe { (*(data_ptr as *const [u32; 2]))[1] },
+            data_ptr
+        );
+        unsafe {
+            MAILBOX_FROM_CORE1.acknowledge(data_ptr);
+        }
+    }
+    stop_core1();
+    println!("Stopped core 1.");
+
+    // start executing main_core1_program2() on core 1
+    unsafe {
+        run_on_core1(main_core1_program2, &mut STACK_CORE1[..]);
+    }
+    println!("Started main_core1_program2() on core 1");
+    for _ in 0..5 {
+        // wait for data
+        while unsafe { !MAILBOX_FROM_CORE1.available() } {}
+        // receive data
+        let data_ptr = unsafe { MAILBOX_FROM_CORE1.receive() };
+        println!(
+            "Received via mailbox from core 1: data {} and {} at address 0x{:X}",
+            unsafe { (*(data_ptr as *const [u32; 2]))[0] },
+            unsafe { (*(data_ptr as *const [u32; 2]))[1] },
+            data_ptr
+        );
+        unsafe {
+            MAILBOX_FROM_CORE1.acknowledge(data_ptr);
+        }
+    }
+    stop_core1();
+    println!("Stopped core 1.");
+
     const RX_LEN: usize = 1;
     let mut rx_descs: [eth::rx::DescEntry; RX_LEN] = unsafe { uninitialized() };
     let mut rx_buffers = [[0u8; eth::MTU]; RX_LEN];
diff --git a/src/mpcore.rs b/src/mpcore.rs
new file mode 100644
index 00000000..36e503ce
--- /dev/null
+++ b/src/mpcore.rs
@@ -0,0 +1,29 @@
+///! Register definitions for Application Processing Unit (mpcore)
+
+use volatile_register::{RO, RW, WO};
+use crate::{register, register_at, register_bit};
+
+#[repr(C)]
+pub struct RegisterBlock {
+    pub scu_control: ScuControl,
+    pub scu_config: RO<u32>,
+    pub scu_cpu_power: RW<u32>,
+    pub scu_invalidate: WO<u32>,
+    reserved0: [u32; 12],
+    pub filter_start: RW<u32>,
+    pub filter_end: RW<u32>,
+    reserved1: [u32; 2],
+    pub scu_access_control: RW<u32>,
+    pub scu_non_secure_access_control: RW<u32>,
+    // there is plenty more (unimplemented)
+}
+register_at!(RegisterBlock, 0xF8F00000, new);
+
+register!(scu_control, ScuControl, RW, u32);
+register_bit!(scu_control, ic_standby_enable, 6);
+register_bit!(scu_control, scu_standby_enable, 5);
+register_bit!(scu_control, force_to_port0_enable, 4);
+register_bit!(scu_control, scu_speculative_linefill_enable, 3);
+register_bit!(scu_control, scu_rams_parity_enable, 2);
+register_bit!(scu_control, address_filtering_enable, 1);
+register_bit!(scu_control, enable, 0);
diff --git a/src/slcr.rs b/src/slcr.rs
index df714ec2..69aba445 100644
--- a/src/slcr.rs
+++ b/src/slcr.rs
@@ -90,7 +90,7 @@ pub struct RegisterBlock {
     pub ocm_rst_ctrl: RW<u32>,
     reserved4: [u32; 1],
     pub fpga_rst_ctrl: RW<u32>,
-    pub a9_cpu_rst_ctrl: RW<u32>,
+    pub a9_cpu_rst_ctrl: A9CpuRstCtrl,
     reserved5: [u32; 1],
     pub rs_awdt_ctrl: RW<u32>,
     reserved6: [u32; 2],
@@ -365,6 +365,13 @@ impl UartRstCtrl {
 register!(pss_rst_ctrl, PssRstCtrl, RW, u32);
 register_bit!(pss_rst_ctrl, soft_rst, 1);
 
+register!(a9_cpu_rst_ctrl, A9CpuRstCtrl, RW, u32);
+register_bit!(a9_cpu_rst_ctrl, peri_rst, 8);
+register_bit!(a9_cpu_rst_ctrl, a9_clkstop1, 5);
+register_bit!(a9_cpu_rst_ctrl, a9_clkstop0, 4);
+register_bit!(a9_cpu_rst_ctrl, a9_rst1, 1);
+register_bit!(a9_cpu_rst_ctrl, a9_rst0, 0);
+
 /// Used for MioPin*.io_type
 #[repr(u8)]
 pub enum IoBufferType {