From 31c6c79204bade21c8450a321f00d5975fd3ce4b Mon Sep 17 00:00:00 2001 From: whitequark Date: Wed, 16 May 2018 21:50:55 +0000 Subject: [PATCH] firmware: implement an edge profiler. It doesn't work extremely well, and the results can be somewhat confusing, and it needs a libunwind patch, but it's the best I could do and it seems still useful. --- artiq/coredevice/profiler.py | 40 ++++++++------- artiq/firmware/ksupport/glue.c | 6 +++ artiq/firmware/libboard_misoc/or1k/vectors.S | 39 ++++++++++++++ artiq/firmware/libunwind_backtrace/lib.rs | 29 +++++++++-- artiq/firmware/runtime/main.rs | 3 ++ artiq/firmware/runtime/profiler.rs | 53 ++++++++++++++++---- artiq/frontend/artiq_coremgmt.py | 11 ++-- 7 files changed, 145 insertions(+), 36 deletions(-) diff --git a/artiq/coredevice/profiler.py b/artiq/coredevice/profiler.py index 5bbbe1996..5ed431915 100644 --- a/artiq/coredevice/profiler.py +++ b/artiq/coredevice/profiler.py @@ -3,11 +3,15 @@ import subprocess class Symbolizer: - def __init__(self, binary, triple): - self._addr2line = subprocess.Popen([ + def __init__(self, binary, triple, demangle=True): + cmdline = [ triple + "-addr2line", "--exe=" + binary, - "--addresses", "--demangle=rust", "--functions", "--inlines" - ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) + "--addresses", "--functions", "--inlines" + ] + if demangle: + cmdline.append("--demangle=rust") + self._addr2line = subprocess.Popen(cmdline, stdin=subprocess.PIPE, stdout=subprocess.PIPE, + universal_newlines=True) def symbolize(self, addr): self._addr2line.stdin.write("0x{:08x}\n0\n".format(addr)) @@ -26,17 +30,17 @@ class Symbolizer: file, line = self._addr2line.stdout.readline().rstrip().split(":") - result.append((function, file, line)) + result.append((function, file, line, addr)) class CallgrindWriter: - def __init__(self, output, binary, triple, compression=True): + def __init__(self, output, binary, triple, compression=True, demangle=True): self._output = output self._binary = binary self._current = defaultdict(lambda: None) self._ids = defaultdict(lambda: {}) self._compression = compression - self._symbolizer = Symbolizer(binary, triple) + self._symbolizer = Symbolizer(binary, triple, demangle=demangle) def _write(self, fmt, *args, **kwargs): self._output.write(fmt.format(*args, **kwargs)) @@ -69,18 +73,20 @@ class CallgrindWriter: self._spec("cob", self._binary) def hit(self, addr, count): - for function, file, line in self._symbolizer.symbolize(addr): - self._spec("fn", function) + for function, file, line, addr in self._symbolizer.symbolize(addr): self._spec("fl", file) + self._spec("fn", function) self._write("0x{:08x} {} {}", addr, line, count) def edge(self, caller, callee, count): - function, file, line = next(self._symbolizer.symbolize(callee)) - self._spec("cfn", function) - self._spec("cfl", file) - self._write("calls={} 0x{:08x} {}", count, callee, line) + edges = self._symbolizer.symbolize(callee) + self._symbolizer.symbolize(caller) + for (callee, caller) in zip(edges, edges[1:]): + function, file, line, addr = callee + self._spec("cfl", file) + self._spec("cfn", function) + self._write("calls={} 0x{:08x} {}", count, addr, line) - function, file, line = next(self._symbolizer.symbolize(caller)) - self._spec("fn", function) - self._spec("fl", file) - self._write("0x{:08x} {} {}", caller, line, count) + function, file, line, addr = caller + self._spec("fl", file) + self._spec("fn", function) + self._write("0x{:08x} {} {}", addr, line, count) diff --git a/artiq/firmware/ksupport/glue.c b/artiq/firmware/ksupport/glue.c index 4e4ce318d..3828a6aea 100644 --- a/artiq/firmware/ksupport/glue.c +++ b/artiq/firmware/ksupport/glue.c @@ -21,6 +21,12 @@ void send_to_rtio_log(long long int timestamp, struct slice data); FILE *stderr; +/* called by libunwind */ +char *getenv(const char *var) +{ + return NULL; +} + /* called by libunwind */ int fprintf(FILE *stream, const char *fmt, ...) { diff --git a/artiq/firmware/libboard_misoc/or1k/vectors.S b/artiq/firmware/libboard_misoc/or1k/vectors.S index a92f97a69..708aed9df 100644 --- a/artiq/firmware/libboard_misoc/or1k/vectors.S +++ b/artiq/firmware/libboard_misoc/or1k/vectors.S @@ -171,35 +171,70 @@ _crt0: l.nop _exception_handler: + .cfi_startproc + .cfi_return_column 32 + .cfi_signal_frame + .cfi_def_cfa_offset EXCEPTION_STACK_SIZE l.sw 0x00(r1), r2 + .cfi_offset 2, 0x00-EXCEPTION_STACK_SIZE l.sw 0x04(r1), r3 + .cfi_offset 3, 0x04-EXCEPTION_STACK_SIZE l.sw 0x08(r1), r4 + .cfi_offset 4, 0x08-EXCEPTION_STACK_SIZE l.sw 0x0c(r1), r5 + .cfi_offset 5, 0x0c-EXCEPTION_STACK_SIZE l.sw 0x10(r1), r6 + .cfi_offset 6, 0x10-EXCEPTION_STACK_SIZE l.sw 0x14(r1), r7 + .cfi_offset 7, 0x14-EXCEPTION_STACK_SIZE l.sw 0x18(r1), r8 + .cfi_offset 8, 0x18-EXCEPTION_STACK_SIZE + /* r9 saved in HANDLE_EXCEPTION */ + .cfi_offset 9, 0x1c-EXCEPTION_STACK_SIZE l.sw 0x20(r1), r10 + .cfi_offset 10, 0x20-EXCEPTION_STACK_SIZE l.sw 0x24(r1), r11 + .cfi_offset 11, 0x24-EXCEPTION_STACK_SIZE l.sw 0x28(r1), r12 + .cfi_offset 12, 0x28-EXCEPTION_STACK_SIZE l.sw 0x2c(r1), r13 + .cfi_offset 13, 0x2c-EXCEPTION_STACK_SIZE l.sw 0x30(r1), r14 + .cfi_offset 14, 0x30-EXCEPTION_STACK_SIZE l.sw 0x34(r1), r15 + .cfi_offset 15, 0x34-EXCEPTION_STACK_SIZE l.sw 0x38(r1), r16 + .cfi_offset 16, 0x38-EXCEPTION_STACK_SIZE l.sw 0x3c(r1), r17 + .cfi_offset 17, 0x3c-EXCEPTION_STACK_SIZE l.sw 0x40(r1), r18 + .cfi_offset 18, 0x40-EXCEPTION_STACK_SIZE l.sw 0x44(r1), r19 + .cfi_offset 19, 0x44-EXCEPTION_STACK_SIZE l.sw 0x48(r1), r20 + .cfi_offset 20, 0x48-EXCEPTION_STACK_SIZE l.sw 0x4c(r1), r21 + .cfi_offset 21, 0x4c-EXCEPTION_STACK_SIZE l.sw 0x50(r1), r22 + .cfi_offset 22, 0x50-EXCEPTION_STACK_SIZE l.sw 0x54(r1), r23 + .cfi_offset 23, 0x54-EXCEPTION_STACK_SIZE l.sw 0x58(r1), r24 + .cfi_offset 24, 0x58-EXCEPTION_STACK_SIZE l.sw 0x5c(r1), r25 + .cfi_offset 25, 0x5c-EXCEPTION_STACK_SIZE l.sw 0x60(r1), r26 + .cfi_offset 26, 0x60-EXCEPTION_STACK_SIZE l.sw 0x64(r1), r27 + .cfi_offset 27, 0x64-EXCEPTION_STACK_SIZE l.sw 0x68(r1), r28 + .cfi_offset 28, 0x68-EXCEPTION_STACK_SIZE l.sw 0x6c(r1), r29 + .cfi_offset 29, 0x6c-EXCEPTION_STACK_SIZE l.sw 0x70(r1), r30 + .cfi_offset 30, 0x70-EXCEPTION_STACK_SIZE l.sw 0x74(r1), r31 + .cfi_offset 31, 0x74-EXCEPTION_STACK_SIZE /* Save return address */ l.or r14, r0, r9 @@ -210,6 +245,9 @@ _exception_handler: l.or r4, r0, r1 /* Extract exception PC */ l.mfspr r5, r0, SPR_EPCR_BASE + /* Tell exception PC to the unwinder */ + l.sw 0x78(r1), r5 + .cfi_offset 32, 0x78-EXCEPTION_STACK_SIZE /* Extract exception effective address */ l.mfspr r6, r0, SPR_EEAR_BASE /* Extract exception SR */ @@ -252,6 +290,7 @@ _exception_handler: l.lwz r31, 0x74(r1) l.jr r9 l.nop + .cfi_endproc .global _cache_init _cache_init: diff --git a/artiq/firmware/libunwind_backtrace/lib.rs b/artiq/firmware/libunwind_backtrace/lib.rs index 7e84e5e44..d1c71cf45 100644 --- a/artiq/firmware/libunwind_backtrace/lib.rs +++ b/artiq/firmware/libunwind_backtrace/lib.rs @@ -5,24 +5,43 @@ extern crate unwind; extern crate libc; use unwind as uw; -use libc::c_void; +use libc::{c_void, c_int}; -pub fn backtrace(mut f: F) -> Result<(), uw::_Unwind_Reason_Code> +const UW_REG_SP: c_int = -2; + +pub fn backtrace(f: F) -> Result<(), uw::_Unwind_Reason_Code> where F: FnMut(usize) -> () { + struct TraceContext { + step_fn: F, + prev_sp: uw::_Unwind_Word + } + extern fn trace(context: *mut uw::_Unwind_Context, arg: *mut c_void) -> uw::_Unwind_Reason_Code where F: FnMut(usize) -> () { unsafe { - let step_fn = &mut *(arg as *mut F); - step_fn(uw::_Unwind_GetIP(context)); + let trace_context = &mut *(arg as *mut TraceContext); + + // Detect the root of a libfringe thread + let cur_sp = uw::_Unwind_GetGR(context, UW_REG_SP); + if cur_sp == trace_context.prev_sp { + return uw::_URC_END_OF_STACK + } else { + trace_context.prev_sp = cur_sp; + } + + // GetIP gives us the return address, i.e. the address after the delay slot, + // but we're interested in the call instruction. + (trace_context.step_fn)(uw::_Unwind_GetIP(context) - 2 * 4); uw::_URC_NO_REASON } } unsafe { - match uw::_Unwind_Backtrace(trace::, &mut f as *mut _ as *mut c_void) { + let mut trace_context = TraceContext { step_fn: f, prev_sp: 0 }; + match uw::_Unwind_Backtrace(trace::, &mut trace_context as *mut _ as *mut c_void) { uw::_URC_NO_REASON => Ok(()), err => Err(err) } diff --git a/artiq/firmware/runtime/main.rs b/artiq/firmware/runtime/main.rs index 2f075b573..087f69167 100644 --- a/artiq/firmware/runtime/main.rs +++ b/artiq/firmware/runtime/main.rs @@ -57,6 +57,7 @@ mod moninj; mod analyzer; fn startup() { + irq::set_mask(0); irq::set_ie(true); clock::init(); info!("ARTIQ runtime starting..."); @@ -305,6 +306,8 @@ pub extern fn abort() { #[lang = "panic_fmt"] pub extern fn panic_fmt(args: core::fmt::Arguments, file: &'static str, line: u32, column: u32) -> ! { + irq::set_ie(false); + println!("panic at {}:{}:{}: {}", file, line, column, args); println!("backtrace for software version {}:", diff --git a/artiq/firmware/runtime/profiler.rs b/artiq/firmware/runtime/profiler.rs index e3c47e1a0..9765100cc 100644 --- a/artiq/firmware/runtime/profiler.rs +++ b/artiq/firmware/runtime/profiler.rs @@ -10,8 +10,8 @@ use managed::ManagedMap; pub struct Address(NonZeroUsize); impl Address { - pub fn new(raw: usize) -> Option
{ - NonZeroUsize::new(raw).map(Address) + pub fn new(raw: usize) -> Address { + Address(NonZeroUsize::new(raw).expect("null address")) } pub fn as_raw(&self) -> usize { @@ -50,6 +50,10 @@ impl Profile { self.edges.capacity() * edge_size } + pub fn has_edges(&self) -> bool { + self.edges.is_empty() + } + pub fn hits<'a>(&'a mut self) -> ManagedMap<'a, Address, u32> { ManagedMap::Borrowed(&mut self.hits[..]) } @@ -88,6 +92,7 @@ impl Profile { #[cfg(has_timer1)] mod imp { + use unwind_backtrace::backtrace; use board_misoc::{csr, irq}; use super::{Address, Profile}; @@ -192,22 +197,46 @@ mod imp { } } + // Skip frames: ::profiler::sample, ::exception, exception vector. + const SKIP_FRAMES: i32 = 3; + #[inline(always)] // make the top of backtrace predictable fn record(profile: &mut Profile, pc: usize) -> Result<(), ()> { - let callee = Address::new(pc).expect("null code address"); - profile.record_hit(callee)?; + let mut result = Ok(()); + let mut frame = -SKIP_FRAMES; - // TODO: record edges + // If we have storage for edges, use the DWARF unwinder. + // Otherwise, don't bother and use a much faster path that just looks at EPCR. + // Also, acquiring a meaningful backtrace requires libunwind + // with the https://reviews.llvm.org/D46971 patch applied. + if profile.has_edges() { + let mut prev_pc = 0; + let _ = backtrace(|pc| { + if frame == 0 { + result = result.and_then(|()| + profile.record_hit(Address::new(pc))); + prev_pc = pc; + } else if frame > 0 { + result = result.and_then(|()| + profile.record_edge(Address::new(pc), + Address::new(prev_pc))); + } + prev_pc = pc; + frame += 1; + }); + } - Ok(()) + // If we couldn't get anything useful out of a backtrace, at least + // record a hit at the exception PC. + if frame <= 0 { + result = profile.record_hit(Address::new(pc)); + } + + result } #[inline(never)] // see above pub fn sample(pc: usize) { - unsafe { - csr::timer1::ev_pending_write(1); - } - let result = { let mut profile = Lock::take().expect("cannot lock"); record(profile.as_mut().expect("profiler not running"), pc) @@ -216,6 +245,10 @@ mod imp { if result.is_err() { warn!("out of space"); stop(); + } else { + unsafe { + csr::timer1::ev_pending_write(1); + } } } } diff --git a/artiq/frontend/artiq_coremgmt.py b/artiq/frontend/artiq_coremgmt.py index d611b711c..0e9553c9d 100644 --- a/artiq/frontend/artiq_coremgmt.py +++ b/artiq/frontend/artiq_coremgmt.py @@ -98,8 +98,8 @@ def get_argparser(): help="sampling interval, in microseconds") p_start.add_argument("--hits-size", metavar="ENTRIES", type=int, default=8192, help="hit buffer size") - p_start.add_argument("--edges-size", metavar="ENTRIES", type=int, default=0, - help="edge buffer size (edge profiling not implemented)") + p_start.add_argument("--edges-size", metavar="ENTRIES", type=int, default=8192, + help="edge buffer size") p_stop = subparsers.add_parser("stop", help="stop profiling") @@ -113,6 +113,9 @@ def get_argparser(): p_save.add_argument("--no-compression", dest="compression", default=True, action="store_false", help="disable profile compression") + p_save.add_argument("--no-demangle", + dest="demangle", default=True, action="store_false", + help="disable symbol demangling") # misc debug t_debug = tools.add_parser("debug", @@ -178,8 +181,8 @@ def main(): mgmt.stop_profiler() elif args.action == "save": hits, edges = mgmt.get_profile() - writer = CallgrindWriter(args.output, args.firmware, - "or1k-linux", args.compression) + writer = CallgrindWriter(args.output, args.firmware, "or1k-linux", + args.compression, args.demangle) writer.header() for addr, count in hits.items(): writer.hit(addr, count)