From 432c6b99e226b077ab77d96226b8aeb182627ea1 Mon Sep 17 00:00:00 2001 From: Sebastien Bourdeauducq Date: Mon, 27 Mar 2017 17:53:07 +0800 Subject: [PATCH] master: still save results when analyze fails. Closes #684 --- RELEASE_NOTES.rst | 1 + artiq/master/scheduler.py | 12 ++++++---- artiq/master/worker_impl.py | 48 ++++++++++++++++++++++--------------- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 82a8dd502..045e7a9f4 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -28,6 +28,7 @@ Release notes and ``Comm`` has been renamed ``CommKernel``. * The "collision" and "busy" RTIO errors are reported through the log instead of raising exceptions. +* Results are still saved when ``analyze`` raises an exception. 2.2 diff --git a/artiq/master/scheduler.py b/artiq/master/scheduler.py index 53d06cac9..9eeeeaff4 100644 --- a/artiq/master/scheduler.py +++ b/artiq/master/scheduler.py @@ -305,14 +305,16 @@ class AnalyzeStage(TaskObject): run.status = RunStatus.analyzing try: await run.analyze() + except: + logger.error("got worker exception in analyze stage of RID %d." + " Results will still be saved.", run.rid) + log_worker_exception() + try: await run.write_results() except: - logger.error("got worker exception in analyze stage, " - "deleting RID %d", run.rid) + logger.error("failed to write results of RID %d.", run.rid) log_worker_exception() - self.delete_cb(run.rid) - else: - self.delete_cb(run.rid) + self.delete_cb(run.rid) class Pipeline: diff --git a/artiq/master/worker_impl.py b/artiq/master/worker_impl.py index 1e7155a30..8bfa306ca 100644 --- a/artiq/master/worker_impl.py +++ b/artiq/master/worker_impl.py @@ -196,6 +196,25 @@ def setup_diagnostics(experiment_file, repository_path): artiq.coredevice.core._DiagnosticEngine.render_diagnostic = \ render_diagnostic +def put_exception_report(): + _, exc, _ = sys.exc_info() + # When we get CompileError, a more suitable diagnostic has already + # been printed. + if not isinstance(exc, CompileError): + short_exc_info = type(exc).__name__ + exc_str = str(exc) + if exc_str: + short_exc_info += ": " + exc_str.splitlines()[0] + lines = ["Terminating with exception ("+short_exc_info+")\n"] + if hasattr(exc, "artiq_core_exception"): + lines.append(str(exc.artiq_core_exception)) + if hasattr(exc, "parent_traceback"): + lines += exc.parent_traceback + lines += traceback.format_exception_only(type(exc), exc) + logging.error("".join(lines).rstrip(), + exc_info=not hasattr(exc, "parent_traceback")) + put_object({"action": "exception"}) + def main(): global ipc @@ -251,8 +270,14 @@ def main(): exp_inst.run() put_object({"action": "completed"}) elif action == "analyze": - exp_inst.analyze() - put_object({"action": "completed"}) + try: + exp_inst.analyze() + except: + # make analyze failure non-fatal, as we may still want to + # write results afterwards + put_exception_report() + else: + put_object({"action": "completed"}) elif action == "write_results": filename = "{:09}-{}.h5".format(rid, exp.__name__) with h5py.File(filename, "w") as f: @@ -267,23 +292,8 @@ def main(): put_object({"action": "completed"}) elif action == "terminate": break - except Exception as exc: - # When we get CompileError, a more suitable diagnostic has already - # been printed. - if not isinstance(exc, CompileError): - short_exc_info = type(exc).__name__ - exc_str = str(exc) - if exc_str: - short_exc_info += ": " + exc_str.splitlines()[0] - lines = ["Terminating with exception ("+short_exc_info+")\n"] - if hasattr(exc, "artiq_core_exception"): - lines.append(str(exc.artiq_core_exception)) - if hasattr(exc, "parent_traceback"): - lines += exc.parent_traceback - lines += traceback.format_exception_only(type(exc), exc) - logging.error("".join(lines).rstrip(), - exc_info=not hasattr(exc, "parent_traceback")) - put_object({"action": "exception"}) + except: + put_exception_report() finally: device_mgr.close_devices() ipc.close()