master: Always write results to HDF5 once run stage is reached

Previously, a significant risk of losing experimental results would
be associated with long-running experiments, as any stray exceptions
while run()ing the experiment – for instance, due to infrequent
network glitches or hardware reliability issue – would cause no
HDF5 file to be written. This was especially troublesome as long
experiments would suffer from a higher probability of unanticipated
failures, while at the same time being more costly to re-take in
terms of wall-clock time.

Unanticipated uncaught exceptions like that were enough of an issue
that several Oxford codebases had come up with their own half-baked
mitigation strategies, from swallowing all exceptions in run() by
convention, to always broadcasting all results to uniquely named
datasets such that the partial results could be recovered and written
to HDF5 by manually run recovery experiments.

This commit addresses the problem at its source, changing the worker
behaviour such that an HDF5 file is always written as soon as run()
starts.
pull/1464/head
David Nadlinger 2020-06-15 00:37:07 +01:00
parent d87042597a
commit 7955b63b00
4 changed files with 22 additions and 28 deletions

View File

@ -16,6 +16,7 @@ Highlights:
* `ad9910`: The maximum amplitude scale factor is now `0x3fff` (was `0x3ffe`
before).
* Applets now restart if they are running and a ccb call changes their spec
* Experiment results are now always saved to HDF5, even if run() fails.
Breaking changes:

View File

@ -115,7 +115,6 @@ class Run:
run = _mk_worker_method("run")
resume = _mk_worker_method("resume")
analyze = _mk_worker_method("analyze")
write_results = _mk_worker_method("write_results")
class RunPool:
@ -309,13 +308,8 @@ class AnalyzeStage(TaskObject):
try:
await run.analyze()
except:
logger.error("got worker exception in analyze stage of RID %d."
" Results will still be saved.", run.rid)
log_worker_exception()
try:
await run.write_results()
except:
logger.error("failed to write results of RID %d.", run.rid)
logger.error("got worker exception in analyze stage of RID %d.",
run.rid)
log_worker_exception()
self.delete_cb(run.rid)

View File

@ -293,10 +293,6 @@ class Worker:
async def analyze(self):
await self._worker_action({"action": "analyze"})
async def write_results(self, timeout=15.0):
await self._worker_action({"action": "write_results"},
timeout)
async def examine(self, rid, file, timeout=20.0):
self.rid = rid
self.filename = os.path.basename(file)

View File

@ -252,6 +252,16 @@ def main():
exp_inst = None
repository_path = None
def write_results():
filename = "{:09}-{}.h5".format(rid, exp.__name__)
with h5py.File(filename, "w") as f:
dataset_mgr.write_hdf5(f)
f["artiq_version"] = artiq_version
f["rid"] = rid
f["start_time"] = start_time
f["run_time"] = run_time
f["expid"] = pyon.encode(expid)
device_mgr = DeviceManager(ParentDeviceDB,
virtual_devices={"scheduler": Scheduler(),
"ccb": CCB()})
@ -292,27 +302,20 @@ def main():
put_completed()
elif action == "run":
run_time = time.time()
exp_inst.run()
try:
exp_inst.run()
except:
# Only write results in run() on failure; on success wait
# for end of analyze stage.
write_results()
raise
put_completed()
elif action == "analyze":
try:
exp_inst.analyze()
except:
# make analyze failure non-fatal, as we may still want to
# write results afterwards
put_exception_report()
else:
put_completed()
elif action == "write_results":
filename = "{:09}-{}.h5".format(rid, exp.__name__)
with h5py.File(filename, "w") as f:
dataset_mgr.write_hdf5(f)
f["artiq_version"] = artiq_version
f["rid"] = rid
f["start_time"] = start_time
f["run_time"] = run_time
f["expid"] = pyon.encode(expid)
put_completed()
finally:
write_results()
elif action == "examine":
examine(ExamineDeviceMgr, ExamineDatasetMgr, obj["file"])
put_completed()