forked from M-Labs/artiq
master/scheduler: Fix priority/due date precedence order when waiting to prepare
See test case – previously, the highest-priority pending run would be used to calculate the timeout, rather than the earliest one. This probably managed to go undetected for that long as any unrelated changes to the pipeline (e.g. new submissions, or experiments pausing) would also cause _get_run() to be re-evaluated.
This commit is contained in:
parent
7955b63b00
commit
966ed5d013
|
@ -87,17 +87,12 @@ class Run:
|
||||||
self._notifier[self.rid]["status"] = self._status.name
|
self._notifier[self.rid]["status"] = self._status.name
|
||||||
self._state_changed.notify()
|
self._state_changed.notify()
|
||||||
|
|
||||||
# The run with the largest priority_key is to be scheduled first
|
def priority_key(self):
|
||||||
def priority_key(self, now=None):
|
"""Return a comparable value that defines a run priority order.
|
||||||
if self.due_date is None:
|
|
||||||
due_date_k = 0
|
Applies only to runs the due date of which has already elapsed.
|
||||||
else:
|
"""
|
||||||
due_date_k = -self.due_date
|
return (self.priority, -(self.due_date or 0), -self.rid)
|
||||||
if now is not None and self.due_date is not None:
|
|
||||||
runnable = int(now > self.due_date)
|
|
||||||
else:
|
|
||||||
runnable = 1
|
|
||||||
return (runnable, self.priority, due_date_k, -self.rid)
|
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
# called through pool
|
# called through pool
|
||||||
|
@ -161,36 +156,37 @@ class PrepareStage(TaskObject):
|
||||||
self.delete_cb = delete_cb
|
self.delete_cb = delete_cb
|
||||||
|
|
||||||
def _get_run(self):
|
def _get_run(self):
|
||||||
"""If a run should get prepared now, return it.
|
"""If a run should get prepared now, return it. Otherwise, return a
|
||||||
Otherwise, return a float representing the time before the next timed
|
float giving the time until the next check, or None if no time-based
|
||||||
run becomes due, or None if there is no such run."""
|
check is required.
|
||||||
|
|
||||||
|
The latter can be the case if there are no due-date runs, or none
|
||||||
|
of them are going to become next-in-line before further pool state
|
||||||
|
changes (which will also cause a re-evaluation).
|
||||||
|
"""
|
||||||
|
pending_runs = list(
|
||||||
|
filter(lambda r: r.status == RunStatus.pending,
|
||||||
|
self.pool.runs.values()))
|
||||||
|
|
||||||
now = time()
|
now = time()
|
||||||
pending_runs = filter(lambda r: r.status == RunStatus.pending,
|
def is_runnable(r):
|
||||||
self.pool.runs.values())
|
return (r.due_date or 0) < now
|
||||||
try:
|
|
||||||
candidate = max(pending_runs, key=lambda r: r.priority_key(now))
|
|
||||||
except ValueError:
|
|
||||||
# pending_runs is an empty sequence
|
|
||||||
return None
|
|
||||||
|
|
||||||
prepared_runs = filter(lambda r: r.status == RunStatus.prepare_done,
|
prepared_max = max((r.priority_key() for r in self.pool.runs.values()
|
||||||
self.pool.runs.values())
|
if r.status == RunStatus.prepare_done),
|
||||||
try:
|
default=None)
|
||||||
top_prepared_run = max(prepared_runs,
|
def takes_precedence(r):
|
||||||
key=lambda r: r.priority_key())
|
return prepared_max is None or r.priority_key() > prepared_max
|
||||||
except ValueError:
|
|
||||||
# there are no existing prepared runs - go ahead with <candidate>
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# prepare <candidate> (as well) only if it has higher priority than
|
|
||||||
# the highest priority prepared run
|
|
||||||
if top_prepared_run.priority_key() >= candidate.priority_key():
|
|
||||||
return None
|
|
||||||
|
|
||||||
if candidate.due_date is None or candidate.due_date < now:
|
candidate = max(filter(is_runnable, pending_runs),
|
||||||
|
key=lambda r: r.priority_key(),
|
||||||
|
default=None)
|
||||||
|
if candidate is not None and takes_precedence(candidate):
|
||||||
return candidate
|
return candidate
|
||||||
else:
|
|
||||||
return candidate.due_date - now
|
return min((r.due_date - now for r in pending_runs
|
||||||
|
if (not is_runnable(r) and takes_precedence(r))),
|
||||||
|
default=None)
|
||||||
|
|
||||||
async def _do(self):
|
async def _do(self):
|
||||||
while True:
|
while True:
|
||||||
|
|
|
@ -28,7 +28,18 @@ class BackgroundExperiment(EnvExperiment):
|
||||||
sleep(0.2)
|
sleep(0.2)
|
||||||
except TerminationRequested:
|
except TerminationRequested:
|
||||||
self.set_dataset("termination_ok", True,
|
self.set_dataset("termination_ok", True,
|
||||||
broadcast=True, save=False)
|
broadcast=True, archive=False)
|
||||||
|
|
||||||
|
|
||||||
|
class CheckPauseBackgroundExperiment(EnvExperiment):
|
||||||
|
def build(self):
|
||||||
|
self.setattr_device("scheduler")
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
while True:
|
||||||
|
while not self.scheduler.check_pause():
|
||||||
|
sleep(0.2)
|
||||||
|
self.scheduler.pause()
|
||||||
|
|
||||||
|
|
||||||
def _get_expid(name):
|
def _get_expid(name):
|
||||||
|
@ -117,6 +128,161 @@ class SchedulerCase(unittest.TestCase):
|
||||||
scheduler.notifier.publish = None
|
scheduler.notifier.publish = None
|
||||||
loop.run_until_complete(scheduler.stop())
|
loop.run_until_complete(scheduler.stop())
|
||||||
|
|
||||||
|
def test_pending_priority(self):
|
||||||
|
"""Check due dates take precedence over priorities when waiting to
|
||||||
|
prepare."""
|
||||||
|
loop = self.loop
|
||||||
|
handlers = {}
|
||||||
|
scheduler = Scheduler(_RIDCounter(0), handlers, None)
|
||||||
|
handlers["scheduler_check_pause"] = scheduler.check_pause
|
||||||
|
|
||||||
|
expid_empty = _get_expid("EmptyExperiment")
|
||||||
|
|
||||||
|
expid_bg = _get_expid("CheckPauseBackgroundExperiment")
|
||||||
|
# Suppress the SystemExit backtrace when worker process is killed.
|
||||||
|
expid_bg["log_level"] = logging.CRITICAL
|
||||||
|
|
||||||
|
high_priority = 3
|
||||||
|
middle_priority = 2
|
||||||
|
low_priority = 1
|
||||||
|
late = time() + 100000
|
||||||
|
early = time() + 1
|
||||||
|
|
||||||
|
expect = [
|
||||||
|
{
|
||||||
|
"path": [],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": {
|
||||||
|
"repo_msg": None,
|
||||||
|
"priority": low_priority,
|
||||||
|
"pipeline": "main",
|
||||||
|
"due_date": None,
|
||||||
|
"status": "pending",
|
||||||
|
"expid": expid_bg,
|
||||||
|
"flush": False
|
||||||
|
},
|
||||||
|
"key": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": {
|
||||||
|
"repo_msg": None,
|
||||||
|
"priority": high_priority,
|
||||||
|
"pipeline": "main",
|
||||||
|
"due_date": late,
|
||||||
|
"status": "pending",
|
||||||
|
"expid": expid_empty,
|
||||||
|
"flush": False
|
||||||
|
},
|
||||||
|
"key": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": {
|
||||||
|
"repo_msg": None,
|
||||||
|
"priority": middle_priority,
|
||||||
|
"pipeline": "main",
|
||||||
|
"due_date": early,
|
||||||
|
"status": "pending",
|
||||||
|
"expid": expid_empty,
|
||||||
|
"flush": False
|
||||||
|
},
|
||||||
|
"key": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [0],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "preparing",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [0],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "prepare_done",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [0],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "running",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [2],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "preparing",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [2],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "prepare_done",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [0],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "paused",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [2],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "running",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [2],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "run_done",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [0],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "running",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [2],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "analyzing",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [2],
|
||||||
|
"action": "setitem",
|
||||||
|
"value": "deleting",
|
||||||
|
"key": "status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": [],
|
||||||
|
"action": "delitem",
|
||||||
|
"key": 2
|
||||||
|
},
|
||||||
|
]
|
||||||
|
done = asyncio.Event()
|
||||||
|
expect_idx = 0
|
||||||
|
def notify(mod):
|
||||||
|
nonlocal expect_idx
|
||||||
|
self.assertEqual(mod, expect[expect_idx])
|
||||||
|
expect_idx += 1
|
||||||
|
if expect_idx >= len(expect):
|
||||||
|
done.set()
|
||||||
|
scheduler.notifier.publish = notify
|
||||||
|
|
||||||
|
scheduler.start()
|
||||||
|
|
||||||
|
scheduler.submit("main", expid_bg, low_priority)
|
||||||
|
scheduler.submit("main", expid_empty, high_priority, late)
|
||||||
|
scheduler.submit("main", expid_empty, middle_priority, early)
|
||||||
|
|
||||||
|
loop.run_until_complete(done.wait())
|
||||||
|
scheduler.notifier.publish = None
|
||||||
|
loop.run_until_complete(scheduler.stop())
|
||||||
|
|
||||||
def test_pause(self):
|
def test_pause(self):
|
||||||
loop = self.loop
|
loop = self.loop
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue