compiler: actually implement interleaving correctly (calls are still broken).

The previous implementation was completely wrong: it always advanced
the global timeline by the same amount as the non-interleaved basic
block did.

The new implementation only advances the global timeline by
the difference between its current time and the virtual time of
the branch, which requires it to adjust the delay instructions.

Previously, the delay expression was present in the IR twice: once
as the iodelay.Expr transformation-visible form, and once as regular
IR instructions, with the latter form being passed to the delay_mu
builtin and advancing the runtime timeline.

As a result of this change, this strategy is no longer valid:
we can meaningfully mutate the iodelay.Expr form but not the IR
instruction form. Thus, IR instructions are no longer generated for
delay expressions, and the LLVM lowering pass now has to lower
the iodelay.Expr objects as well.

This works OK for flat `with parallel:` expressions, but breaks down
outside of `with parallel:` or when calls are present. The reasons
it breaks down are as follows:

  * Outside of `with parallel:`, delay() and delay_mu() must accept
    any expression, but iodelay.Expr's are not nearly expressive
    enough. So, the IR instruction form must actually be kept as well.

  * A delay instruction is currently inserted after a call to
    a user-defined function; this delay instruction introduces
    a point where basic block reordering is possible as well as
    provides delay information. However, the callee knows nothing
    about the context in which it is called, which means that
    the runtime timeline is advanced twice. So, a new terminator
    instruction must be added that combines the properties of delay
    and call instructions (and another for delay and invoke as well).
This commit is contained in:
whitequark 2015-11-21 00:02:47 +08:00
parent 73c358a59a
commit 50e7b44d04
5 changed files with 73 additions and 30 deletions

View File

@ -1418,6 +1418,15 @@ class ARTIQIRGenerator(algorithm.Visitor):
return self.append(ir.Alloc(attributes, typ))
def _make_delay(self, delay):
if not iodelay.is_const(delay, 0):
after_delay = self.add_block()
self.append(ir.Delay(delay,
{var_name: self.current_args[var_name]
for var_name in delay.free_vars()},
after_delay))
self.current_block = after_delay
def visit_builtin_call(self, node):
# A builtin by any other name... Ignore node.func, just use the type.
typ = node.func.type
@ -1520,7 +1529,7 @@ class ARTIQIRGenerator(algorithm.Visitor):
return self.append(ir.Arith(ast.Mult(loc=None), now_mu_float, self.ref_period))
else:
assert False
elif types.is_builtin(typ, "delay") or types.is_builtin(typ, "at"):
elif types.is_builtin(typ, "at"):
if len(node.args) == 1 and len(node.keywords) == 0:
arg = self.visit(node.args[0])
arg_mu_float = self.append(ir.Arith(ast.Div(loc=None), arg, self.ref_period))
@ -1528,8 +1537,7 @@ class ARTIQIRGenerator(algorithm.Visitor):
self.append(ir.Builtin(typ.name + "_mu", [arg_mu], builtins.TNone()))
else:
assert False
elif types.is_builtin(typ, "now_mu") or types.is_builtin(typ, "delay_mu") \
or types.is_builtin(typ, "at_mu"):
elif types.is_builtin(typ, "now_mu") or types.is_builtin(typ, "at_mu"):
return self.append(ir.Builtin(typ.name,
[self.visit(arg) for arg in node.args], node.type))
elif types.is_builtin(typ, "mu_to_seconds"):
@ -1546,6 +1554,9 @@ class ARTIQIRGenerator(algorithm.Visitor):
return self.append(ir.Coerce(arg_mu, builtins.TInt(types.TValue(64))))
else:
assert False
elif types.is_builtin(typ, "delay") or types.is_builtin(typ, "delay_mu"):
assert node.iodelay is not None
self._make_delay(node.iodelay)
elif types.is_exn_constructor(typ):
return self.alloc_exn(node.type, *[self.visit(arg_node) for arg_node in node.args])
elif types.is_constructor(typ):
@ -1557,18 +1568,7 @@ class ARTIQIRGenerator(algorithm.Visitor):
typ = node.func.type.find()
if types.is_builtin(typ):
insn = self.visit_builtin_call(node)
# Temporary.
if node.iodelay is not None and not iodelay.is_const(node.iodelay, 0):
after_delay = self.add_block()
self.append(ir.Delay(node.iodelay,
{var_name: self.current_args[var_name]
for var_name in node.iodelay.free_vars()},
after_delay))
self.current_block = after_delay
return insn
return self.visit_builtin_call(node)
if types.is_function(typ):
func = self.visit(node.func)

View File

@ -77,6 +77,9 @@ class Interleaver:
index, source_block = min(enumerate(source_blocks), key=time_after_block)
source_block_delay = iodelay_of_block(source_block)
new_target_time = source_times[index] + source_block_delay
target_time_delta = new_target_time - target_time
target_terminator = target_block.terminator()
if isinstance(target_terminator, (ir.Delay, ir.Branch)):
target_terminator.set_target(source_block)
@ -85,8 +88,15 @@ class Interleaver:
else:
assert False
target_block = source_block
target_time += source_block_delay
source_terminator = source_block.terminator()
if target_time_delta > 0:
assert isinstance(source_terminator, ir.Delay)
source_terminator.expr = iodelay.Const(target_time_delta)
else:
source_terminator.replace_with(ir.Branch(source_terminator.target()))
target_block = source_block
target_time = new_target_time
new_source_block = postdom_tree.immediate_dominator(source_block)
assert (new_source_block is not None)
@ -98,4 +108,4 @@ class Interleaver:
del source_times[index]
else:
source_blocks[index] = new_source_block
source_times[index] = target_time
source_times[index] = new_target_time

View File

@ -7,7 +7,7 @@ import os
from pythonparser import ast, diagnostic
from llvmlite_artiq import ir as ll
from ...language import core as language_core
from .. import types, builtins, ir
from .. import types, builtins, ir, iodelay
llvoid = ll.VoidType()
@ -784,12 +784,6 @@ class LLVMIRGenerator:
return self.map(insn.operands[0])
elif insn.op == "now_mu":
return self.llbuilder.load(self.llbuiltin("now"), name=insn.name)
elif insn.op == "delay_mu":
interval, = insn.operands
llnowptr = self.llbuiltin("now")
llnow = self.llbuilder.load(llnowptr)
lladjusted = self.llbuilder.add(llnow, self.map(interval))
return self.llbuilder.store(lladjusted, llnowptr)
elif insn.op == "at_mu":
time, = insn.operands
return self.llbuilder.store(self.map(time), self.llbuiltin("now"))
@ -1068,8 +1062,6 @@ class LLVMIRGenerator:
def process_Branch(self, insn):
return self.llbuilder.branch(self.map(insn.target()))
process_Delay = process_Branch
def process_BranchIf(self, insn):
return self.llbuilder.cbranch(self.map(insn.condition()),
self.map(insn.if_true()), self.map(insn.if_false()))
@ -1150,3 +1142,16 @@ class LLVMIRGenerator:
return llexn
def process_Delay(self, insn):
def map_delay(expr):
if isinstance(expr, iodelay.Const):
return ll.Constant(lli64, int(expr.value))
else:
assert False
llnowptr = self.llbuiltin("now")
llnow = self.llbuilder.load(llnowptr)
lladjusted = self.llbuilder.add(llnow, map_delay(insn.expr))
self.llbuilder.store(lladjusted, llnowptr)
return self.llbuilder.branch(self.map(insn.target()))

View File

@ -0,0 +1,25 @@
# RUN: %python -m artiq.compiler.testbench.jit %s >%t
# RUN: OutputCheck %s --file-to-check=%t
def g():
with parallel:
with sequential:
print("A", now_mu())
delay_mu(2)
#
print("B", now_mu())
with sequential:
print("C", now_mu())
delay_mu(2)
#
print("D", now_mu())
delay_mu(2)
#
print("E", now_mu())
# CHECK-L: A 0
# CHECK-L: B 2
# CHECK-L: C 2
# CHECK-L: D 2
# CHECK-L: E 4
g()

View File

@ -6,17 +6,20 @@ def g():
with sequential:
print("A", now_mu())
delay_mu(3)
#
print("B", now_mu())
with sequential:
print("C", now_mu())
delay_mu(2)
#
print("D", now_mu())
delay_mu(2)
#
print("E", now_mu())
# CHECK-L: C 0
# CHECK-L: A 2
# CHECK-L: D 5
# CHECK-L: B 7
# CHECK-L: E 7
# CHECK-L: B 3
# CHECK-L: D 3
# CHECK-L: E 4
g()