Host object invariant constant folding #54

Closed
opened 2021-10-07 16:00:18 +08:00 by pca006132 · 5 comments

#33 is done, but we have not yet implemented constant folding for host object kernel invariants.

#33 is done, but we have not yet implemented constant folding for host object kernel invariants.
sb10q added the
high-priority
label 2021-11-05 18:19:09 +08:00
sb10q added this to the Prealpha milestone 2021-11-19 16:50:13 +08:00

We probably want some inlining passes.

from min_artiq import *

@nac3
class Demo:
    core: KernelInvariant[Core]
    led0: KernelInvariant[TTLOut]

    def __init__(self):
        self.core = Core()
        self.led0 = TTLOut(self.core, 18)

    @kernel
    def run(self):
        self.led0.pulse(100.*ms)

if __name__ == "__main__":
    Demo().run()

Before:

000000d4 <__modinit__>:
      d4: 17 05 00 00  	auipc	a0, 0
      d8: 03 25 05 77  	lw	a0, 1904(a0)
      dc: 17 03 00 00  	auipc	t1, 0
      e0: 67 00 43 13  	jr	308(t1)

000000e4 <min_artiqTTLOut.off.0>:
      e4: 93 05 00 00  	mv	a1, zero
      e8: 17 03 00 00  	auipc	t1, 0
      ec: 67 00 83 13  	jr	312(t1)

000000f0 <__main__Demo.run.0>:
      f0: 03 25 45 00  	lw	a0, 4(a0)
      f4: 97 05 00 00  	auipc	a1, 0
      f8: 93 85 c5 6a  	addi	a1, a1, 1708
      fc: 07 b5 05 00  	fld	fa0, 0(a1)
     100: 17 03 00 00  	auipc	t1, 0
     104: 67 00 03 13  	jr	304(t1)

00000108 <min_artiqCore.delay.0>:
     108: 13 01 01 ff  	addi	sp, sp, -16
     10c: 23 26 11 00  	sw	ra, 12(sp)
     110: 97 00 00 00  	auipc	ra, 0
     114: e7 80 00 13  	jalr	304(ra)
     118: 17 06 00 00  	auipc	a2, 0
     11c: 03 26 06 73  	lw	a2, 1840(a2)
     120: 83 26 06 00  	lw	a3, 0(a2)
     124: 03 27 86 00  	lw	a4, 8(a2)
     128: b3 85 b6 00  	add	a1, a3, a1
     12c: 33 05 a7 00  	add	a0, a4, a0
     130: b3 36 e5 00  	sltu	a3, a0, a4
     134: b3 85 d5 00  	add	a1, a1, a3
     138: 0f 00 10 03  	fence	rw, w
     13c: 23 20 b6 00  	sw	a1, 0(a2)
     140: 0f 00 10 03  	fence	rw, w
     144: 23 24 a6 00  	sw	a0, 8(a2)
     148: 83 20 c1 00  	lw	ra, 12(sp)
     14c: 13 01 01 01  	addi	sp, sp, 16
     150: 67 80 00 00  	ret

00000154 <min_artiqTTLOut.pulse.0>:
     154: 13 01 01 ff  	addi	sp, sp, -16
     158: 23 26 11 00  	sw	ra, 12(sp)
     15c: 23 24 81 00  	sw	s0, 8(sp)
     160: 27 30 81 00  	fsd	fs0, 0(sp)
     164: 53 04 a5 22  	fmv.d	fs0, fa0
     168: 13 04 05 00  	mv	s0, a0
     16c: 97 00 00 00  	auipc	ra, 0
     170: e7 80 40 0e  	jalr	228(ra)
     174: 03 25 04 00  	lw	a0, 0(s0)
     178: 53 05 84 22  	fmv.d	fa0, fs0
     17c: 97 00 00 00  	auipc	ra, 0
     180: e7 80 40 0e  	jalr	228(ra)
     184: 13 05 04 00  	mv	a0, s0
     188: 07 34 01 00  	fld	fs0, 0(sp)
     18c: 03 24 81 00  	lw	s0, 8(sp)
     190: 83 20 c1 00  	lw	ra, 12(sp)
     194: 13 01 01 01  	addi	sp, sp, 16
     198: 17 03 00 00  	auipc	t1, 0
     19c: 67 00 83 0d  	jr	216(t1)

000001a0 <min_artiqTTLOut.set_o.0>:
     1a0: 03 25 85 00  	lw	a0, 8(a0)
     1a4: 93 f5 15 00  	andi	a1, a1, 1
     1a8: 17 03 00 00  	auipc	t1, 0
     1ac: 67 00 83 0d  	jr	216(t1)

000001b0 <min_artiqTTLOut.on.0>:
     1b0: 93 05 10 00  	addi	a1, zero, 1
     1b4: 17 03 00 00  	auipc	t1, 0
     1b8: 67 00 c3 06  	jr	108(t1)

000001bc <min_artiqCore.seconds_to_mu.0>:
     1bc: 13 01 01 ff  	addi	sp, sp, -16
     1c0: 23 26 11 00  	sw	ra, 12(sp)
     1c4: 07 30 05 00  	fld	ft0, 0(a0)
     1c8: 53 75 05 1a  	fdiv.d	fa0, fa0, ft0
     1cc: 97 00 00 00  	auipc	ra, 0
     1d0: e7 80 40 0c  	jalr	196(ra)
     1d4: 53 15 05 c2  	fcvt.w.d	a0, fa0, rtz
     1d8: 93 55 f5 41  	srai	a1, a0, 31
     1dc: 83 20 c1 00  	lw	ra, 12(sp)
     1e0: 13 01 01 01  	addi	sp, sp, 16
     1e4: 67 80 00 00  	ret

After #114:

000000d4 <min_artiqTTLOut.pulse.0>:
      d4: 13 01 01 ff  	addi	sp, sp, -16
      d8: 23 26 11 00  	sw	ra, 12(sp)
      dc: 23 24 81 00  	sw	s0, 8(sp)
      e0: 27 30 81 00  	fsd	fs0, 0(sp)
      e4: 53 04 a5 22  	fmv.d	fs0, fa0
      e8: 17 04 00 00  	auipc	s0, 0
      ec: 03 24 c4 7a  	lw	s0, 1964(s0)
      f0: 13 05 04 00  	mv	a0, s0
      f4: 97 00 00 00  	auipc	ra, 0
      f8: e7 80 c0 14  	jalr	332(ra)
      fc: 17 05 00 00  	auipc	a0, 0
     100: 03 25 c5 79  	lw	a0, 1948(a0)
     104: 53 05 84 22  	fmv.d	fa0, fs0
     108: 97 00 00 00  	auipc	ra, 0
     10c: e7 80 80 14  	jalr	328(ra)
     110: 13 05 04 00  	mv	a0, s0
     114: 07 34 01 00  	fld	fs0, 0(sp)
     118: 03 24 81 00  	lw	s0, 8(sp)
     11c: 83 20 c1 00  	lw	ra, 12(sp)
     120: 13 01 01 01  	addi	sp, sp, 16
     124: 17 03 00 00  	auipc	t1, 0
     128: 67 00 c3 13  	jr	316(t1)

0000012c <min_artiqTTLOut.set_o.0>:
     12c: 93 f5 15 00  	andi	a1, a1, 1
     130: 37 15 00 00  	lui	a0, 1
     134: 13 05 05 20  	addi	a0, a0, 512
     138: 17 03 00 00  	auipc	t1, 0
     13c: 67 00 83 13  	jr	312(t1)

00000140 <__modinit__>:
     140: 17 05 00 00  	auipc	a0, 0
     144: 03 25 c5 75  	lw	a0, 1884(a0)
     148: 17 03 00 00  	auipc	t1, 0
     14c: 67 00 83 13  	jr	312(t1)

00000150 <min_artiqCore.delay.0>:
     150: 13 01 01 ff  	addi	sp, sp, -16
     154: 23 26 11 00  	sw	ra, 12(sp)
     158: 17 05 00 00  	auipc	a0, 0
     15c: 03 25 05 74  	lw	a0, 1856(a0)
     160: 97 00 00 00  	auipc	ra, 0
     164: e7 80 00 13  	jalr	304(ra)
     168: 17 06 00 00  	auipc	a2, 0
     16c: 03 26 86 73  	lw	a2, 1848(a2)
     170: 83 26 06 00  	lw	a3, 0(a2)
     174: 03 27 86 00  	lw	a4, 8(a2)
     178: b3 85 b6 00  	add	a1, a3, a1
     17c: 33 05 a7 00  	add	a0, a4, a0
     180: b3 36 e5 00  	sltu	a3, a0, a4
     184: b3 85 d5 00  	add	a1, a1, a3
     188: 0f 00 10 03  	fence	rw, w
     18c: 23 20 b6 00  	sw	a1, 0(a2)
     190: 0f 00 10 03  	fence	rw, w
     194: 23 24 a6 00  	sw	a0, 8(a2)
     198: 83 20 c1 00  	lw	ra, 12(sp)
     19c: 13 01 01 01  	addi	sp, sp, 16
     1a0: 67 80 00 00  	ret

000001a4 <__main__Demo.run.0>:
     1a4: 17 05 00 00  	auipc	a0, 0
     1a8: 03 25 05 6f  	lw	a0, 1776(a0)
     1ac: 97 05 00 00  	auipc	a1, 0
     1b0: 93 85 c5 63  	addi	a1, a1, 1596
     1b4: 07 b5 05 00  	fld	fa0, 0(a1)
     1b8: 17 03 00 00  	auipc	t1, 0
     1bc: 67 00 83 0e  	jr	232(t1)

000001c0 <min_artiqTTLOut.off.0>:
     1c0: 17 05 00 00  	auipc	a0, 0
     1c4: 03 25 45 6d  	lw	a0, 1748(a0)
     1c8: 93 05 00 00  	mv	a1, zero
     1cc: 17 03 00 00  	auipc	t1, 0
     1d0: 67 00 43 0e  	jr	228(t1)

000001d4 <min_artiqTTLOut.on.0>:
     1d4: 17 05 00 00  	auipc	a0, 0
     1d8: 03 25 05 6c  	lw	a0, 1728(a0)
     1dc: 93 05 10 00  	addi	a1, zero, 1
     1e0: 17 03 00 00  	auipc	t1, 0
     1e4: 67 00 03 0d  	jr	208(t1)

000001e8 <min_artiqCore.seconds_to_mu.0>:
     1e8: 13 01 01 ff  	addi	sp, sp, -16
     1ec: 23 26 11 00  	sw	ra, 12(sp)
     1f0: 17 05 00 00  	auipc	a0, 0
     1f4: 13 05 05 60  	addi	a0, a0, 1536
     1f8: 07 30 05 00  	fld	ft0, 0(a0)
     1fc: 53 75 05 1a  	fdiv.d	fa0, fa0, ft0
     200: 97 00 00 00  	auipc	ra, 0
     204: e7 80 00 0c  	jalr	192(ra)
     208: 53 15 05 c2  	fcvt.w.d	a0, fa0, rtz
     20c: 93 55 f5 41  	srai	a1, a0, 31
     210: 83 20 c1 00  	lw	ra, 12(sp)
     214: 13 01 01 01  	addi	sp, sp, 16
     218: 67 80 00 00  	ret

The optimization does not seem that high.

We probably want some inlining passes. ```python from min_artiq import * @nac3 class Demo: core: KernelInvariant[Core] led0: KernelInvariant[TTLOut] def __init__(self): self.core = Core() self.led0 = TTLOut(self.core, 18) @kernel def run(self): self.led0.pulse(100.*ms) if __name__ == "__main__": Demo().run() ``` Before: ``` 000000d4 <__modinit__>: d4: 17 05 00 00 auipc a0, 0 d8: 03 25 05 77 lw a0, 1904(a0) dc: 17 03 00 00 auipc t1, 0 e0: 67 00 43 13 jr 308(t1) 000000e4 <min_artiqTTLOut.off.0>: e4: 93 05 00 00 mv a1, zero e8: 17 03 00 00 auipc t1, 0 ec: 67 00 83 13 jr 312(t1) 000000f0 <__main__Demo.run.0>: f0: 03 25 45 00 lw a0, 4(a0) f4: 97 05 00 00 auipc a1, 0 f8: 93 85 c5 6a addi a1, a1, 1708 fc: 07 b5 05 00 fld fa0, 0(a1) 100: 17 03 00 00 auipc t1, 0 104: 67 00 03 13 jr 304(t1) 00000108 <min_artiqCore.delay.0>: 108: 13 01 01 ff addi sp, sp, -16 10c: 23 26 11 00 sw ra, 12(sp) 110: 97 00 00 00 auipc ra, 0 114: e7 80 00 13 jalr 304(ra) 118: 17 06 00 00 auipc a2, 0 11c: 03 26 06 73 lw a2, 1840(a2) 120: 83 26 06 00 lw a3, 0(a2) 124: 03 27 86 00 lw a4, 8(a2) 128: b3 85 b6 00 add a1, a3, a1 12c: 33 05 a7 00 add a0, a4, a0 130: b3 36 e5 00 sltu a3, a0, a4 134: b3 85 d5 00 add a1, a1, a3 138: 0f 00 10 03 fence rw, w 13c: 23 20 b6 00 sw a1, 0(a2) 140: 0f 00 10 03 fence rw, w 144: 23 24 a6 00 sw a0, 8(a2) 148: 83 20 c1 00 lw ra, 12(sp) 14c: 13 01 01 01 addi sp, sp, 16 150: 67 80 00 00 ret 00000154 <min_artiqTTLOut.pulse.0>: 154: 13 01 01 ff addi sp, sp, -16 158: 23 26 11 00 sw ra, 12(sp) 15c: 23 24 81 00 sw s0, 8(sp) 160: 27 30 81 00 fsd fs0, 0(sp) 164: 53 04 a5 22 fmv.d fs0, fa0 168: 13 04 05 00 mv s0, a0 16c: 97 00 00 00 auipc ra, 0 170: e7 80 40 0e jalr 228(ra) 174: 03 25 04 00 lw a0, 0(s0) 178: 53 05 84 22 fmv.d fa0, fs0 17c: 97 00 00 00 auipc ra, 0 180: e7 80 40 0e jalr 228(ra) 184: 13 05 04 00 mv a0, s0 188: 07 34 01 00 fld fs0, 0(sp) 18c: 03 24 81 00 lw s0, 8(sp) 190: 83 20 c1 00 lw ra, 12(sp) 194: 13 01 01 01 addi sp, sp, 16 198: 17 03 00 00 auipc t1, 0 19c: 67 00 83 0d jr 216(t1) 000001a0 <min_artiqTTLOut.set_o.0>: 1a0: 03 25 85 00 lw a0, 8(a0) 1a4: 93 f5 15 00 andi a1, a1, 1 1a8: 17 03 00 00 auipc t1, 0 1ac: 67 00 83 0d jr 216(t1) 000001b0 <min_artiqTTLOut.on.0>: 1b0: 93 05 10 00 addi a1, zero, 1 1b4: 17 03 00 00 auipc t1, 0 1b8: 67 00 c3 06 jr 108(t1) 000001bc <min_artiqCore.seconds_to_mu.0>: 1bc: 13 01 01 ff addi sp, sp, -16 1c0: 23 26 11 00 sw ra, 12(sp) 1c4: 07 30 05 00 fld ft0, 0(a0) 1c8: 53 75 05 1a fdiv.d fa0, fa0, ft0 1cc: 97 00 00 00 auipc ra, 0 1d0: e7 80 40 0c jalr 196(ra) 1d4: 53 15 05 c2 fcvt.w.d a0, fa0, rtz 1d8: 93 55 f5 41 srai a1, a0, 31 1dc: 83 20 c1 00 lw ra, 12(sp) 1e0: 13 01 01 01 addi sp, sp, 16 1e4: 67 80 00 00 ret ``` After https://git.m-labs.hk/M-Labs/nac3/pulls/114: ``` 000000d4 <min_artiqTTLOut.pulse.0>: d4: 13 01 01 ff addi sp, sp, -16 d8: 23 26 11 00 sw ra, 12(sp) dc: 23 24 81 00 sw s0, 8(sp) e0: 27 30 81 00 fsd fs0, 0(sp) e4: 53 04 a5 22 fmv.d fs0, fa0 e8: 17 04 00 00 auipc s0, 0 ec: 03 24 c4 7a lw s0, 1964(s0) f0: 13 05 04 00 mv a0, s0 f4: 97 00 00 00 auipc ra, 0 f8: e7 80 c0 14 jalr 332(ra) fc: 17 05 00 00 auipc a0, 0 100: 03 25 c5 79 lw a0, 1948(a0) 104: 53 05 84 22 fmv.d fa0, fs0 108: 97 00 00 00 auipc ra, 0 10c: e7 80 80 14 jalr 328(ra) 110: 13 05 04 00 mv a0, s0 114: 07 34 01 00 fld fs0, 0(sp) 118: 03 24 81 00 lw s0, 8(sp) 11c: 83 20 c1 00 lw ra, 12(sp) 120: 13 01 01 01 addi sp, sp, 16 124: 17 03 00 00 auipc t1, 0 128: 67 00 c3 13 jr 316(t1) 0000012c <min_artiqTTLOut.set_o.0>: 12c: 93 f5 15 00 andi a1, a1, 1 130: 37 15 00 00 lui a0, 1 134: 13 05 05 20 addi a0, a0, 512 138: 17 03 00 00 auipc t1, 0 13c: 67 00 83 13 jr 312(t1) 00000140 <__modinit__>: 140: 17 05 00 00 auipc a0, 0 144: 03 25 c5 75 lw a0, 1884(a0) 148: 17 03 00 00 auipc t1, 0 14c: 67 00 83 13 jr 312(t1) 00000150 <min_artiqCore.delay.0>: 150: 13 01 01 ff addi sp, sp, -16 154: 23 26 11 00 sw ra, 12(sp) 158: 17 05 00 00 auipc a0, 0 15c: 03 25 05 74 lw a0, 1856(a0) 160: 97 00 00 00 auipc ra, 0 164: e7 80 00 13 jalr 304(ra) 168: 17 06 00 00 auipc a2, 0 16c: 03 26 86 73 lw a2, 1848(a2) 170: 83 26 06 00 lw a3, 0(a2) 174: 03 27 86 00 lw a4, 8(a2) 178: b3 85 b6 00 add a1, a3, a1 17c: 33 05 a7 00 add a0, a4, a0 180: b3 36 e5 00 sltu a3, a0, a4 184: b3 85 d5 00 add a1, a1, a3 188: 0f 00 10 03 fence rw, w 18c: 23 20 b6 00 sw a1, 0(a2) 190: 0f 00 10 03 fence rw, w 194: 23 24 a6 00 sw a0, 8(a2) 198: 83 20 c1 00 lw ra, 12(sp) 19c: 13 01 01 01 addi sp, sp, 16 1a0: 67 80 00 00 ret 000001a4 <__main__Demo.run.0>: 1a4: 17 05 00 00 auipc a0, 0 1a8: 03 25 05 6f lw a0, 1776(a0) 1ac: 97 05 00 00 auipc a1, 0 1b0: 93 85 c5 63 addi a1, a1, 1596 1b4: 07 b5 05 00 fld fa0, 0(a1) 1b8: 17 03 00 00 auipc t1, 0 1bc: 67 00 83 0e jr 232(t1) 000001c0 <min_artiqTTLOut.off.0>: 1c0: 17 05 00 00 auipc a0, 0 1c4: 03 25 45 6d lw a0, 1748(a0) 1c8: 93 05 00 00 mv a1, zero 1cc: 17 03 00 00 auipc t1, 0 1d0: 67 00 43 0e jr 228(t1) 000001d4 <min_artiqTTLOut.on.0>: 1d4: 17 05 00 00 auipc a0, 0 1d8: 03 25 05 6c lw a0, 1728(a0) 1dc: 93 05 10 00 addi a1, zero, 1 1e0: 17 03 00 00 auipc t1, 0 1e4: 67 00 03 0d jr 208(t1) 000001e8 <min_artiqCore.seconds_to_mu.0>: 1e8: 13 01 01 ff addi sp, sp, -16 1ec: 23 26 11 00 sw ra, 12(sp) 1f0: 17 05 00 00 auipc a0, 0 1f4: 13 05 05 60 addi a0, a0, 1536 1f8: 07 30 05 00 fld ft0, 0(a0) 1fc: 53 75 05 1a fdiv.d fa0, fa0, ft0 200: 97 00 00 00 auipc ra, 0 204: e7 80 00 0c jalr 192(ra) 208: 53 15 05 c2 fcvt.w.d a0, fa0, rtz 20c: 93 55 f5 41 srai a1, a0, 31 210: 83 20 c1 00 lw ra, 12(sp) 214: 13 01 01 01 addi sp, sp, 16 218: 67 80 00 00 ret ``` The optimization does not seem that high.

We can do this if you want:

diff --git a/nac3artiq/src/lib.rs b/nac3artiq/src/lib.rs
index 61e16b9..cdbd69f 100644
--- a/nac3artiq/src/lib.rs
+++ b/nac3artiq/src/lib.rs
@@ -478,8 +478,9 @@ impl Nac3 {
         let working_directory = self.working_directory.path().to_owned();
         let f = Arc::new(WithCall::new(Box::new(move |module| {
             let builder = PassManagerBuilder::create();
-            builder.set_optimization_level(OptimizationLevel::Default);
+            builder.set_optimization_level(OptimizationLevel::Aggressive);
             let passes = PassManager::create(());
+            builder.set_inliner_with_threshold(255);
             builder.populate_module_pass_manager(&passes);
             passes.run_on(module);
 
@@ -508,7 +509,7 @@ impl Nac3 {
                     &triple,
                     "",
                     &features,
-                    OptimizationLevel::Default,
+                    OptimizationLevel::Aggressive,
                     RelocMode::PIC,
                     CodeModel::Default,
                 )
@@ -520,8 +521,9 @@ impl Nac3 {
                     &working_directory.join(&format!("{}.o", module.get_name().to_str().unwrap())),
                 )
                 .expect("couldn't write module to file");
+            println!("{}", module.print_to_string().to_str().unwrap());
         })));
-        let thread_names: Vec<String> = (0..4).map(|i| format!("module{}", i)).collect();
+        let thread_names: Vec<String> = (0..1).map(|i| format!("module{}", i)).collect();
         let threads: Vec<_> = thread_names
             .iter()
             .map(|s| Box::new(ArtiqCodeGenerator::new(s.to_string(), self.time_fns)))

And set the linkage of functions to be private to prevent duplicating the function body. But this hurts LLVM codegen performance. We can also try LTO if you want.

We can do this if you want: ```rust diff --git a/nac3artiq/src/lib.rs b/nac3artiq/src/lib.rs index 61e16b9..cdbd69f 100644 --- a/nac3artiq/src/lib.rs +++ b/nac3artiq/src/lib.rs @@ -478,8 +478,9 @@ impl Nac3 { let working_directory = self.working_directory.path().to_owned(); let f = Arc::new(WithCall::new(Box::new(move |module| { let builder = PassManagerBuilder::create(); - builder.set_optimization_level(OptimizationLevel::Default); + builder.set_optimization_level(OptimizationLevel::Aggressive); let passes = PassManager::create(()); + builder.set_inliner_with_threshold(255); builder.populate_module_pass_manager(&passes); passes.run_on(module); @@ -508,7 +509,7 @@ impl Nac3 { &triple, "", &features, - OptimizationLevel::Default, + OptimizationLevel::Aggressive, RelocMode::PIC, CodeModel::Default, ) @@ -520,8 +521,9 @@ impl Nac3 { &working_directory.join(&format!("{}.o", module.get_name().to_str().unwrap())), ) .expect("couldn't write module to file"); + println!("{}", module.print_to_string().to_str().unwrap()); }))); - let thread_names: Vec<String> = (0..4).map(|i| format!("module{}", i)).collect(); + let thread_names: Vec<String> = (0..1).map(|i| format!("module{}", i)).collect(); let threads: Vec<_> = thread_names .iter() .map(|s| Box::new(ArtiqCodeGenerator::new(s.to_string(), self.time_fns))) ``` And set the linkage of functions to be private to prevent duplicating the function body. But this hurts LLVM codegen performance. We can also try LTO if you want.
pca006132 removed the
high-priority
label 2021-11-28 17:20:36 +08:00

It is crucial that this kind of code does get optimized. A lot of experiments rely on this.

The LLVM IR generated after this patch looks fine to me.

Maybe keep several threads then combine the LLVM modules into one as suggested in #18 (comment) ?

Yes we should set this private linkage to prevent duplication.

It is crucial that this kind of code does get optimized. A lot of experiments rely on this. The LLVM IR generated after this patch looks fine to me. Maybe keep several threads then combine the LLVM modules into one as suggested in https://git.m-labs.hk/M-Labs/nac3/issues/18#issuecomment-3054 ? Yes we should set this private linkage to prevent duplication.
sb10q added the
high-priority
label 2021-11-28 21:30:32 +08:00
pca006132 was assigned by sb10q 2021-12-03 12:09:35 +08:00

We probably want some inlining passes.

from min_artiq import *

@nac3
class Demo:
    core: KernelInvariant[Core]
    led0: KernelInvariant[TTLOut]

    def __init__(self):
        self.core = Core()
        self.led0 = TTLOut(self.core, 18)

    @kernel
    def run(self):
        self.led0.pulse(100.*ms)

if __name__ == "__main__":
    Demo().run()

This is now optimized to

; ModuleID = 'main'
source_filename = "main"

@now = external local_unnamed_addr global i64
@"140614884076656" = global { double } { double 1.000000e-09 }
@"140614884077040" = global { { double }*, i32, i32 } { { double }* @"140614884076656", i32 18, i32 4608 }
@"140614884077280" = local_unnamed_addr global { { double }*, { { double }*, i32, i32 }* } { { double }* @"140614884076656", { { double }*, i32, i32 }* @"140614884077040" }

define void @__modinit__() local_unnamed_addr {
init:
  tail call void @rtio_output(i32 4608, i32 1)
  %now_hi.i.i.i = load i32, i32* bitcast (i64* @now to i32*), align 4
  %now_lo.i.i.i = load i32, i32* bitcast (i64* getelementptr inbounds (i64, i64* @now, i64 1) to i32*), align 4
  %now_zext_hi.i.i.i = zext i32 %now_hi.i.i.i to i64
  %now_shifted_zext_hi.i.i.i = shl nuw i64 %now_zext_hi.i.i.i, 32
  %now_zext_lo.i.i.i = zext i32 %now_lo.i.i.i to i64
  %now_or.i.i.i = or i64 %now_shifted_zext_hi.i.i.i, %now_zext_lo.i.i.i
  %now_add.i.i.i = add i64 %now_or.i.i.i, 100000000
  %now_lshr.i.i.i = lshr i64 %now_add.i.i.i, 32
  %now_trunc.i.i.i = trunc i64 %now_lshr.i.i.i to i32
  %now_trunc1.i.i.i = trunc i64 %now_add.i.i.i to i32
  store atomic i32 %now_trunc.i.i.i, i32* bitcast (i64* @now to i32*) seq_cst, align 4
  store atomic i32 %now_trunc1.i.i.i, i32* bitcast (i64* getelementptr inbounds (i64, i64* @now, i64 1) to i32*) seq_cst, align 4
  tail call void @rtio_output(i32 4608, i32 0)
  ret void
}

declare void @rtio_output(i32 %0, i32 %1) local_unnamed_addr

In disassembly:

module.elf:	file format elf32-littleriscv


Disassembly of section .text:

000000d4 <__modinit__>:
      d4: 13 01 01 ff  	addi	sp, sp, -16
      d8: 23 26 11 00  	sw	ra, 12(sp)
      dc: 23 24 81 00  	sw	s0, 8(sp)
      e0: 37 15 00 00  	lui	a0, 1
      e4: 13 04 05 20  	addi	s0, a0, 512
      e8: 93 05 10 00  	addi	a1, zero, 1
      ec: 13 05 04 00  	mv	a0, s0
      f0: 97 00 00 00  	auipc	ra, 0
      f4: e7 80 00 08  	jalr	128(ra)
      f8: 17 05 00 00  	auipc	a0, 0
      fc: 03 25 c5 34  	lw	a0, 844(a0)
     100: 83 25 85 00  	lw	a1, 8(a0)
     104: 03 26 05 00  	lw	a2, 0(a0)
     108: b7 e6 f5 05  	lui	a3, 24414
     10c: 93 86 06 10  	addi	a3, a3, 256
     110: b3 86 d5 00  	add	a3, a1, a3
     114: b3 b5 b6 00  	sltu	a1, a3, a1
     118: b3 05 b6 00  	add	a1, a2, a1
     11c: 0f 00 10 03  	fence	rw, w
     120: 23 20 b5 00  	sw	a1, 0(a0)
     124: 0f 00 10 03  	fence	rw, w
     128: 23 24 d5 00  	sw	a3, 8(a0)
     12c: 13 05 04 00  	mv	a0, s0
     130: 93 05 00 00  	mv	a1, zero
     134: 03 24 81 00  	lw	s0, 8(sp)
     138: 83 20 c1 00  	lw	ra, 12(sp)
     13c: 13 01 01 01  	addi	sp, sp, 16
     140: 17 03 00 00  	auipc	t1, 0
     144: 67 00 03 03  	jr	48(t1)

Disassembly of section .plt:

00000150 <.plt>:
     150: 97 03 00 00  	auipc	t2, 0
     154: 33 03 c3 41  	sub	t1, t1, t3
     158: 03 ae 83 27  	lw	t3, 632(t2)
     15c: 13 03 43 fd  	addi	t1, t1, -44
     160: 93 82 83 27  	addi	t0, t2, 632
     164: 13 53 23 00  	srli	t1, t1, 2
     168: 83 a2 42 00  	lw	t0, 4(t0)
     16c: 67 00 0e 00  	jr	t3
     170: 17 0e 00 00  	auipc	t3, 0
     174: 03 2e 0e 26  	lw	t3, 608(t3)
     178: 67 03 0e 00  	jalr	t1, t3
     17c: 13 00 00 00  	nop
> We probably want some inlining passes. > > ```python > from min_artiq import * > > @nac3 > class Demo: > core: KernelInvariant[Core] > led0: KernelInvariant[TTLOut] > > def __init__(self): > self.core = Core() > self.led0 = TTLOut(self.core, 18) > > @kernel > def run(self): > self.led0.pulse(100.*ms) > > if __name__ == "__main__": > Demo().run() > ``` > This is now optimized to ``` ; ModuleID = 'main' source_filename = "main" @now = external local_unnamed_addr global i64 @"140614884076656" = global { double } { double 1.000000e-09 } @"140614884077040" = global { { double }*, i32, i32 } { { double }* @"140614884076656", i32 18, i32 4608 } @"140614884077280" = local_unnamed_addr global { { double }*, { { double }*, i32, i32 }* } { { double }* @"140614884076656", { { double }*, i32, i32 }* @"140614884077040" } define void @__modinit__() local_unnamed_addr { init: tail call void @rtio_output(i32 4608, i32 1) %now_hi.i.i.i = load i32, i32* bitcast (i64* @now to i32*), align 4 %now_lo.i.i.i = load i32, i32* bitcast (i64* getelementptr inbounds (i64, i64* @now, i64 1) to i32*), align 4 %now_zext_hi.i.i.i = zext i32 %now_hi.i.i.i to i64 %now_shifted_zext_hi.i.i.i = shl nuw i64 %now_zext_hi.i.i.i, 32 %now_zext_lo.i.i.i = zext i32 %now_lo.i.i.i to i64 %now_or.i.i.i = or i64 %now_shifted_zext_hi.i.i.i, %now_zext_lo.i.i.i %now_add.i.i.i = add i64 %now_or.i.i.i, 100000000 %now_lshr.i.i.i = lshr i64 %now_add.i.i.i, 32 %now_trunc.i.i.i = trunc i64 %now_lshr.i.i.i to i32 %now_trunc1.i.i.i = trunc i64 %now_add.i.i.i to i32 store atomic i32 %now_trunc.i.i.i, i32* bitcast (i64* @now to i32*) seq_cst, align 4 store atomic i32 %now_trunc1.i.i.i, i32* bitcast (i64* getelementptr inbounds (i64, i64* @now, i64 1) to i32*) seq_cst, align 4 tail call void @rtio_output(i32 4608, i32 0) ret void } declare void @rtio_output(i32 %0, i32 %1) local_unnamed_addr ``` In disassembly: ``` module.elf: file format elf32-littleriscv Disassembly of section .text: 000000d4 <__modinit__>: d4: 13 01 01 ff addi sp, sp, -16 d8: 23 26 11 00 sw ra, 12(sp) dc: 23 24 81 00 sw s0, 8(sp) e0: 37 15 00 00 lui a0, 1 e4: 13 04 05 20 addi s0, a0, 512 e8: 93 05 10 00 addi a1, zero, 1 ec: 13 05 04 00 mv a0, s0 f0: 97 00 00 00 auipc ra, 0 f4: e7 80 00 08 jalr 128(ra) f8: 17 05 00 00 auipc a0, 0 fc: 03 25 c5 34 lw a0, 844(a0) 100: 83 25 85 00 lw a1, 8(a0) 104: 03 26 05 00 lw a2, 0(a0) 108: b7 e6 f5 05 lui a3, 24414 10c: 93 86 06 10 addi a3, a3, 256 110: b3 86 d5 00 add a3, a1, a3 114: b3 b5 b6 00 sltu a1, a3, a1 118: b3 05 b6 00 add a1, a2, a1 11c: 0f 00 10 03 fence rw, w 120: 23 20 b5 00 sw a1, 0(a0) 124: 0f 00 10 03 fence rw, w 128: 23 24 d5 00 sw a3, 8(a0) 12c: 13 05 04 00 mv a0, s0 130: 93 05 00 00 mv a1, zero 134: 03 24 81 00 lw s0, 8(sp) 138: 83 20 c1 00 lw ra, 12(sp) 13c: 13 01 01 01 addi sp, sp, 16 140: 17 03 00 00 auipc t1, 0 144: 67 00 03 03 jr 48(t1) Disassembly of section .plt: 00000150 <.plt>: 150: 97 03 00 00 auipc t2, 0 154: 33 03 c3 41 sub t1, t1, t3 158: 03 ae 83 27 lw t3, 632(t2) 15c: 13 03 43 fd addi t1, t1, -44 160: 93 82 83 27 addi t0, t2, 632 164: 13 53 23 00 srli t1, t1, 2 168: 83 a2 42 00 lw t0, 4(t0) 16c: 67 00 0e 00 jr t3 170: 17 0e 00 00 auipc t3, 0 174: 03 2e 0e 26 lw t3, 608(t3) 178: 67 03 0e 00 jalr t1, t3 17c: 13 00 00 00 nop ```

Excellent! Well done.

Excellent! Well done.
Sign in to join this conversation.
No Milestone
No Assignees
2 Participants
Notifications
Due Date
The due date is invalid or out of range. Please use the format 'yyyy-mm-dd'.

No due date set.

Dependencies

No dependencies set.

Reference: M-Labs/nac3#54
There is no content yet.