phinode的创建和消除

1. SSA构建算法

依赖:

  1. dom tree
  2. DF
  3. llvm ir基础知识

对以下c代码

1
2
3
4
int foo(int a){
int b = a;
return b;
}

clang会生成类似这种IR

1
2
3
4
5
6
7
8
define i32 @foo(i32 %a){
%pa = alloc i32
%pb = alloc i32
store %a, %pa
store %a, %pb
%b1 = load %pb
ret i32 %b1
}

我们能看到llvm采用了alloc+load+store指令来实现c中局部变量的存放。
这里参数处理i32 %a; store %a, %pa实际上是个比较复杂的话题,按下不表。
这段IR的有很多冗余的load/store, 有没有什么办法消除掉呢?
SROA(mem2reg)!
预期的IR

1
2
3
define i32 @foo(i32 %a){
ret i32 %a
}

如果控制流比较复杂呢?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
int b = a;
if(a > 10){
b = b + a;
}else{
b = b * a;
}
return b;
=>
ir
bb0:
%pb = alloc i32
store %a, %pb
%c = CMP_GT %a, 10
br %c, %bb1, %bb2

bb1:
%b0 = load %pb
%b1 = add %b0, %a
store %b1, %pb
br %bb4

bb3:
%b2 = load %pb
%b3 = mul %b2, %a
store %b3, %pb
br %bb4
bb4:
%b4 = load %pb
ret %b4

这里我们发现,如果想要正确删除掉这些load/store,我们需要引入phinode,而且必须拿到控制流信息。phinode是个逻辑节点,在实际的CPU上没对应的指令/寄存器表示。所以我们在生成实际的汇编前需要再删除掉phi。

接下来看如何实现SSA构建:

原理分为两步, insertPHI, Rename, 伪代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

procedure InsertPHI(variable_list)

    for each variable v in variable_list
        WorkList ← ∅
        EverOnWorkList ← ∅
        AlreadyHasPhiFunc ← ∅
        for each node n containing an assignment to v
            WorkList ← WorkList ∪ {n}
        end

        EverOnWorkList ← WorkList
        while WorkList ≠ ∅
            Remove some node n from WorkList
            for each d ∈ DF(n)
                if d ∉ AlreadyHasPhiFunc
                    Insert a φ-function for v at d
                    AlreadyHasPhiFunc ← AlreadyHasPhiFunc ∪ {d}
                    if d ∉ EverOnWorkList
                        WorkList ← WorkList ∪ {d}
                        EverOnWorkList ← EverOnWorkList ∪ {d}
                    end
                end
            end
        end
    end
end procedure



procedure GenName(variable v)
    vn = new Value(v)
    Push vn onto Stacks[v]
    return v
end procedure



procedure Rename(block b)

    if b previously visited return
    for each φ-function p in b
        v = LHS(p)
        vn = GenName(v) and replace v with vn
    end

    for each statement s in b (in order)
        for each variable v ∈ RHS(s)
            replace v by Top(Stacks[v])
        end

        for each variable v ∈ LHS(s)
            vn = GenName(v) and replace v with vn
        end

        for each s ∈ succ(b) (in CFG)
            j ← position in s’s φ-function corresponding to block b
            for each φ-function p in s
                replace the jth operand of RHS(p) by Top(Stacks[v])
            end
        end
    end

    for each s ∈ child(b) (in DT)
        Rename(s)
    end

    for each φ-function or statement t in b
        for each vi ∈ LHS(t)
            Pop(Stacks[v])
        end
    end
end procedure

SSAConstruction:
    InsertPHI(list)
    Rename(entry)
end

我们看到这种伪代码的形式和LLVMIR区别很大。因为IR格式不一样,接下来看llvm ir下如何实现mem2reg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
mem2reg(Function f){
list = []
for inst in f.entry(){
if inst is alloc and isPromotable(inst){
// 有些alloc不能被删除,like volatile
list.push(inst)
}
}
if list.empty() return

Phi2Alloc = InsertPHI(list)

// init map: alloc -> stack<value>
// with first value undef
incomings_map = {}
for alloc in list {
incomings_map[alloc] = [undef]
}

Rename(f.entry(), incomings_map, Phi2Alloc)
}

InsertPHI(list){
Phi2Alloc = {}
for alloc in list{
WorkList = []
for user in alloc.use_list(){
if user is store {
WorkList.push(user.parent())
}
}
EverOnWorkList = WorkList.clone()
AlreadyHasPhi = []

while !WorkList.empty() {
n = WorkList.pop()

for d in DF(n) {
if d not in AlreadyHasPhi {
phi = create_phi()
phi.set_all_incomings(undef)
d.insert_front(phi)
Phi2Alloc[phi] = alloc

AlreadyHasPhi.add(d)
if d not in EverOnWorkList {
WorkList.push(d)
EverOnWorkList.push(d)
}
}
}
}
}
return Phi2Alloc
}

PhiRename(bb, incomings_map, Phi2Alloc){
if bb visited, return
for inst in bb {
if inst is phi {
if inst not in Phi2Alloc
continue
alloc = Phi2Alloc[inst]
incomings_map[alloc].push(phi) // update top

continue
}
if inst is load {
// %inst = load %address
ad = inst.get_address();
if(ad is alloc) {
top = incomings_map[ad].top();
inst.replaceAllUsesWith(top)
}
continue;
}
if inst is store {
ad = inst.get_address()
// store %x, %address
if ad is alloc {
// update top
incomings_map[ad].push(inst.getValue());
}
}
}
for succ in b.getSuccessors() { // cfg
for phi in succ.getPhiNodes() {
alloc = Phi2Alloc[phi]
top = incomings_map[alloc].top()
phi.update_by_block(b, top) // update for incoming
}
}

for child in DomTree(b).children() {
// may cause stack over flow...
Rename(child, incomings_map, Phi2Alloc)
}

// do pop stack
for inst in b {
if inst is phi {
alloc = Phi2Alloc[phi]
incomings_map[alloc].pop()
continue
}

if inst is store {
ad = inst.getAddress()
incomings_map[ad].pop()
}
}
}

当然,以上代码不是llvm mem2reg实际实现代码。

llvm实现做了一些优化,domtree遍历优化,特殊情况处理等。


2. SSA destruction

如何销毁SSA形式,删除phinode。

llvm的phi-elimination是在mir阶段发生。

一般来说,SSA destruction是将phinode替换为copy指令。

  1. 暴力做法,对于p = phi (%a1, %bb1), (%a2, %bb2) ....,在bb1,bb2… 尾部插入 p = copy %a... 然后删除掉phi节点。但是该做法结果不对。

考虑到lost copy problem和swap problem:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

// Revisiting Out-of-SSA Translation for Correctness, Code Quality, and Efficiency
// https://inria.hal.science/inria-00349925v1/document

lost copy problem:

b0:
x1 =...
jmp b1

b1:
x2 = phi(x1, x3)
x3 = x2 + 1
...
jmp p, b1, b2


-----------------------

swap problem:

b0:
a1 =...
b1 =...
jmp b1

b1:
a2 = phi(a1, b2)
b2 = phi(b1, a2)
...
jmp p, b1, b2

我们的暴力算法会生成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

swap problem:
b0:
a1 = ...
b1 = ...
a2 = copy a1
b2 = copy b1
jump b1
b1:
a2 = copy b2 <--- 没有swap
b2 = copy a2

if p jump b1
jump b2
b2:
...


--------------------------------------------

lost-copy-problem

b0:
x1 = ...
x2 = copy x1
jump b1
b1:
x3 = x2 + 1
x2 = copy x3
if p jump b1
jump b2
b2:
... = x2 <---- 这里错误

split critical edges

图中红边就是critical edge:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

digraph G {
// 不可见节点(不显示标签)
xxx [label="", shape=none, width=0, height=0];
others [label="", shape=none, width=0, height=0];

// 可见节点
b1 [label="b1", shape=circle, style=filled, fillcolor=lightgray];
b2 [label="b2", shape=circle, style=filled, fillcolor=lightgray];

// 边
xxx -> b2 [];
b1 -> b2 [color=red, penwidth=2];
b1 -> others [];

// 调整布局
rankdir=TD; // 从左到右排列
}

对lost copy和swap应用该算法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

lost-copy-problem

b0:
x1 =...
x2 = copy x1
jmp b1

b1:
x3 = x2 + 1
jmp p, b3, b2

b3:
x2 = copy x3
jmp b1

b2:
... = x2

---------------------------------------------------

swap problem

bb0:
a1 =...
b1 =...
a2 = copy a1
b2 = copy b1
jmp bb1

bb1:

jmp p, bb3, bb2

bb3:
a2 = copy b2 <--- 没有swap
b2 = copy a2
jmp bb1

bb2:
... = a2
... = b2

swap problem结果看起来不太对哦。
看起来是插入copy出了问题, 再看一眼a2=phi(a1, b2);b2=phi(b1,a2) a2, b2形成了环。
是个环。。。

我们发现swap problem中

  • a2和b2的生命周期重叠了
  • 并且插入copy方式也需要改进

isolating phi + parallel copy

先介绍几个概念

  • CSSA(Conventional SSA) form is defined as SSA form for which each phi-web is interference-free.
  • TSSA(Transformed SSA) form is non-conventional SSA, may have phi-web is not interference-free.

swap problem和lost-copy problem中 都是TSSA,而不是CSSA。
如何将TSSA转换为CSSA?并且在CSSA中插入copy是否还需要注意类似swapproblem的情况?

所以我们接下来的算法步骤是:

  1. Insert parallel copies for all φ-functions (TSSA => CSSA)
  2. eliminate phis in CSSA
  3. Sequentialize parallel copies, possibly with one more variable and some additional copies
  4. some optimization

首先转换为cssa格式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

lost copy problem

b0:
x1 =...
x1' = copy x1
jmp b1

b1:
x2' = phi(x1', x3')
x2 = copy x2'
x3 = x2 + 1
x3' = copy x3

jmp p, b1, b2

b2:
... = x2


----------------------------------------

swap problem:

B0:
a1 =...
b1 =...
a1' = copy a1
b1' = copy b1
jmp B1

B1:
a2' = phi(a1', b2')
b2' = phi(b1', a2')
a2 = copy a2'
b2 = copy b2'
b2' = copy b2
a2' = copy a2
...
jmp p, B1, B2

B2:
... = a2
... = b2

消除phi节点后:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

lost copy problem

b0:
x1 =...
x1' = copy x1
x2' = copy x1' // x1' in x2' = phi(x1', x3')
jmp b1

b1:
x2 = copy x2'
x3 = x2 + 1
x3' = copy x3
x2' = x3' // x3' in x2' = phi(x1', x3')
jmp p, b1, b2

b2:
... = x2


----------------------------------------

swap problem:

B0:
a1 =...
b1 =...
a1' = copy a1
b1' = copy b1
a2' = copy a1' // a1' in a2' = phi(a1', b2')
b2' = copy b1' // b1' in b2' = phi(b1', a2')
jmp B1

B1:
a2 = copy a2'
b2 = copy b2'
b2' = copy b2
a2' = copy a2
a2' = copy b2' // a2' in a2' = phi(a1', b2')
b2' = copy a2' // b2' in b2' = phi(b1', a2')
jmp p, B1, B2

B2:
... = a2
... = b2

上一小节提到的swap中copy的问题仍然存在,所以接下来要介绍,parallel copies的概念。
我们将这些插入的copy指令视为parallel copies,然后采用算法求解copy插入顺序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// Revisiting Out-of-SSA Translation for Correctness, Code Quality, and Efficiency (https://inria.hal.science/inria-00349925v1/document)

// @args
// Set P of parallel copies of the form a -> b, a != b
// n: one extra fresh variable
// @output: List of copies in sequential order
def parallel_copy_sequentialization(P:set, n: variable)

ready = []
to_do = []
pred(n) = none // a map

for a -> b in p
loc(b) = none // init
pred(b) = none
end for

for a -> b in p
loc(a) = a /* needed and not copied yet */
pred(b) = a /* (unique) predecessor *
to_do.append(b) /* copy into b to be done */

for a->b in p
if loc(b) == none
ready.append(b) /* b is not used and can be overwritten */

while to_do != []
while ready != []
b = ready.pop()
a = pred(b)
c = loc(a)
emit copy c -> b

loc(a) = b
if a == c and pred(a) != none
ready.append(a)
b = to_do.pop()
l = loc(pred(b))
if b == l
emit copy b -> n
loc(b) = n
ready.append(b)

不幸的是,这个算法有点小问题。
实际上是个图遍历算法:cc09.pdf

我们用python写个demo:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
class Copy:
    def __init__(self, src, dst):
        self.src = src
        self.dst = dst
    def __repr__(self):
        return "%s <- %s" % (self.dst, self.src)

def seq_copy(seq):
    ready = []
    todo = []
    pred = {}
    loc = {}
    n = "tmp"

    pred[n] = None
    for i in seq:
        a = i.src
        b = i.dst
        loc[b] = None
        pred[a] = None

    for i in seq:
        a = i.src
        b = i.dst
        loc[a] = a
        pred[b] = a
        todo.append(b)

    for i in seq:
        a = i.src
        b = i.dst
        if loc[b] is None:
            ready.append(b)


    while len(todo) != 0:
        while len(ready) != 0:
            b = ready.pop()
            a = pred[b]
            c = loc[a]
            print("emit copy {} <- {}".format(b, c))
            loc[a] = b
            if a ==c and pred[a] is not None:
                ready.append(a)
        b = todo.pop(0)
        l = loc[pred[b]]

        if b == l: # <<<<< 这里
            print("emit copy {}<- {}".format(n, b))
            loc[b] = n
            ready.append(b)
           
seq= [
    Copy("b", "a"),
    Copy("a", "b"),
]

seq_copy(seq)

运行脚本,啥也不输出。
但是如果我们将 b==l改成b!=l, 就会输出
emit copy tmp<- a
emit copy a <- b
emit copy b <- tmp

至此,我们完成了phi-elimination的一半,合法化问题解决了。但是性能问题没解决。

对于lost copy problem,在BB块前插入copy指令是必须的。(如果采用Split Critical Edges的方式,循环的代码质量会下降)
对于swap problem,需检测环的出现。

copy插入位置问题,lost copy 和swap problem插入顺序还不一样。
这帮人写论文能不能靠谱点

llvm采用的算法

llvm有split critical edge也有不依赖split critical edge的实现。
parallel copy面对如下代码还是有些问题

1
2
3
4
5
6
7
8
9
10
// 格式化整型的代码实现
int value = ....;
do{
int a = value % base;
value = value / base;

buf[len++] = a;

} while(value)

原因还是copy插入顺序。(parallel copies的遍历顺序)
解决方法,额。
根据llvm中split critical edges的方法。
如果phi的incoming block是critical edge 的src block,例如 bb0:a0 = phi a1,bb1, a2, bb2 其中bb2是critical edge,那么在bb1尾部创建tmp = copy a1, 在bb0头部a0 = copy tmp, 在bb2尾部tmp = copy a2.
如果没有critical edge, 直接copy,bb1尾部创建a0 = copy a1,bb2尾部a0 = copy a2.

很简单的实现,但是还是挺有效。。。

https://gcc.godbolt.org/z/aMWPjK3a9

为什么不split block呢?因为跳转太多会导致性能下降,特别是在loop中。


优化问题

关键字:copy de-coalescing

即如何最小化插入copy指令

  1. 在CSSA上面对parallel copies 优化
  2. 先生成很多冗余的copy指令,然后再优化这些copy指令

第一种方式有些困难,个人认为难点在于parallel copies相关生命周期如何计算。
第二种方式就很传统了, interference graph + union find, 不干涉的放在一个点集里面。


others

我们重新关注下parallel copies 。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

对于


bb1:
a1 = ...
b1 = ...
c1 = ...
jump bb

bb2:
a2 = ...
b2 = ...
c2 = ...
jump bb

bb3:
a3 = ...
b3 = ...
c3 = ...
jump bb

bb:
a = phi(a1, a2, a3)
b = phi(b1, b2, b3)
c = phi(c1, c2, c3)


注意\\可能会被识别为\, 用\newline更好

$\begin{bmatrix} a \newline b \newline c \end{bmatrix}
= Φ \begin{bmatrix} a1 & a2 & a3 \newline b1 & b2 & b3 \newline c1 &c2 & c3 \end{bmatrix} $

那么对于bb1来说,就有parallel copies: $a \gets a1, b \gets b1, c \gets c1$。
其对应的Location Transfer Graph为$G = (V, E), V = \lbrace a,b,c, a1, b1, c1\rbrace, E = \lbrace a \gets a1, b \gets b1, c \gets c1 \rbrace$

再看下swap problem里面的

1
2
a2 = phi(a1, b2)
b2 = phi(b1, a2)
  1. 前驱1的parallel copies
1
2
3
flowchart TD
a1 --> a2
b1 --> b2
  1. 前驱2的parallel copies
1
2
3
flowchart TD
a2 --> b2
b2 --> a2

参考


llvm实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int cond_random();

void swap_p(int a, int b, int *arr){

while(cond_random() ){
int tmp = a;
a = b;
b = tmp;
}
*arr++ = a;
*arr++ = b;
}


void lost_copy(int a, int *p){

while(cond_random()){
a+=1;
}
*p = a;
}

lost copy

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

# Machine code for function lost_copy(int, int*): IsSSA, TracksLiveness
Function Live Ins: $edi in %3, $rsi in %4

bb.0.entry:
successors: %bb.1(0x80000000); %bb.1(100.00%)
liveins: $edi, $rsi
%4:gr64 = COPY killed $rsi
%3:gr32 = COPY killed $edi
%0:gr32 = DEC32r killed %3:gr32(tied-def 0), implicit-def dead $eflags; example.cpp:19:5

bb.1.while.cond:
; predecessors: %bb.0, %bb.1
successors: %bb.2(0x04000000), %bb.1(0x7c000000); %bb.2(3.12%), %bb.1(96.88%)

%1:gr32 = PHI %0:gr32, %bb.0, %2:gr32, %bb.1, debug-instr-number 1
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:19:11
CALL64pcrel32 target-flags(x86-plt) @cond_random(), <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax; example.cpp:19:11
ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:19:11
%5:gr32 = COPY killed $eax; example.cpp:19:11
%2:gr32 = INC32r killed %1:gr32(tied-def 0), implicit-def dead $eflags; example.cpp:19:5
TEST32rr killed %5:gr32, %5:gr32, implicit-def $eflags; example.cpp:19:11
JCC_1 %bb.1, 5, implicit killed $eflags; example.cpp:19:5
JMP_1 %bb.2; example.cpp:19:5

bb.2.while.end:
; predecessors: %bb.1

MOV32mr killed %4:gr64, 1, $noreg, 0, $noreg, killed %2:gr32 :: (store (s32) into %ir.p); example.cpp:22:8
RET 0; example.cpp:23:1

# End machine code for function lost_copy(int, int*).


# Machine code for function lost_copy(int, int*): NoPHIs, TracksLiveness
Function Live Ins: $edi in %3, $rsi in %4

bb.0.entry:
successors: %bb.1(0x80000000); %bb.1(100.00%)
liveins: $edi, $rsi
%4:gr64 = COPY killed $rsi
%3:gr32 = COPY killed $edi
%0:gr32 = DEC32r killed %3:gr32(tied-def 0), implicit-def dead $eflags; example.cpp:19:5
%6:gr32 = COPY killed %0:gr32

bb.1.while.cond:
; predecessors: %bb.0, %bb.1
successors: %bb.2(0x04000000), %bb.1(0x7c000000); %bb.2(3.12%), %bb.1(96.88%)

%1:gr32 = COPY killed %6:gr32
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:19:11
CALL64pcrel32 target-flags(x86-plt) @cond_random(), <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax; example.cpp:19:11
ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:19:11
%5:gr32 = COPY killed $eax; example.cpp:19:11
%2:gr32 = INC32r killed %1:gr32(tied-def 0), implicit-def dead $eflags; example.cpp:19:5
TEST32rr killed %5:gr32, %5:gr32, implicit-def $eflags; example.cpp:19:11
%6:gr32 = COPY %2:gr32
JCC_1 %bb.1, 5, implicit killed $eflags; example.cpp:19:5
JMP_1 %bb.2; example.cpp:19:5

bb.2.while.end:
; predecessors: %bb.1

MOV32mr killed %4:gr64, 1, $noreg, 0, $noreg, killed %2:gr32 :: (store (s32) into %ir.p); example.cpp:22:8
RET 0; example.cpp:23:1

# End machine code for function lost_copy(int, int*).

# Machine code for function lost_copy(int, int*): NoPHIs, TracksLiveness, TiedOpsRewritten
Function Live Ins: $edi in %3, $rsi in %4

0B bb.0.entry:
successors: %bb.1(0x80000000); %bb.1(100.00%)
liveins: $edi, $rsi
16B %4:gr64 = COPY $rsi
32B %6:gr32 = COPY $edi
64B %6:gr32 = DEC32r %6:gr32(tied-def 0), implicit-def dead $eflags; example.cpp:19:5

96B bb.1.while.cond:
; predecessors: %bb.0, %bb.1
successors: %bb.2(0x04000000), %bb.1(0x7c000000); %bb.2(3.12%), %bb.1(96.88%)

128B ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:19:11
144B CALL64pcrel32 target-flags(x86-plt) @cond_random(), <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax; example.cpp:19:11
160B ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:19:11
176B %5:gr32 = COPY killed $eax; example.cpp:19:11
208B %6:gr32 = INC32r %6:gr32(tied-def 0), implicit-def dead $eflags; example.cpp:19:5
224B TEST32rr %5:gr32, %5:gr32, implicit-def $eflags; example.cpp:19:11
256B JCC_1 %bb.1, 5, implicit killed $eflags; example.cpp:19:5
272B JMP_1 %bb.2; example.cpp:19:5

288B bb.2.while.end:
; predecessors: %bb.1

304B MOV32mr %4:gr64, 1, $noreg, 0, $noreg, %6:gr32 :: (store (s32) into %ir.p); example.cpp:22:8
320B RET 0; example.cpp:23:1

# End machine code for function lost_copy(int, int*).

swap

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

# Machine code for function swap_p(int, int, int*): IsSSA, TracksLiveness
Function Live Ins: $edi in %2, $esi in %3, $rdx in %4

bb.0.entry:
successors: %bb.1(0x80000000); %bb.1(100.00%)
liveins: $edi, $esi, $rdx
%4:gr64 = COPY killed $rdx
%3:gr32 = COPY killed $esi
%2:gr32 = COPY killed $edi

bb.1.while.cond:
; predecessors: %bb.0, %bb.1
successors: %bb.2(0x04000000), %bb.1(0x7c000000); %bb.2(3.12%), %bb.1(96.88%)

%0:gr32 = PHI %3:gr32, %bb.0, %1:gr32, %bb.1, debug-instr-number 2
%1:gr32 = PHI %2:gr32, %bb.0, %0:gr32, %bb.1, debug-instr-number 1
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:7:11
CALL64pcrel32 target-flags(x86-plt) @cond_random(), <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax; example.cpp:7:11
ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:7:11
%5:gr32 = COPY killed $eax; example.cpp:7:11
TEST32rr killed %5:gr32, %5:gr32, implicit-def $eflags; example.cpp:7:11
JCC_1 %bb.1, 5, implicit killed $eflags; example.cpp:7:5
JMP_1 %bb.2; example.cpp:7:5

bb.2.while.end:
; predecessors: %bb.1

MOV32mr %4:gr64, 1, $noreg, 0, $noreg, killed %1:gr32 :: (store (s32) into %ir.arr); example.cpp:12:12
MOV32mr killed %4:gr64, 1, $noreg, 4, $noreg, killed %0:gr32 :: (store (s32) into %ir.incdec.ptr); example.cpp:13:12
RET 0; example.cpp:14:1

# End machine code for function swap_p(int, int, int*).


# Machine code for function swap_p(int, int, int*): NoPHIs, TracksLiveness
Function Live Ins: $edi in %2, $esi in %3, $rdx in %4

bb.0.entry:
successors: %bb.1(0x80000000); %bb.1(100.00%)
liveins: $edi, $esi, $rdx
%4:gr64 = COPY killed $rdx
%3:gr32 = COPY killed $esi
%2:gr32 = COPY killed $edi
%6:gr32 = COPY killed %3:gr32
%7:gr32 = COPY killed %2:gr32

bb.1.while.cond:
; predecessors: %bb.0, %bb.1
successors: %bb.2(0x04000000), %bb.1(0x7c000000); %bb.2(3.12%), %bb.1(96.88%)

%1:gr32 = COPY killed %7:gr32
%0:gr32 = COPY killed %6:gr32
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:7:11
CALL64pcrel32 target-flags(x86-plt) @cond_random(), <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax; example.cpp:7:11
ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:7:11
%5:gr32 = COPY killed $eax; example.cpp:7:11
TEST32rr killed %5:gr32, %5:gr32, implicit-def $eflags; example.cpp:7:11
%6:gr32 = COPY %1:gr32
%7:gr32 = COPY %0:gr32
JCC_1 %bb.1, 5, implicit killed $eflags; example.cpp:7:5
JMP_1 %bb.2; example.cpp:7:5

bb.2.while.end:
; predecessors: %bb.1

MOV32mr %4:gr64, 1, $noreg, 0, $noreg, killed %1:gr32 :: (store (s32) into %ir.arr); example.cpp:12:12
MOV32mr killed %4:gr64, 1, $noreg, 4, $noreg, killed %0:gr32 :: (store (s32) into %ir.incdec.ptr); example.cpp:13:12
RET 0; example.cpp:14:1

# End machine code for function swap_p(int, int, int*).


# Machine code for function swap_p(int, int, int*): NoPHIs, TracksLiveness, TiedOpsRewritten
Function Live Ins: $edi in %2, $esi in %3, $rdx in %4

0B bb.0.entry:
successors: %bb.1(0x80000000); %bb.1(100.00%)
liveins: $edi, $esi, $rdx
16B %4:gr64 = COPY $rdx
32B %6:gr32 = COPY $esi
48B %7:gr32 = COPY $edi

96B bb.1.while.cond:
; predecessors: %bb.0, %bb.1
successors: %bb.2(0x04000000), %bb.1(0x7c000000); %bb.2(3.12%), %bb.1(96.88%)

112B %1:gr32 = COPY %7:gr32
128B %7:gr32 = COPY %6:gr32
144B ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:7:11
160B CALL64pcrel32 target-flags(x86-plt) @cond_random(), <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax; example.cpp:7:11
176B ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp; example.cpp:7:11
192B %5:gr32 = COPY killed $eax; example.cpp:7:11
208B TEST32rr %5:gr32, %5:gr32, implicit-def $eflags; example.cpp:7:11
224B %6:gr32 = COPY %1:gr32
256B JCC_1 %bb.1, 5, implicit killed $eflags; example.cpp:7:5
272B JMP_1 %bb.2; example.cpp:7:5

288B bb.2.while.end:
; predecessors: %bb.1

304B MOV32mr %4:gr64, 1, $noreg, 0, $noreg, %1:gr32 :: (store (s32) into %ir.arr); example.cpp:12:12
320B MOV32mr %4:gr64, 1, $noreg, 4, $noreg, %7:gr32 :: (store (s32) into %ir.incdec.ptr); example.cpp:13:12
336B RET 0; example.cpp:14:1

# End machine code for function swap_p(int, int, int*).