wazevo: refactors liveness analysis (#1821)

This refactors the liveness analysis and starts using the different algorithm, which results in 30s -> 12s for Python binary compilation. Signed-off-by: Takeshi Yoneda <t.y.mathetake@gmail.com>
2023-10-31 08:12:17 +09:00
parent 695b49e94d
commit 6f16354ec7
17 changed files with 833 additions and 341 deletions
--- a/internal/engine/wazevo/backend/backend_test.go
+++ b/internal/engine/wazevo/backend/backend_test.go
@@ -266,10 +266,8 @@ L2 (SSA Block: blk1):
 L1 (SSA Block: blk0):
 	mov x128?, x0
 	mov x131?, xzr
-	cbz w131?, (L2)
-L3 (SSA Block: blk1):
-	ret
-L2 (SSA Block: blk2):
+	cbnz w131?, L2
+L3 (SSA Block: blk2):
 	movz x132?, #0x3, lsl 0
 	str w132?, [x128?]
 	mov x133?, sp
@@ -277,18 +275,16 @@ L2 (SSA Block: blk2):
 	adr x134?, #0x0
 	str x134?, [x128?, #0x30]
 	exit_sequence x128?
+L2 (SSA Block: blk1):
+	ret
 `,
 			afterFinalizeARM64: `
 L1 (SSA Block: blk0):
 	stp x30, xzr, [sp, #-0x10]!
 	str xzr, [sp, #-0x10]!
 	mov x8, xzr
-	cbz w8, #0x10 L2
-L3 (SSA Block: blk1):
-	add sp, sp, #0x10
-	ldr x30, [sp], #0x10
-	ret
-L2 (SSA Block: blk2):
+	cbnz w8, #0x34 (L2)
+L3 (SSA Block: blk2):
 	movz x8, #0x3, lsl 0
 	str w8, [x0]
 	mov x8, sp
@@ -296,6 +292,10 @@ L2 (SSA Block: blk2):
 	adr x8, #0x0
 	str x8, [x0, #0x30]
 	exit_sequence x0
+L2 (SSA Block: blk1):
+	add sp, sp, #0x10
+	ldr x30, [sp], #0x10
+	ret
 `,
 		},
 		{
@@ -397,10 +397,10 @@ L2 (SSA Block: blk1):
 			afterLoweringARM64: `
 L1 (SSA Block: blk0):
 	mov x131?, xzr
-	cbnz w131?, L2
-L3 (SSA Block: blk2):
+	cbz w131?, (L2)
+L3 (SSA Block: blk1):
 	b L4
-L2 (SSA Block: blk1):
+L2 (SSA Block: blk2):
 L4 (SSA Block: blk3):
 	ret
 `,
@@ -409,10 +409,10 @@ L1 (SSA Block: blk0):
 	stp x30, xzr, [sp, #-0x10]!
 	str xzr, [sp, #-0x10]!
 	mov x8, xzr
-	cbnz w8, #0x8 (L2)
-L3 (SSA Block: blk2):
+	cbz w8, #0x8 L2
+L3 (SSA Block: blk1):
 	b #0x4 (L4)
-L2 (SSA Block: blk1):
+L2 (SSA Block: blk2):
 L4 (SSA Block: blk3):
 	add sp, sp, #0x10
 	ldr x30, [sp], #0x10
@@ -424,25 +424,25 @@ L4 (SSA Block: blk3):
 			afterLoweringARM64: `
 L1 (SSA Block: blk0):
 	mov x131?, xzr
-	cbnz w131?, L2
-L3 (SSA Block: blk2):
-	ret
-L2 (SSA Block: blk1):
+	cbz w131?, (L2)
+L3 (SSA Block: blk1):
 L4 (SSA Block: blk3):
 	ret
+L2 (SSA Block: blk2):
+	ret
 `,
 			afterFinalizeARM64: `
 L1 (SSA Block: blk0):
 	stp x30, xzr, [sp, #-0x10]!
 	str xzr, [sp, #-0x10]!
 	mov x8, xzr
-	cbnz w8, #0x10 (L2)
-L3 (SSA Block: blk2):
+	cbz w8, #0x10 L2
+L3 (SSA Block: blk1):
+L4 (SSA Block: blk3):
 	add sp, sp, #0x10
 	ldr x30, [sp], #0x10
 	ret
-L2 (SSA Block: blk1):
-L4 (SSA Block: blk3):
+L2 (SSA Block: blk2):
 	add sp, sp, #0x10
 	ldr x30, [sp], #0x10
 	ret
@@ -453,30 +453,30 @@ L4 (SSA Block: blk3):
 			afterLoweringARM64: `
 L1 (SSA Block: blk0):
 	mov x132?, xzr
-	cbnz w132?, L2
-L3 (SSA Block: blk2):
+	cbz w132?, (L2)
+L3 (SSA Block: blk1):
+	mov x131?, xzr
+	mov x0, x131?
+	ret
+L2 (SSA Block: blk2):
 L4 (SSA Block: blk3):
 	mov x130?, xzr
 	mov x0, x130?
 	ret
-L2 (SSA Block: blk1):
-	mov x131?, xzr
-	mov x0, x131?
-	ret
 `,
 			afterFinalizeARM64: `
 L1 (SSA Block: blk0):
 	stp x30, xzr, [sp, #-0x10]!
 	str xzr, [sp, #-0x10]!
 	mov x8, xzr
-	cbnz w8, #0x14 (L2)
-L3 (SSA Block: blk2):
-L4 (SSA Block: blk3):
+	cbz w8, #0x14 L2
+L3 (SSA Block: blk1):
 	mov x0, xzr
 	add sp, sp, #0x10
 	ldr x30, [sp], #0x10
 	ret
-L2 (SSA Block: blk1):
+L2 (SSA Block: blk2):
+L4 (SSA Block: blk3):
 	mov x0, xzr
 	add sp, sp, #0x10
 	ldr x30, [sp], #0x10
@@ -489,12 +489,12 @@ L2 (SSA Block: blk1):
 L1 (SSA Block: blk0):
 	mov x130?, x2
 	mov x131?, x3
-	cbnz w130?, L2
-L3 (SSA Block: blk2):
-	mov x132?, x131?
-	b L4
-L2 (SSA Block: blk1):
+	cbz w130?, (L2)
+L3 (SSA Block: blk1):
 	mov x132?, x130?
+	b L4
+L2 (SSA Block: blk2):
+	mov x132?, x131?
 L4 (SSA Block: blk3):
 	mov x0, x132?
 	ret
@@ -503,13 +503,13 @@ L4 (SSA Block: blk3):
 L1 (SSA Block: blk0):
 	stp x30, xzr, [sp, #-0x10]!
 	str xzr, [sp, #-0x10]!
-	cbnz w2, #0x8 (L2)
-L3 (SSA Block: blk2):
+	cbz w2, #0x8 L2
+L3 (SSA Block: blk1):
 	b #0x8 (L4)
-L2 (SSA Block: blk1):
-	mov x3, x2
+L2 (SSA Block: blk2):
+	mov x2, x3
 L4 (SSA Block: blk3):
-	mov x0, x3
+	mov x0, x2
 	add sp, sp, #0x10
 	ldr x30, [sp], #0x10
 	ret
@@ -589,9 +589,8 @@ L5 (SSA Block: blk3):
 L1 (SSA Block: blk0):
 	stp x30, xzr, [sp, #-0x10]!
 	str xzr, [sp, #-0x10]!
-	mov x8, x2
 L2 (SSA Block: blk1):
-	cbnz w8, #0x8 (L4)
+	cbnz w2, #0x8 (L4)
 	b #0x10 (L3)
 L4 (SSA Block: blk5):
 	add sp, sp, #0x10
@@ -599,7 +598,7 @@ L4 (SSA Block: blk5):
 	ret
 L3 (SSA Block: blk4):
 L5 (SSA Block: blk3):
-	orr w8, wzr, #0x1
+	orr w2, wzr, #0x1
 	b #-0x18 (L2)
 `,
 		},
@@ -639,30 +638,30 @@ L1 (SSA Block: blk0):
 	sub sp, sp, #0x10
 	orr x27, xzr, #0x10
 	str x27, [sp, #-0x10]!
-	mov x9, x0
-	mov x8, x1
-	str x8, [x9, #0x8]
-	mov x0, x9
-	mov x1, x8
-	str x9, [sp, #0x10]
-	str x8, [sp, #0x18]
+	mov x8, x0
+	mov x9, x1
+	str x9, [x8, #0x8]
+	mov x0, x8
+	mov x1, x9
+	str x8, [sp, #0x10]
+	str x9, [sp, #0x18]
 	bl f1
-	ldr x8, [sp, #0x18]
-	ldr x9, [sp, #0x10]
+	ldr x9, [sp, #0x18]
+	ldr x8, [sp, #0x10]
 	mov x2, x0
-	str x8, [x9, #0x8]
-	mov x0, x9
-	mov x1, x8
+	str x9, [x8, #0x8]
+	mov x0, x8
+	mov x1, x9
 	movz w3, #0x5, lsl 0
-	str x9, [sp, #0x10]
-	str x8, [sp, #0x18]
+	str x8, [sp, #0x10]
+	str x9, [sp, #0x18]
 	bl f2
-	ldr x8, [sp, #0x18]
-	ldr x9, [sp, #0x10]
+	ldr x9, [sp, #0x18]
+	ldr x8, [sp, #0x10]
 	mov x2, x0
-	str x8, [x9, #0x8]
-	mov x0, x9
-	mov x1, x8
+	str x9, [x8, #0x8]
+	mov x0, x8
+	mov x1, x9
 	bl f3
 	add sp, sp, #0x10
 	add sp, sp, #0x10
@@ -1623,11 +1622,12 @@ L1 (SSA Block: blk0):
 L1 (SSA Block: blk0):
 	stp x30, xzr, [sp, #-0x10]!
 	str xzr, [sp, #-0x10]!
-	mov x3, x2
+	mov x9, x2
 	str x1, [x0, #0x8]
 	ldr x8, [x1, #0x8]
 	ldr x1, [x1, #0x10]
-	mov x2, x3
+	mov x2, x9
+	mov x3, x9
 	bl x8
 	add sp, sp, #0x10
 	ldr x30, [sp], #0x10
--- a/internal/engine/wazevo/backend/compiler.go
+++ b/internal/engine/wazevo/backend/compiler.go
@@ -102,6 +102,9 @@ type Compiler interface {

 	// Emit4Bytes appends 4 bytes to the buffer. Used during the code emission.
 	Emit4Bytes(b uint32)
+
+	// LoopNestingForestRoots returns the roots of the loop nesting forest.
+	LoopNestingForestRoots() []ssa.BasicBlock
 }

 // RelocationInfo represents the relocation information for a call instruction.
@@ -389,3 +392,8 @@ func (c *compiler) Emit4Bytes(b uint32) {
 func (c *compiler) Buf() []byte {
 	return c.buf
 }
+
+// LoopNestingForestRoots implements Compiler.LoopNestingForestRoots.
+func (c *compiler) LoopNestingForestRoots() []ssa.BasicBlock {
+	return c.ssaBuilder.LoopNestingForestRoots()
+}
--- a/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
+++ b/internal/engine/wazevo/backend/isa/arm64/machine_regalloc.go
@@ -17,7 +17,8 @@ type (
 		// labelToRegAllocBlockIndex maps label to the index of reversePostOrderBlocks.
 		labelToRegAllocBlockIndex map[label]int
 		// vs is used for regalloc.Instr Defs() and Uses() methods, defined here for reuse.
-		vs []regalloc.VReg
+		vs                     []regalloc.VReg
+		loopNestingForestRoots []ssa.BasicBlock
 	}

 	// regAllocBlockImpl implements regalloc.Block.
@@ -28,7 +29,8 @@ type (
 		l   label
 		pos *labelPosition
 		// instrImpl is re-used for all instructions in this block.
-		instrImpl regAllocInstrImpl
+		instrImpl                 regAllocInstrImpl
+		loopNestingForestChildren []ssa.BasicBlock
 	}

 	// regAllocInstrImpl implements regalloc.Instr.
@@ -151,6 +153,56 @@ func (r *regAllocBlockImpl) Pred(i int) regalloc.Block {
 	return &r.f.reversePostOrderBlocks[index]
 }

+// Succs implements regalloc.Block Succs.
+func (r *regAllocBlockImpl) Succs() int {
+	return r.sb.Succs()
+}
+
+// Succ implements regalloc.Block Succ.
+func (r *regAllocBlockImpl) Succ(i int) regalloc.Block {
+	sb := r.sb
+	succ := sb.Succ(i)
+	if succ.ReturnBlock() {
+		return nil
+	}
+	l := r.f.m.ssaBlockIDToLabels[succ.ID()]
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
+
+// LoopHeader implements regalloc.Block LoopHeader.
+func (r *regAllocBlockImpl) LoopHeader() bool {
+	return r.sb.LoopHeader()
+}
+
+// LoopNestingForestRoots implements regalloc.Function LoopNestingForestRoots.
+func (f *regAllocFunctionImpl) LoopNestingForestRoots() int {
+	f.loopNestingForestRoots = f.m.compiler.LoopNestingForestRoots()
+	return len(f.loopNestingForestRoots)
+}
+
+// LoopNestingForestRoot implements regalloc.Function LoopNestingForestRoot.
+func (f *regAllocFunctionImpl) LoopNestingForestRoot(i int) regalloc.Block {
+	blk := f.loopNestingForestRoots[i]
+	l := f.m.ssaBlockIDToLabels[blk.ID()]
+	index := f.labelToRegAllocBlockIndex[l]
+	return &f.reversePostOrderBlocks[index]
+}
+
+// LoopNestingForestChildren implements regalloc.Block LoopNestingForestChildren.
+func (r *regAllocBlockImpl) LoopNestingForestChildren() int {
+	r.loopNestingForestChildren = r.sb.LoopNestingForestChildren()
+	return len(r.loopNestingForestChildren)
+}
+
+// LoopNestingForestChild implements regalloc.Block LoopNestingForestChild.
+func (r *regAllocBlockImpl) LoopNestingForestChild(i int) regalloc.Block {
+	blk := r.loopNestingForestChildren[i]
+	l := r.f.m.ssaBlockIDToLabels[blk.ID()]
+	index := r.f.labelToRegAllocBlockIndex[l]
+	return &r.f.reversePostOrderBlocks[index]
+}
+
 // InstrIteratorBegin implements regalloc.Block InstrIteratorBegin.
 func (r *regAllocBlockImpl) InstrIteratorBegin() regalloc.Instr {
 	r.instrImpl.i = r.pos.begin
@@ -170,6 +222,25 @@ func (r *regAllocBlockImpl) InstrIteratorNext() regalloc.Instr {
 	}
 }

+// InstrRevIteratorBegin implements regalloc.Block InstrRevIteratorBegin.
+func (r *regAllocBlockImpl) InstrRevIteratorBegin() regalloc.Instr {
+	r.instrImpl.i = r.pos.end
+	return &r.instrImpl
+}
+
+// InstrRevIteratorNext implements regalloc.Block InstrRevIteratorNext.
+func (r *regAllocBlockImpl) InstrRevIteratorNext() regalloc.Instr {
+	for {
+		instr := r.instrIteratorRevNext()
+		if instr == nil {
+			return nil
+		} else if instr.i.addedBeforeRegAlloc {
+			// Only concerned about the instruction added before regalloc.
+			return instr
+		}
+	}
+}
+
 // BlockParams implements regalloc.Block BlockParams.
 func (r *regAllocBlockImpl) BlockParams() []regalloc.VReg {
 	c := r.f.m.compiler
@@ -190,6 +261,15 @@ func (r *regAllocBlockImpl) instrIteratorNext() *regAllocInstrImpl {
 	return &r.instrImpl
 }

+func (r *regAllocBlockImpl) instrIteratorRevNext() *regAllocInstrImpl {
+	cur := r.instrImpl.i
+	if r.pos.begin == cur {
+		return nil
+	}
+	r.instrImpl.i = cur.prev
+	return &r.instrImpl
+}
+
 // Entry implements regalloc.Block Entry.
 func (r *regAllocBlockImpl) Entry() bool { return r.sb.EntryBlock() }

--- a/internal/engine/wazevo/backend/isa/arm64/util_test.go
+++ b/internal/engine/wazevo/backend/isa/arm64/util_test.go
@@ -61,6 +61,8 @@ type mockCompiler struct {
 	buf         []byte
 }

+func (m *mockCompiler) LoopNestingForestRoots() []ssa.BasicBlock { panic("TODO") }
+
 func (m *mockCompiler) SourceOffsetInfo() []backend.SourceOffsetInfo { return nil }

 func (m *mockCompiler) AddSourceOffsetInfo(int64, ssa.SourceOffset) {}
--- a/internal/engine/wazevo/backend/regalloc/api.go
+++ b/internal/engine/wazevo/backend/regalloc/api.go
@@ -35,12 +35,17 @@ type (
 		ReloadRegisterAfter(v VReg, instr Instr)
 		// Done tells the implementation that register allocation is done, and it can finalize the stack
 		Done()
+		// LoopNestingForestRoots returns the number of roots of the loop nesting forest in a function.
+		LoopNestingForestRoots() int
+		// LoopNestingForestRoot returns the i-th root of the loop nesting forest in a function.
+		LoopNestingForestRoot(i int) Block
 	}

 	// Block is a basic block in the CFG of a function, and it consists of multiple instructions, and predecessor Block(s).
 	Block interface {
 		// ID returns the unique identifier of this block.
 		ID() int
+		// BlockParams returns the virtual registers used as the parameters of this block.
 		BlockParams() []VReg
 		// InstrIteratorBegin returns the first instruction in this block. Instructions added after lowering must be skipped.
 		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
@@ -48,12 +53,26 @@ type (
 		// InstrIteratorNext returns the next instruction in this block. Instructions added after lowering must be skipped.
 		// Note: multiple Instr(s) will not be held at the same time, so it's safe to use the same impl for the return Instr.
 		InstrIteratorNext() Instr
+		// InstrRevIteratorBegin is the same as InstrIteratorBegin, but in the reverse order.
+		InstrRevIteratorBegin() Instr
+		// InstrRevIteratorNext is the same as InstrIteratorNext, but in the reverse order.
+		InstrRevIteratorNext() Instr
 		// Preds returns the number of predecessors of this block in the CFG.
 		Preds() int
 		// Pred returns the i-th predecessor of this block in the CFG.
 		Pred(i int) Block
 		// Entry returns true if the block is for the entry block.
 		Entry() bool
+		// Succs returns the number of successors of this block in the CFG.
+		Succs() int
+		// Succ returns the i-th successor of this block in the CFG.
+		Succ(i int) Block
+		// LoopHeader returns true if this block is a loop header.
+		LoopHeader() bool
+		// LoopNestingForestChildren returns the number of children of this block in the loop nesting forest.
+		LoopNestingForestChildren() int
+		// LoopNestingForestChild returns the i-th child of this block in the loop nesting forest.
+		LoopNestingForestChild(i int) Block
 	}

 	// Instr is an instruction in a block, abstracting away the underlying ISA.
--- a/internal/engine/wazevo/backend/regalloc/api_test.go
+++ b/internal/engine/wazevo/backend/regalloc/api_test.go
@@ -12,6 +12,7 @@ type (
 		iter            int
 		blocks          []*mockBlock
 		befores, afters []storeOrReloadInfo
+		lnfRoots        []*mockBlock
 	}

 	storeOrReloadInfo struct {
@@ -22,12 +23,15 @@ type (

 	// mockBlock implements Block.
 	mockBlock struct {
-		id           int
-		instructions []*mockInstr
-		preds        []*mockBlock
-		_preds       []Block
-		iter         int
-		_entry       bool
+		id             int
+		instructions   []*mockInstr
+		preds, succs   []*mockBlock
+		_preds, _succs []Block
+		iter           int
+		_entry         bool
+		_loop          bool
+		lnfChildren    []*mockBlock
+		blockParams    []VReg
 	}

 	// mockInstr implements Instr.
@@ -41,6 +45,10 @@ func newMockFunction(blocks ...*mockBlock) *mockFunction {
 	return &mockFunction{blocks: blocks}
 }

+func (m *mockFunction) loopNestingForestRoots(blocks ...*mockBlock) {
+	m.lnfRoots = blocks
+}
+
 func newMockBlock(id int, instructions ...*mockInstr) *mockBlock {
 	return &mockBlock{id: id, instructions: instructions}
 }
@@ -75,6 +83,8 @@ func (m *mockBlock) String() string {
 func (m *mockBlock) addPred(b *mockBlock) {
 	m.preds = append(m.preds, b)
 	m._preds = append(m._preds, b)
+	b._succs = append(b._succs, m)
+	b.succs = append(b.succs, m)
 }

 func (m *mockInstr) use(uses ...VReg) *mockInstr {
@@ -87,6 +97,12 @@ func (m *mockInstr) def(defs ...VReg) *mockInstr {
 	return m
 }

+func (m *mockBlock) loop(children ...*mockBlock) *mockBlock {
+	m._loop = true
+	m.lnfChildren = children
+	return m
+}
+
 func (m *mockBlock) entry() *mockBlock {
 	m._entry = true
 	return m
@@ -194,13 +210,35 @@ func (m *mockBlock) InstrIteratorNext() Instr {
 	return ret
 }

+// InstrRevIteratorBegin implements Block.
+func (m *mockBlock) InstrRevIteratorBegin() Instr {
+	if len(m.instructions) == 0 {
+		return nil
+	}
+	m.iter = len(m.instructions)
+	return m.InstrRevIteratorNext()
+}
+
+// InstrRevIteratorNext implements Block.
+func (m *mockBlock) InstrRevIteratorNext() Instr {
+	m.iter--
+	if m.iter < 0 {
+		return nil
+	}
+	return m.instructions[m.iter]
+}
+
 // Preds implements Block.
 func (m *mockBlock) Preds() int {
 	return len(m._preds)
 }

 // BlockParams implements Block.
-func (m *mockBlock) BlockParams() []VReg { return nil }
+func (m *mockBlock) BlockParams() []VReg { return m.blockParams }
+
+func (m *mockBlock) blockParam(v VReg) {
+	m.blockParams = append(m.blockParams, v)
+}

 // Pred implements Instr.
 func (m *mockBlock) Pred(i int) Block { return m._preds[i] }
@@ -248,3 +286,31 @@ var (
 	_ Block    = (*mockBlock)(nil)
 	_ Instr    = (*mockInstr)(nil)
 )
+
+func (m *mockFunction) LoopNestingForestRoots() int {
+	return len(m.lnfRoots)
+}
+
+func (m *mockFunction) LoopNestingForestRoot(i int) Block {
+	return m.lnfRoots[i]
+}
+
+func (m *mockBlock) LoopHeader() bool {
+	return m._loop
+}
+
+func (m *mockBlock) Succs() int {
+	return len(m.succs)
+}
+
+func (m *mockBlock) Succ(i int) Block {
+	return m.succs[i]
+}
+
+func (m *mockBlock) LoopNestingForestChildren() int {
+	return len(m.lnfChildren)
+}
+
+func (m *mockBlock) LoopNestingForestChild(i int) Block {
+	return m.lnfChildren[i]
+}
--- a/internal/engine/wazevo/backend/regalloc/regalloc.go
+++ b/internal/engine/wazevo/backend/regalloc/regalloc.go
@@ -73,6 +73,7 @@ type (
 		nodes2     []*node
 		nodes3     []*node
 		dedup      []bool
+		blks       []Block
 	}

 	// blockInfo is a per-block information used during the register allocation.
@@ -145,17 +146,21 @@ const (
 	pcStride    = pcDefOffset + 1
 )

+// phiBlk returns the block that defines the given phi value, nil otherwise.
+func (a *Allocator) phiBlk(id VRegID) Block {
+	if int(id) >= len(a.phiBlocks) {
+		return nil
+	}
+	return a.phiBlocks[id]
+}
+
 // liveAnalysis constructs Allocator.blockInfos.
-// The algorithm here is described in https://pfalcon.github.io/ssabook/latest/book-full.pdf Chapter 9.4.
-//
-// TODO: this might not be efficient. We should be able to leverage dominance tree, etc.
+// The algorithm here is described in https://pfalcon.github.io/ssabook/latest/book-full.pdf Chapter 9.2.
 func (a *Allocator) livenessAnalysis(f Function) {
 	// First, we need to allocate blockInfos.
+	var maxBlockID int
 	for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() { // Order doesn't matter.
 		info := a.allocateBlockInfo(blk.ID())
-		if blk.Entry() {
-			continue
-		}
 		// If this is not the entry block, we should define phi nodes, which are not defined by instructions.
 		for _, p := range blk.BlockParams() {
 			info.defs[p] = 0 // Earliest definition is at the beginning of the block.
@@ -166,137 +171,145 @@ func (a *Allocator) livenessAnalysis(f Function) {
 			}
 			a.phiBlocks[pid] = blk
 		}
+		if blk.ID() > maxBlockID {
+			maxBlockID = blk.ID()
+		}
 	}

-	// Gathers all defs, lastUses, and VRegs in use (into a.vs).
-	a.vs = a.vs[:0]
-	for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() {
-		info := a.blockInfoAt(blk.ID())
+	if maxBlockID >= len(a.dedup) {
+		a.dedup = append(a.dedup, make([]bool, maxBlockID+1)...)
+	}

-		// We have to do a first pass to find the lowest VRegID in the block;
-		// this is used to reduce memory utilization in the VRegTable, which
-		// can avoid allocating memory for registers zero to minVRegID-1.
-		minVRegID := VRegIDMinSet{}
+	// Run the Algorithm 9.2 in the bool.
+	for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() {
+		blkID := blk.ID()
+		info := a.allocateBlockInfo(blkID)
+
+		ns := blk.Succs()
+		for i := 0; i < ns; i++ {
+			succ := blk.Succ(i)
+			if succ == nil {
+				continue
+			}
+
+			succID := succ.ID()
+			if !a.dedup[succID] { // This means the back edge.
+				continue
+			}
+
+			succInfo := a.blockInfoAt(succID)
+			for v := range succInfo.liveIns {
+				if a.phiBlk(v.ID()) != succ {
+					info.liveOuts[v] = struct{}{}
+					info.liveIns[v] = struct{}{}
+				}
+			}
+		}
+
+		var pc programCounter
+		var minVRegID VRegIDMinSet
 		for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() {
-			for _, use := range instr.Uses() {
+			uses := instr.Uses()
+			for _, use := range uses {
 				if !use.IsRealReg() {
 					minVRegID.Observe(use)
 				}
 			}
+			pc += pcStride
 		}
 		info.lastUses.Reset(minVRegID)

-		var pc programCounter
-		for instr := blk.InstrIteratorBegin(); instr != nil; instr = blk.InstrIteratorNext() {
-			var srcVR, dstVR VReg
-			for _, use := range instr.Uses() {
-				srcVR = use
-				pos := pc + pcUseOffset
-				if use.IsRealReg() {
-					info.addRealRegUsage(use, pos)
-				} else {
-					info.lastUses.Insert(use, pos)
-				}
-			}
-			for _, def := range instr.Defs() {
-				dstVR = def
+		for instr := blk.InstrRevIteratorBegin(); instr != nil; instr = blk.InstrRevIteratorNext() {
+			pc -= pcStride
+			var use, def VReg
+			for _, def = range instr.Defs() {
 				defID := def.ID()
 				pos := pc + pcDefOffset
 				if def.IsRealReg() {
 					info.realRegDefs[defID] = append(info.realRegDefs[defID], pos)
 				} else {
-					if _, ok := info.defs[def]; !ok {
-						// This means that this VReg is defined multiple times in a series of instructions
-						// e.g. loading arbitrary constant in arm64, and we only need the earliest
-						// definition to construct live range.
-						info.defs[def] = pos
+					info.defs[def] = pos
+					delete(info.liveIns, def)
+				}
+			}
+			for _, use = range instr.Uses() {
+				pos := pc + pcUseOffset
+				if use.IsRealReg() {
+					id := use.ID()
+					info.realRegUses[id] = append(info.realRegUses[id], pos)
+				} else {
+					if info.lastUses.Lookup(use) < 0 {
+						info.lastUses.Insert(use, pos)
 					}
-					a.vs = append(a.vs, def)
+					info.liveIns[use] = struct{}{}
 				}
 			}
+
 			if instr.IsCopy() {
-				id := int(dstVR.ID())
-				if id < len(a.phiBlocks) && a.phiBlocks[id] != nil {
-					info.liveOuts[dstVR] = struct{}{}
-				}
-				a.recordCopyRelation(dstVR, srcVR)
+				a.recordCopyRelation(def, use)
+			}
+
+			// If the destination is a phi value, and ...
+			if def.Valid() && a.phiBlk(def.ID()) != nil {
+				if use.Valid() && use.IsRealReg() {
+					// If the source is a real register, this is the beginning of the function, and
+					// therefore we need to add the definition of the real register.
+					r := use.ID()
+					info.realRegDefs[r] = append(info.realRegDefs[r], 0)
+				} else {
+					// Otherwise, this is the definition of the phi value for the successor block.
+					// So we need to make it outlive the block.
+					info.liveOuts[def] = struct{}{}
+				}
 			}
-			pc += pcStride
-		}
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("prepared block info for block[%d]:\n%s\n\n", blk.ID(), info.Format(a.regInfo))
 		}
+		a.dedup[blkID] = true
 	}

-	// Run the Algorithm 9.9. in the book. This will construct blockInfo.liveIns and blockInfo.liveOuts.
-	for _, phi := range a.phis {
-		blk := a.phiBlocks[phi.ID()]
-		a.beginUpAndMarkStack(f, phi, true, blk)
+	nrs := f.LoopNestingForestRoots()
+	for i := 0; i < nrs; i++ {
+		root := f.LoopNestingForestRoot(i)
+		a.loopTreeDFS(root)
 	}
-	for _, v := range a.vs {
-		if v.IsRealReg() {
-			// Real registers do not need to be tracked in liveOuts and liveIns because they are not allocation targets.
-			panic("BUG")
-		}
-		a.beginUpAndMarkStack(f, v, false, nil)
+
+	// Clears the dedup array for the next function.
+	for i := 0; i <= maxBlockID; i++ {
+		a.dedup[i] = false
 	}
 }

-func (a *Allocator) beginUpAndMarkStack(f Function, v VReg, isPhi bool, phiDefinedAt Block) {
-	for blk := f.PostOrderBlockIteratorBegin(); blk != nil; blk = f.PostOrderBlockIteratorNext() {
-		if blk.Preds() == 0 && !blk.Entry() {
-			panic(fmt.Sprintf("block without predecessor must be optimized out by the compiler: %d", blk.ID()))
+// loopTreeDFS implements the Algorithm 9.3 in the book in an iterative way.
+func (a *Allocator) loopTreeDFS(entry Block) {
+	a.blks = a.blks[:0]
+	a.blks = append(a.blks, entry)
+
+	for len(a.blks) > 0 {
+		tail := len(a.blks) - 1
+		loop := a.blks[tail]
+		a.blks = a.blks[:tail]
+		a.vs = a.vs[:0]
+
+		info := a.blockInfoAt(loop.ID())
+		for v := range info.liveIns {
+			if a.phiBlk(v.ID()) != loop {
+				a.vs = append(a.vs, v)
+				info.liveOuts[v] = struct{}{}
+			}
 		}
-		info := a.blockInfoAt(blk.ID())
-		if !info.lastUses.Contains(v) {
-			continue
+
+		cn := loop.LoopNestingForestChildren()
+		for i := 0; i < cn; i++ {
+			child := loop.LoopNestingForestChild(i)
+			childID := child.ID()
+			childInfo := a.blockInfoAt(childID)
+			for _, v := range a.vs {
+				childInfo.liveIns[v] = struct{}{}
+				childInfo.liveOuts[v] = struct{}{}
+			}
+			if child.LoopHeader() {
+				a.blks = append(a.blks, child)
+			}
 		}
-		// TODO: we might want to avoid recursion here.
-		a.upAndMarkStack(blk, v, isPhi, phiDefinedAt, 0)
-	}
-}
-
-// upAndMarkStack is the Algorithm 9.10. in the book named Up_and_Mark_Stack(B, v).
-//
-// We recursively call this, so passing `depth` for debugging.
-func (a *Allocator) upAndMarkStack(b Block, v VReg, isPhi bool, phiDefinedAt Block, depth int) {
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Printf("%supAndMarkStack for %v at %v\n", strings.Repeat("\t", depth), v, b.ID())
-	}
-
-	info := a.blockInfoAt(b.ID())
-	if _, ok := info.defs[v]; ok && !isPhi {
-		return // Defined in this block, so no need to go further climbing up.
-	}
-	// v must be in liveIns.
-	if _, ok := info.liveIns[v]; ok {
-		return // But this case, it is already visited. (maybe by, for example, sibling blocks).
-	}
-	if wazevoapi.RegAllocLoggingEnabled {
-		fmt.Printf("%sadding %v live-in at block[%d]\n", strings.Repeat("\t", depth), v, b.ID())
-	}
-
-	// Now we can safely mark v as a part of live-in
-	info.liveIns[v] = struct{}{}
-
-	// Plus if this is this block has the definition of this phi, we can stop climbing up.
-	if b == phiDefinedAt {
-		return
-	}
-
-	preds := b.Preds()
-	if preds == 0 {
-		panic(fmt.Sprintf("BUG: block has no predecessors while requiring live-in: blk%d", b.ID()))
-	}
-
-	// and climb up the CFG.
-	for i := 0; i < preds; i++ {
-		pred := b.Pred(i)
-		if wazevoapi.RegAllocLoggingEnabled {
-			fmt.Printf("%sadding %v live-out at block[%d]\n", strings.Repeat("\t", depth+1), v, pred.ID())
-		}
-		a.blockInfoAt(pred.ID()).liveOuts[v] = struct{}{}
-		a.upAndMarkStack(pred, v, isPhi, phiDefinedAt, depth+1)
 	}
 }

@@ -362,10 +375,11 @@ func (a *Allocator) buildLiveRangesForNonReals(info *blockInfo) {
 			// v is defined here and live-out, so it is live-through.
 			end = math.MaxInt32
 		} else {
-			if end = info.lastUses.Lookup(v); end == -1 {
+			end = info.lastUses.Lookup(v)
+			if end == -1 {
 				// This case the defined value is not used at all.
 				end = defPos
-			} // Otherwise v is killed at defPos.
+			}
 		}
 		n := a.getOrAllocateNode(v)
 		intervalNode := info.intervalMng.insert(n, defPos, end)
@@ -398,8 +412,17 @@ func (a *Allocator) buildLiveRangesForReals(info *blockInfo) {
 					a.regInfo.RealRegName(r), len(defs), len(uses),
 				),
 			)
+		} else if len(uses) == 0 {
+			continue
 		}

+		sort.Slice(uses, func(i, j int) bool {
+			return uses[i] < uses[j]
+		})
+		sort.Slice(defs, func(i, j int) bool {
+			return defs[i] < defs[j]
+		})
+
 		for i := range uses {
 			n := a.allocateNode()
 			n.r = r
@@ -514,17 +537,6 @@ func (a *Allocator) allocateNode() (n *node) {
 	return
 }

-func (i *blockInfo) addRealRegUsage(v VReg, pc programCounter) {
-	id := v.ID()
-	defs := i.realRegDefs[id]
-	if len(defs) == 0 {
-		// If the definition not found yet but used, this must be a function preamble,
-		// so we let's assume it is defined at the beginning.
-		i.realRegDefs[id] = append(i.realRegDefs[id], 0)
-	}
-	i.realRegUses[id] = append(i.realRegUses[id], pc)
-}
-
 // Format is for debugging.
 func (i *blockInfo) Format(ri *RegisterInfo) string {
 	var buf strings.Builder
--- a/internal/engine/wazevo/backend/regalloc/regalloc_test.go
+++ b/internal/engine/wazevo/backend/regalloc/regalloc_test.go
@@ -1,6 +1,8 @@
 package regalloc

 import (
+	"fmt"
+	"sort"
 	"testing"

 	"github.com/tetratelabs/wazero/internal/testing/require"
@@ -24,7 +26,7 @@ func makeVRegTable(vregs map[VReg]programCounter) (table VRegTable) {
 func TestAllocator_livenessAnalysis(t *testing.T) {
 	const realRegID, realRegID2 = 50, 100
 	realReg, realReg2 := FromRealReg(realRegID, RegTypeInt), FromRealReg(realRegID2, RegTypeInt)
-	const phiVReg = 12345
+	phiVReg := VReg(12345).SetRegType(RegTypeInt)
 	for _, tc := range []struct {
 		name  string
 		setup func() Function
@@ -47,7 +49,32 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 				},
 			},
 		},
-
+		{
+			name: "single block with real reg",
+			setup: func() Function {
+				realVReg := FromRealReg(10, RegTypeInt)
+				param := VReg(1)
+				ret := VReg(2)
+				blk := newMockBlock(0,
+					newMockInstr().def(param).use(realVReg),
+					newMockInstr().def(ret).use(param, param),
+					newMockInstr().def(realVReg).use(ret),
+				).entry()
+				blk.blockParam(param)
+				return newMockFunction(blk)
+			},
+			exp: map[int]*blockInfo{
+				0: {
+					defs: map[VReg]programCounter{1: 1, 2: pcDefOffset + pcStride},
+					lastUses: makeVRegTable(map[VReg]programCounter{
+						1: pcStride + pcUseOffset,
+						2: pcStride*2 + pcUseOffset,
+					}),
+					realRegUses: [vRegIDReservedForRealNum][]programCounter{10: {0}},
+					realRegDefs: [vRegIDReservedForRealNum][]programCounter{10: {pcDefOffset + pcStride*2}},
+				},
+			},
+		},
 		{
 			name: "straight",
 			// b0 -> b1 -> b2
@@ -168,7 +195,7 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 					liveOuts:    map[VReg]struct{}{1000: {}},
 					lastUses:    makeVRegTable(map[VReg]programCounter{2: pcUseOffset}),
 					realRegUses: [vRegIDReservedForRealNum][]programCounter{realRegID2: {pcUseOffset}},
-					realRegDefs: [vRegIDReservedForRealNum][]programCounter{realRegID2: {0}},
+					realRegDefs: [vRegIDReservedForRealNum][]programCounter{},
 				},
 				3: {
 					liveIns:  map[VReg]struct{}{1000: {}},
@@ -251,6 +278,7 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 				b1 := newMockBlock(1,
 					newMockInstr().def(9999),
 				)
+				b1.blockParam(phiVReg)
 				b2 := newMockBlock(2,
 					newMockInstr().def(100).use(phiVReg, 9999),
 				)
@@ -259,7 +287,9 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 					newMockInstr().use(100),
 				)
 				b4 := newMockBlock(4,
-					newMockInstr().def(phiVReg).use(54321),
+					newMockInstr().def(phiVReg).use(54321).
+						// Make sure this is the PHI defining instruction.
+						asCopy(),
 				)
 				b5 := newMockBlock(
 					4, newMockInstr().use(54321),
@@ -270,7 +300,10 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 				b3.addPred(b2)
 				b4.addPred(b3)
 				b5.addPred(b3)
-				return newMockFunction(b0, b1, b2, b3, b4, b5)
+				b1.loop(b2, b3, b4, b5)
+				f := newMockFunction(b0, b1, b2, b3, b4, b5)
+				f.loopNestingForestRoots(b1)
+				return f
 			},
 			exp: map[int]*blockInfo{
 				0: {
@@ -289,7 +322,7 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 				1: {
 					liveIns:  map[VReg]struct{}{phiVReg: {}},
 					liveOuts: map[VReg]struct{}{phiVReg: {}, 9999: {}},
-					defs:     map[VReg]programCounter{9999: pcDefOffset},
+					defs:     map[VReg]programCounter{phiVReg: 0, 9999: pcDefOffset},
 					lastUses: makeVRegTable(map[VReg]programCounter{}),
 				},
 				2: {
@@ -312,7 +345,61 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "multiple pass alive",
+			setup: func() Function {
+				v := VReg(9999)
+				b0 := newMockBlock(0, newMockInstr().def(v)).entry()

+				b1, b2, b3, b4, b5, b6 := newMockBlock(1), newMockBlock(2),
+					newMockBlock(3, newMockInstr().use(v)),
+					newMockBlock(4), newMockBlock(5), newMockBlock(6)
+
+				b1.addPred(b0)
+				b4.addPred(b0)
+				b2.addPred(b1)
+				b5.addPred(b2)
+				b2.addPred(b5)
+				b6.addPred(b2)
+				b3.addPred(b6)
+				b3.addPred(b4)
+				f := newMockFunction(b0, b1, b2, b4, b5, b6, b3)
+				f.loopNestingForestRoots(b2)
+				return f
+			},
+			exp: map[int]*blockInfo{
+				0: {
+					liveOuts: map[VReg]struct{}{9999: {}},
+					defs:     map[VReg]programCounter{9999: pcDefOffset},
+					lastUses: makeVRegTable(nil),
+				},
+				1: {
+					liveIns:  map[VReg]struct{}{9999: {}},
+					liveOuts: map[VReg]struct{}{9999: {}},
+					lastUses: makeVRegTable(nil),
+				},
+				2: {
+					liveIns:  map[VReg]struct{}{9999: {}},
+					liveOuts: map[VReg]struct{}{9999: {}},
+					lastUses: makeVRegTable(nil),
+				},
+				3: {
+					liveIns:  map[VReg]struct{}{9999: {}},
+					lastUses: makeVRegTable(map[VReg]programCounter{9999: pcUseOffset}),
+				},
+				4: {
+					liveIns:  map[VReg]struct{}{9999: {}},
+					liveOuts: map[VReg]struct{}{9999: {}},
+					lastUses: makeVRegTable(nil),
+				},
+				5: {lastUses: makeVRegTable(nil)},
+				6: {
+					liveIns:  map[VReg]struct{}{9999: {}},
+					liveOuts: map[VReg]struct{}{9999: {}},
+					lastUses: makeVRegTable(nil),
+				},
+			},
+		},
 		{
 			//           -----+
 			//           v    |
@@ -321,10 +408,14 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 			//      +----+
 			name: "Fig. 9.2 in paper",
 			setup: func() Function {
-				b0 := newMockBlock(0, newMockInstr().def(99999)).entry()
+				b0 := newMockBlock(0,
+					newMockInstr().def(99999),
+					newMockInstr().def(phiVReg).use(111).asCopy(),
+				).entry()
 				b1 := newMockBlock(1, newMockInstr().use(99999))
-				b2 := newMockBlock(2)
-				b3 := newMockBlock(3)
+				b1.blockParam(phiVReg)
+				b2 := newMockBlock(2, newMockInstr().def(88888).use(phiVReg, phiVReg))
+				b3 := newMockBlock(3, newMockInstr().def(phiVReg).use(88888).asCopy())
 				b4 := newMockBlock(4)
 				b1.addPred(b0)
 				b1.addPred(b2)
@@ -332,145 +423,86 @@ func TestAllocator_livenessAnalysis(t *testing.T) {
 				b2.addPred(b3)
 				b3.addPred(b2)
 				b4.addPred(b3)
-				return newMockFunction(b0, b1, b2, b3, b4)
+
+				b1.loop(b2)
+				b2.loop(b3)
+				f := newMockFunction(b0, b1, b2, b3, b4)
+				f.loopNestingForestRoots(b1)
+				return f
 			},
 			exp: map[int]*blockInfo{
 				0: {
-					defs:     map[VReg]programCounter{99999: pcDefOffset},
-					liveOuts: map[VReg]struct{}{99999: {}},
-					lastUses: makeVRegTable(nil),
+					defs:     map[VReg]programCounter{99999: pcDefOffset, phiVReg: pcStride + pcDefOffset},
+					liveOuts: map[VReg]struct{}{99999: {}, phiVReg: {}},
+					liveIns:  map[VReg]struct{}{111: {}},
+					lastUses: makeVRegTable(map[VReg]programCounter{111: pcStride + pcUseOffset}),
 				},
 				1: {
-					liveIns:  map[VReg]struct{}{99999: {}},
-					liveOuts: map[VReg]struct{}{99999: {}},
+					defs:     map[VReg]programCounter{phiVReg: 0},
+					liveIns:  map[VReg]struct{}{99999: {}, phiVReg: {}},
+					liveOuts: map[VReg]struct{}{99999: {}, phiVReg: {}},
 					lastUses: makeVRegTable(map[VReg]programCounter{99999: pcUseOffset}),
 				},
 				2: {
-					liveIns:  map[VReg]struct{}{99999: {}},
-					liveOuts: map[VReg]struct{}{99999: {}},
-					lastUses: makeVRegTable(nil),
+					liveIns:  map[VReg]struct{}{99999: {}, phiVReg: {}},
+					liveOuts: map[VReg]struct{}{99999: {}, 88888: {}, phiVReg: {}},
+					defs:     map[VReg]programCounter{88888: pcDefOffset},
+					lastUses: makeVRegTable(map[VReg]programCounter{phiVReg: pcUseOffset}),
 				},
 				3: {
-					liveIns:  map[VReg]struct{}{99999: {}},
-					liveOuts: map[VReg]struct{}{99999: {}},
-					lastUses: makeVRegTable(nil),
+					liveIns:  map[VReg]struct{}{99999: {}, phiVReg: {}, 88888: {}},
+					liveOuts: map[VReg]struct{}{99999: {}, phiVReg: {}},
+					defs:     map[VReg]programCounter{phiVReg: pcDefOffset},
+					lastUses: makeVRegTable(map[VReg]programCounter{88888: pcUseOffset}),
 				},
 				4: {
 					lastUses: makeVRegTable(nil),
 				},
 			},
 		},
-
-		//      2
-		//      ^              +----+
-		//      |              v    |
-		// 0 -> 1 -> 3 -> 4 -> 5 -> 6 -> 9
-		//      ^    |         ^         |
-		//      |    v         |         |
-		//      |    7 -> 8 ---+         |
-		//      |    ^    |              |
-		//      |    +----+              |
-		//      +------------------------+
-		{
-			name: "Fig. 9.1 in paper",
-			setup: func() Function {
-				b0 := newMockBlock(0).entry()
-				b1 := newMockBlock(1)
-				b2 := newMockBlock(2)
-				b3 := newMockBlock(3,
-					newMockInstr().def(100),
-				)
-				b4 := newMockBlock(4)
-				b5 := newMockBlock(5,
-					newMockInstr().use(100),
-				)
-				b6 := newMockBlock(6)
-				b7 := newMockBlock(7)
-				b8 := newMockBlock(8)
-				b9 := newMockBlock(9)
-
-				b1.addPred(b0)
-				b1.addPred(b9)
-
-				b2.addPred(b1)
-
-				b3.addPred(b1)
-
-				b4.addPred(b3)
-
-				b5.addPred(b4)
-				b5.addPred(b6)
-				b5.addPred(b8)
-
-				b6.addPred(b5)
-
-				b7.addPred(b3)
-				b7.addPred(b8)
-
-				b8.addPred(b7)
-
-				b9.addPred(b6)
-				return newMockFunction(b0, b1, b2, b3, b4, b7, b8, b5, b6, b9)
-			},
-			exp: map[int]*blockInfo{
-				0: {
-					lastUses: makeVRegTable(nil),
-				},
-				1: {
-					lastUses: makeVRegTable(nil),
-				},
-				2: {
-					lastUses: makeVRegTable(nil),
-				},
-				3: {
-					defs:     map[VReg]programCounter{100: pcDefOffset},
-					liveOuts: map[VReg]struct{}{100: {}},
-					lastUses: makeVRegTable(nil),
-				},
-				4: {
-					liveIns:  map[VReg]struct{}{100: {}},
-					liveOuts: map[VReg]struct{}{100: {}},
-					lastUses: makeVRegTable(nil),
-				},
-				5: {
-					liveIns:  map[VReg]struct{}{100: {}},
-					liveOuts: map[VReg]struct{}{100: {}},
-					lastUses: makeVRegTable(map[VReg]programCounter{100: pcUseOffset}),
-				},
-				6: {
-					liveIns:  map[VReg]struct{}{100: {}},
-					liveOuts: map[VReg]struct{}{100: {}},
-					lastUses: makeVRegTable(nil),
-				},
-				7: {
-					liveIns:  map[VReg]struct{}{100: {}},
-					liveOuts: map[VReg]struct{}{100: {}},
-					lastUses: makeVRegTable(nil),
-				},
-				8: {
-					liveIns:  map[VReg]struct{}{100: {}},
-					liveOuts: map[VReg]struct{}{100: {}},
-					lastUses: makeVRegTable(nil),
-				},
-				9: {
-					lastUses: makeVRegTable(nil),
-				},
-			},
-		},
 	} {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			f := tc.setup()
-			a := NewAllocator(&RegisterInfo{})
+			a := NewAllocator(&RegisterInfo{
+				RealRegName: func(r RealReg) string {
+					return fmt.Sprintf("r%d", r)
+				},
+			})
 			a.livenessAnalysis(f)
 			for blockID := range a.blockInfos {
-				actual := a.blockInfos[blockID]
-				exp := tc.exp[blockID]
-				initMapInInfo(exp)
-				saved := actual.intervalMng
-				actual.intervalMng = nil // Don't compare intervalManager.
-				require.Equal(t, exp, actual, "\n[exp for block[%d]]\n%v\n[actual for block[%d]]\n%v", blockID, exp, blockID, actual)
-				actual.intervalMng = saved
+				t.Run(fmt.Sprintf("block_id=%d", blockID), func(t *testing.T) {
+					actual := a.blockInfos[blockID]
+					exp := tc.exp[blockID]
+					initMapInInfo(exp)
+					fmt.Printf("\n[exp for block[%d]]\n%v\n[actual for block[%d]]\n%v\n",
+						blockID, exp.Format(a.regInfo), blockID, actual.Format(a.regInfo))
+
+					require.Equal(t, exp.liveOuts, actual.liveOuts, "live outs")
+					require.Equal(t, exp.liveIns, actual.liveIns, "live ins")
+					require.Equal(t, exp.defs, actual.defs, "defs")
+					for i := range exp.realRegUses {
+						_exp, _actual := exp.realRegUses[i], actual.realRegUses[i]
+						sort.Slice(_exp, func(i, j int) bool {
+							return _exp[i] < _exp[j]
+						})
+						sort.Slice(_actual, func(i, j int) bool {
+							return _actual[i] < _actual[j]
+						})
+						require.Equal(t, _exp, _actual, "real reg use[%d]", i)
+					}
+					for i := range exp.realRegDefs {
+						_exp, _actual := exp.realRegDefs[i], actual.realRegDefs[i]
+						sort.Slice(_exp, func(i, j int) bool {
+							return _exp[i] < _exp[j]
+						})
+						sort.Slice(_actual, func(i, j int) bool {
+							return _actual[i] < _actual[j]
+						})
+						require.Equal(t, _exp, _actual, "real defs[%d]", i)
+					}
+					require.Equal(t, exp.lastUses, actual.lastUses, "last uses")
+				})
 			}

 			// Sanity check: buildLiveRanges should not panic.
--- a/internal/engine/wazevo/ssa/basic_block.go
+++ b/internal/engine/wazevo/ssa/basic_block.go
@@ -52,14 +52,30 @@ type BasicBlock interface {

 	// Valid is true if this block is still valid even after optimizations.
 	Valid() bool
+
 	// BeginPredIterator returns the first predecessor of this block.
 	BeginPredIterator() BasicBlock
+
 	// NextPredIterator returns the next predecessor of this block.
 	NextPredIterator() BasicBlock
+
 	// Preds returns the number of predecessors of this block.
 	Preds() int
+
 	// Pred returns the i-th predecessor of this block.
 	Pred(i int) BasicBlock
+
+	// Succs returns the number of successors of this block.
+	Succs() int
+
+	// Succ returns the i-th successor of this block.
+	Succ(i int) BasicBlock
+
+	// LoopHeader returns true if this block is a loop header.
+	LoopHeader() bool
+
+	// LoopNestingForestChildren returns the children of this block in the loop nesting forest.
+	LoopNestingForestChildren() []BasicBlock
 }

 type (
@@ -93,6 +109,10 @@ type (
 		// This is modified during the subPassLoopDetection pass.
 		loopHeader bool

+		// loopNestingForestChildren holds the children of this block in the loop nesting forest.
+		// Non-empty if and only if this block is a loop header (i.e. loopHeader=true)
+		loopNestingForestChildren []BasicBlock
+
 		// reversePostOrder is used to sort all the blocks in the function in reverse post order.
 		// This is used in builder.LayoutBlocks.
 		reversePostOrder int
@@ -234,6 +254,16 @@ func (bb *basicBlock) Pred(i int) BasicBlock {
 	return bb.preds[i].blk
 }

+// Succs implements BasicBlock.Succs.
+func (bb *basicBlock) Succs() int {
+	return len(bb.success)
+}
+
+// Succ implements BasicBlock.Succ.
+func (bb *basicBlock) Succ(i int) BasicBlock {
+	return bb.success[i]
+}
+
 // Root implements BasicBlock.Root.
 func (bb *basicBlock) Root() *Instruction {
 	return bb.rootInstr
@@ -256,6 +286,7 @@ func resetBasicBlock(bb *basicBlock) {
 	bb.unknownValues = make(map[Variable]Value)
 	bb.lastDefinitions = make(map[Variable]Value)
 	bb.reversePostOrder = -1
+	bb.loopNestingForestChildren = bb.loopNestingForestChildren[:0]
 }

 // addPred adds a predecessor to this block specified by the branch instruction.
@@ -342,3 +373,13 @@ func (bb *basicBlock) validate(b *builder) {
 func (bb *basicBlock) String() string {
 	return strconv.Itoa(int(bb.id))
 }
+
+// LoopNestingForestChildren implements BasicBlock.LoopNestingForestChildren.
+func (bb *basicBlock) LoopNestingForestChildren() []BasicBlock {
+	return bb.loopNestingForestChildren
+}
+
+// LoopHeader implements BasicBlock.LoopHeader.
+func (bb *basicBlock) LoopHeader() bool {
+	return bb.loopHeader
+}
--- a/internal/engine/wazevo/ssa/builder.go
+++ b/internal/engine/wazevo/ssa/builder.go
@@ -121,6 +121,9 @@ type Builder interface {

 	// SetCurrentSourceOffset sets the current source offset. The incoming instruction will be annotated with this offset.
 	SetCurrentSourceOffset(line SourceOffset)
+
+	// LoopNestingForestRoots returns the roots of the loop nesting forest.
+	LoopNestingForestRoots() []BasicBlock
 }

 // NewBuilder returns a new Builder implementation.
@@ -167,6 +170,9 @@ type builder struct {
 	// The index is blockID of the BasicBlock.
 	dominators []*basicBlock

+	// loopNestingForestRoots are the roots of the loop nesting forest.
+	loopNestingForestRoots []BasicBlock
+
 	// The followings are used for optimization passes/deterministic compilation.
 	instStack                      []*Instruction
 	blkVisited                     map[*basicBlock]int
@@ -208,6 +214,7 @@ func (b *builder) Init(s *Signature) {
 	b.blkStack = b.blkStack[:0]
 	b.blkStack2 = b.blkStack2[:0]
 	b.dominators = b.dominators[:0]
+	b.loopNestingForestRoots = b.loopNestingForestRoots[:0]

 	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
 		blk := b.basicBlocksPool.View(i)
@@ -249,6 +256,7 @@ func (b *builder) AnnotateValue(value Value, a string) {
 // AllocateInstruction implements Builder.AllocateInstruction.
 func (b *builder) AllocateInstruction() *Instruction {
 	instr := b.instructionsPool.Allocate()
+	instr.id = b.instructionsPool.Allocated()
 	return instr
 }

@@ -827,12 +835,6 @@ func (b *builder) LayoutBlocks() {
 			bs = append(bs, blk.Name())
 		}
 		fmt.Println("ordered blocks: ", strings.Join(bs, ", "))
-		bs = bs[:0]
-		for visited := range b.blkVisited {
-			bs = append(bs, visited.Name())
-		}
-		sort.Slice(bs, func(i, j int) bool { return bs[i] < bs[j] })
-		fmt.Println("visited blocks: ", strings.Join(bs, ", "))
 	}

 	if wazevoapi.SSAValidationEnabled {
@@ -844,6 +846,9 @@ func (b *builder) LayoutBlocks() {
 		}
 	}

+	// Critical edges are split, so we fix the loop nesting forest.
+	buildLoopNestingForest(b)
+
 	// Reuse the stack for the next iteration.
 	b.blkStack2 = uninsertedTrampolines[:0]

@@ -966,6 +971,11 @@ func (b *builder) splitCriticalEdge(pred, succ *basicBlock, predInfo *basicBlock
 	// where trampoline is a new basic block which is created to split the critical edge.

 	trampoline := b.allocateBasicBlock()
+	if int(trampoline.id) >= len(b.dominators) {
+		b.dominators = append(b.dominators, make([]*basicBlock, trampoline.id+1)...)
+	}
+	b.dominators[trampoline.id] = pred
+
 	originalBranch := predInfo.branch

 	// Replace originalBranch with the newBranch.
@@ -1034,3 +1044,8 @@ func (b *builder) InsertUndefined() {
 	instr.opcode = OpcodeUndefined
 	b.InsertInstruction(instr)
 }
+
+// LoopNestingForestRoots implements Builder.LoopNestingForestRoots.
+func (b *builder) LoopNestingForestRoots() []BasicBlock {
+	return b.loopNestingForestRoots
+}
--- a/internal/engine/wazevo/ssa/builder_test.go
+++ b/internal/engine/wazevo/ssa/builder_test.go
@@ -341,7 +341,7 @@ func TestBuilder_LayoutBlocks(t *testing.T) {
 				b.Seal(b2)
 				b.Seal(b3)
 			},
-			exp: []BasicBlockID{0, 2, 1, 3},
+			exp: []BasicBlockID{0, 1, 2, 3},
 		},
 		{
 			name: "loop towards loop header in fallthrough",
@@ -462,7 +462,7 @@ func TestBuilder_LayoutBlocks(t *testing.T) {
 				b.Seal(b5)
 			},
 			// The trampoline 6 is placed right after 4, which is the hot path of the loop.
-			exp: []BasicBlockID{0, 1, 2, 3, 4, 6, 5},
+			exp: []BasicBlockID{0, 1, 3, 2, 4, 6, 5},
 		},
 		{
 			name: "multiple critical edges",
--- a/internal/engine/wazevo/ssa/instructions.go
+++ b/internal/engine/wazevo/ssa/instructions.go
@@ -16,6 +16,8 @@ type Opcode uint32
 // for all instructions, and therefore each field has different meaning
 // depending on Opcode.
 type Instruction struct {
+	// id is the unique ID of this instruction which ascends from 0 following the order of program.
+	id         int
 	opcode     Opcode
 	u1, u2     uint64
 	v          Value
--- a/internal/engine/wazevo/ssa/pass.go
+++ b/internal/engine/wazevo/ssa/pass.go
@@ -2,6 +2,7 @@ package ssa

 import (
 	"fmt"
+	"sort"

 	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
 )
@@ -13,6 +14,7 @@ import (
 // Note that passes suffixed with "Opt" are the optimization passes, meaning that they edit the instructions and blocks
 // while the other passes are not, like passEstimateBranchProbabilities does not edit them, but only calculates the additional information.
 func (b *builder) RunPasses() {
+	passSortSuccessors(b)
 	passDeadBlockEliminationOpt(b)
 	passRedundantPhiEliminationOpt(b)
 	// The result of passCalculateImmediateDominators will be used by various passes below.
@@ -350,3 +352,24 @@ func passNopInstElimination(b *builder) {
 		}
 	}
 }
+
+// passSortSuccessors sorts the successors of each block in the natural program order.
+func passSortSuccessors(b *builder) {
+	for i := 0; i < b.basicBlocksPool.Allocated(); i++ {
+		blk := b.basicBlocksPool.View(i)
+		sort.SliceStable(blk.success, func(i, j int) bool {
+			iBlk, jBlk := blk.success[i], blk.success[j]
+			if jBlk.ReturnBlock() {
+				return true
+			}
+			if iBlk.ReturnBlock() {
+				return false
+			}
+			iRoot, jRoot := iBlk.rootInstr, jBlk.rootInstr
+			if iRoot == nil || jRoot == nil { // For testing.
+				return true
+			}
+			return iBlk.rootInstr.id < jBlk.rootInstr.id
+		})
+	}
+}
--- a/internal/engine/wazevo/ssa/pass_cfg.go
+++ b/internal/engine/wazevo/ssa/pass_cfg.go
@@ -1,5 +1,12 @@
 package ssa

+import (
+	"fmt"
+	"strings"
+
+	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
+)
+
 // passCalculateImmediateDominators calculates immediate dominators for each basic block.
 // The result is stored in b.dominators. This make it possible for the following passes to
 // use builder.isDominatedBy to check if a block is dominated by another block.
@@ -156,3 +163,39 @@ func subPassLoopDetection(b *builder) {
 		}
 	}
 }
+
+// buildLoopNestingForest builds the loop nesting forest for the function.
+// This must be called after branch splitting since it relies on the CFG.
+func buildLoopNestingForest(b *builder) {
+	ent := b.entryBlk()
+	doms := b.dominators
+	for _, blk := range b.reversePostOrderedBasicBlocks {
+		n := doms[blk.id]
+		for !n.loopHeader && n != ent {
+			n = doms[n.id]
+		}
+
+		if n == ent && blk.loopHeader {
+			b.loopNestingForestRoots = append(b.loopNestingForestRoots, blk)
+		} else if n == ent {
+		} else if n.loopHeader {
+			n.loopNestingForestChildren = append(n.loopNestingForestChildren, blk)
+		}
+	}
+
+	if wazevoapi.SSALoggingEnabled {
+		for _, root := range b.loopNestingForestRoots {
+			printLoopNestingForest(root.(*basicBlock), 0)
+		}
+	}
+}
+
+func printLoopNestingForest(root *basicBlock, depth int) {
+	fmt.Println(strings.Repeat("\t", depth), "loop nesting forest root:", root.ID())
+	for _, child := range root.loopNestingForestChildren {
+		fmt.Println(strings.Repeat("\t", depth+1), "child:", child.ID())
+		if child.LoopHeader() {
+			printLoopNestingForest(child.(*basicBlock), depth+2)
+		}
+	}
+}
--- a/internal/engine/wazevo/ssa/pass_cfg_test.go
+++ b/internal/engine/wazevo/ssa/pass_cfg_test.go
@@ -1,6 +1,7 @@
 package ssa

 import (
+	"sort"
 	"testing"

 	"github.com/tetratelabs/wazero/internal/testing/require"
@@ -505,6 +506,22 @@ func TestBuilder_passCalculateImmediateDominators(t *testing.T) {
 			},
 			expLoops: map[BasicBlockID]struct{}{1: {}, 6: {}},
 		},
+		{
+			name: "merge after loop",
+			edges: edgesCase{
+				0: {3, 1},
+				1: {2},
+				2: {1, 3},
+				3: {4},
+			},
+			expDoms: map[BasicBlockID]BasicBlockID{
+				1: 0,
+				2: 1,
+				3: 0,
+				4: 3,
+			},
+			expLoops: map[BasicBlockID]struct{}{1: {}},
+		},
 	} {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
@@ -524,3 +541,142 @@ func TestBuilder_passCalculateImmediateDominators(t *testing.T) {
 		})
 	}
 }
+
+func TestBuildLoopNestingForest(t *testing.T) {
+	type expLoopNestingForest struct {
+		roots    []BasicBlockID
+		children map[BasicBlockID][]BasicBlockID
+	}
+
+	for _, tc := range []struct {
+		name   string
+		edges  edgesCase
+		expLNF expLoopNestingForest
+	}{
+		{
+			name: "linear",
+			// 0 -> 1 -> 2 -> 3 -> 4
+			edges: edgesCase{
+				0: {1},
+				1: {2},
+				2: {3},
+				3: {4},
+			},
+		},
+		{
+			name: "loop",
+			// 0 -> 1 -> 2
+			//      ^    |
+			//      |    v
+			//      |--- 3
+			edges: edgesCase{
+				0: {1},
+				1: {2},
+				2: {3},
+				3: {1},
+			},
+			expLNF: expLoopNestingForest{
+				roots: []BasicBlockID{1},
+				children: map[BasicBlockID][]BasicBlockID{
+					1: {2, 3},
+				},
+			},
+		},
+		{
+			name: "two independent loops",
+			//      0
+			//      |
+			//      v
+			//      1 --> 2 --> 3
+			//      ^           |
+			//      v           v
+			//      4 <---------5
+			//      |
+			//      v
+			//      6 --> 7 --> 8
+			//      ^           |
+			//      v           v
+			//      9 <---------10
+			edges: map[BasicBlockID][]BasicBlockID{
+				0:  {1},
+				1:  {2, 4},
+				2:  {3},
+				3:  {5},
+				4:  {1, 6},
+				5:  {4},
+				6:  {7, 9},
+				7:  {8},
+				8:  {10},
+				9:  {6},
+				10: {9},
+			},
+			expLNF: expLoopNestingForest{
+				roots: []BasicBlockID{1},
+				children: map[BasicBlockID][]BasicBlockID{
+					1: {2, 3, 4, 5, 6},
+					6: {7, 8, 9, 10},
+				},
+			},
+		},
+		{
+			//
+			//                  +-----+
+			//                  |     |
+			//                  v     |
+			//    0 ---> 1 ---> 2 --> 3 ---> 4
+			//           ^      |
+			//           |      |
+			//           +------+
+			//
+			name: "Fig. 9.2", // in "SSA-based Compiler Design".
+			edges: map[BasicBlockID][]BasicBlockID{
+				0: {1},
+				1: {2},
+				2: {1, 3},
+				3: {2, 4},
+			},
+			expLNF: expLoopNestingForest{
+				roots: []BasicBlockID{1},
+				children: map[BasicBlockID][]BasicBlockID{
+					1: {2},
+					2: {3, 4},
+				},
+			},
+		},
+	} {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			b := constructGraphFromEdges(tc.edges)
+			// buildLoopNestingForest requires passCalculateImmediateDominators to be done.
+			passCalculateImmediateDominators(b)
+			buildLoopNestingForest(b)
+
+			blocks := map[BasicBlockID]*basicBlock{}
+			for blk := b.blockIteratorBegin(); blk != nil; blk = b.blockIteratorNext() {
+				blocks[blk.id] = blk
+			}
+
+			// Check the result of buildLoopNestingForest.
+			var forestRoots []BasicBlockID
+			for _, root := range b.loopNestingForestRoots {
+				forestRoots = append(forestRoots, root.(*basicBlock).id)
+			}
+			sort.Slice(forestRoots, func(i, j int) bool {
+				return forestRoots[i] < forestRoots[j]
+			})
+			require.Equal(t, tc.expLNF.roots, forestRoots)
+
+			for expBlkID, blk := range blocks {
+				expChildren := tc.expLNF.children[expBlkID]
+				var actualChildren []BasicBlockID
+				for _, child := range blk.loopNestingForestChildren {
+					actualChildren = append(actualChildren, child.(*basicBlock).id)
+				}
+				sort.Slice(actualChildren, func(i, j int) bool {
+					return actualChildren[i] < actualChildren[j]
+				})
+				require.Equal(t, expChildren, actualChildren, "block %d", expBlkID)
+			}
+		})
+	}
+}
--- a/internal/engine/wazevo/ssa/ssa_test.go
+++ b/internal/engine/wazevo/ssa/ssa_test.go
@@ -1,8 +1,6 @@
 package ssa

-import (
-	"sort"
-)
+import "sort"

 // edgesCase is a map from BasicBlockID to its successors.
 type edgesCase map[BasicBlockID][]BasicBlockID
@@ -34,14 +32,10 @@ func constructGraphFromEdges(edges edgesCase) (b *builder) {
 		blocks[blk.id] = blk
 	}

-	// To have a consistent behavior in test, we sort the pairs.
+	// To have a consistent behavior in test, we sort the pairs by fromID.
 	sort.Slice(pairs, func(i, j int) bool {
 		xf, yf := pairs[i][0], pairs[j][0]
-		xt, yt := pairs[i][1], pairs[j][1]
-		if xf < yf {
-			return true
-		}
-		return xt < yt
+		return xf < yf
 	})

 	// Add edges.
--- a/internal/engine/wazevo/testcases/testcases.go
+++ b/internal/engine/wazevo/testcases/testcases.go
@@ -327,7 +327,6 @@ var (
 		Module: SingleFunctionModule(i32_v, []byte{
 			wasm.OpcodeLoop, blockSignature_vv,
 			wasm.OpcodeBlock, blockSignature_vv,
-
 			wasm.OpcodeLocalGet, 0,
 			wasm.OpcodeBrIf, 2,
 			wasm.OpcodeEnd,