machine programming – branching ceng331: introduction to computer systems 6 th lecture
DESCRIPTION
Machine Programming – Branching CENG331: Introduction to Computer Systems 6 th Lecture. Instructor: Erol Sahin. Acknowledgement: Most of the slides are adapted from the ones prepared by R.E. Bryant, D.R. O’Hallaron of Carnegie-Mellon Univ. Conditional Branch Example. absdiff: - PowerPoint PPT PresentationTRANSCRIPT
Machine Programming – BranchingCENG331: Introduction to Computer Systems6th Lecture
Instructor: Erol Sahin
Acknowledgement: Most of the slides are adapted from the ones prepared by R.E. Bryant, D.R. O’Hallaron of Carnegie-Mellon Univ.
Conditional Branch Example
int absdiff(int x, int y){ int result; if (x > y) { result = x-y; } else { result = y-x; } return result;}
absdiff:pushl %ebpmovl %esp, %ebpmovl 8(%ebp), %edxmovl 12(%ebp), %eaxcmpl %eax, %edxjle .L7subl %eax, %edxmovl %edx, %eax
.L8:leaveret
.L7:subl %edx, %eaxjmp .L8
Body1
Setup
Finish
Body2
Conditional Branch Example (Cont.)int goto_ad(int x, int y){ int result; if (x <= y) goto Else; result = x-y;Exit: return result;Else: result = y-x; goto Exit;}
C allows “goto” as means of transferring control Closer to machine-level
programming style Generally considered bad coding
style
absdiff:pushl %ebpmovl %esp, %ebpmovl 8(%ebp), %edxmovl 12(%ebp), %eaxcmpl %eax, %edxjle .L7subl %eax, %edxmovl %edx, %eax
.L8:leaveret
.L7:subl %edx, %eaxjmp .L8
Conditional Branch Example (Cont.)int goto_ad(int x, int y){ int result; if (x <= y) goto Else; result = x-y;Exit: return result;Else: result = y-x; goto Exit;}
absdiff:pushl %ebpmovl %esp, %ebpmovl 8(%ebp), %edxmovl 12(%ebp), %eaxcmpl %eax, %edxjle .L7subl %eax, %edxmovl %edx, %eax
.L8:leaveret
.L7:subl %edx, %eaxjmp .L8
Conditional Branch Example (Cont.)int goto_ad(int x, int y){ int result; if (x <= y) goto Else; result = x-y;Exit: return result;Else: result = y-x; goto Exit;}
absdiff:pushl %ebpmovl %esp, %ebpmovl 8(%ebp), %edxmovl 12(%ebp), %eaxcmpl %eax, %edxjle .L7subl %eax, %edxmovl %edx, %eax
.L8:leaveret
.L7:subl %edx, %eaxjmp .L8
Conditional Branch Example (Cont.)int goto_ad(int x, int y){ int result; if (x <= y) goto Else; result = x-y;Exit: return result;Else: result = y-x; goto Exit;}
absdiff:pushl %ebpmovl %esp, %ebpmovl 8(%ebp), %edxmovl 12(%ebp), %eaxcmpl %eax, %edxjle .L7subl %eax, %edxmovl %edx, %eax
.L8:leaveret
.L7:subl %edx, %eaxjmp .L8
Conditional Branch Example (Cont.)int goto_ad(int x, int y){ int result; if (x <= y) goto Else; result = x-y;Exit: return result;Else: result = y-x; goto Exit;}
absdiff:pushl %ebpmovl %esp, %ebpmovl 8(%ebp), %edxmovl 12(%ebp), %eaxcmpl %eax, %edxjle .L7subl %eax, %edxmovl %edx, %eax
.L8:leaveret
.L7:subl %edx, %eaxjmp .L8
C Codeval = Test ? Then-Expr : Else-Expr;
Goto Versionnt = !Test;if (nt) goto Else;val = Then-Expr;
Done:. . .
Else: val = Else-Expr; goto Done;
General Conditional Expression Translation
Test is expression returning integer= 0 interpreted as false0 interpreted as true
Create separate code regions for then & else expressions
Execute appropriate one
val = x>y ? x-y : y-x;
Conditionals: x86-64absdiff: # x in %edi, y in %esimovl %edi, %eax # eax = xmovl %esi, %edx # edx = ysubl %esi, %eax # eax = x-ysubl %edi, %edx # edx = y-xcmpl %esi, %edi # x:ycmovle %edx, %eax # eax=edx if <=ret
int absdiff( int x, int y){ int result; if (x > y) { result = x-y; } else { result = y-x; } return result;}
Will disappearBlackboard?
Conditionals: x86-64
Conditional move instruction cmovC src, dest Move value from src to dest if condition C holds More efficient than conditional branching (simple control flow) But overhead: both branches are evaluated
absdiff: # x in %edi, y in %esimovl %edi, %eax # eax = xmovl %esi, %edx # edx = ysubl %esi, %eax # eax = x-ysubl %edi, %edx # edx = y-xcmpl %esi, %edi # x:ycmovle %edx, %eax # eax=edx if <=ret
int absdiff( int x, int y){ int result; if (x > y) { result = x-y; } else { result = y-x; } return result;}
C Code
Conditional Move Versionval1 = Then-Expr;val2 = Else-Expr;val1 = val2 if !Test;
General Form with Conditional Move
Both values get computed Overwrite then-value with else-value if condition doesn’t hold Don’t use when:
Then or else expression have side effects Then and else expression are to expensive
val = Test ? Then-Expr : Else-Expr;
C Codeint fact_do(int x){ int result = 1; do { result *= x; x = x-1; } while (x > 1);
return result;}
Goto Versionint fact_goto(int x){ int result = 1;loop: result *= x; x = x-1; if (x > 1) goto loop; return result;}
“Do-While” Loop Example
Use backward branch to continue looping Only take branch when “while” condition holds
Goto Versionintfact_goto(int x){ int result = 1;
loop: result *= x; x = x-1; if (x > 1) goto loop;
return result;}
“Do-While” Loop CompilationRegisters:%edx x%eax result
fact_goto:pushl %ebp # Setupmovl %esp,%ebp # Setupmovl $1,%eax # eax = 1movl 8(%ebp),%edx # edx = x
.L11:imull %edx,%eax # result *= xdecl %edx # x--cmpl $1,%edx # Compare x : 1jg .L11 # if > goto loop
movl %ebp,%esp # Finishpopl %ebp # Finishret # Finish
Assembly
Will disappearBlackboard?
Goto Versionintfact_goto(int x){ int result = 1;
loop: result *= x; x = x-1; if (x > 1) goto loop;
return result;}
“Do-While” Loop CompilationRegisters:%edx x%eax result
fact_goto:pushl %ebp # Setupmovl %esp,%ebp # Setupmovl $1,%eax # eax = 1movl 8(%ebp),%edx # edx = x
.L11:imull %edx,%eax # result *= xdecl %edx # x--cmpl $1,%edx # Compare x : 1jg .L11 # if > goto loop
movl %ebp,%esp # Finishpopl %ebp # Finishret # Finish
Assembly
C Codedo Body while (Test);
Goto Versionloop: Body if (Test) goto loop
General “Do-While” Translation
Body:
Test returns integer= 0 interpreted as false0 interpreted as true
{ Statement1; Statement2; … Statementn;}
C Codeint fact_while(int x){ int result = 1; while (x > 1) {
result *= x; x = x-1; };
return result;}
Goto Version #1int fact_while_goto(int x){ int result = 1;loop: if (!(x > 1)) goto done; result *= x; x = x-1; goto loop;done: return result;}
“While” Loop Example
Is this code equivalent to the do-while version? Must jump out of loop if test fails
C Codeint fact_while(int x){ int result = 1; while (x > 1) { result *= x; x = x-1; }; return result;}
Goto Version #2int fact_while_goto2(int x){ int result = 1; if (!(x > 1)) goto done; loop: result *= x; x = x-1; if (x > 1) goto loop;done: return result;}
Alternative “While” Loop Translation
Historically used by GCC Uses same inner loop as do-
while version Guards loop entry with extra
test
While versionwhile (Test) Body
Do-While Version if (!Test) goto done; do Body while(Test);done:
General “While” Translation
Goto Version if (!Test) goto done;loop: Body if (Test) goto loop;done:
C Codeint fact_while(int x){ int result = 1; while (x > 1) { result *= x; x = x-1; }; return result;}
Goto Versionint fact_while_goto3(int x){ int result = 1; goto middle; loop: result *= x; x = x-1;middle: if (x > 1) goto loop; return result;}
New Style “While” Loop Translation
Recent technique for GCC Both IA32 & x86-64
First iteration jumps over body computation within loop
C Codewhile (Test) Body
Jump-to-Middle While Translation
Avoids duplicating test code Unconditional goto incurs no
performance penalty for loops compiled in similar fashion
Goto Versiongoto middle;loop: Bodymiddle: if (Test) goto loop;
Goto (Previous) Version if (!Test) goto done;loop: Body if (Test) goto loop;done:
int fact_while(int x){ int result = 1; while (x > 1) { result *= x; x--; }; return result;}
# x in %edx, result in %eax jmp .L34 # goto Middle.L35: # Loop: imull %edx, %eax # result *= x decl %edx # x--.L34: # Middle: cmpl $1, %edx # x:1 jg .L35 # if >, goto Loop
Jump-to-Middle Example
“For” Loop Example: Square-and-Multiply
Algorithm Exploit bit representation: p = p0 + 2p1 + 22p2 + … 2n–1pn–1
Gives: xp = z0 · z1 2 · (z2 2) 2 · … · (…((zn –12) 2 )…) 2
zi = 1 when pi = 0zi = x when pi = 1
Complexity O(log p)
/* Compute x raised to nonnegative power p */int ipwr_for(int x, unsigned p){
int result;for (result = 1; p != 0; p = p>>1) {
if (p & 0x1) result *= x; x = x*x; } return result;}
n–1 times
Example
310 = 32 * 38
= 32 * ((32)2)2
ipwr Computation/* Compute x raised to nonnegative power p */int ipwr_for(int x, unsigned p){
int result;for (result = 1; p != 0; p = p>>1) {
if (p & 0x1) result *= x; x = x*x; } return result;}
before iteration result x=3 p=101 1 3 10=10102
2 1 9 5= 1012
3 9 81 2= 102
4 9 6561 1= 12
5 59049 43046721 0
“For” Loop Example
for (Init; Test; Update) Body
int result; for (result = 1; p != 0; p = p>>1) { if (p & 0x1) result *= x; x = x*x; }
General Form
Initresult = 1
Testp != 0
Updatep = p >> 1
Body { if (p & 0x1) result *= x; x = x*x; }
“For” “While” “Do-While”
for (Init; Test; Update ) Body
Init;while (Test ) { Body Update ;}
Goto Version Init; if (!Test) goto done;loop: Body Update ; if (Test) goto loop;done:
While VersionFor Version
Do-While Version Init; if (!Test) goto done; do { Body Update ; } while (Test)done:
For-Loop: Compilation #1
for (Init; Test; Update ) Body
Goto Version Init; if (!Test) goto done;loop: Body Update ; if (Test) goto loop;done:
For Version for (result = 1; p != 0; p = p>>1){ if (p & 0x1) result *= x; x = x*x;}
result = 1; if (p == 0) goto done;loop: if (p & 0x1) result *= x; x = x*x; p = p >> 1; if (p != 0) goto loop;done:
“For” “While” (Jump-to-Middle)
for (Init; Test; Update ) Body
Init;while (Test ) { Body Update ;}
Init; goto middle;loop: Body Update ;middle: if (Test) goto loop;done:
While Version
For Version
Goto Version
For-Loop: Compilation #2
for (Init; Test; Update ) Body
Init; goto middle;loop: Body Update ;middle: if (Test) goto loop;done:
For Version
Goto Version
for (result = 1; p != 0; p = p>>1){ if (p & 0x1) result *= x; x = x*x;}
result = 1;goto middle;loop: if (p & 0x1) result *= x; x = x*x; p = p >> 1;middle: if (p != 0) goto loop;done:
Implementing Loops IA32
All loops translated into form based on “do-while”
x86-64 Also make use of “jump to middle”
Why the difference IA32 compiler developed for machine where all operations costly x86-64 compiler developed for machine where unconditional
branches incur (almost) no overhead
Switch Statement Example
Multiple case labels Here: 5, 6
Fall through cases Here: 2
Missing cases Here: 4
long switch_eg (long x, long y, long z){ long w = 1; switch(x) { case 1: w = y*z; break; case 2: w = y/z; /* Fall Through */ case 3: w += z; break; case 5: case 6: w -= z; break; default: w = 2; } return w;}
Jump Table Structure
Code Block0
Targ0:
Code Block1
Targ1:
Code Block2
Targ2:
Code Blockn–1
Targn-1:
•••
Targ0Targ1Targ2
Targn-1
•••
jtab:
target = JTab[x];goto *target;
switch(x) { case val_0: Block 0 case val_1: Block 1 • • • case val_n-1: Block n–1}
Switch Form
Approximate Translation
Jump Table Jump Targets
Switch Statement Example (IA32)
Setup: switch_eg:pushl %ebp # Setupmovl %esp, %ebp # Setuppushl %ebx # Setupmovl $1, %ebx # w = 1movl 8(%ebp), %edx # edx = xmovl 16(%ebp), %ecx # ecx = zcmpl $6, %edx # x:6ja .L61 # if > goto defaultjmp *.L62(,%edx,4) # goto JTab[x]
long switch_eg(long x, long y, long z){ long w = 1; switch(x) { . . . } return w;}
Will disappearBlackboard?
Switch Statement Example (IA32)
Setup: switch_eg:pushl %ebp # Setupmovl %esp, %ebp # Setuppushl %ebx # Setupmovl $1, %ebx # w = 1movl 8(%ebp), %edx # edx = xmovl 16(%ebp), %ecx # ecx = zcmpl $6, %edx # x:6ja .L61 # if > goto defaultjmp *.L62(,%edx,4) # goto JTab[x]
long switch_eg(long x, long y, long z){ long w = 1; switch(x) { . . . } return w;}
Indirect jump
Jump table.section .rodata .align 4.L62:
.long .L61 # x = 0
.long .L56 # x = 1
.long .L57 # x = 2
.long .L58 # x = 3
.long .L61 # x = 4
.long .L60 # x = 5
.long .L60 # x = 6
Assembly Setup Explanation Table Structure
Each target requires 4 bytes Base address at .L62
JumpingDirect: jmp .L61 Jump target is denoted by label .L61
Indirect: jmp *.L62(,%edx,4) Start of jump table: .L62 Must scale by factor of 4 (labels have 32-bit = 4 Bytes on IA32) Fetch target from effective Address .L61 + edx*4
Only for 0 x 6
.section .rodata .align 4.L62:.long .L61 # x = 0.long .L56 # x = 1.long .L57 # x = 2.long .L58 # x = 3.long .L61 # x = 4.long .L60 # x = 5.long .L60 # x = 6
Jump table
Jump Table
.section .rodata .align 4.L62:.long .L61 # x = 0.long .L56 # x = 1.long .L57 # x = 2.long .L58 # x = 3.long .L61 # x = 4.long .L60 # x = 5.long .L60 # x = 6
Jump table switch(x) { case 1: // .L56 w = y*z; break; case 2: // .L57 w = y/z; /* Fall Through */ case 3: // .L58 w += z; break; case 5: case 6: // .L60 w -= z; break; default: // .L61 w = 2; }
Code Blocks (Partial).L61: // Default case
movl $2, %ebx # w = 2movl %ebx, %eax # Return wpopl %ebxleaveret
.L57: // Case 2:movl 12(%ebp), %eax # ycltd # Div prepidivl %ecx # y/z movl %eax, %ebx # w = y/z
# Fall through.L58: // Case 3:
addl %ecx, %ebx # w+= zmovl %ebx, %eax # Return wpopl %ebxleaveret
switch(x) { . . . case 2: // .L57 w = y/z; /* Fall Through */ case 3: // .L58 w += z; break; . . . default: // .L61 w = 2; }
x86-64 Switch Implementation
.section .rodata .align 8.L62:.quad .L55 # x = 0.quad .L50 # x = 1.quad .L51 # x = 2.quad .L52 # x = 3.quad .L55 # x = 4.quad .L54 # x = 5.quad .L54 # x = 6
Jump Table
Same general idea, adapted to 64-bit code Table entries 64 bits (pointers) Cases use revised code
.L50: // Case 1:movq %rsi, %r8 # w = yimulq %rdx, %r8 # w *= zmovq %r8, %rax # Return wret
switch(x) { case 1: // .L50 w = y*z; break; . . . }
IA32 Object Code Setup
Label .L61 becomes address 0x8048630 Label .L62 becomes address 0x80488dc
08048610 <switch_eg>: . . . 8048622: 77 0c ja 8048630 8048624: ff 24 95 dc 88 04 08 jmp *0x80488dc(,%edx,4)
switch_eg: . . .
ja .L61 # if > goto defaultjmp *.L62(,%edx,4) # goto JTab[x]
Assembly Code
Disassembled Object Code
IA32 Object Code (cont.) Jump Table
Doesn’t show up in disassembled code Can inspect using GDB gdb asm-cntl(gdb) x/7xw 0x80488dc
Examine 7 hexadecimal format “words” (4-bytes each) Use command “help x” to get format documentation
0x80488dc: 0x08048630 0x08048650 0x0804863a 0x08048642 0x08048630 0x08048649 0x08048649
Disassembled Targets 8048630: bb 02 00 00 00 mov $0x2,%ebx 8048635: 89 d8 mov %ebx,%eax 8048637: 5b pop %ebx 8048638: c9 leave 8048639: c3 ret 804863a: 8b 45 0c mov 0xc(%ebp),%eax 804863d: 99 cltd 804863e: f7 f9 idiv %ecx 8048640: 89 c3 mov %eax,%ebx 8048642: 01 cb add %ecx,%ebx 8048644: 89 d8 mov %ebx,%eax 8048646: 5b pop %ebx 8048647: c9 leave 8048648: c3 ret 8048649: 29 cb sub %ecx,%ebx 804864b: 89 d8 mov %ebx,%eax 804864d: 5b pop %ebx 804864e: c9 leave 804864f: c3 ret 8048650: 8b 5d 0c mov 0xc(%ebp),%ebx 8048653: 0f af d9 imul %ecx,%ebx 8048656: 89 d8 mov %ebx,%eax 8048658: 5b pop %ebx 8048659: c9 leave 804865a: c3 ret
Matching Disassembled Targets 8048630: bb 02 00 00 00 mov 8048635: 89 d8 mov 8048637: 5b pop 8048638: c9 leave 8048639: c3 ret 804863a: 8b 45 0c mov 804863d: 99 cltd 804863e: f7 f9 idiv 8048640: 89 c3 mov 8048642: 01 cb add 8048644: 89 d8 mov 8048646: 5b pop 8048647: c9 leave 8048648: c3 ret 8048649: 29 cb sub 804864b: 89 d8 mov 804864d: 5b pop 804864e: c9 leave 804864f: c3 ret 8048650: 8b 5d 0c mov 8048653: 0f af d9 imul 8048656: 89 d8 mov 8048658: 5b pop 8048659: c9 leave 804865a: c3 ret
0x080486300x080486500x0804863a0x080486420x080486300x080486490x08048649
x86-64 Object Code Setup
Label .L61 becomes address 0x0000000000400716 Label .L62 becomes address 0x0000000000400990
0000000000400700 <switch_eg>: . . . 40070d: 77 07 ja 400716 40070f: ff 24 fd 90 09 40 00 jmpq *0x400990(,%rdi,8)
switch_eg: . . .
ja .L55 # if > goto defaultjmp *.L56(,%rdi,8) # goto JTab[x]
Assembly Code
Disassembled Object Code
x86-64 Object Code (cont.) Jump Table
Can inspect using GDB gdb asm-cntl(gdb) x/7xg 0x400990
Examine 7 hexadecimal format “giant words” (8-bytes each) Use command “help x” to get format documentation
0x400990: 0x0000000000400716 0x0000000000400739 0x0000000000400720 0x000000000040072b 0x0000000000400716 0x0000000000400732 0x0000000000400732
Summarizing C Control
if-then-else do-while while, for switch
Assembler Control Conditional jump Conditional move Indirect jump Compiler Must generate assembly code to
implement more complex control
Standard Techniques IA32 loops converted to do-while form x86-64 loops use jump-to-middle Large switch statements use jump tables Sparse switch statements may use
decision trees (not shown)
Conditions in CISC CISC machines generally have condition
code registers
Machine Programming – Procedures and IA32 StackCENG331: Introduction to Computer Systems7th Lecture
Instructor: Erol Sahin
Acknowledgement: Most of the slides are adapted from the ones prepared by R.E. Bryant, D.R. O’Hallaron of Carnegie-Mellon Univ.
IA32 Stack
Region of memory managed with stack discipline
Grows toward lower addresses
Register %esp contains lowest stack address= address of “top” element
Stack Pointer: %esp
Stack GrowsDown
IncreasingAddresses
Stack “Top”
Stack “Bottom”
IA32 Stack: Push
pushl Src Fetch operand at Src Decrement %esp by 4 Write operand at address given
by %esp
Stack GrowsDown
IncreasingAddresses
Stack “Top”
Stack “Bottom”
Stack Pointer: %esp -4
IA32 Stack: Pop
Stack Pointer: %esp
Stack GrowsDown
IncreasingAddresses
Stack “Top”
Stack “Bottom” popl Dest
Read operand at address %esp Increment %esp by 4 Write operand to Dest
+4
Procedure Control Flow Use stack to support procedure call and return Procedure call: call label
Push return address on stack Jump to label
Return address: Address of instruction beyond call Example from disassembly804854e: e8 3d 06 00 00 call 8048b90 <main>
8048553: 50 pushl %eax Return address = 0x8048553
Procedure return: ret Pop address from stack Jump to address
%esp
%eip
%esp
%eip 0x804854e
0x108
0x1080x10c0x110
0x104
0x804854e
0x8048553123
Procedure Call Example
0x1080x10c0x110
123
0x108
call 8048b90
804854e: e8 3d 06 00 00 call 8048b90 <main>8048553: 50 pushl %eax
0x8048b90
0x104
%eip: program counter
%esp
%eip
0x104
%esp
%eip 0x80485910x8048591
0x1040x104
0x1080x10c0x110
0x8048553123
Procedure Return Example
0x1080x10c0x110
123
ret
8048591: c3 ret
0x108
0x8048553
0x8048553
%eip: program counter
Stack-Based Languages Languages that support recursion
e.g., C, Pascal, Java Code must be “Reentrant”
Multiple simultaneous instantiations of single procedure Need some place to store state of each instantiation
Arguments Local variables Return pointer
Stack discipline State for given procedure needed for limited time
From when called to when return Callee returns before caller does
Stack allocated in Frames state for single procedure instantiation
Call Chain Exampleyoo(…){
••who();••}
who(…){
• • •amI();• • •amI();• • •}
amI(…){
••amI();••}
yoo
who
amI
amI
amI
ExampleCall Chain
amI
Procedure amI is recursive
Frame for
proc
Frame Pointer: %ebp
Stack Frames Contents
Local variables Return information Temporary space
Management Space allocated when enter procedure
“Set-up” code Deallocated when return
“Finish” code
Stack Pointer: %esp
PreviousFrame
Stack “Top”
Exampleyoo(…){
••who();••}
yoo
who
amI
amI
amI
amI
yoo%ebp
%esp
Stack
who(…){
• • •amI();• • •amI();• • •}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
amI(…){
••amI();••}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
amI
amI(…){
••amI();••}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
amI
amI
amI(…){
••amI();••}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
amI
amI
amI
amI(…){
••amI();••}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
amI
amI
amI(…){
••amI();••}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
amI
who(…){
• • •amI();• • •amI();• • •}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
amI(…){
•••••}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
amI
who(…){
• • •amI();• • •amI();• • •}
Exampleyoo
who
amI
amI
amI
amI
yoo
%ebp
%esp
Stack
who
Exampleyoo(…){
••who();••}
yoo
who
amI
amI
amI
amI
yoo%ebp
%esp
Stack
IA32/Linux Stack Frame Current Stack Frame (“Top” to Bottom)
“Argument build:”Parameters for function about to call
Local variablesIf can’t keep in registers
Saved register context Old frame pointer
Caller Stack Frame Return address Pushed by call instruction Arguments for this call
Return Addr
SavedRegisters
+Local
Variables
ArgumentBuild
Old %ebp
Arguments
CallerFrame
Frame pointer%ebp
Stack pointer%esp
Revisiting swap
void swap(int *xp, int *yp) { int t0 = *xp; int t1 = *yp; *xp = t1; *yp = t0;}
int zip1 = 15213;int zip2 = 91125;
void call_swap(){ swap(&zip1, &zip2);}
call_swap:• • •pushl $zip2 # Global Varpushl $zip1 # Global Varcall swap• • •
&zip2&zip1Rtn adr %esp
ResultingStack•
••
Calling swap from call_swap
Revisiting swap
void swap(int *xp, int *yp) { int t0 = *xp; int t1 = *yp; *xp = t1; *yp = t0;}
swap:pushl %ebpmovl %esp,%ebppushl %ebx
movl 12(%ebp),%ecxmovl 8(%ebp),%edxmovl (%ecx),%eaxmovl (%edx),%ebxmovl %eax,(%edx)movl %ebx,(%ecx)
movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
Body
SetUp
Finish
Do on blackboard?
swap Setup #1
swap:pushl %ebpmovl %esp,%ebppushl %ebx
Resulting Stack
&zip2&zip1Rtn adr %esp
Entering Stack
•••
%ebp
ypxp
Rtn adr
Old %ebp
%ebp•••
%esp
swap Setup #1
swap:pushl %ebpmovl %esp,%ebppushl %ebx
&zip2&zip1Rtn adr %esp
Entering Stack
•••
%ebp
ypxp
Rtn adr
Old %ebp
%ebp•••
%esp
swap Setup #1
swap:pushl %ebpmovl %esp,%ebppushl %ebx
&zip2&zip1Rtn adr %esp
Entering Stack
•••
%ebp
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
Resulting Stack
swap Setup #1
swap:pushl %ebpmovl %esp,%ebppushl %ebx
&zip2&zip1Rtn adr %esp
Entering Stack
•••
%ebp
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
1284
swap Setup #1
&zip2&zip1Rtn adr %esp
Entering Stack
•••
%ebp
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
Resulting Stack
Old %ebx
movl 12(%ebp),%ecx # get ypmovl 8(%ebp),%edx # get xp. . .
Offset relative to %ebp
swap Finish #1
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
swap’s Stack
Old %ebx
movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
Resulting Stack
Old %ebx
Observation: Saved and restored register %ebx
swap Finish #2
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
swap’s Stack
Old %ebx
movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
ypxp
Rtn adr
Old %ebp %ebp
•••
%espOld %ebx
swap Finish #2
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
swap’s Stack
Old %ebx
movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
Resulting Stack
swap Finish #2
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
swap’s Stack
Old %ebx
movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
swap Finish #3
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
swap’s Stack
Old %ebx
movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
Resulting Stack
ypxp
Rtn adr
%ebp•••
%esp
swap Finish #4
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
swap’s Stack
Old %ebx
movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
ypxp
Rtn adr
%ebp•••
%esp
swap Finish #4
ypxp
Rtn adr
Old %ebp %ebp
•••
%esp
swap’s Stack
Old %ebx
movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
ypxp
%ebp•••
%esp
Resulting Stack
Observation Saved & restored register %ebx Didn’t do so for %eax, %ecx, or %edx
Disassembled swap080483a4 <swap>: 80483a4: 55 push %ebp 80483a5: 89 e5 mov %esp,%ebp 80483a7: 53 push %ebx 80483a8: 8b 55 08 mov 0x8(%ebp),%edx 80483ab: 8b 4d 0c mov 0xc(%ebp),%ecx 80483ae: 8b 1a mov (%edx),%ebx 80483b0: 8b 01 mov (%ecx),%eax 80483b2: 89 02 mov %eax,(%edx) 80483b4: 89 19 mov %ebx,(%ecx) 80483b6: 5b pop %ebx 80483b7: c9 leave 80483b8: c3 ret
8048409: e8 96 ff ff ff call 80483a4 <swap> 804840e: 8b 45 f8 mov 0xfffffff8(%ebp),%eax
Calling Code
Register Saving Conventions When procedure yoo calls who:
yoo is the caller who is the callee
Can Register be used for temporary storage?
Contents of register %edx overwritten by who
yoo:• • •movl $15213, %edxcall whoaddl %edx, %eax
• • •ret
who:• • •movl 8(%ebp), %edxaddl $91125, %edx
• • •ret
Register Saving Conventions When procedure yoo calls who:
yoo is the caller who is the callee
Can register be used for temporary storage? Conventions
“Caller Save” Caller saves temporary in its frame before calling
“Callee Save” Callee saves temporary in its frame before using
IA32/Linux Register Usage %eax, %edx, %ecx
Caller saves prior to call if values are used later
%eax also used to return integer
value
%ebx, %esi, %edi Callee saves if wants to
use them
%esp, %ebp special
%eax%edx%ecx%ebx%esi%edi%esp%ebp
Caller-SaveTemporaries
Callee-SaveTemporaries
Special
int rfact(int x){ int rval; if (x <= 1) return 1; rval = rfact(x-1); return rval * x;}
.globl rfact.type
rfact,@functionrfact:
pushl %ebpmovl %esp,%ebppushl %ebxmovl 8(%ebp),%ebxcmpl $1,%ebxjle .L78leal -1(%ebx),%eaxpushl %eaxcall rfactimull %ebx,%eaxjmp .L79.align 4
.L78:movl $1,%eax
.L79:movl -4(%ebp),%ebxmovl %ebp,%esppopl %ebpret
Recursive Factorial
Registers %eax used without first saving %ebx used, but saved at
beginning & restore at end
Pointer Code
void s_helper (int x, int *accum){ if (x <= 1) return; else { int z = *accum * x; *accum = z; s_helper (x-1,accum); }}
int sfact(int x){ int val = 1; s_helper(x, &val); return val;}
Top-Level CallRecursive Procedure
Pass pointer to update location
Temp.Space
%esp
Creating & Initializing Pointer
int sfact(int x){ int val = 1; s_helper(x, &val); return val;}
_sfact:pushl %ebp # Save %ebpmovl %esp,%ebp # Set %ebpsubl $16,%esp # Add 16 bytes movl 8(%ebp),%edx # edx = xmovl $1,-4(%ebp) # val = 1
Variable val must be stored on stack Because: Need to create pointer to it
Compute pointer as -4(%ebp) Push on stack as second argument
Initial part of sfact
xRtn adr
Old %ebp %ebp 0 4 8
-4 val = 1
Unused-12 -8
-16
_sfact:pushl %ebp # Save %ebpmovl %esp,%ebp # Set %ebpsubl $16,%esp # Add 16 bytes movl 8(%ebp),%edx # edx = xmovl $1,-4(%ebp) # val = 1
_sfact:pushl %ebp # Save %ebpmovl %esp,%ebp # Set %ebpsubl $16,%esp # Add 16 bytes movl 8(%ebp),%edx # edx = xmovl $1,-4(%ebp) # val = 1
_sfact:pushl %ebp # Save %ebpmovl %esp,%ebp # Set %ebpsubl $16,%esp # Add 16 bytes movl 8(%ebp),%edx # edx = xmovl $1,-4(%ebp) # val = 1
Temp.Space
%esp
Creating & Initializing Pointer
int sfact(int x){ int val = 1; s_helper(x, &val); return val;}
_sfact:pushl %ebp # Save %ebpmovl %esp,%ebp # Set %ebpsubl $16,%esp # Add 16 bytes movl 8(%ebp),%edx # edx = xmovl $1,-4(%ebp) # val = 1
Variable val must be stored on stack Because: Need to create pointer to it
Compute pointer as -4(%ebp) Push on stack as second argument
Initial part of sfact
xRtn adr
Old %ebp %ebp 0 4 8
-4 val = 1
Unused-12 -8
-16
_sfact:pushl %ebp # Save %ebpmovl %esp,%ebp # Set %ebpsubl $16,%esp # Add 16 bytes movl 8(%ebp),%edx # edx = xmovl $1,-4(%ebp) # val = 1
_sfact:pushl %ebp # Save %ebpmovl %esp,%ebp # Set %ebpsubl $16,%esp # Add 16 bytes movl 8(%ebp),%edx # edx = xmovl $1,-4(%ebp) # val = 1
_sfact:pushl %ebp # Save %ebpmovl %esp,%ebp # Set %ebpsubl $16,%esp # Add 16 bytes movl 8(%ebp),%edx # edx = xmovl $1,-4(%ebp) # val = 1
Passing Pointerint sfact(int x){ int val = 1; s_helper(x, &val); return val;}
leal -4(%ebp),%eax # Compute &valpushl %eax # Push on stackpushl %edx # Push xcall s_helper # callmovl -4(%ebp),%eax # Return val• • • # Finish
Calling s_helper from sfact
xRtn adr
Old %ebp %ebp 0 4 8
val = 1 -4
Unused-12 -8
-16
%espx&val
Stack at time of call
leal -4(%ebp),%eax # Compute &valpushl %eax # Push on stackpushl %edx # Push xcall s_helper # callmovl -4(%ebp),%eax # Return val• • • # Finish
leal -4(%ebp),%eax # Compute &valpushl %eax # Push on stackpushl %edx # Push xcall s_helper # callmovl -4(%ebp),%eax # Return val• • • # Finish
val=x!
Passing Pointerint sfact(int x){ int val = 1; s_helper(x, &val); return val;}
leal -4(%ebp),%eax # Compute &valpushl %eax # Push on stackpushl %edx # Push xcall s_helper # callmovl -4(%ebp),%eax # Return val• • • # Finish
Calling s_helper from sfact
xRtn adr
Old %ebp %ebp 0 4 8
val = 1 -4
Unused-12 -8
-16
%espx&val
Stack at time of call
leal -4(%ebp),%eax # Compute &valpushl %eax # Push on stackpushl %edx # Push xcall s_helper # callmovl -4(%ebp),%eax # Return val• • • # Finish
leal -4(%ebp),%eax # Compute &valpushl %eax # Push on stackpushl %edx # Push xcall s_helper # callmovl -4(%ebp),%eax # Return val• • • # Finish
val=x!
IA 32 Procedure Summary The Stack Makes Recursion Work
Private storage for each instance of procedure call Instantiations don’t clobber each other Addressing of locals + arguments can be
relative to stack positions Managed by stack discipline
Procedures return in inverse order of calls IA32 Procedures Combination of Instructions
+ Conventions Call / Ret instructions Register usage conventions
Caller / Callee save %ebp and %esp
Stack frame organization conventions
Return Addr
SavedRegisters
+Local
Variables
ArgumentBuild
Old %ebp
Arguments
CallerFrame
%ebp
%esp
Today Procedures (x86-64) Arrays
One-dimensional Multi-dimensional (nested) Multi-level
Structures
%rax
%rbx
%rcx
%rdx
%rsi
%rdi
%rsp
%rbp
x86-64 Integer Registers
Twice the number of registers Accessible as 8, 16, 32, 64 bits
%eax
%ebx
%ecx
%edx
%esi
%edi
%esp
%ebp
%r8
%r9
%r10
%r11
%r12
%r13
%r14
%r15
%r8d
%r9d
%r10d
%r11d
%r12d
%r13d
%r14d
%r15d
%rax
%rbx
%rcx
%rdx
%rsi
%rdi
%rsp
%rbp
x86-64 Integer Registers
%r8
%r9
%r10
%r11
%r12
%r13
%r14
%r15Callee saved Callee saved
Callee saved
Callee saved
C: Callee saved
Callee saved
Callee saved
Stack pointer
Used for linking
Return value
Argument #4
Argument #1
Argument #3
Argument #2
Argument #6
Argument #5
x86-64 Registers Arguments passed to functions via registers
If more than 6 integral parameters, then pass rest on stack These registers can be used as caller-saved as well
All references to stack frame via stack pointer Eliminates need to update %ebp/%rbp
Other Registers 6+1 callee saved 2 or 3 have special uses
x86-64 Long Swap
Operands passed in registers First (xp) in %rdi, second (yp) in %rsi 64-bit pointers
No stack operations required (except ret) Avoiding stack
Can hold all local information in registers
void swap(long *xp, long *yp) { long t0 = *xp; long t1 = *yp; *xp = t1; *yp = t0;}
swap:movq (%rdi), %rdxmovq (%rsi), %raxmovq %rax, (%rdi)movq %rdx, (%rsi)ret
x86-64 Locals in the Red Zone
Avoiding Stack Pointer Change Can hold all information within small
window beyond stack pointer
/* Swap, using local array */void swap_a(long *xp, long *yp) { volatile long loc[2]; loc[0] = *xp; loc[1] = *yp; *xp = loc[1]; *yp = loc[0];}
swap_a: movq (%rdi), %rax movq %rax, -24(%rsp) movq (%rsi), %rax movq %rax, -16(%rsp) movq -16(%rsp), %rax movq %rax, (%rdi) movq -24(%rsp), %rax movq %rax, (%rsi) ret
rtn Ptr
unused
%rsp
−8loc[1]loc[0]
−16
−24
x86-64 NonLeaf without Stack Frame No values held while swap
being invoked
No callee save registers needed
long scount = 0;
/* Swap a[i] & a[i+1] */void swap_ele_se (long a[], int i){ swap(&a[i], &a[i+1]); scount++;}
swap_ele_se: movslq %esi,%rsi # Sign extend i leaq (%rdi,%rsi,8), %rdi # &a[i] leaq 8(%rdi), %rsi # &a[i+1] call swap # swap() incq scount(%rip) # scount++; ret
x86-64 Call using Jump
long scount = 0;
/* Swap a[i] & a[i+1] */void swap_ele(long a[], int i){ swap(&a[i], &a[i+1]);}
swap_ele: movslq %esi,%rsi # Sign extend i leaq (%rdi,%rsi,8), %rdi # &a[i] leaq 8(%rdi), %rsi # &a[i+1] jmp swap # swap()
Will disappearBlackboard?
x86-64 Call using Jump When swap executes ret,
it will return from swap_ele
Possible since swap is a “tail call”(no instructions afterwards)
long scount = 0;
/* Swap a[i] & a[i+1] */void swap_ele(long a[], int i){ swap(&a[i], &a[i+1]);}
swap_ele: movslq %esi,%rsi # Sign extend i leaq (%rdi,%rsi,8), %rdi # &a[i] leaq 8(%rdi), %rsi # &a[i+1] jmp swap # swap()
x86-64 Stack Frame Example
Keeps values of a and i in callee save registers
Must set up stack frame to save these registers
long sum = 0;/* Swap a[i] & a[i+1] */void swap_ele_su (long a[], int i){ swap(&a[i], &a[i+1]); sum += a[i];}
swap_ele_su: movq %rbx, -16(%rsp) movslq %esi,%rbx movq %r12, -8(%rsp) movq %rdi, %r12 leaq (%rdi,%rbx,8), %rdi subq $16, %rsp leaq 8(%rdi), %rsi call swap movq (%r12,%rbx,8), %rax addq %rax, sum(%rip) movq (%rsp), %rbx movq 8(%rsp), %r12 addq $16, %rsp ret
Blackboard?
Understanding x86-64 Stack Frameswap_ele_su: movq %rbx, -16(%rsp) # Save %rbx movslq %esi,%rbx # Extend & save i movq %r12, -8(%rsp) # Save %r12 movq %rdi, %r12 # Save a leaq (%rdi,%rbx,8), %rdi # &a[i] subq $16, %rsp # Allocate stack frame leaq 8(%rdi), %rsi # &a[i+1] call swap # swap() movq (%r12,%rbx,8), %rax # a[i] addq %rax, sum(%rip) # sum += a[i] movq (%rsp), %rbx # Restore %rbx movq 8(%rsp), %r12 # Restore %r12 addq $16, %rsp # Deallocate stack frame ret
Understanding x86-64 Stack Frameswap_ele_su: movq %rbx, -16(%rsp) # Save %rbx movslq %esi,%rbx # Extend & save i movq %r12, -8(%rsp) # Save %r12 movq %rdi, %r12 # Save a leaq (%rdi,%rbx,8), %rdi # &a[i] subq $16, %rsp # Allocate stack frame leaq 8(%rdi), %rsi # &a[i+1] call swap # swap() movq (%r12,%rbx,8), %rax # a[i] addq %rax, sum(%rip) # sum += a[i] movq (%rsp), %rbx # Restore %rbx movq 8(%rsp), %r12 # Restore %r12 addq $16, %rsp # Deallocate stack frame ret
rtn addr%r12
%rsp−8
%rbx−16
rtn addr%r12
%rsp+8
%rbx
Interesting Features of Stack Frame Allocate entire frame at once
All stack accesses can be relative to %rsp Do by decrementing stack pointer Can delay allocation, since safe to temporarily use red zone
Simple deallocation Increment stack pointer No base/frame pointer needed
x86-64 Procedure Summary Heavy use of registers
Parameter passing More temporaries since more registers
Minimal use of stack Sometimes none Allocate/deallocate entire block
Many tricky optimizations What kind of stack frame to use Calling with jump Various allocation techniques
Today Procedures (x86-64) Arrays
One-dimensional Multi-dimensional (nested) Multi-level
Structures
Basic Data Types Integral
Stored & operated on in general (integer) registers Signed vs. unsigned depends on instructions used
Intel GAS Bytes Cbyte b 1 [unsigned] charword w 2 [unsigned] shortdouble word l 4 [unsigned] intquad word q 8 [unsigned] long int (x86-64)
Floating Point Stored & operated on in floating point registers
Intel GAS Bytes CSingle s 4 floatDouble l 8 doubleExtended t 10/12/16 long double
Array Allocation Basic Principle
T A[L]; Array of data type T and length L Contiguously allocated region of L * sizeof(T) bytes
char string[12];
x x + 12
int val[5];
x x + 4 x + 8 x + 12 x + 16 x + 20
double a[3];
x + 24x x + 8 x + 16
char *p[3];
x x + 8 x + 16 x + 24
x x + 4 x + 8 x + 12
IA32
x86-64
Array Access Basic Principle
T A[L]; Array of data type T and length L Identifier A can be used as a pointer to array element 0: Type T*
Reference Type Valueval[4] int 3val int * xval+1 int * x + 4&val[2] int * x + 8val[5] int ??*(val+1)int 5val + i int * x + 4 i
int val[5]; 1 5 2 1 3
x x + 4 x + 8 x + 12 x + 16 x + 20
Will disappearBlackboard?
Array Access Basic Principle
T A[L]; Array of data type T and length L Identifier A can be used as a pointer to array element 0: Type T*
Reference Type Valueval[4] int 3val int * xval+1 int * x + 4&val[2] int * x + 8val[5] int ??*(val+1)int 5val + i int * x + 4 i
int val[5]; 1 5 2 1 3
x x + 4 x + 8 x + 12 x + 16 x + 20
Array Example
Declaration “zip_dig cmu” equivalent to “int cmu[5]” Example arrays were allocated in successive 20 byte blocks
Not guaranteed to happen in general
typedef int zip_dig[5];
zip_dig cmu = { 1, 5, 2, 1, 3 };zip_dig mit = { 0, 2, 1, 3, 9 };zip_dig ucb = { 9, 4, 7, 2, 0 };
zip_dig cmu; 1 5 2 1 3
16 20 24 28 32 36
zip_dig mit; 0 2 1 3 9
36 40 44 48 52 56
zip_dig ucb; 9 4 7 2 0
56 60 64 68 72 76
Array Accessing Example
Register %edx contains starting address of array
Register %eax contains array index
Desired digit at 4*%eax + %edx
Use memory reference (%edx,%eax,4)
int get_digit (zip_dig z, int dig){ return z[dig];}
# %edx = z # %eax = dig
movl (%edx,%eax,4),%eax # z[dig]
IA32
zip_dig cmu; 1 5 2 1 3
16 20 24 28 32 36
Referencing Examples
Reference Address Value Guaranteed?mit[3] 36 + 4* 3 = 48 3mit[5] 36 + 4* 5 = 56 9mit[-1] 36 + 4*-1 = 32 3cmu[15] 16 + 4*15 = 76 ??
zip_dig cmu; 1 5 2 1 3
16 20 24 28 32 36
zip_dig mit; 0 2 1 3 9
36 40 44 48 52 56
zip_dig ucb; 9 4 7 2 0
56 60 64 68 72 76
Will disappearBlackboard?
Referencing Examples
Reference Address Value Guaranteed?mit[3] 36 + 4* 3 = 48 3mit[5] 36 + 4* 5 = 56 9mit[-1] 36 + 4*-1 = 32 3cmu[15] 16 + 4*15 = 76 ??
No bound checking Out of range behavior implementation-dependent No guaranteed relative allocation of different arrays
YesNoNoNo
zip_dig cmu; 1 5 2 1 3
16 20 24 28 32 36
zip_dig mit; 0 2 1 3 9
36 40 44 48 52 56
zip_dig mit; 9 4 7 2 0
56 60 64 68 72 76
int zd2int(zip_dig z){ int i; int zi = 0; for (i = 0; i < 5; i++) { zi = 10 * zi + z[i]; } return zi;}
Array Loop Example
int zd2int(zip_dig z){ int zi = 0; int *zend = z + 4; do { zi = 10 * zi + *z; z++; } while (z <= zend); return zi;}
Original
Transformed As generated by GCC Eliminate loop variable i Convert array code to
pointer code Express in do-while form
(no test at entrance)
# %ecx = zxorl %eax,%eax # zi = 0leal 16(%ecx),%ebx # zend = z+4
.L59:leal (%eax,%eax,4),%edx # 5*zimovl (%ecx),%eax # *zaddl $4,%ecx # z++leal (%eax,%edx,2),%eax # zi = *z + 2*(5*zi)cmpl %ebx,%ecx # z : zendjle .L59 # if <= goto loop
Array Loop Implementation (IA32)int zd2int(zip_dig z){ int zi = 0; int *zend = z + 4; do { zi = 10 * zi + *z; z++; } while(z <= zend); return zi;}
Will disappearBlackboard?
# %ecx = zxorl %eax,%eax # zi = 0leal 16(%ecx),%ebx # zend = z+4
.L59:leal (%eax,%eax,4),%edx # 5*zimovl (%ecx),%eax # *zaddl $4,%ecx # z++leal (%eax,%edx,2),%eax # zi = *z + 2*(5*zi)cmpl %ebx,%ecx # z : zendjle .L59 # if <= goto loop
Array Loop Implementation (IA32) Registers
%ecx z%eax zi%ebx zend
Computations 10*zi + *z implemented as *z + 2*(zi+4*zi)
z++ increments by 4
int zd2int(zip_dig z){ int zi = 0; int *zend = z + 4; do { zi = 10 * zi + *z; z++; } while(z <= zend); return zi;}
# %ecx = zxorl %eax,%eax # zi = 0leal 16(%ecx),%ebx # zend = z+4
.L59:leal (%eax,%eax,4),%edx # 5*zimovl (%ecx),%eax # *zaddl $4,%ecx # z++leal (%eax,%edx,2),%eax # zi = *z + 2*(5*zi)cmpl %ebx,%ecx # z : zendjle .L59 # if <= goto loop
int zd2int(zip_dig z){ int zi = 0; int *zend = z + 4; do { zi = 10 * zi + *z; z++; } while(z <= zend); return zi;}
# %ecx = zxorl %eax,%eax # zi = 0leal 16(%ecx),%ebx # zend = z+4
.L59:leal (%eax,%eax,4),%edx # 5*zimovl (%ecx),%eax # *zaddl $4,%ecx # z++leal (%eax,%edx,2),%eax # zi = *z + 2*(5*zi)cmpl %ebx,%ecx # z : zendjle .L59 # if <= goto loop
int zd2int(zip_dig z){ int zi = 0; int *zend = z + 4; do { zi = 10 * zi + *z; z++; } while(z <= zend); return zi;}
# %ecx = zxorl %eax,%eax # zi = 0leal 16(%ecx),%ebx # zend = z+4
.L59:leal (%eax,%eax,4),%edx # 5*zimovl (%ecx),%eax # *zaddl $4,%ecx # z++leal (%eax,%edx,2),%eax # zi = *z + 2*(5*zi)cmpl %ebx,%ecx # z : zendjle .L59 # if <= goto loop
int zd2int(zip_dig z){ int zi = 0; int *zend = z + 4; do { zi = 10 * zi + *z; z++; } while(z <= zend); return zi;}
# %ecx = zxorl %eax,%eax # zi = 0leal 16(%ecx),%ebx # zend = z+4
.L59:leal (%eax,%eax,4),%edx # 5*zimovl (%ecx),%eax # *zaddl $4,%ecx # z++leal (%eax,%edx,2),%eax # zi = *z + 2*(5*zi)cmpl %ebx,%ecx # z : zendjle .L59 # if <= goto loop
int zd2int(zip_dig z){ int zi = 0; int *zend = z + 4; do { zi = 10 * zi + *z; z++; } while(z <= zend); return zi;}
Nested Array Example
“zip_dig pgh[4]” equivalent to “int pgh[4][5]” Variable pgh: array of 4 elements, allocated contiguously Each element is an array of 5 int’s, allocated contiguously
“Row-Major” ordering of all elements guaranteed
#define PCOUNT 4zip_dig pgh[PCOUNT] = {{1, 5, 2, 0, 6}, {1, 5, 2, 1, 3 }, {1, 5, 2, 1, 7 }, {1, 5, 2, 2, 1 }};
zip_digpgh[4];
76 96 116 136 156
1 5 2 0 6 1 5 2 1 3 1 5 2 1 7 1 5 2 2 1
Multidimensional (Nested) Arrays Declaration
T A[R][C]; 2D array of data type T R rows, C columns Type T element requires K bytes
Array Size R * C * K bytes
Arrangement Row-Major Ordering
A[0][0] A[0][C-1]
A[R-1][0]
• • •
• • • A[R-1][C-1]
•••
•••
int A[R][C];
• • •A[0][0]
A[0]
[C-1]• • •
A[1][0]
A[1]
[C-1]• • •
A[R-1][0]
A[R-1][C-1]
• • •
4*R*C Bytes
• • •
Nested Array Row Access Row Vectors
A[i] is array of C elements Each element of type T requires K bytes Starting address A + i * (C * K)
• • •A
[i][0]
A[i]
[C-1]
A[i]
• • •A
[R-1][0]
A[R-1][C-1]
A[R-1]
• • •
A
• • •A
[0][0]
A[0]
[C-1]
A[0]
A+i*C*4 A+(R-1)*C*4
int A[R][C];
Nested Array Row Access Codeint *get_pgh_zip(int index){ return pgh[index];}
# %eax = indexleal (%eax,%eax,4),%eax # 5 * indexleal pgh(,%eax,4),%eax # pgh + (20 * index)
#define PCOUNT 4zip_dig pgh[PCOUNT] = {{1, 5, 2, 0, 6}, {1, 5, 2, 1, 3 }, {1, 5, 2, 1, 7 }, {1, 5, 2, 2, 1 }};
Will disappearBlackboard?
What data type is pgh[index]? What is its starting address?
Nested Array Row Access Code
Row Vector pgh[index] is array of 5 int’s Starting address pgh+20*index
IA32 Code Computes and returns address Compute as pgh + 4*(index+4*index)
int *get_pgh_zip(int index){ return pgh[index];}
# %eax = indexleal (%eax,%eax,4),%eax # 5 * indexleal pgh(,%eax,4),%eax # pgh + (20 * index)
#define PCOUNT 4zip_dig pgh[PCOUNT] = {{1, 5, 2, 0, 6}, {1, 5, 2, 1, 3 }, {1, 5, 2, 1, 7 }, {1, 5, 2, 2, 1 }};
• • •
Nested Array Row Access Array Elements
A[i][j] is element of type T, which requires K bytes Address A + i * (C * K) + j * K = A + (i * C + j)* K
• • • • • •A[i][j]
A[i]
• • •A
[R-1][0]
A[R-1][C-1]
A[R-1]
• • •
A
• • •A
[0][0]
A[0]
[C-1]
A[0]
A+i*C*4 A+(R-1)*C*4
int A[R][C];
A+i*C*4+j*4
Nested Array Element Access Code
Array Elements pgh[index][dig] is int Address: pgh + 20*index + 4*dig
IA32 Code Computes address pgh + 4*dig + 4*(index+4*index) movl performs memory reference
int get_pgh_digit (int index, int dig){ return pgh[index][dig];}
# %ecx = dig# %eax = indexleal 0(,%ecx,4),%edx # 4*digleal (%eax,%eax,4),%eax # 5*indexmovl pgh(%edx,%eax,4),%eax # *(pgh + 4*dig + 20*index)
Strange Referencing Examples
Reference Address Value Guaranteed?pgh[3][3] 76+20*3+4*3 = 148 2pgh[2][5] 76+20*2+4*5 = 136 1pgh[2][-1] 76+20*2+4*-1 = 112 3pgh[4][-1] 76+20*4+4*-1 = 152 1pgh[0][19] 76+20*0+4*19 = 152 1 pgh[0][-1] 76+20*0+4*-1 = 72 ??
zip_digpgh[4];
76 96 116 136 156
1 5 2 0 6 1 5 2 1 3 1 5 2 1 7 1 5 2 2 1
Will disappear
Strange Referencing Examples
Reference Address Value Guaranteed?pgh[3][3] 76+20*3+4*3 = 148 2pgh[2][5] 76+20*2+4*5 = 136 1pgh[2][-1] 76+20*2+4*-1 = 112 3pgh[4][-1] 76+20*4+4*-1 = 152 1pgh[0][19] 76+20*0+4*19 = 152 1 pgh[0][-1] 76+20*0+4*-1 = 72 ??
Code does not do any bounds checking Ordering of elements within array guaranteed
YesYesYesYesNo
zip_digpgh[4];
76 96 116 136 156
1 5 2 0 6 1 5 2 1 3 1 5 2 1 7 1 5 2 2 1
Multi-Level Array Example Variable univ denotes
array of 3 elements Each element is a pointer
4 bytes Each pointer points to array
of int’s
zip_dig cmu = { 1, 5, 2, 1, 3 };zip_dig mit = { 0, 2, 1, 3, 9 };zip_dig ucb = { 9, 4, 7, 2, 0 };
#define UCOUNT 3int *univ[UCOUNT] = {mit, cmu, ucb};
361601656
164168
univ
cmu
mit
ucb
1 5 2 1 3
16 20 24 28 32 360 2 1 3 9
36 40 44 48 52 56
9 4 7 2 0
56 60 64 68 72 76
Element Access in Multi-Level Array
# %ecx = index# %eax = digleal 0(,%ecx,4),%edx # 4*indexmovl univ(%edx),%edx # Mem[univ+4*index]movl (%edx,%eax,4),%eax # Mem[...+4*dig]
int get_univ_digit (int index, int dig){ return univ[index][dig];}
Will disappearBlackboard?
Element Access in Multi-Level Array
Computation (IA32) Element access Mem[Mem[univ+4*index]+4*dig] Must do two memory reads
First get pointer to row array Then access element within array
# %ecx = index# %eax = digleal 0(,%ecx,4),%edx # 4*indexmovl univ(%edx),%edx # Mem[univ+4*index]movl (%edx,%eax,4),%eax # Mem[...+4*dig]
int get_univ_digit (int index, int dig){ return univ[index][dig];}
Array Element Accesses
int get_pgh_digit (int index, int dig){ return pgh[index][dig];}
int get_univ_digit (int index, int dig){ return univ[index][dig];}
Nested array Multi-level array
Access looks similar, but element:
Mem[pgh+20*index+4*dig] Mem[Mem[univ+4*index]+4*dig]
Strange Referencing Examples
Reference Address Value Guaranteed?univ[2][3] 56+4*3 = 68 2univ[1][5] 16+4*5 = 36 0univ[2][-1] 56+4*-1 = 52 9univ[3][-1] ?? ??univ[1][12] 16+4*12 = 64 7
361601656
164168
univ
cmu
mit
ucb
1 5 2 1 3
16 20 24 28 32 360 2 1 3 9
36 40 44 48 52 56
9 4 7 2 0
56 60 64 68 72 76
Will disappear
Strange Referencing Examples
Reference Address Value Guaranteed?univ[2][3] 56+4*3 = 68 2univ[1][5] 16+4*5 = 36 0univ[2][-1] 56+4*-1 = 52 9univ[3][-1] ?? ??univ[1][12] 16+4*12 = 64 7
Code does not do any bounds checking Ordering of elements in different arrays not guaranteed
YesNoNoNoNo
361601656
164168
univ
cmu
mit
ucb
1 5 2 1 3
16 20 24 28 32 360 2 1 3 9
36 40 44 48 52 56
9 4 7 2 0
56 60 64 68 72 76
Using Nested Arrays Strengths
C compiler handles doubly subscripted arrays
Generates very efficient code Avoids multiply in index
computation
Limitation Only works for fixed array size
#define N 16typedef int fix_matrix[N][N];
/* Compute element i,k of fixed matrix product */int fix_prod_ele(fix_matrix a, fix_matrix b, int i, int k){ int j; int result = 0; for (j = 0; j < N; j++) result += a[i][j]*b[j][k]; return result;}
a b
i-th row
j-th columnx
Dynamic Nested Arrays Strength
Can create matrix of any size Programming
Must do index computation explicitly
Performance Accessing single element costly Must do multiplication
int * new_var_matrix(int n){ return (int *) calloc(sizeof(int), n*n);}
int var_ele (int *a, int i, int j, int n){ return a[i*n+j];}
movl 12(%ebp),%eax # imovl 8(%ebp),%edx # aimull 20(%ebp),%eax # n*iaddl 16(%ebp),%eax # n*i+jmovl (%edx,%eax,4),%eax # Mem[a+4*(i*n+j)]
Dynamic Array Multiplication
Without Optimizations Multiplies: 3
2 for subscripts 1 for data
Adds: 4 2 for array indexing 1 for loop index 1 for data
/* Compute element i,k of variable matrix product */int var_prod_ele (int *a, int *b, int i, int k, int n){ int j; int result = 0; for (j = 0; j < n; j++) result += a[i*n+j] * b[j*n+k]; return result;}
Optimizing Dynamic Array Multiplication Optimizations
Performed when set optimization level to -O2
Code Motion Expression i*n can be
computed outside loop Strength Reduction
Incrementing j has effect of incrementing j*n+k by n
Operations count 4 adds, 1 mult
Compiler can optimize regular access patterns
{ int j; int result = 0; for (j = 0; j < n; j++) result += a[i*n+j] * b[j*n+k]; return result;}
{ int j; int result = 0; int iTn = i*n; int jTnPk = k; for (j = 0; j < n; j++) { result += a[iTn+j] * b[jTnPk]; jTnPk += n; } return result;}
struct rec { int i; int a[3]; int *p;};
IA32 Assembly# %eax = val# %edx = rmovl %eax,(%edx) # Mem[r] = val
void set_i(struct rec *r, int val){ r->i = val;}
Structures
Concept Contiguously-allocated region of memory Refer to members within structure by names Members may be of different types
Accessing Structure Member
Memory Layouti a p
0 4 16 20
# %ecx = idx# %edx = rleal 0(,%ecx,4),%eax # 4*idxleal 4(%eax,%edx),%eax # r+4*idx+4
int *find_a (struct rec *r, int idx){ return &r->a[idx];}
Generating Pointer to Structure Member
Generating Pointer to Array Element Offset of each structure
member determined at compile time
struct rec { int i; int a[3]; int *p;};
i a p0 4 16 20
r+4+4*idxr
struct rec { int i; int a[3]; int *p;};
# %edx = rmovl (%edx),%ecx # r->ileal 0(,%ecx,4),%eax # 4*(r->i)leal 4(%edx,%eax),%eax # r+4+4*(r->i)movl %eax,16(%edx) # Update r->p
void set_p(struct rec *r){ r->p = &r->a[r->i];}
Structure Referencing (Cont.) C Code
i a p0 4 16 20
i a0 4 16 20
Element i