Why "subq" as allocate by ICC-v10.0 but not as prologue, but ICC-v11.0 uses "pushq" as prologue?

srimks · ‎01-21-2009

Hi All.

Below is a piece of CPP code and behaviour of asm due to ICC-v11.0 & ICC-v10.0 -

--
#include
#include

#define MAX 1024

int main()
{
int i, j;
int num[MAX], isort[MAX], cluster[MAX][MAX];

for (j = 0; j < MAX; j++) {
num = 0;
isort = j;
for (i = 0; i < MAX; i++) {
cluster = 0;
}
}
printf("%d %d %d\n",num[64],isort[78],cluster[384][74]);
return 0;
}
--

whose asm using ICC-v11.0 has been created with command -
$ icpc -fno-builtin test.cpp -S
test.cpp(14): (col. 6) remark: LOOP WAS VECTORIZED.

as
---
# -- Machine type EFI2
# mark_description "Intel C++ Compiler Professional for applications running on Intel 64, Version 11.0 Build 20081105 %";
# mark_description "s";
# mark_description "-fno-builtin -S";
.file "test.cpp"
.section .ctors, "wa"
.text
..TXTST0:
# -- Begin main
# mark_begin;
.align 16,0x90
.globl main
main:
..B1.1: # Preds ..B1.0
..___tag_value_main.1: #7.1
pushq %rbp #7.1 //save frame pointer
..___tag_value_main.2: #
movq %rsp, %rbp #7.1 //set new FP
..___tag_value_main.3: #
andq $-128, %rsp #7.1
subq $4202496, %rsp #7.1 //allocate stack space
movl $3, %edi #7.1
..___tag_value_main.5: #7.1
call __intel_new_proc_init #7.1
..___tag_value_main.6: #
# LOE rbx r12 r13 r14 r15
..B1.9: # Preds ..B1.1
stmxcsr 4194304(%rsp) #7.1
orl $32832, 4194304(%rsp) #7.1
ldmxcsr 4194304(%rsp) #7.1
lea (%rsp), %rdx #15.11
xorl %esi, %esi #11.2
xorl %ecx, %ecx #
movq %rdx, %rax #
pxor %xmm0, %xmm0 #15.27
# LOE rax rdx rbx rsi r12 r13 r14 r15 ecx xmm0
..B1.2: # Preds ..B1.4 ..B1.9
movl %ecx, 4198400(%rsp,%rsi,4) #13.11
movl $0, 4194304(%rsp,%rsi,4) #12.11
xorl %r8d, %r8d #14.6
movq %rax, %rdi #12.11
# LOE rax rdx rbx rsi rdi r8 r12 r13 r14 r15 ecx xmm0
..B1.3: # Preds ..B1.3 ..B1.2
movdqa %xmm0, (%rdx,%r8,4) #15.11
movdqa %xmm0, 16(%rdi) #15.11
movdqa %xmm0, 32(%rdi) #15.11
movdqa %xmm0, 48(%rdi) #15.11
addq $64, %rdi #14.6
addq $16, %r8 #14.6
cmpq $1024, %r8 #14.6
jl ..B1.3 # Prob 99% #14.6
# LOE rax rdx rbx rsi rdi r8 r12 r13 r14 r15 ecx xmm0
..B1.4: # Preds ..B1.3
addq $4096, %rax #11.2
addq $4096, %rdx #11.2
incl %ecx #11.2
incq %rsi #11.2
cmpq $1024, %rsi #11.2
jl ..B1.2 # Prob 99% #11.2
# LOE rax rdx rbx rsi r12 r13 r14 r15 ecx xmm0
..B1.5: # Preds ..B1.4
movl 4194560(%rsp), %esi #18.2
movl 4198712(%rsp), %edx #18.2
movl 1573160(%rsp), %ecx #18.2
movl $_2__STRING.0.0, %edi #18.2
xorl %eax, %eax #18.2
..___tag_value_main.7: #18.2
call printf #18.2
..___tag_value_main.8: #
# LOE rbx r12 r13 r14 r15
..B1.6: # Preds ..B1.5
xorl %eax, %eax #19.9
movq %rbp, %rsp #19.9
popq %rbp #19.9
..___tag_value_main.9: #
ret #19.9
.align 16,0x90
..___tag_value_main.11: #
# LOE
# mark_end;
.type main,@function
.size main,.-main
.data
# -- End main
.text
# -- Begin __sti__$E
# mark_begin;
.align 16,0x90
__sti__$E:
..B2.1: # Preds ..B2.0
..___tag_value___sti__$E.12: #
pushq %rsi #
..___tag_value___sti__$E.14: #
movl $_ZSt8__ioinit.0, %edi #77.25
..___tag_value___sti__$E.15: #77.25
call _ZNSt8ios_base4InitC1Ev #77.25
..___tag_value___sti__$E.16: #
# LOE rbx rbp r12 r13 r14 r15
..B2.2: # Preds ..B2.1
movl $_ZNSt8ios_base4InitD1Ev, %edi #77.25
....
....
---

and the asm using ICC-v10.0 has been created as below -
$ /opt/intel/cce/10.0.023/bin/icpc test.cpp -S
test.cpp(14): (col. 6) remark: LOOP WAS VECTORIZED.

as
--
# -- Machine type EFI2
# mark_description "Intel C++ Compiler for applications running on Intel 64, Version 10.0 Build 20070426 %s";
# mark_description "-S";
.file "test.cpp"
.section .ctors, "wa"
.text
..TXTST0:
# -- Begin main
# mark_begin;
.align 2,0x90
.globl main
main:
..B1.1: # Preds ..B1.0
..___tag_value_main.1: #7.1
subq $4202504, %rsp #7.1 // Query- Why allocate for stack space has been used for subq here but prologue is not being called?
..___tag_value_main.9: #
movl $3, %edi #7.1
..___tag_value_main.10: #7.1
call __intel_new_proc_init #7.1
..___tag_value_main.11: #
# LOE rbx rbp r12 r13 r14 r15
..B1.9: # Preds ..B1.1
stmxcsr (%rsp) #7.1
orl $32832, (%rsp) #7.1
ldmxcsr (%rsp) #7.1
xorl %esi, %esi #
xorl %ecx, %ecx #
xorl %edx, %edx #
movl $4096, %eax #
pxor %xmm0, %xmm0 #15.27
# LOE rax rcx rbx rbp rsi r12 r13 r14 r15 edx xmm0
..B1.2: # Preds ..B1.9 ..B1.4
movl %edx, 4096(%rsp,%rsi) #13.11
movl $0, (%rsp,%rsi) #12.11
movq %rcx, %rdi #12.11
# LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15 edx xmm0
..B1.3: # Preds ..B1.3 ..B1.2
movdqa %xmm0, 8192(%rsp,%rdi) #15.11
movdqa %xmm0, 8208(%rsp,%rdi) #15.11
movdqa %xmm0, 8224(%rsp,%rdi) #15.11
movdqa %xmm0, 8240(%rsp,%rdi) #15.11
movdqa %xmm0, 8256(%rsp,%rdi) #15.11
movdqa %xmm0, 8272(%rsp,%rdi) #15.11
movdqa %xmm0, 8288(%rsp,%rdi) #15.11
movdqa %xmm0, 8304(%rsp,%rdi) #15.11
addq $128, %rdi #14.6
cmpq %rax, %rdi #14.6
jl ..B1.3 # Prob 99% #14.6
# LOE rax rcx rbx rbp rsi rdi r12 r13 r14 r15 edx xmm0
..B1.4: # Preds ..B1.3
addq $4, %rsi #11.2
addq $4096, %rax #11.2
addq $4096, %rcx #11.2
addl $1, %edx #11.2
cmpq $4198400, %rax #11.2
jl ..B1.2 # Prob 99% #11.2
# LOE rax rcx rbx rbp rsi r12 r13 r14 r15 edx xmm0
..B1.5: # Preds ..B1.4
movl 256(%rsp), %esi #18.2
movl 4408(%rsp), %edx #18.2
movl 1581352(%rsp), %ecx #18.2
movl $_2__STRING.0.0, %edi #18.2
xorl %eax, %eax #18.2
..___tag_value_main.12: #18.2
call printf #18.2
..___tag_value_main.13: #
# LOE rbx rbp r12 r13 r14 r15
..B1.6: # Preds ..B1.5
xorl %eax, %eax #19.9
addq $4202504, %rsp #19.9
..___tag_value_main.14: #
ret #19.9
.align 2,0x90
..___tag_value_main.15: #
# LOE
# mark_end;
.type main,@function
.size main,.-main
.data
# -- End main
.text
# -- Begin __sti__$E
# mark_begin;
.align 2,0x90
__sti__$E:
..B2.1: # Preds ..B2.0
..___tag_value___sti__$E.16: #
pushq %rsi #
..___tag_value___sti__$E.24: #
movl $_ZSt8__ioinit.0, %edi #77.25
..___tag_value___sti__$E.25: #77.25
call _ZNSt8ios_base4InitC1Ev #77.25
..___tag_value___sti__$E.26: #
# LOE rbx rbp r12 r13 r14 r15
..B2.2: # Preds ..B2.1
movl $_ZNSt8ios_base4InitD1Ev, %edi #77.25
movl $_ZSt8__ioinit.0, %esi #77.25
...
...
--

Query:

(a) Why with ICC-v10.0, the allocate for main starts with subq operand or why pushq prologue is not observed?

(b) Does ICC-v11.0 has any specific reasons of using prologue for main rather allocate which has been used by ICC-v10.0?

(c) The size for above am has been 9120 & 8792 respectively. Use of "fno-builtin" with ICC-v11.0 shouldn't bloat the asm size?

(d) ICC-v10.0 asm results with "pushq %rsi", means register "rsi" used tp pass an arguement to function but not with "rbp" but ICC-v11.0 "pushq" results with both "rbp", a callee-saved register and ""rsi" used to pass arguement to function, any insights w.r.t performances?

(e) If the same above code is executed with ICC-v11.0 using "-fno-builtin", allocate of main starts with "subq" but w/o "fno-builtin" it starts with "pushq", also use of "fno-builtin" generates a bigger size asm and contains "MOVNTDQ" inst. Normally, use of MOVNTDQ speeds the code but here it becomes opposite, why such weird behaviour?

~BR

Sergey_M_Intel1 · ‎01-27-2009

(1) In 11.0 version of ICC generates an aligned (to 128) frame for main. This is the intentional behavior and is done for performance reasons though not for this specific example. This where all the uses of %rbp arise (it is used to save the value of %rsp before alignment). If you are confused with this extra alignment, please put your code into a function other than main. Consider the prologue for main with 10.1:

main:

subq $4198408, %rsp

vs 11.0 prologue:

main:

pushq %rbp

movq %rsp, %rbp

andq $-128, %rsp

subq $4198400, %rsp

(2) The -fno-builtin option causes the compiler to not expand intrinsics code inline. The code in the example doesnt make use of any intrinsics which might suggest that this option should have no effect here. Interestingly but it does affect the way the pattern of setting memory to zero is recognized. It may be considered a "bug" in the sense that the behavior is not the one as expected, but I don't fell too strong about it. It is important that both variants are correct to the matter of what the semantics of the option is and they both look reasonably adequate in terms of performance. Generally it is true that the use of -fno-builtin would result in a smaller though slower code, but it is incorrect assumption that the code will indeed be smaller. If you are really interested in code size vs. performance, please use option -Os.

(3) The pushq %rsi code that you refer in (d) has nothing to do to parameter passing for routine __sti__$E. It is just an easy way to adjust the stack pointer by 8 bytes to make sure that it is properly aligned to 16-byte boundary at the subsequent call, to conform to x86-64 ABI. The popq %rcx is its counterpart in function epilog.

34) I cannot reproduce the behavior you describe with regards to MOVNTDQ and also your asm snippets don't contain any MOVNTDQ. Maybe you have more details that you haven't shared?

Regards,

-Sergey

srimks · ‎04-24-2009

Hello,

Thanks for your valuable input.

I have new query similar to above as below -

(a) For multiple C++ package file, when I do vectorizations (calling of pragma's) within that file within section of code, I get starting and ending asm as -
{
44d960: 55 push %rbp
44d961: 48 83 ec 50 sub $0x50,%rsp
44d965: 49 89 f0 mov %rsi,%r8
44d968: 4c 63 c9 movslq %ecx,%r9
...

...
44dc84: 48 83 c4 50 add $0x50,%rsp
44dc88: 5d pop %rbp
44dc89: c3 retq
44dc8a: 90 nop
44dc8b: 48 8d 74 26 00 lea 0x0(%rsi),%rsi
}

(b) But the same code w/o using any pragma's call, the starting & ending asm are as -
{
44d960: 48 83 ec 68 sub $0x68,%rsp
44d964: 49 89 f9 mov %rdi,%r9
44d967: 49 89 d0 mov %rdx,%r8
44d96a: 4c 63 d1 movslq %ecx,%r10
..
..
..
44dc4e: 48 83 c4 68 add $0x68,%rsp
44dc52: c3 retq
44dc53: 90 nop
44dc54: 48 8d 74 26 00 lea 0x0(%rsi),%rsi
44dc59: 48 8d bf 00 00 00 00 lea 0x0(%rdi),%rdi
}
---

Query:
(1) Could the difference between having PUSH/POP call with pragma vectorization calls and not having w/o it be differentiated?

(2) W/o pragma calls, the asm in (b) has "lea" calls twice and also the during starting it has - sub, mov, mov & movslq than with pragma calls, why pragma calls bring such a difference?

Sorry, I didn't thought of creating a new thread.

~BR

Sergey_M_Intel1 · ‎11-02-2009

Quoting - srimks

Hello,

Thanks for your valuable input.

I have new query similar to above as below -

(a) For multiple C++ package file, when I do vectorizations (calling of pragma's) within that file within section of code, I get starting and ending asm as -
{
44d960: 55 push %rbp
44d961: 48 83 ec 50 sub $0x50,%rsp
44d965: 49 89 f0 mov %rsi,%r8
44d968: 4c 63 c9 movslq %ecx,%r9
...

...
44dc84: 48 83 c4 50 add $0x50,%rsp
44dc88: 5d pop %rbp
44dc89: c3 retq
44dc8a: 90 nop
44dc8b: 48 8d 74 26 00 lea 0x0(%rsi),%rsi
}

(b) But the same code w/o using any pragma's call, the starting & ending asm are as -
{
44d960: 48 83 ec 68 sub $0x68,%rsp
44d964: 49 89 f9 mov %rdi,%r9
44d967: 49 89 d0 mov %rdx,%r8
44d96a: 4c 63 d1 movslq %ecx,%r10
..
..
..
44dc4e: 48 83 c4 68 add $0x68,%rsp
44dc52: c3 retq
44dc53: 90 nop
44dc54: 48 8d 74 26 00 lea 0x0(%rsi),%rsi
44dc59: 48 8d bf 00 00 00 00 lea 0x0(%rdi),%rdi
}
---

Query:
(1) Could the difference between having PUSH/POP call with pragma vectorization calls and not having w/o it be differentiated?

(2) W/o pragma calls, the asm in (b) has "lea" calls twice and also the during starting it has - sub, mov, mov & movslq than with pragma calls, why pragma calls bring such a difference?

Sorry, I didn't thought of creating a new thread.

~BR

The prolog/epilog sequences (i.e. what you refer to as "starting and ending asm") can indeed be very sensitive to the other code in the routine & to the level of optimizations applied. Several simple examples can be different register pressure and/or different alignment constraints for the local variables. You shouldn't however be ever able to reason on one sequence or another - it is completely implementation dependent. Well, you may always make a nice guess, but that would be perfectly incorrect to make any assumptions based on such a guess.

BTW, the "lea" instructions you are seeing at the end are just NOPs that are never executed and are their for code alignment purposes only.