Vectorization in C++

velvia · ‎06-01-2014

Hi,

The following code vectorizes nicely in Fortran, but I can't make it vectorize in C++.

program main
    implicit none

    integer, parameter :: dp = 8
    integer, parameter :: n = 2 * 10**9

    real(dp) :: somme
    integer :: i

    somme = 0.0_dp
    do i = 1, n
        somme = somme + 1.0_dp / real(i, dp)
    end do

    write (*,*) somme
end program main

And the C++ version is the following :

#include <iostream>

int main (int argc, char const *argv[])
{
	using namespace std;
	
	const int n {2000000000};
	double somme {0.0};
	for(size_t i = 1; i <= n; ++i)
	{
		somme += 1.0 / static_cast<double>(i);
	}
	cout << somme;
	
	return 0;
}

I compile the fortran version with ifort -Ofast and the c++ version with icpc -Ofast. The Fortran version is 2x faster as it uses vector instructions (checked in the assembly code), but the C++ version does not. Is there a reason for this ?

Best regards,

François

Bernard · ‎06-01-2014

Can you post disassembly of C++ version?

velvia · ‎06-02-2014

Hi Iliyapolak,

Here is the disassembly of the Fortran version :

Block 1:
pushq  %rbp
mov %rsp, %rbp
and $0xffffffffffffff80, %rsp
sub $0x80, %rsp
mov $0x0, %rsi
mov $0x3, %edi
callq  0x46e8b0 <__intel_new_feature_proc_init>
Block 2:
stmxcsrl  (%rsp)
mov $0x47e290, %edi
orl  $0x8040, (%rsp)
ldmxcsrl  (%rsp)
callq  0x406370 <for_set_reentrancy>
Block 3:
mov $0x2, %eax
mov $0x200000001, %rdx
pxor %xmm8, %xmm8
movaps %xmm8, %xmm7
movaps %xmm8, %xmm9
movaps %xmm8, %xmm5
movaps %xmm8, %xmm4
movd %eax, %xmm0
movaps %xmm8, %xmm3
pshufd $0x0, %xmm0, %xmm6
movaps %xmm8, %xmm2
movaps %xmm8, %xmm1
movq %rdx, %xmm0
xor %eax, %eax
Block 4:
cvtdq2pd %xmm0, %xmm10
movapsx  0x7b5c7(%rip), %xmm11
paddd %xmm6, %xmm0
divpd %xmm10, %xmm11
cvtdq2pd %xmm0, %xmm12
addpd %xmm11, %xmm8
paddd %xmm6, %xmm0
add $0x10, %eax
cvtdq2pd %xmm0, %xmm14
paddd %xmm6, %xmm0
cmp $0x77359400, %eax
movapsx  0x7b597(%rip), %xmm13
cvtdq2pd %xmm0, %xmm10
divpd %xmm12, %xmm13
paddd %xmm6, %xmm0
movapsx  0x7b581(%rip), %xmm15
cvtdq2pd %xmm0, %xmm12
addpd %xmm13, %xmm7
divpd %xmm14, %xmm15
paddd %xmm6, %xmm0
cvtdq2pd %xmm0, %xmm14
addpd %xmm15, %xmm9
movapsx  0x7b55c(%rip), %xmm11
paddd %xmm6, %xmm0
divpd %xmm10, %xmm11
movapsx  0x7b54b(%rip), %xmm10
divpd %xmm14, %xmm10
cvtdq2pd %xmm0, %xmm15
addpd %xmm10, %xmm3
addpd %xmm11, %xmm5
paddd %xmm6, %xmm0
cvtdq2pd %xmm0, %xmm10
movapsx  0x7b526(%rip), %xmm13
paddd %xmm6, %xmm0
movapsx  0x7b51a(%rip), %xmm11
divpd %xmm12, %xmm13
movapsx  0x7b50d(%rip), %xmm12
divpd %xmm15, %xmm11
divpd %xmm10, %xmm12
addpd %xmm13, %xmm4
addpd %xmm11, %xmm2
addpd %xmm12, %xmm1
jb 0x402ccc <Block 4>
Block 5:
addpd %xmm7, %xmm8
addpd %xmm5, %xmm9
addpd %xmm3, %xmm4
addpd %xmm1, %xmm2
addpd %xmm9, %xmm8
addpd %xmm2, %xmm4
addpd %xmm4, %xmm8
movaps %xmm8, %xmm0
lea 0x10(%rsp), %rdi
unpckhpd %xmm8, %xmm0
lea (%rsp), %r8
mov $0x1208384ff00, %rdx
mov $0x47e298, %ecx
mov $0xffffffff, %esi
xor %eax, %eax
movq  $0x0, 0x10(%rsp)
addsd %xmm0, %xmm8
movsdq  %xmm8, (%rsp)
callq  0x407730 <for_write_seq_lis>
Block 6:
mov $0x1, %eax
mov %rbp, %rsp
popq  %rbp
retq  
Block 7:
nopl  %eax, (%rax)

Here is the disassembly of the C++ version

Block 1:
pushq  %rbp
mov %rsp, %rbp
and $0xffffffffffffff80, %rsp
sub $0x80, %rsp
mov $0x0, %rsi
mov $0x3, %edi
callq  0x400cc0 <__intel_new_feature_proc_init>
Block 2:
stmxcsrl  (%rsp)
mov $0x1, %eax
orl  $0x8040, (%rsp)
ldmxcsrl  (%rsp)
movsdq  0xe19(%rip), %xmm1
pxor %xmm0, %xmm0
Block 3:
pxor %xmm2, %xmm2
movaps %xmm1, %xmm3
cvtsi2sd %rax, %xmm2
divsd %xmm2, %xmm3
inc %rax
addsd %xmm3, %xmm0
cmp $0x77359400, %rax
jbe 0x400c53 <Block 3>
Block 4:
mov $0x603840, %edi
callq  0x4009d0 <_ZNSolsEd>
Block 5:
xor %eax, %eax
mov %rbp, %rsp
popq  %rbp
retq  
Block 6:
nopl  %eax, (%rax,%rax,1)
nopl  %eax, (%rax,%rax,1)

MalReddy_Y_Intel · ‎06-02-2014

Hi,

In C++ it is being vectorized for lower trip counts ( ie n<=1000000000), I can also reproduce the issue for the specified trip count and escalated this issue to compiler development team.

I will inform you when there is an update.

Thanks,

Reddy

Bernard · ‎06-02-2014

Beside the vectorization fortran code disassembly appears to use 4x unrolling also with RIP related addressing.

TimP · ‎06-02-2014

I find it useful to specify unroll amount. With normal loop lengths, improvements in vector remainder in the most recent compilers allow more aggressive unrolling.

The ifort %rip references presumably are to data placed there before entering the displayed code.

Bernard · ‎06-02-2014

Those %RIP addresses are not incremented by 16 bytes. I think that those are some kind of temporary variables .

jimdempseyatthecove · ‎06-02-2014

The %rip is for instruction pointer relative addressing. This (presumably) is the relative address offset to the 1.0 literal constant. The address of the 1.0 is not changing (iow the sum of the instruction pointer and relative offset does not change) I am surprised the Fortran optimization did not lift the 1.0 into a register outside the loop.

Jim Dempsey

Bernard · ‎06-02-2014

>>>This (presumably) is the relative address offset to the 1.0 literal constant.>>>

Yes it seems so. I was slightly confused because of AT&T syntax.

jimdempseyatthecove · ‎06-02-2014

Note that the C++ optimization did lift the 1.0 into a register outside the loop (though it did not unroll).

Jim Dempsey

Bernard · ‎06-03-2014

Fortran version also did wise usage of temporary variables for summation while unrolling.