- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
the original purpose of my test was to compare the performance of a loop using pointer arithmetic to one using simple array indexing...
eg.
A = array; B = array2; C = array3;
for ( i=0;i<1000;i++ )
*A++ = *B++ * *C++;
versus
for ( i=0;i<1000;i++ )
array = array2*array3;
(The actual test code can be found in the attachment)
The test code was compiled with the gnu g++ compiler and the intel icc compiler, and severel sets of optimization options were used...
g++ -march=athlon-xp -mfpmath=sse -o gcc-xp.o loop.cpp -c
icc -D__ICC__ loop.cpp -o icc-std
icc -D__ICC__ loop.cpp -o icc-o2 -O2
icc -march=pentium4 -mcpu=pentium4 -xW -tpp7 -D__ICC__ loop.cpp -o icc-p4
icc -march=pentiumpro -mcpu=pentiumpro -xi -tpp6 -D__ICC__ loop.cpp -o icc-pro
When I ran all code on a p4 machine, I got the strangest results... It seems like the icc-p4 optimized version does the multiplication about 30 times faster than other optimizations... (except gcc-p4, which is 2 times faster than eg gcc-std or icc-std, as I would expect if it uses SIMD to process 2 multiplications at the same time)
Can anyone give an explanation for the results I'm getting?
eg.
A = array; B = array2; C = array3;
for ( i=0;i<1000;i++ )
*A++ = *B++ * *C++;
versus
for ( i=0;i<1000;i++ )
array = array2*array3;
(The actual test code can be found in the attachment)
The test code was compiled with the gnu g++ compiler and the intel icc compiler, and severel sets of optimization options were used...
g++ -march=athlon-xp -mfpmath=sse -o gcc-xp.o loop.cpp -c
icc -D__ICC__ loop.cpp -o icc-std
icc -D__ICC__ loop.cpp -o icc-o2 -O2
icc -march=pentium4 -mcpu=pentium4 -xW -tpp7 -D__ICC__ loop.cpp -o icc-p4
icc -march=pentiumpro -mcpu=pentiumpro -xi -tpp6 -D__ICC__ loop.cpp -o icc-pro
When I ran all code on a p4 machine, I got the strangest results... It seems like the icc-p4 optimized version does the multiplication about 30 times faster than other optimizations... (except gcc-p4, which is 2 times faster than eg gcc-std or icc-std, as I would expect if it uses SIMD to process 2 multiplications at the same time)
Can anyone give an explanation for the results I'm getting?
Link Copied
5 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
It seems I'm unable to attach C source, so here it is, loop.cpp
#include
#include
#include
#include
#include
using namespace std;
#define REPITITIONS 1000
#define REPITITIONS_MUL 400
#define ARRAY_SIZE 100000
#ifdef __ICC__
double __declspec(align(16)) array[ARRAY_SIZE];
double __declspec(align(16)) array2[ARRAY_SIZE];
double __declspec(align(16)) array3[ARRAY_SIZE];
#else
double array[ARRAY_SIZE];
double array2[ARRAY_SIZE];
double array3[ARRAY_SIZE];
#endif
void random_fill( void* buf )
{
srandom( 0xDEADFACE );
unsigned* b = (unsigned*) buf;
assert( sizeof( unsigned ) * 2 == sizeof( double ) );
int i;
for ( i=0;i b = random();
}
int main( void )
{
FILE* file;
unsigned start,end;
int i,j,k;
double *pD,*pA,*pB;
cout << "TEST 1 : zeroing out a REPITITIONS element array" << endl;
cout << " -- using standard loop" << endl;
for ( i=0;i<9;i++ )
{
start = clock();
for ( k=0;k for ( j=0;j array = 0.0;
end = clock();
cout << end - start << " # ";
if ( i%3 == 2 )
cout << endl;
}
cout << " -- using pointer arithmetic" << endl;
for ( i=0;i<9;i++ )
{
start = clock();
for ( k=0;k {
pD = array;
for ( j=0;j *pD++ = 1.0;
}
end = clock();
assert( pD == &array[ARRAY_SIZE] );
cout << end - start << " # ";
if ( i%3 == 2 )
cout << endl;
}
random_fill( array2 );
random_fill( array3 );
file = fopen( "output.txt","wb" );
cout << "TEST 2 : Vector multiplication " << endl;
cout << " -- using standard loop" << endl;
for ( i=0;i<9;i++ )
{
start = clock();
for ( k=0;k for ( j=0;j array = array2 * array3;
end = clock();
fwrite( array, 1, ARRAY_SIZE * sizeof( double ), file );
cout << end - start << " # ";
if ( i%3 == 2 )
cout << endl;
}
cout << " -- using pointer arithmetic" << endl;
for ( i=0;i<9;i++ )
{
start = clock();
for ( k=0;k {
pD = array; pA = array2; pB = array3;
for ( j=0;j *pD++ = *pA++ * *pB++;
}
end = clock();
fwrite( array, 1, ARRAY_SIZE * sizeof( double ), file );
assert( pD == &array[ARRAY_SIZE] );
assert( pA == &array2[ARRAY_SIZE] );
assert( pB == &array3[ARRAY_SIZE] );
cout << end - start << " # ";
if ( i%3 == 2 )
cout << endl;
}
fclose( file );
}
#include
#include
#include
#include
#include
using namespace std;
#define REPITITIONS 1000
#define REPITITIONS_MUL 400
#define ARRAY_SIZE 100000
#ifdef __ICC__
double __declspec(align(16)) array[ARRAY_SIZE];
double __declspec(align(16)) array2[ARRAY_SIZE];
double __declspec(align(16)) array3[ARRAY_SIZE];
#else
double array[ARRAY_SIZE];
double array2[ARRAY_SIZE];
double array3[ARRAY_SIZE];
#endif
void random_fill( void* buf )
{
srandom( 0xDEADFACE );
unsigned* b = (unsigned*) buf;
assert( sizeof( unsigned ) * 2 == sizeof( double ) );
int i;
for ( i=0;i
}
int main( void )
{
FILE* file;
unsigned start,end;
int i,j,k;
double *pD,*pA,*pB;
cout << "TEST 1 : zeroing out a REPITITIONS element array" << endl;
cout << " -- using standard loop" << endl;
for ( i=0;i<9;i++ )
{
start = clock();
for ( k=0;k
end = clock();
cout << end - start << " # ";
if ( i%3 == 2 )
cout << endl;
}
cout << " -- using pointer arithmetic" << endl;
for ( i=0;i<9;i++ )
{
start = clock();
for ( k=0;k
pD = array;
for ( j=0;j
}
end = clock();
assert( pD == &array[ARRAY_SIZE] );
cout << end - start << " # ";
if ( i%3 == 2 )
cout << endl;
}
random_fill( array2 );
random_fill( array3 );
file = fopen( "output.txt","wb" );
cout << "TEST 2 : Vector multiplication " << endl;
cout << " -- using standard loop" << endl;
for ( i=0;i<9;i++ )
{
start = clock();
for ( k=0;k
end = clock();
fwrite( array, 1, ARRAY_SIZE * sizeof( double ), file );
cout << end - start << " # ";
if ( i%3 == 2 )
cout << endl;
}
cout << " -- using pointer arithmetic" << endl;
for ( i=0;i<9;i++ )
{
start = clock();
for ( k=0;k
pD = array; pA = array2; pB = array3;
for ( j=0;j
}
end = clock();
fwrite( array, 1, ARRAY_SIZE * sizeof( double ), file );
assert( pD == &array[ARRAY_SIZE] );
assert( pA == &array2[ARRAY_SIZE] );
assert( pB == &array3[ARRAY_SIZE] );
cout << end - start << " # ";
if ( i%3 == 2 )
cout << endl;
}
fclose( file );
}
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
The mystery lies here:
Example times, debug build, +/- 5%:
TEST 1 : zeroing out a REPITITIONS element array
-- using standard loop
1187 # 1531 # 1250 #
1203 # 1187 # 1188 #
1218 # 1188 # 1203 #
-- using pointer arithmetic
1188 # 1203 # 1265 #
1188 # 1219 # 1187 #
1187 # 1203 # 1203 #
TEST 2 : Vector multiplication
-- using standard loop
42188 # 39516 # 40453 #
40360 # 39797 # 39735 #
39468 # 38625 # 41547 #
-- using pointer arithmetic
39781 # 39984 # 42485 #
40203 # 39344 # 39266 #
38437 # 38437 # 38422 #
rand() generates 15bit unsigned integers; this means all 17 MSBs are zeroed; in double type, MSB contains sign, next 11 bits contain biased exponent. If exponent == 0 the number is denormal; denormals mean penalty for CPUs like Pentium 4.
Super-optimization you mentioned probably turns on DAZ (denormals are zero) mode, which causes the CPU to treat all denormals (and in your example all values, whole arrays) as zeros; of course, with no penalty in multiplication.
Proof - change random_fill() to following:
Example times, debug build, +/- 5%:
TEST 1 : zeroing out a REPITITIONS element array
-- using standard loop
1171 # 1516 # 1141 #
1140 # 1125 # 1156 #
1141 # 1140 # 1141 #
-- using pointer arithmetic
1140 # 1141 # 1140 #
1141 # 1140 # 1157 #
1140 # 1141 # 1141 #
TEST 2 : Vector multiplication
-- using standard loop
906 # 1250 # 921 #
922 # 922 # 922 #
922 # 906 # 922 #
-- using pointer arithmetic
922 # 907 # 922 #
922 # 922 # 921 #
906 # 922 # 907 #
Here all values are normals, so multiplication executes with no penalty, 43x speedup.
Regards,
Anna
P.S. To Intel web team: Any chance that here, in the forums, sequence leftbracket-i-rightbracket will be treated normally inside leftbracket-pre-rightbracket (preformatted) section?
void random_fill(void* buf) { srand(0xDEADFACE); unsigned* b = (unsigned*)buf; for (int j = 0; j < ARRAY_SIZE * 2; j++) b= rand(); }
Example times, debug build, +/- 5%:
TEST 1 : zeroing out a REPITITIONS element array
-- using standard loop
1187 # 1531 # 1250 #
1203 # 1187 # 1188 #
1218 # 1188 # 1203 #
-- using pointer arithmetic
1188 # 1203 # 1265 #
1188 # 1219 # 1187 #
1187 # 1203 # 1203 #
TEST 2 : Vector multiplication
-- using standard loop
42188 # 39516 # 40453 #
40360 # 39797 # 39735 #
39468 # 38625 # 41547 #
-- using pointer arithmetic
39781 # 39984 # 42485 #
40203 # 39344 # 39266 #
38437 # 38437 # 38422 #
rand() generates 15bit unsigned integers; this means all 17 MSBs are zeroed; in double type, MSB contains sign, next 11 bits contain biased exponent. If exponent == 0 the number is denormal; denormals mean penalty for CPUs like Pentium 4.
Super-optimization you mentioned probably turns on DAZ (denormals are zero) mode, which causes the CPU to treat all denormals (and in your example all values, whole arrays) as zeros; of course, with no penalty in multiplication.
Proof - change random_fill() to following:
void random_fill(void* buf) { srand(0xDEADFACE); double* d = (double*)buf; for (int j = 0; j < ARRAY_SIZE; j++) d= (double)rand(); }
Example times, debug build, +/- 5%:
TEST 1 : zeroing out a REPITITIONS element array
-- using standard loop
1171 # 1516 # 1141 #
1140 # 1125 # 1156 #
1141 # 1140 # 1141 #
-- using pointer arithmetic
1140 # 1141 # 1140 #
1141 # 1140 # 1157 #
1140 # 1141 # 1141 #
TEST 2 : Vector multiplication
-- using standard loop
906 # 1250 # 921 #
922 # 922 # 922 #
922 # 906 # 922 #
-- using pointer arithmetic
922 # 907 # 922 #
922 # 922 # 921 #
906 # 922 # 907 #
Here all values are normals, so multiplication executes with no penalty, 43x speedup.
Regards,
Anna
P.S. To Intel web team: Any chance that here, in the forums, sequence leftbracket-i-rightbracket will be treated normally inside leftbracket-pre-rightbracket (preformatted) section?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
thank you very much... That's one thing I didn't consider ;-)
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
When you specify -xW compiler option, generates Pentium 4 specific instructions (SSE2) & enables automatic vectorization, the compiler sets the FTZ mode for SSE calculations, which flushes the denormals to zero. There is also an intrinsic function provided to turn on FTZ mode for SSE/SSE2 instructions - see below.
Details available from the Intel document that describes the SSE/SSE2 registers, including FTZ mode: http://or1cedar.intel.com/media/pdf/appnotes/sse2/w_fp_precision.pdf, page 33.
Intrisic example:
#include "xmmintrin.h"
void SSE_ftz() {
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
}
regards,
john
Message Edited by intel.software.network.support on 12-09-2005 01:42 PM
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
the DAZ and FTZ flags are two distinct animals
enabling DAZ you can minimize the occurence of assists further than with only FTZ (on some codes)
AFAIK the DAZ mode isn't very well documented though
it's the bit #6 of the MXCSR (FTZ being bit #15)
you can set it with something like :
beware to set it only on targets supporting the feature (will crashhhhh otherwise)
enabling DAZ you can minimize the occurence of assists further than with only FTZ (on some codes)
AFAIK the DAZ mode isn't very well documented though
it's the bit #6 of the MXCSR (FTZ being bit #15)
you can set it with something like :
_mm_setcsr(_mm_getcsr() | 0x40);
beware to set it only on targets supporting the feature (will crashhhhh otherwise)
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page