- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
#include
#include
#include
#include
using namespace std;
int main()
{
/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/
__declspec(align(16)) double onevec[2], xvec[2], res[2];
xvec[0] = 0.5;
xvec[1] = 0.7;
onevec[0] = 1;
onevec[1] = 1;
__asm
{
mov eax,1; // counter initialization
movapd xmm0,onevec; // onevec = {1, 1} in xmm0
movapd xmm1,xvec; // xvec = {X1, x2} (2 values of the variable)
movapd xmm2,xmm0; // factorial initialization: factor = {1, 1} in xmm2
movapd xmm3,xmm1; // series linear term: {x1, x2} in xmm3
movapd xmm4,xmm0; // sum initialization: sum = {1, 1} in xmm4
addpd xmm4,xmm3; // 1st iteration: sum = {1,1} + {x1, x2}
itloop: inc eax; // next iteration
addpd xmm2,xmm0; // {n, n} -> {n+1, n+1}
mulpd xmm3,xmm1; // x^n -> x^(n+1) for both x1 & x2
divpd xmm3,xmm2; // x^(n+1)/n! -> x^(n+1)/(n+1)!
addpd xmm4,xmm3; // sum = sum + x^(n+1)/(n+1)! for both x1 & x2 ->
// iteration completed
cmp eax,6; // loop for eax = 2, 3, ... , 7
jle itloop;
movapd res,xmm4; // save result
}
cout << res[0] << " " << res[1] << endl;
return 0;
}
and its disassembly:
/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/
__declspec(align(16)) double onevec[2], xvec[2], res[2];
xvec[0] = 0.5;
011414C0 fld qword ptr [__real@3fe0000000000000 (1147858h)]
011414C6 fstp qword ptr [xvec]
xvec[1] = 0.7;
011414C9 fld qword ptr [__real@3fe6666666666666 (1147848h)]
011414CF fstp qword ptr [ebp-38h]
onevec[0] = 1;
011414D2 fld1
011414D4 fstp qword ptr [onevec]
onevec[1] = 1;
011414D7 fld1
011414D9 fstp qword ptr [ebp-18h]
__asm
{
mov eax,1; // counter initialization
011414DC mov eax,1
movapd xmm0,onevec; // onevec = {1, 1} in xmm0
011414E1 movapd xmm0,xmmword ptr [onevec]
movapd xmm1,xvec; // xvec = {X1, x2} (2 values of the variable)
011414E6 movapd xmm1,xmmword ptr [xvec]
movapd xmm2,xmm0; // factorial initialization: factor = {1, 1} in xmm2
011414EB movapd xmm2,xmm0
movapd xmm3,xmm1; // series linear term: {x1, x2} in xmm3
011414EF movapd xmm3,xmm1
movapd xmm4,xmm0; // sum initialization: sum = {1, 1} in xmm4
011414F3 movapd xmm4,xmm0
addpd xmm4,xmm3; // 1st iteration: sum = {1,1} + {x1, x2}
011414F7 addpd xmm4,xmm3
itloop: inc eax; // next iteration
011414FB inc eax
addpd xmm2,xmm0; // {n, n} -> {n+1, n+1}
011414FC addpd xmm2,xmm0
mulpd xmm3,xmm1; // x^n -> x^(n+1) for both x1 & x2
01141500 mulpd xmm3,xmm1
divpd xmm3,xmm2; // x^(n+1)/n! -> x^(n+1)/(n+1)!
01141504 divpd xmm3,xmm2
addpd xmm4,xmm3; // sum = sum + x^(n+1)/(n+1)! for both x1 & x2 ->
01141508 addpd xmm4,xmm3
// iteration completed
cmp eax,6; // loop for eax = 2, 3, ... , 7
0114150C cmp eax,6
jle itloop;
0114150F jle itloop (11414FBh)
movapd res,xmm4; // save result
01141511 movapd xmmword ptr [res],xmm4
}
Of course, it strictly conforms to the inline assembly code! (how could it be different?).
Now, here is the intrinsics-based code as I have been able to write it down:
#include
#include
#include
#include
using namespace std;
int main()
{
/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/
__declspec(align(16)) double res[2];
const int imax = 6;
double x1 = 0.5;
double x2 = 0.7;
double w = 1;
register int i = 1;
double acc = 0.001;
__m128d one = _mm_set1_pd(w); // one = {1, 1}
__m128d x = _mm_set_pd(x2, x1); // x = {X1, x2}
__m128d factor = one; // factorial initialization: factor = {1, 1}
__m128d sum = one; // sum initialization: sum = {1, 1}
__m128d term = x; // linear term: {x1, x2}
sum = _mm_add_pd(sum, term); // 1st iteration: sum = {1+x1, 1+x2}
do
{
i++;
factor = _mm_add_pd(factor, one); // {n, n} -> {n+1, n+1}
term = _mm_mul_pd(term, x); // x^n -> x^{n+1}
term = _mm_div_pd(term, factor); // x^{n+1}/n -> x^{n+1}/(n+1)
sum = _mm_add_pd(sum, term); // sum -> sum + x^{n+1}/(n+1)
} while(i<=imax);
_mm_store_pd(res, sum);
cout << res[0] << " " << res[1] << endl;
return 0;
}
and its disassembly:
/***** computes e^x = 1 + x + x^2/2! + x^3/3! + ... *****/
__declspec(align(16)) double res[2];
const int imax = 6;
002A14C0 mov dword ptr [imax],6
double x1 = 0.5;
002A14C7 fld qword ptr [__real@3fe0000000000000 (2A7868h)]
002A14CD fstp qword ptr [x1]
double x2 = 0.7;
002A14D0 fld qword ptr [__real@3fe6666666666666 (2A7858h)]
002A14D6 fstp qword ptr [x2]
double w = 1;
002A14D9 fld1
002A14DB fstp qword ptr
register int i = 1;
002A14DE mov dword ptr ,1
double acc = 0.001;
002A14E5 fld qword ptr [__real@3f50624dd2f1a9fc (2A7838h)]
002A14EB fstp qword ptr [acc]
__m128d one = _mm_set1_pd(w); // one = {1, 1}
002A14EE movsd xmm0,mmword ptr
002A14F3 shufpd xmm0,xmm0,0
002A14F8 movapd xmmword ptr [ebp-2C0h],xmm0
002A1500 movapd xmm0,xmmword ptr [ebp-2C0h]
002A1508 movapd xmmword ptr [one],xmm0
__m128d x = _mm_set_pd(x2, x1); // x = {X1, x2}
002A1510 movsd xmm0,mmword ptr [x1]
002A1515 movsd xmm1,mmword ptr [x2]
002A151A unpcklpd xmm0,xmm1
002A151E movapd xmmword ptr [ebp-2A0h],xmm0
002A1526 movapd xmm0,xmmword ptr [ebp-2A0h]
002A152E movapd xmmword ptr
__m128d factor = one; // factorial initialization: factor = {1, 1}
002A1536 movapd xmm0,xmmword ptr [one]
002A153E movapd xmmword ptr [factor],xmm0
__m128d sum = one; // sum initialization: sum = {1, 1}
002A1546 movapd xmm0,xmmword ptr [one]
002A154E movapd xmmword ptr [sum],xmm0
__m128d term = x; // linear term: {x1, x2}
002A1556 movapd xmm0,xmmword ptr
002A155E movapd xmmword ptr [term],xmm0
sum = _mm_add_pd(sum, term); // 1st iteration: sum = {1+x1, 1+x2}
002A1566 movapd xmm0,xmmword ptr [term]
002A156E movapd xmm1,xmmword ptr [sum]
002A1576 addpd xmm1,xmm0
002A157A movapd xmmword ptr [ebp-280h],xmm1
002A1582 movapd xmm0,xmmword ptr [ebp-280h]
002A158A movapd xmmword ptr [sum],xmm0
do
{
i++;
002A1592 mov eax,dword ptr
002A1595 add eax,1
002A1598 mov dword ptr ,eax
factor = _mm_add_pd(factor, one); // {n, n} -> {n+1, n+1}
002A159B movapd xmm0,xmmword ptr [one]
002A15A3 movapd xmm1,xmmword ptr [factor]
002A15AB addpd xmm1,xmm0
002A15AF movapd xmmword ptr [ebp-260h],xmm1
002A15B7 movapd xmm0,xmmword ptr [ebp-260h]
002A15BF movapd xmmword ptr [factor],xmm0
term = _mm_mul_pd(term, x); // x^n -> x^{n+1}
002A15C7 movapd xmm0,xmmword ptr
002A15CF movapd xmm1,xmmword ptr [term]
002A15D7 mulpd xmm1,xmm0
002A15DB movapd xmmword ptr [ebp-240h],xmm1
002A15E3 movapd xmm0,xmmword ptr [ebp-240h]
002A15EB movapd xmmword ptr [term],xmm0
term = _mm_div_pd(term, factor); // x^{n+1}/n -> x^{n+1}/(n+1)
002A15F3 movapd xmm0,xmmword ptr [factor]
002A15FB movapd xmm1,xmmword ptr [term]
002A1603 divpd xmm1,xmm0
002A1607 movapd xmmword ptr [ebp-220h],xmm1
002A160F movapd xmm0,xmmword ptr [ebp-220h]
002A1617 movapd xmmword ptr [term],xmm0
sum = _mm_add_pd(sum, term); // sum -> sum + x^{n+1}/(n+1)
002A161F movapd xmm0,xmmword ptr [term]
002A1627 movapd xmm1,xmmword ptr [sum]
002A162F addpd xmm1,xmm0
002A1633 movapd xmmword ptr [ebp-200h],xmm1
002A163B movapd xmm0,xmmword ptr [ebp-200h]
002A1643 movapd xmmword ptr [sum],xmm0
} while(i<=imax);
002A164B cmp dword ptr ,6
002A164F jle main+102h (2A1592h)
_mm_store_pd(res, sum);
002A1655 movapd xmm0,xmmword ptr [sum]
002A165D movapd xmmword ptr [res],xmm0
It seems clear that, with the intrinsics instructions I used, the variables are saved in memory and reloaded into xmm registers at each step of the loop. This results in a loop of 29 machine codes, while the inline asm loop involves 7 op codes only.
I am everything but a programming expert (in fact, I am a physicist). Comments of experts would be appreciated. Are there intrinsics instructions that would avoid writing intermediate variable values into memory? Another question: visual studio c++ does not accept inline assembly in x64. How about Intel compilers?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
So what I'd recommend is this:
1) Check your algorithm. A better algorithm is by far the easiest way to get a performance improvement.
2) Use somebody else's code :). Performance libraries like MKL (which includes BLAS, LAPACK, etc.), VML, and IPP can save you a tremendous amount of time by providing expertly-optimized implementations of many common (and many not-so-common) mathematical and statistical methods. Don't fool yourself into thinking you can write faster matrix code than you can find in these libraries!
3) Look for parallelization opportunities. Cilk and TBB (available with Intel Parallel Studio) make it pretty easy to write simple parallel code (as do the parallel extensions in VS2010). Avoid tricky parallel code: you can spend hours debugging incredibly subtle race conditions and locking problems if you try to get too clever. MKL, VML, and IPP often use parallelization automatically (they use OpenMP internally).
4) If you must write the core code yourself, write it first in C/C++. This is easier to get correct, gets you a working program sooner, and later gives you a reference to check the correctness of your optimized code.
5) Profile the code running on a realistic data set. Find your hotspots (usually in the innermost loops).
6) Optimize those hotspots in order of priority using intrinsics. Compare the results produced by the intrinsics to the results produced by your C/C++ code, and compare the performance too. Note that it may be difficult to achieve a performance improvement: the compiler is often very good at optimizing C/C++ code directly. You may have a better chance of improving performance by improving locality of access (i.e. using blocked/tiled algorithms to improve cache utilization) than by writing with intrinsics.
7) Only in extreme cases rewrite code to use inline assembly. Again, if you're not a proficient ASM programmer then you probably won't get a performance improvement.
BTW, the Intel compiler does accept inline assembly in x64 (unlike the VS compiler). If you're looking for the fastest code, I'd recommend using the Intel compiler - Parallel Studio integrates fairly seamlessly within Visual Studio and includes Cilk, TBB, and the IPP library as well as a very nice performance profiling environment. Parallel Studio Ex (a little pricier) adds MKL, VML, and an even more capable performance profiling tool (VTune) - among other things.
Hope this helps!
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
So what I'd recommend is this:
1) Check your algorithm. A better algorithm is by far the easiest way to get a performance improvement.
2) Use somebody else's code :). Performance libraries like MKL (which includes BLAS, LAPACK, etc.), VML, and IPP can save you a tremendous amount of time by providing expertly-optimized implementations of many common (and many not-so-common) mathematical and statistical methods. Don't fool yourself into thinking you can write faster matrix code than you can find in these libraries!
3) Look for parallelization opportunities. Cilk and TBB (available with Intel Parallel Studio) make it pretty easy to write simple parallel code (as do the parallel extensions in VS2010). Avoid tricky parallel code: you can spend hours debugging incredibly subtle race conditions and locking problems if you try to get too clever. MKL, VML, and IPP often use parallelization automatically (they use OpenMP internally).
4) If you must write the core code yourself, write it first in C/C++. This is easier to get correct, gets you a working program sooner, and later gives you a reference to check the correctness of your optimized code.
5) Profile the code running on a realistic data set. Find your hotspots (usually in the innermost loops).
6) Optimize those hotspots in order of priority using intrinsics. Compare the results produced by the intrinsics to the results produced by your C/C++ code, and compare the performance too. Note that it may be difficult to achieve a performance improvement: the compiler is often very good at optimizing C/C++ code directly. You may have a better chance of improving performance by improving locality of access (i.e. using blocked/tiled algorithms to improve cache utilization) than by writing with intrinsics.
7) Only in extreme cases rewrite code to use inline assembly. Again, if you're not a proficient ASM programmer then you probably won't get a performance improvement.
BTW, the Intel compiler does accept inline assembly in x64 (unlike the VS compiler). If you're looking for the fastest code, I'd recommend using the Intel compiler - Parallel Studio integrates fairly seamlessly within Visual Studio and includes Cilk, TBB, and the IPP library as well as a very nice performance profiling environment. Parallel Studio Ex (a little pricier) adds MKL, VML, and an even more capable performance profiling tool (VTune) - among other things.
Hope this helps!
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I agree that modern Intel compilers give better optimized codes that we can do by hand. BTW I'd be curious to see what improvement it can bring to my so short inline asm codes, except loop unrolling. At my university, the Intel compilers are located at a central facility and the access is mainly via command lines, which makes debug somewhat unpleasant. I'll try to get a disassembly. Any way it's just curiosity!
As much as possible, I try to use existing function libraries. I don't want to reinvent the wheel! Unfortunately, often, I don't find what I'm looking for. In my present research, I use Legendre toroidal functions and till now haven't found anything up to date.
Many thanks again. I hope I didn't take too much of your time.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
void vexp(double* __restrict ax, double* __restrict ares, int n)
{
const int imax = 6;
#pragma simd
for(int k=0;k
double x=ax
double term = x;
double factor=1.0;
double sum=1.0+term;
for(int i=0;i
factor+=1.0;
term=term*x/factor;
sum+=term;
}
ares
}
}
The __restrict keyword tells the compiler that the ax and ares array pointers don't alias (i.e. don't overlap) - this can significantly improve efficiency. The #pragma simd directive is a suggestion to the compiler to use SIMD instructions for the code that follows. You'll probably find that you can create efficient code more easily by learning about using these kinds of keywords and directives in standard C/C++ code than by using intrinsics: if you try to insert your intrinsics-based code in the sample above, you will probably find that you just slow things down.
BTW, if you substitute float for double in the sample, the efficiency goes up dramatically: not only is there half as much data to move, there is a SIMD reciprocal approximation instruction for floats that is much faster than the division instruction - the compiler knows this, and uses it when possible. You can learn a lot by looking at the assembly language generated by the compiler!
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
So I guess this just reinforces a point I made earlier: always start with a simple non-optimized implementation of your algorithm first so you can verify correctness as you optimize! And don't assume helpful little things like #pragma simd are as simple and foolproof as they look :).
If anyone knows why #pragma simd is failing in such a simple case, I'd love to hear about it! The documentation is quite unhelpful - I tried a couple of things but couldn't come up with a way to coerce the compiler to generate correct code without manually unrolling the inner loop. It's a little worrisome that this directive changes the semantics of the loop code without issuing any warnings.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
There is a separate pragma for unroll-and-jam, in case that was the intent for putting the #pragma simd in the position shown. The usual usage for promotion of vector reduction would be to put #pragma simd on the inner reduction loop and specify the reduction and private variables. Of course, this example isn't suitable for vector reduction.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
1. Regarding number of terms: Number of calculated terms of the series to calculate power of the Exponential constant affects accuracy and has to be taken into account;
2. Performance evaluations are as follows for 12 terms calculated:
( 10,000,000 calls for every implementation / 1 sec = 1000 ticks / All Optimizations disabled )
- Inline Assembler ~= 5,516 ticks
- Intrinsic Functions ~= 10,828 ticks
- Recursivein C ~= 19,188 ticks
3. A couple words about accuracy of calculations:
Let's assume that a true value of Exponential constant is 2.718281828459045 ( up to 15 digits ).
Then, if x equals to 1, here are calculations with different number of terms in the Exponential series for 'double' data type ( FPU init-settings - Default ):
Dbl Epsilon : 0.0000000000000002220446
Exp-constant is : 2.7182818284590450000000
x=1.0 -> E^ 1.0 is: 2.7182818284590451000000 Number of Terms: 18
Note: Limitation of IEEE 754 standardfor a double data typereached
x=1.0 -> E^ 1.0 is: 2.7182818284590451000000 Number of Terms: 17 - 15 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818284590420000000 Number of Terms: 16 - 14 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818284589945000000 Number of Terms: 15 - 11 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818284582297000000 Number of Terms: 14 - 11 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818284467589000000 Number of Terms: 13 - 10 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818282861687000000 Number of Terms: 12 - 9 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818261984929000000 Number of Terms: 11 - 8 digits matched
x=1.0 -> E^ 1.0 is: 2.7182818011463845000000 Number of Terms: 10 - 7 digits matched
x=1.0 -> E^ 1.0 is: 2.7182815255731922000000 Number of Terms: 9 - 6 digits matched
x=1.0 -> E^ 1.0 is: 2.7182787698412696000000 Number of Terms: 8 - 4 digits matched
x=1.0 -> E^ 1.0 is: 2.7182539682539684000000 Number of Terms: 7 - 4 digits matched
x=1.0 -> E^ 1.0 is: 2.7180555555555554000000 Number of Terms: 6 - 3 digits matched
x=1.0 -> E^ 1.0 is: 2.7166666666666668000000 Number of Terms: 5 - 2 digits matched
In order to provide the best double-precision accuracy of theExponential constant compared to the
true value the calculation has to be done with up to at least 17th term.
In case of calculations with up to 17th term accuracy is:
Abs Error: 0.000000000000000
Rel Error : 0.000000000000000
Pct Error : 0.000000000000000 %
In case of calculations with up to 16th term accuracy is:
Abs Error: -0.000000000000003
Rel Error : -0.000000000000001
Pct Error : -0.000000000000114 %
4. Regarding optimizations:Of course, Vectorization is agood thing, but it looks like everybody forgot some very simple ways to optimize an algorithm or algorithms...
- The classic expression to calculate the Exponential constant using series, that is
e^x = 1 + x + (x^2)/2! + (x^3)/3! + (x^4)/4! + ... ( *1 )
could be normalized in order to reduce number of multiplications when calculating powers of x;
- Factorials could be replaced by constants, 2!=2, 3!=6, 4!=24, etc, in order to speed up
calculations;
- Calculations could be done backwards starting withhigher terms ( smaller values)in order to minimize rounding errors and improve accuracy of the final value;
- Instruction 'mov eax, 1' could be replaced with 'xor eax, eax'.
5. Do you know, that the Exponential constant could be calculated using Horner's Rule?
Here is an example with the same accuracy as the classic expression ( *1 )up to 12th term:
...
dExpCalcV = ((((((((((( 1.0L + 1.0L / 12.0L ) / 11.0L + 1.0L ) / 10.0L + 1.0L ) / 9.0L +
1.0L ) / 8.0L + 1.0L ) / 7.0L + 1.0L ) / 6.0L +
1.0L ) / 5.0L + 1.0L ) / 4.0L + 1.0L ) / 3.0L + 1.0L ) / 2.0L + 1.0L ) + 1.0L;
...
6. Here are some real results:
Best regards,
Sergey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I detected a little issue in a Sub-Test 4 ( for Intrinsic Functions )and correct results are as follows:
...
Sub-Test 4 - Using Intrinsic Functions
Exp-constant is : 2.718281828459045100
Intrinsic-Based completed in: 11844 ticks
Exp-constant is : 2.718281828286168700 Rel Error: -0.000000000063597666 ( 12 terms accuracy )
Exp-constant is : 2.718281828286168700 Rel Error: -0.000000000063597666 ( 12 terms accuracy )
...
In case of calculation of the Exponential constant using the series expression up to 12th term accuracy between all three implementations is the same:
...
...Rel Error: -0.000000000063597666 ( 12 terms accuracy )...
...
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thanks again for your help.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
That's a "sad"reality of Modern Software Development whenSoftware Developers, even experienced (!!!), too reliant on software optimizations by a C/C++ compiler.
Whenthere are1,000 code lines in C, or in C++, or in Assembler,instead of 100 it's an illusion that a C/C++ compiler will makethat codesignificantly faster by enabling some "cool" optimization(s). Of course, itcould make it faster, but not too significantly.
I'm forced to doa highly portable software programming due to strict requirements of my current project and when it comes to optimizations they are always Disabled in Debug and Release configurations.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'm forced to doa highly portable software programming due to strict requirements of my current project and when it comes to optimizations they are always Disabled in Debug and Release configurations.
Options such as Intel /fp:source and options to turn off interprocedural analysis are provided to meet requirements of portability.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
>>to meet requirements of portability...
That's good to know. Thank you for your feedback, Tim!

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page