- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Cross-posting from http://stackoverflow.com/q/15836542/90475
Here's my code... I intend to run IPP FFT over millions of 1D traces by calling Correlate(): **performance matters**.
In Visual Studio profiler, I can see IPP DFT calls taking a lion's share of time when running with a low number of processes.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using ipp;
namespace GregsFavNamespace
{
public class Correlation
{
readonly int _traceSampleCount, _sizeDftSpec, _sizeDftInitBuf, _sizeDftWorkBuf;
readonly byte[] _dftSpecBuf, _workBuffer;
readonly Ipp32fc[] _sweep_spectral_conjugate, _interleavedData_temporal, _interleavedData_spectral, _interleavedDataWithSweep_spectral, _interleavedData_temporalResult;
public Correlation(float[] sweep, int sweepSampleCount, int traceSampleCount, bool shouldNormalizeData = false, int sweepDelay = 0)
{
_traceSampleCount = traceSampleCount;
// Greg Chernis: the following initialization code is based on
// http://software.intel.com/en-us/articles/how-to-use-intel-ipp-s-1d-fourier-transform-functions
int ippDivisionAlgorithm = (int)FftDivisionFlag.IPP_FFT_DIV_INV_BY_N; // alternatively, use IPP_FFT_NODIV_BY_ANY
IppHintAlgorithm ippPerformanceHint = IppHintAlgorithm.ippAlgHintAccurate; // alternatively, use ippAlgHintFast
unsafe
{
IppStatus result;
fixed (int* p_sizeDftSpec = &_sizeDftSpec, p_sizeDftInitBuf = &_sizeDftInitBuf, p_sizeDftWorkBuf = &_sizeDftWorkBuf)
result = sp.ippsDFTGetSize_C_32fc(traceSampleCount, ippDivisionAlgorithm, ippPerformanceHint,
p_sizeDftSpec, p_sizeDftInitBuf, p_sizeDftWorkBuf);
if (result != IppStatus.ippStsNoErr)
throw new ApplicationException(result.ToString());
}
byte[] dftInitBuf = new byte[_sizeDftInitBuf];
_dftSpecBuf = new byte[_sizeDftSpec];
_workBuffer = new byte[_sizeDftWorkBuf];
unsafe
{
IppStatus result;
fixed (byte* p_dftInitBuf = dftInitBuf)
fixed (byte* p_dftSpecBuf = _dftSpecBuf)
{
var p_dftSpec = (IppsDFTSpec_C_32fc*)p_dftSpecBuf;
result = sp.ippsDFTInit_C_32fc(traceSampleCount, ippDivisionAlgorithm, ippPerformanceHint,
p_dftSpec, p_dftInitBuf);
}
if (result != IppStatus.ippStsNoErr)
throw new ApplicationException(result.ToString());
}
// Compute this sweep transformation once and re-use it as many times as we have traces to process
_sweep_spectral_conjugate = PrepareSweepForCorrelation(sweep, sweepSampleCount, sweepDelay);
// Pre-allocate these buffers, as they are frequently re-used: as many times as we have traces to process
_interleavedData_temporal = new Ipp32fc[traceSampleCount];
_interleavedData_spectral = new Ipp32fc[traceSampleCount];
_interleavedDataWithSweep_spectral = new Ipp32fc[traceSampleCount];
_interleavedData_temporalResult = new Ipp32fc[traceSampleCount];
}
private static float CalculateNormalizationFactor(float[] trace1, int traceSampleCount)
{
double normalizationFactor = 0.0;
for (uint i = 0; i < traceSampleCount; i++)
normalizationFactor += trace1 * trace1;
if (normalizationFactor == 0.0)
return 1.0f;
return (float)(1.0 / Math.Sqrt(normalizationFactor));
}
private Ipp32fc[] PrepareSweepForCorrelation(float[] sweep, int sweepSampleCount, int sweepDelay)
{
var sweep_temporal = new Ipp32fc[_traceSampleCount];
ApplySweepDelay(sweep, sweep_temporal, sweepSampleCount, sweepDelay);
return ComputeConjugateOfSweepSpectra(sweep_temporal, _traceSampleCount);
}
private static void ApplySweepDelay(float[] sweepSrc, Ipp32fc[] sweepDst, int sweepSampleCount, int delay)
{
if (delay > 0)
interleave4(sweepSrc.Skip(delay).ToArray(), null, sweepDst, sweepSampleCount - delay);
else if (delay < 0)
interleave4(sweepSrc, null, sweepDst.Skip(delay).ToArray(), sweepSampleCount - delay);
else if (delay == 0)
interleave4(sweepSrc, null, sweepDst, sweepSampleCount);
}
private unsafe Ipp32fc[] ComputeConjugateOfSweepSpectra(Ipp32fc[] sweep_temporal, int sweepSampleCount)
{
var sweep_spectral = new Ipp32fc[sweepSampleCount];
var sweep_spectral_conjugate = new Ipp32fc[sweepSampleCount];
fixed (Ipp32fc*
p_sweep_temporal = sweep_temporal,
p_sweep_spectral = sweep_spectral,
p_sweep_spectral_conjugate = sweep_spectral_conjugate)
{
fixed (byte* p_workBuffer = _workBuffer)
fixed (byte* p_dftSpecBuf = _dftSpecBuf)
{
var p_dftSpec = (IppsDFTSpec_C_32fc*)p_dftSpecBuf;
// Forward Fourier to spectra domain
sp.ippsDFTFwd_CToC_32fc(p_sweep_temporal, p_sweep_spectral, p_dftSpec, p_workBuffer);
}
// Conjugate
sp.ippsConj_32fc(p_sweep_spectral, p_sweep_spectral_conjugate, sweepSampleCount);
}
return sweep_spectral_conjugate;
}
public void Correlate(float[] trace1, float[] trace2, float[] trace1Result, float[] trace2Result)
{
// Blend traces 1 and 2 into _interleavedData_temporal (array of complex numbers for Complex-to-Complex Fourier)
interleave4(trace1, trace2, _interleavedData_temporal, _traceSampleCount);
CorrelateBothTraces();
// Get back correlated traces 1 and 2 from _interleavedData_spectral (array of complex numbers for Complex-to-Complex DFT)
uninterleave4(_interleavedData_temporalResult, trace1Result, trace2Result, _traceSampleCount);
}
unsafe private void CorrelateBothTraces()
{
// GC: performance is critical for the following three unmanaged calls:
// all arrays should be preallocated upfront
fixed (Ipp32fc* p_sweep_spectral_conjugate = _sweep_spectral_conjugate,
p_interleavedData_temporal = _interleavedData_temporal,
p_interleavedData_spectral = _interleavedData_spectral,
p_interleavedDataWithSweep_spectral = _interleavedDataWithSweep_spectral,
p_interleavedData_temporalResult = _interleavedData_temporalResult)
fixed (byte* p_workBuffer = _workBuffer, p_dftSpecBuf = _dftSpecBuf)
{
var p_dftSpec = (IppsDFTSpec_C_32fc*)p_dftSpecBuf;
// Forward Fourier to spectra domain
sp.ippsDFTFwd_CToC_32fc(p_interleavedData_temporal, p_interleavedData_spectral, p_dftSpec, p_workBuffer);
// Complex multiply conjugate of sweep spectra by blended trace data spectra
sp.ippsMul_32fc(p_sweep_spectral_conjugate, p_interleavedData_spectral, p_interleavedDataWithSweep_spectral, _traceSampleCount);
// Inverse Fourier to time domain, correlated
sp.ippsDFTInv_CToC_32fc(p_interleavedDataWithSweep_spectral, p_interleavedData_temporalResult, p_dftSpec, p_workBuffer);
}
}
static void interleave4(float[] reals, float[] imaginaries, Ipp32fc[] complexOutput, int sampleCount)
{
int i = 0;
if (reals != null && imaginaries != null)
{
for (i = 0; i < sampleCount; i++)
{
complexOutput.re = reals;
complexOutput.im = imaginaries;
}
}
else if (reals != null)
{
for (i = 0; i < sampleCount; ++i)
{
complexOutput.re = reals;
complexOutput.im = 0;
}
}
else if (imaginaries != null)
{
for (i = 0; i < sampleCount; ++i)
{
complexOutput.re = 0;
complexOutput.im = imaginaries;
}
}
}
static void uninterleave4(Ipp32fc[] complex, float[] realsOut, float[] imaginariesOut, int sampleCount)
{
if (realsOut != null && imaginariesOut != null)
{
for (int i = 0; i < sampleCount; i++)
{
realsOut = complex.re;
imaginariesOut = complex.im;
}
}
else if (realsOut != null)
{
for (int i = 0; i < sampleCount; i++)
realsOut = complex.re;
}
else if (imaginariesOut != null)
{
for (int i = 0; i < sampleCount; i++)
imaginariesOut = complex.im;
}
}
}
}
In Intel's IPP samples, there are generated P/Invoke signatures that I use to call the code from C# (I modified them slightly, but they still work):
using System;
using System.Security;
using System.Runtime.InteropServices;
namespace ipp {
public enum IppStatus { }
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)]
[DebuggerDisplay("({re},{im})")]
public struct Ipp32fc
{
public float re;
public float im;
public Ipp32fc(float re, float im)
{
this.re = re;
this.im = im;
}
};
[SuppressUnmanagedCodeSecurityAttribute()]
[DllImport(ipp.sp.libname)]
public static extern
IppStatus ippsDFTGetSize_C_32fc(int length, int flag, IppHintAlgorithm hint, int* pSpecSize, int* pSpecBufferSize, int* pBufferSize);
[SuppressUnmanagedCodeSecurityAttribute()]
[DllImport(ipp.sp.libname)]
public static extern
IppStatus ippsDFTInit_C_32fc(int length, int flag, IppHintAlgorithm hint, IppsDFTSpec_C_32fc* pSpec, byte* pSpecBuffer);
[SuppressUnmanagedCodeSecurityAttribute()]
[DllImport(ipp.sp.libname)] public static extern
IppStatus ippsDFTFwd_CToC_32fc ( Ipp32fc *pSrc, Ipp32fc *pDst, IppsDFTSpec_C_32fc *pDFTSpec, byte *pBuffer );
[SuppressUnmanagedCodeSecurityAttribute()]
[DllImport(ipp.sp.libname)] public static extern
IppStatus ippsDFTFwd_CToC_32fc ( Ipp32fc *pSrc, Ipp32fc *pDst, IppsDFTSpec_C_32fc *pDFTSpec, byte *pBuffer );
[SuppressUnmanagedCodeSecurityAttribute()]
[DllImport(ipp.sp.libname)] public static extern
IppStatus ippsMul_32fc ( Ipp32fc *pSrc1, Ipp32fc *pSrc2, Ipp32fc *pDst, int len );
[SuppressUnmanagedCodeSecurityAttribute()]
[DllImport(ipp.sp.libname)] public static extern
IppStatus ippsDFTInv_CToC_32fc ( Ipp32fc *pSrc, Ipp32fc *pDst, IppsDFTSpec_C_32fc *pDFTSpec, byte *pBuffer );
}
Here are my run times with a small dataset (keeping the trace size such that working set is 128K):
Process count | CPU utilization | Wall time to complete (seconds)
2 10 90.8
4 20 48.6
6 25 37.1
8 30 31.1
10 40 28.6
12 50 27.0 <-- perf should not improve past this point due to 12 floating point units in this machine, but IMHO it should scale better
14 55 27.0
16 60 27.6
.NET GC load is reasonable (less than 10% always, 0-1% most of the time).
I see a lot of calls to IppZero_8u on the top of call stack as it runs
I see a lot of [Transition from Managed To Unmanaged]: I plan to switch to C++/CLI in hopes of addressing this issue.
I suspect that IppZero_8u hinders scalability, as it (possibly) tries to access main memory, and is having to synchronize between cores.
How can I pinpoint what prevents scaling? What would you recommend to improve scalability of the algorithm?
----------
Details
- This was performed on dual Xeon 5650 with 6 cores each (6 floating point processors x 2). Intel IPP library correctly detects SSE 4.2 on this processor configuration
- This is an asymmetric machine with two memory nodes, 96GB total DDR3 RAM
- 32K L1 and 256K L2, 12 MB shared L3
- I have tried in-place FFT instead of out-of-place DFT. It scales even less, taking the same amount of time with 6 processes as it does with 2: 40 seconds
- I/O is not a bottleneck, since the same work finishes in 6 seconds without involving correlation
- Intel Integrated Performance Primitives 7.1
- .NET 4.5 is running with Server GC enabled
- I prefer to do this high-level orchestration from a managed language to avoid instability due to memory buffer overruns (fear of clumsy programming and insufficient testing)
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Can you check the post here? It discussed the internal FF threading in the IPP:
http://software.intel.com/en-us/forums/topic/283657
Thanks,
Chao
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
from your code I see that you are trying to implement cross-correlation through convolution theorem. It is better to use ippsCrossCorr function for this purpose - it is threaded, it is highly optimized internaly, etc. Regarding your algorithm - why do you use DFT? why not FFT? - FFT is significanly faster. why do you use interleave/deinterleave? there are special functions in IPP - complex to real and back conversions. Also there are as real as well as complex versions of FFT/DFT - you can use them directly without pre/after conversions. For multiplication in the freq domain for real transforms you can use MulPack function...etc.
regards, Igor
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Igor,
I am trying to retrofit existing code to work with IPP. I have an in-place FFT imlementation as well. It seems that Chao Y's answer is better, as it questions the validity of my expectations of linear performance improvements with a dataset that might not fit into L2. In fact, I re-ran in-place FFT under debugger and saw that two complex inputs and a work buffer are 384 KB in length. This does not fit into Xeon 5650's 256KB L2.
from your code I see that you are trying to implement cross-correlation through convolution theorem. It is better to use ippsCrossCorr function for this purpose - it is threaded, it is highly optimized internaly, etc.
Existing code prepares and reuses one of the inputs (correlation kernel), dramatically reducing calculations. I followed existing implementation. How can I achieve the same effect with the IPP library method?
Regarding your algorithm - why do you use DFT? why not FFT? - FFT is significanly faster. why do you use interleave/deinterleave?
Out-of-place DFT for clarity while debugging. I choose not to optimize and make code harder to debug unless there is a clear need for that. Now I know that everything has to fit into L2, and that would drive the optimization.
There are special functions in IPP - complex to real and back conversions.
This did not prove to be a problem when written in C#. Profiler shows less than 1% of the time spent there, so I left it alone. Nice to know I have options to simplify this, will have a good look.
Also there are real as well as complex versions of FFT/DFT - you can use them directly without pre/after conversions.
I see doubling the speed when in old C# code when I process two signals at once. Textbooks commonly call for this approach too. I will have to see if using real version of FFT/DFT is any faster.
For multiplication in the freq domain for real transforms you can use MulPack function...etc.
I would like to learn more about effective use of the library. If you have more options to list, could you please spell them out directly.
Thank you for all the help!
-Greg Chernis
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Chao Y (Intel) wrote:
Hi,
Can you check the post here? It discussed the internal FF threading in the IPP:
http://software.intel.com/en-us/forums/topic/283657
Thanks,
Chao
Chao, This was helpful!
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Gregory,
if your signal is real - you don't need to convert it to complex - use real FFT functions - RToPack, then MulPackConj (special function for correlation) and finally - Inv FFT PackToR - as all this stuff is based on real transform symmetry - it takes 2x less memory for temp buffers.
Regards, Igor
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Igor,
This is exactly what I was looking for. Our common signal length scenarios place us in 2^13 to 2^14 samples. I am already seeing great scaling with 4096 samples (2^12), but performance degrades with 2^13 and 2^14 signal lengths, since it doesn't seem to fit into L2.
Halving temp buffers should allow me to see better CPU scaling with 2^13 signal lengths, boosting our performace in those cases.
Thank you for pointing me in the right direction!
-Greg
Igor Astakhov (Intel) wrote:
Gregory,
if your signal is real - you don't need to convert it to complex - use real FFT functions - RToPack, then MulPackConj (special function for correlation) and finally - Inv FFT PackToR - as all this stuff is based on real transform symmetry - it takes 2x less memory for temp buffers.
Regards, Igor
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Gentlemen,
Please find my benchmark results attached. Computations were done on complex data (2 signals interleaved into complex for purposes of correlation).
- Sergey, do the supplied performance levels appear correct?
- What buffer size(s) I should be inspecting to determine if data "fits" onto L2? That is, how do I know that FFT will run entirely within L2 and allow for healthy scaling with additional cores.
- Igor, after reading the Intel IPP 7.1 reference manual on signal processing, I am not entirely clear on how to perform real data correlation. You've describe functionality that does not correlate with what's in the manuals :-) Could you point at an introductory example?
Thank you,
-Greg
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
hi Greg,
here it is:
Ipp##fvr* pTmp1 = NULL;\
Ipp##fvr* pTmp2 = (Ipp##fvr*)s2;\
Ipp8u* pBuffer = NULL;\
IppsFFTSpec_R_##fvr* ctxFft;\
int order = 1, fftSize = srcLen2, n, i = 0;\
IppStatus status = ippStsNoErr;\
\
n = srcLen1 + srcLen2 - 1;\
if( srcLen1 < srcLen2 ) { SWAP( srcLen2, srcLen1, fftSize ); SWAP( s2, s1, pTmp2 ); i ++; }\
\
/* if srcLen1 is nearly equal srcLen2 then the same order FFT */\
if(( srcLen1 < CRITER_FRAME * srcLen2 )||( strtPoint > srcLen2 )) {\
if(i) { SWAP( srcLen1, srcLen2, fftSize ); SWAP( s1, s2, pTmp2 ); }\
while( 1 << order < n ) ++order;\
fftSize = 1 << order;\
status = ippsFFTInitAlloc_R_##fvr( &ctxFft, order, IPP_FFT_DIV_INV_BY_N, ippAlgHintNone );\
if( ippStsNoErr != status) return status;\
status = ippsFFTGetBufSize_R_##fvr( ctxFft, &i );\
if( ippStsNoErr <= status ) {\
pBuffer = ippsMalloc_8u( i );\
pTmp1 = ippsMalloc_##fvr( 2 * fftSize );\
if( NULL != pTmp1) {\
pTmp2 = pTmp1 + fftSize;\
ippsCopy_##fvr( s1, pTmp1, srcLen1 );\
ippsZero_##fvr( pTmp1 + srcLen1, fftSize - srcLen1 );\
status = ippsFFTFwd_RToPack_##fvr( pTmp1, pTmp1, ctxFft, pBuffer );\
if( ippStsNoErr <= status ) {\
ippsZero_##fvr( pTmp2, srcLen1 - 1 );\
ippsCopy_##fvr( s2, pTmp2 + srcLen1 - 1, srcLen2 );\
ippsZero_##fvr( pTmp2 + n, fftSize - n );\
status = ippsFFTFwd_RToPack_##fvr( pTmp2, pTmp2, ctxFft, pBuffer );\
if( ippStsNoErr <= status ) {\
ippsMulPackConj_##fvr##_I( pTmp2, pTmp1, fftSize );\
status = ippsFFTInv_PackToR_##fvr( pTmp1, pTmp2, ctxFft, pBuffer );\
if( ippStsNoErr <= status )\
ippsCopy_##fvr( pTmp2 + strtPoint, pDst, len );\
}\
}\
} else status = ippStsMemAllocErr;\
ippsFFTFree_R_##fvr( ctxFft );\
ippsFree( pTmp1 );\
ippsFree( pBuffer );\
}\
}\
it is better to use ippsCrossCorr function - in addition to method mentioned above it also uses "framed" approach in case of big difference between lengths of vectors
regards, Igor
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Sergey,
I am interested in relative performance of various trace sizes and in scaling performance much more so than in absolute performace of this particular setup. Let me try Igor's newly supplied code sample. I will provide an updated performance sheet once that's done.
Thanks for all the help,
-Greg Chernis
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Igor,
I was able to implement cross-correlation in a manner that you've outlined. Thank you for the code sample.
Clearly, there is a benefit of simplifying higher-level code that calls into cross-correlation routine, as it no longer has to pair up the traces.
Performance was very similar to the solution that uses an interleaved pair of traces in complex number domain. On 4-core Westmere and Sandy Bridge machines, one improvement was seen with 64 x 1024 samples in double precision. Processing single trace at a time allowed the data to fit into L3 and, as a result, produced a 20-30% speedup.
I expect that we'll see performance improvements on the dual Xeon machine, but it's tied up with other tests right now. I'll report back later in the day.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Similar improvements can be seen on dual Xeon machine with trace size of 64K samples:
In double precision with "accurate" hint, dividing by SQRT(N): 870 MB/sec when packing two traces and processing them together vs. 950 MB/sec when processing one trace at a time. This is fast enough for our current application.
Case closed. Igor and Sergey, thanks for all the help!
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page