<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Comparing FFT Performance MKL11 with 1 thread and 4 threads in Intel® oneAPI Math Kernel Library</title>
    <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948423#M15046</link>
    <description>&lt;P&gt;Hi All,&lt;/P&gt;
&lt;P&gt;I'm evaluating the performance (this time not MKL6 vs MKL11) of MKL11 with 1 thread versus 4 threads.&lt;/P&gt;
&lt;P&gt;The 4 thread version seems to be slower. Furthermore, the 4 thread implementation has a huge number of outliners. Does anyone have any explanations, why?&lt;/P&gt;
&lt;P&gt;Below the source (float and double are similar), I shortened it for better overview.&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;Main function:&lt;/STRONG&gt;&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;&lt;STRONG&gt;int _tmain(int argc, _TCHAR* argv[])&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;{&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; int threads = 4; //or 1&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; mkl_set_num_threads(threads);&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS ); // Set a process priority to 'High'&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; TEST FUNCTION HERE&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; SetPriorityClass( GetCurrentProcess(), NORMAL_PRIORITY_CLASS ); // Restore the process priority to 'Norma'l&lt;BR /&gt;}&lt;/STRONG&gt;&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;&lt;STRONG&gt;TEST FUNCTION&lt;/STRONG&gt;&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;&lt;STRONG&gt;&amp;nbsp; DFTI_DESCRIPTOR_HANDLE hand;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; cxdTimeLoops.alloc(loops);&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; //&amp;nbsp;&amp;nbsp;&amp;nbsp; FLOAT&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; k=0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; for (exp=exp_start;exp&amp;lt;=exp_stop;exp++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Nfft = (unsigned int) pow(2.0,exp);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; myRndNumber = 1; //seed&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;Nfft;i++) //get pseudo random signal&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; myRndNumber&amp;nbsp;&amp;nbsp;&amp;nbsp; = NextRand32(myRndNumber);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxfTimesig&lt;I&gt;&amp;nbsp; = ((float) myRndNumber / UINT_MAX)*2-1;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxfTimeaxis&lt;I&gt; = ((float) i + 1.0) / fs;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hand&amp;nbsp;&amp;nbsp; = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiCreateDescriptor(&amp;amp;hand, DFTI_SINGLE, DFTI_REAL, 1, Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiCommitDescriptor(hand);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hpfcTimer.Start(); //start timer for single execution&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiComputeForward(hand, cxfTimesig.ptr(), cxfFreqsig.ptr());&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeLoops&lt;I&gt; = hpfcTimer.Time();&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DftiFreeDescriptor(&amp;amp;hand);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMax = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMin = cxdTimeLoops[0];&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg += cxdTimeLoops&lt;I&gt;;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMax = max(cxdTimeLoops&lt;I&gt;,dTimeMax);&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMin = min(cxdTimeLoops&lt;I&gt;,dTimeMin);&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg /= (double) loops;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k++;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; //&amp;nbsp;&amp;nbsp;&amp;nbsp; DOUBLE&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; k=0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; for (exp=exp_start;exp&amp;lt;=exp_stop;exp++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Nfft = (unsigned int) pow(2.0,exp);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdFreqsig.alloc(Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimesig.alloc(Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeaxis.alloc(Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; myRndNumber = 1; //seed&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;Nfft;i++)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; //get pseudo random signal&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; myRndNumber&amp;nbsp;&amp;nbsp;&amp;nbsp; = NextRand32(myRndNumber);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimesig&lt;I&gt;&amp;nbsp; = ((double) myRndNumber / UINT_MAX)*2-1;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeaxis&lt;I&gt; = ((double) i + 1.0) / fs;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hand&amp;nbsp;&amp;nbsp; = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiCreateDescriptor(&amp;amp;hand, DFTI_DOUBLE, DFTI_REAL, 1, Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiCommitDescriptor(hand);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hpfcTimer.Start(); //start timer for single execution&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiComputeForward(hand, cxdTimesig.ptr(), cxdFreqsig.ptr());&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeLoops&lt;I&gt; = hpfcTimer.Time();&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DftiFreeDescriptor(&amp;amp;hand);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMax = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMin = cxdTimeLoops[0];&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg += cxdTimeLoops&lt;I&gt;;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMax = max(cxdTimeLoops&lt;I&gt;,dTimeMax);&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMin = min(cxdTimeLoops&lt;I&gt;,dTimeMin);&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg /= (double) loops;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k++;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;}&lt;/STRONG&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/P&gt;
&lt;P&gt;&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;dTimeAvg is plottet versus Nfft for float and double. I'm attaching the individual plots with min/max for visualizing the outliners.&lt;/P&gt;
&lt;P&gt;Thanks, Marian&lt;/P&gt;</description>
    <pubDate>Mon, 18 Feb 2013 09:29:03 GMT</pubDate>
    <dc:creator>Marian_L_</dc:creator>
    <dc:date>2013-02-18T09:29:03Z</dc:date>
    <item>
      <title>Comparing FFT Performance MKL11 with 1 thread and 4 threads</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948423#M15046</link>
      <description>&lt;P&gt;Hi All,&lt;/P&gt;
&lt;P&gt;I'm evaluating the performance (this time not MKL6 vs MKL11) of MKL11 with 1 thread versus 4 threads.&lt;/P&gt;
&lt;P&gt;The 4 thread version seems to be slower. Furthermore, the 4 thread implementation has a huge number of outliners. Does anyone have any explanations, why?&lt;/P&gt;
&lt;P&gt;Below the source (float and double are similar), I shortened it for better overview.&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;Main function:&lt;/STRONG&gt;&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;&lt;STRONG&gt;int _tmain(int argc, _TCHAR* argv[])&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;{&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; int threads = 4; //or 1&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; mkl_set_num_threads(threads);&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS ); // Set a process priority to 'High'&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; TEST FUNCTION HERE&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; SetPriorityClass( GetCurrentProcess(), NORMAL_PRIORITY_CLASS ); // Restore the process priority to 'Norma'l&lt;BR /&gt;}&lt;/STRONG&gt;&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;&lt;STRONG&gt;TEST FUNCTION&lt;/STRONG&gt;&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;&lt;STRONG&gt;&amp;nbsp; DFTI_DESCRIPTOR_HANDLE hand;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; cxdTimeLoops.alloc(loops);&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; //&amp;nbsp;&amp;nbsp;&amp;nbsp; FLOAT&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; k=0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; for (exp=exp_start;exp&amp;lt;=exp_stop;exp++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Nfft = (unsigned int) pow(2.0,exp);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; myRndNumber = 1; //seed&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;Nfft;i++) //get pseudo random signal&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; myRndNumber&amp;nbsp;&amp;nbsp;&amp;nbsp; = NextRand32(myRndNumber);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxfTimesig&lt;I&gt;&amp;nbsp; = ((float) myRndNumber / UINT_MAX)*2-1;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxfTimeaxis&lt;I&gt; = ((float) i + 1.0) / fs;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hand&amp;nbsp;&amp;nbsp; = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiCreateDescriptor(&amp;amp;hand, DFTI_SINGLE, DFTI_REAL, 1, Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiCommitDescriptor(hand);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hpfcTimer.Start(); //start timer for single execution&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiComputeForward(hand, cxfTimesig.ptr(), cxfFreqsig.ptr());&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeLoops&lt;I&gt; = hpfcTimer.Time();&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DftiFreeDescriptor(&amp;amp;hand);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMax = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMin = cxdTimeLoops[0];&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg += cxdTimeLoops&lt;I&gt;;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMax = max(cxdTimeLoops&lt;I&gt;,dTimeMax);&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMin = min(cxdTimeLoops&lt;I&gt;,dTimeMin);&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg /= (double) loops;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k++;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; //&amp;nbsp;&amp;nbsp;&amp;nbsp; DOUBLE&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; k=0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; for (exp=exp_start;exp&amp;lt;=exp_stop;exp++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Nfft = (unsigned int) pow(2.0,exp);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdFreqsig.alloc(Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimesig.alloc(Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeaxis.alloc(Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; myRndNumber = 1; //seed&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;Nfft;i++)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; //get pseudo random signal&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; myRndNumber&amp;nbsp;&amp;nbsp;&amp;nbsp; = NextRand32(myRndNumber);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimesig&lt;I&gt;&amp;nbsp; = ((double) myRndNumber / UINT_MAX)*2-1;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeaxis&lt;I&gt; = ((double) i + 1.0) / fs;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hand&amp;nbsp;&amp;nbsp; = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiCreateDescriptor(&amp;amp;hand, DFTI_DOUBLE, DFTI_REAL, 1, Nfft);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiCommitDescriptor(hand);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hpfcTimer.Start(); //start timer for single execution&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiComputeForward(hand, cxdTimesig.ptr(), cxdFreqsig.ptr());&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeLoops&lt;I&gt; = hpfcTimer.Time();&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DftiFreeDescriptor(&amp;amp;hand);&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMax = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMin = cxdTimeLoops[0];&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg = 0;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg += cxdTimeLoops&lt;I&gt;;&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMax = max(cxdTimeLoops&lt;I&gt;,dTimeMax);&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeMin = min(cxdTimeLoops&lt;I&gt;,dTimeMin);&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dTimeAvg /= (double) loops;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k++;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;}&lt;/STRONG&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/P&gt;
&lt;P&gt;&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;dTimeAvg is plottet versus Nfft for float and double. I'm attaching the individual plots with min/max for visualizing the outliners.&lt;/P&gt;
&lt;P&gt;Thanks, Marian&lt;/P&gt;</description>
      <pubDate>Mon, 18 Feb 2013 09:29:03 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948423#M15046</guid>
      <dc:creator>Marian_L_</dc:creator>
      <dc:date>2013-02-18T09:29:03Z</dc:date>
    </item>
    <item>
      <title>Dear Customer,</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948424#M15047</link>
      <description>&lt;P&gt;Marian,&lt;/P&gt;
&lt;P&gt;can you please give me your machine/processor specifications? Do you observe this in every machine?&lt;/P&gt;
&lt;P&gt;- Sridevi&lt;/P&gt;</description>
      <pubDate>Tue, 19 Feb 2013 23:56:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948424#M15047</guid>
      <dc:creator>Sridevi_A_Intel</dc:creator>
      <dc:date>2013-02-19T23:56:00Z</dc:date>
    </item>
    <item>
      <title>&gt;&gt;...</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948425#M15048</link>
      <description>&amp;gt;&amp;gt;...
&amp;gt;&amp;gt;int _tmain(int argc, _TCHAR* argv[])
&amp;gt;&amp;gt;{
&amp;gt;&amp;gt;   int threads = 4; //or 1
&amp;gt;&amp;gt;   &lt;STRONG&gt;mkl_set_num_threads( threads )&lt;/STRONG&gt;;
&amp;gt;&amp;gt;
&amp;gt;&amp;gt;   &lt;STRONG&gt;SetPriorityClass&lt;/STRONG&gt;( &lt;STRONG&gt;GetCurrentProcess&lt;/STRONG&gt;(), &lt;STRONG&gt;HIGH_PRIORITY_CLASS&lt;/STRONG&gt; );
&amp;gt;&amp;gt;...
&amp;gt;&amp;gt;The &lt;STRONG&gt;4 thread version seems to be slower&lt;/STRONG&gt;.

You've changed a priority of the main thread and I don't think it will raise priorities of &lt;STRONG&gt;OpenMP&lt;/STRONG&gt; threads. Then, when MKL creates &lt;STRONG&gt;OpenMP&lt;/STRONG&gt; threads they &lt;STRONG&gt;could have Normal priorities&lt;/STRONG&gt; and because of this all these four threads &lt;STRONG&gt;could be preempted&lt;/STRONG&gt; more often by the thread with the higher priority, that is your main application thread. Also, I'm not sure that this is the only problem in your test case. If &lt;STRONG&gt;Nfft&lt;/STRONG&gt; is too big than &lt;STRONG&gt;cache related issues could negatively affect&lt;/STRONG&gt; performance.</description>
      <pubDate>Wed, 20 Feb 2013 04:48:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948425#M15048</guid>
      <dc:creator>SergeyKostrov</dc:creator>
      <dc:date>2013-02-20T04:48:00Z</dc:date>
    </item>
    <item>
      <title>Does MKL library have a</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948426#M15049</link>
      <description>&lt;P&gt;Does MKL library have a function which raises OpenMP thread's priority?&lt;/P&gt;</description>
      <pubDate>Wed, 20 Feb 2013 05:28:25 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948426#M15049</guid>
      <dc:creator>Bernard</dc:creator>
      <dc:date>2013-02-20T05:28:25Z</dc:date>
    </item>
    <item>
      <title>&gt;&gt;...Does MKL library have a</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948427#M15050</link>
      <description>&amp;gt;&amp;gt;...Does MKL library have a function which raises OpenMP thread's priority?

I don't know if MKL has it but I know that OpenMP specification 3.1 ( July 2011 ) does not have any functions to change priorities of OpenMP threads.</description>
      <pubDate>Wed, 20 Feb 2013 06:01:50 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948427#M15050</guid>
      <dc:creator>SergeyKostrov</dc:creator>
      <dc:date>2013-02-20T06:01:50Z</dc:date>
    </item>
    <item>
      <title>@Sridevi: It's an Inten Core</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948428#M15051</link>
      <description>&lt;P&gt;@Sridevi: It's an Inten Core i5-2500@3.30 GHz, 8 GB RAM. I've only checked this on this computer.&lt;/P&gt;
&lt;P&gt;@Sergey: Would you wrap the priority raising around the for loop like this:&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;&lt;STRONG&gt;//Set to high priority&lt;BR /&gt;&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;for (i=0;i&amp;lt;loops;i++)&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; {&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; hpfcTimer.Start(); //start timer for single execution&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; status = DftiComputeForward(hand, cxfTimesig.ptr(), cxfFreqsig.ptr());&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; cxdTimeLoops&lt;I&gt; = hpfcTimer.Time();&lt;/I&gt;&lt;/STRONG&gt;&lt;I&gt;&lt;BR /&gt;&lt;STRONG&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; }&lt;/STRONG&gt;&lt;/I&gt;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;//Set to normal priority&lt;BR /&gt;&lt;/STRONG&gt;&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;</description>
      <pubDate>Wed, 20 Feb 2013 06:19:41 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948428#M15051</guid>
      <dc:creator>Marian_L_</dc:creator>
      <dc:date>2013-02-20T06:19:41Z</dc:date>
    </item>
    <item>
      <title>Hi,</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948429#M15052</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;
&lt;P&gt;You measure FFT performance on powers of two sizes in a range [exp_start, exp_stop]&amp;nbsp; for real-to-complex.&lt;/P&gt;
&lt;P&gt;What they are? You know, small sizes&amp;nbsp;have always sequential&amp;nbsp;implementation.&lt;/P&gt;
&lt;P&gt;To check that MKL implementation is threaded please set environment KMP_AFFINITY=compact,verbose and be sure you linked with MKL threaded libraries.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 20 Feb 2013 07:27:50 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948429#M15052</guid>
      <dc:creator>barragan_villanueva_</dc:creator>
      <dc:date>2013-02-20T07:27:50Z</dc:date>
    </item>
    <item>
      <title>&gt;&gt;&gt; I know that OpenMP</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948430#M15053</link>
      <description>&lt;P&gt;&amp;gt;&amp;gt;&amp;gt;&amp;nbsp;I know that OpenMP specification 3.1 ( July 2011 ) does not have any functions to change priorities of OpenMP threads.&amp;gt;&amp;gt;&amp;gt;&lt;/P&gt;
&lt;P&gt;So the priorities of OpenMP thread is hardcoded to be Normal.&lt;/P&gt;</description>
      <pubDate>Wed, 20 Feb 2013 08:32:54 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948430#M15053</guid>
      <dc:creator>Bernard</dc:creator>
      <dc:date>2013-02-20T08:32:54Z</dc:date>
    </item>
    <item>
      <title>Marian,</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948431#M15054</link>
      <description>Marian,

&amp;gt;&amp;gt;SetPriorityClass( GetCurrentProcess(), HIGH_PRIORITY_CLASS );

Did you try to comment that?</description>
      <pubDate>Wed, 20 Feb 2013 13:08:08 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948431#M15054</guid>
      <dc:creator>SergeyKostrov</dc:creator>
      <dc:date>2013-02-20T13:08:08Z</dc:date>
    </item>
    <item>
      <title>Hi,</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948432#M15055</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;
&lt;P&gt;Setting high-priority&amp;nbsp;may not help and used&amp;nbsp;just for getting performance stability on overloaded machine.&lt;/P&gt;
&lt;P&gt;However,&amp;nbsp;you can initialize OMP-threads in your program easily with required priority before using MKL functions:&lt;/P&gt;
&lt;P&gt;#pragma omp parallel&lt;/P&gt;
&lt;P&gt;{&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&amp;nbsp; SetThreadPriority( GetCurrentThread(),&lt;STRONG&gt;THREAD_PRIORITY_HIGHEST);&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;}&lt;/P&gt;
&lt;P&gt;so that MKL will use these theads doing parallel FFTs.&lt;/P&gt;
&lt;P&gt;BTW, how many real&amp;nbsp;CPUs are on your machine. And what about HT (hyper-threading)?&lt;/P&gt;</description>
      <pubDate>Thu, 21 Feb 2013 05:55:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948432#M15055</guid>
      <dc:creator>barragan_villanueva_</dc:creator>
      <dc:date>2013-02-21T05:55:00Z</dc:date>
    </item>
    <item>
      <title>@Victor:</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948433#M15056</link>
      <description>&lt;P&gt;@Victor:&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;What they are? You know, small sizes&amp;nbsp;have always sequential&amp;nbsp;implementation.&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;The FFT length is written in the absissa, so the exponent runs from 10 to 20.&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;BTW, how many real&amp;nbsp;CPUs are on your machine. And what about HT (hyper-threading)?&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;Intel Core i5-2500@3.30 GHz, according to &lt;A href="http://ark.intel.com/de/products/52209/Intel-Core-i5-2500-Processor-6M-Cache-up-to-3_70-GHz"&gt;this &lt;/A&gt;it has no hyper threading&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;SetThreadPriority( GetCurrentThread(),&lt;STRONG&gt;THREAD_PRIORITY_HIGHEST);&lt;/STRONG&gt;&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;&lt;STRONG&gt;&lt;/STRONG&gt;Sadly, that does not change the performance significantly.&lt;/P&gt;
&lt;P&gt;@Sergey:&lt;/P&gt;
&lt;BLOCKQUOTE&gt;
&lt;P&gt;&amp;gt;&amp;gt;SetPriorityClass( GetCurrentProcess(), HIGH_PRIORITY_CLASS );&lt;/P&gt;
&lt;P&gt;Did you try to comment that?&lt;/P&gt;
&lt;/BLOCKQUOTE&gt;
&lt;P&gt;Yes, but it sets the priority of the whole process, not threads. So the placement should be irrelevant, correct?&lt;/P&gt;
&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 21 Feb 2013 13:21:50 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948433#M15056</guid>
      <dc:creator>Marian_L_</dc:creator>
      <dc:date>2013-02-21T13:21:50Z</dc:date>
    </item>
    <item>
      <title>&gt;&gt;...Yes, but it sets the</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948434#M15057</link>
      <description>&amp;gt;&amp;gt;...Yes, but it sets the priority of the whole process, not threads. So the placement should be irrelevant,
&amp;gt;&amp;gt;correct?

Yes, and this is what I've seen in my tests in 2012.

Marian, I'll try to investigate it again and then post my new results ( I can't tell when it will be done ). Also, I actually have that task on my list for a long time and it's a right time to look into that.</description>
      <pubDate>Thu, 21 Feb 2013 13:40:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948434#M15057</guid>
      <dc:creator>SergeyKostrov</dc:creator>
      <dc:date>2013-02-21T13:40:00Z</dc:date>
    </item>
    <item>
      <title>Sergey, thanks for your help.</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948435#M15058</link>
      <description>&lt;P&gt;Sergey, thanks for your help. If you need some of my testing code or gnuplot export functions, let me know.&lt;/P&gt;</description>
      <pubDate>Thu, 21 Feb 2013 14:37:44 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948435#M15058</guid>
      <dc:creator>Marian_L_</dc:creator>
      <dc:date>2013-02-21T14:37:44Z</dc:date>
    </item>
    <item>
      <title>@marian</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948436#M15059</link>
      <description>&lt;P&gt;@marian&lt;/P&gt;
&lt;P&gt;Before running your tests can you measure overall sytem load?I would suggest to do it with xperf tool.There is possibility that your threads are preempted by code which is running at high IRQL(like driver's routines).As it was suggested in other post by Sergey it is recommended to disable some of the unneded windows services and even disable some of the hardware loke network cards.&lt;/P&gt;</description>
      <pubDate>Sun, 24 Feb 2013 09:46:55 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948436#M15059</guid>
      <dc:creator>Bernard</dc:creator>
      <dc:date>2013-02-24T09:46:55Z</dc:date>
    </item>
    <item>
      <title>&gt;&gt;... If you need some of my</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948437#M15060</link>
      <description>&amp;gt;&amp;gt;... If you need &lt;STRONG&gt;some of my testing code&lt;/STRONG&gt; or gnuplot export functions, let me know...

Yes, it would be nice to look at it. Thanks, Marian.</description>
      <pubDate>Sun, 24 Feb 2013 20:23:37 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948437#M15060</guid>
      <dc:creator>SergeyKostrov</dc:creator>
      <dc:date>2013-02-24T20:23:37Z</dc:date>
    </item>
    <item>
      <title>@ iliyapolak: Thanks for your</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948438#M15061</link>
      <description>&lt;P&gt;@ iliyapolak: Thanks for your comment. If the performance is not good with my system, it is very likely the same on a customer's computer. I agree it could be some driver or something else. But I can't go into details of the root cause here, because I don't know what is on other systems, that I cannot control.&lt;/P&gt;
&lt;P&gt;@Sergey, I'll send you a pm.&lt;/P&gt;</description>
      <pubDate>Mon, 25 Feb 2013 10:49:21 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948438#M15061</guid>
      <dc:creator>Marian_L_</dc:creator>
      <dc:date>2013-02-25T10:49:21Z</dc:date>
    </item>
    <item>
      <title>&gt;&gt;...There is possibility</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948439#M15062</link>
      <description>&amp;gt;&amp;gt;...There is possibility that your threads are preempted by code which is running at high IRQL (like driver's routines)....

Many software developers simply do not care what IRQL, or a driver routing, etc, are in essence. If a software developer has a problem he / she could ask a question on a forum and a &lt;STRONG&gt;solution / proposal / hands-on-R&amp;amp;D, test, etc&lt;/STRONG&gt; is / are expected ( ideally ). I see that in Marian's case a real &lt;STRONG&gt;practical investigation&lt;/STRONG&gt; is really needed what is going on with MKL v11. It is clear that performance has degraded compared to older versions of MKL.

Best regards,
Sergey</description>
      <pubDate>Tue, 26 Feb 2013 00:49:31 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948439#M15062</guid>
      <dc:creator>SergeyKostrov</dc:creator>
      <dc:date>2013-02-26T00:49:31Z</dc:date>
    </item>
    <item>
      <title>&gt;&gt;&gt;I see that in Marian's</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948440#M15063</link>
      <description>&lt;P&gt;&amp;gt;&amp;gt;&amp;gt;I see that in Marian's case a real &lt;STRONG&gt;practical investigation&lt;/STRONG&gt; is really needed what is going on with MKL v11. It is clear that performance has degraded compared to older versions of MKL.&amp;gt;&amp;gt;&amp;gt;&lt;/P&gt;
&lt;P&gt;As far as it concerns performance of some program/software at the beginning of the investigation you cannot exclude anything.I completely agree with you that any software developer should not care about the IRQL's and driver's routines,but in the case of software performance problems everything must be taken into account even system load.&lt;/P&gt;</description>
      <pubDate>Tue, 26 Feb 2013 05:34:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948440#M15063</guid>
      <dc:creator>Bernard</dc:creator>
      <dc:date>2013-02-26T05:34:00Z</dc:date>
    </item>
    <item>
      <title>@Marian</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948441#M15064</link>
      <description>&lt;P&gt;@Marian&lt;/P&gt;
&lt;P&gt;I was not talking about the other system.Testing your code on another machine is important to understand the root cause.Regarding the problem I believe that sometimes you software performance can degrade because of interference from the OS (it's services) and the other code maybe more priviledged one.&lt;/P&gt;</description>
      <pubDate>Tue, 26 Feb 2013 05:46:14 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948441#M15064</guid>
      <dc:creator>Bernard</dc:creator>
      <dc:date>2013-02-26T05:46:14Z</dc:date>
    </item>
    <item>
      <title>Marian,</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948442#M15065</link>
      <description>&lt;P&gt;Marian,&lt;/P&gt;
&lt;P&gt;The best time observed by the benchmark scales (decreases with the number of threads) but this best time is dominated by instability of measurement.&lt;/P&gt;
&lt;P&gt;Here are some tips to stabilize measurements.&lt;/P&gt;
&lt;OL&gt;
&lt;LI&gt;Pin threads to CPU cores using the KMP_AFFINITY environment varibale or the Windows API for thread affinity&lt;/LI&gt;
&lt;LI&gt;Ensure the benchmark single-threaded; if your use-case is multi-threadedm you may want to look through &lt;A href="http://software.intel.com/en-us/articles/different-parallelization-techniques-and-intel-mkl-fft"&gt;http://software.intel.com/en-us/articles/different-parallelization-techniques-and-intel-mkl-fft&lt;/A&gt;&amp;nbsp;&lt;/LI&gt;
&lt;LI&gt;Prevent the cache warm-up time from dominating your performance measurement -- either increase the value of the loops variable in your code, or exclude from measurement the first call to DftiComputeForward for each Nfft.&lt;/LI&gt;
&lt;/OL&gt;
&lt;P&gt;Please let us know if the above tips help.&lt;/P&gt;
&lt;P&gt;Thanks,&lt;/P&gt;
&lt;P&gt;Evgueni.&lt;/P&gt;</description>
      <pubDate>Tue, 26 Feb 2013 12:03:30 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/Comparing-FFT-Performance-MKL11-with-1-thread-and-4-threads/m-p/948442#M15065</guid>
      <dc:creator>Evgueni_P_Intel</dc:creator>
      <dc:date>2013-02-26T12:03:30Z</dc:date>
    </item>
  </channel>
</rss>

