<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: vdAdd not faster than hand written unrolled C loop in Intel® oneAPI Math Kernel Library</title>
    <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/vdAdd-not-faster-than-hand-written-unrolled-C-loop/m-p/850387#M6527</link>
    <description>&lt;P&gt;&lt;FONT face="Arial" color="#000080" size="2"&gt;Thank you for the report, we will look for further optimization opportunities.&lt;/FONT&gt;&lt;/P&gt;</description>
    <pubDate>Mon, 18 Feb 2008 09:44:51 GMT</pubDate>
    <dc:creator>Eugeny_G_Intel</dc:creator>
    <dc:date>2008-02-18T09:44:51Z</dc:date>
    <item>
      <title>vdAdd not faster than hand written unrolled C loop</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/vdAdd-not-faster-than-hand-written-unrolled-C-loop/m-p/850386#M6526</link>
      <description>&lt;P&gt;We are evaluating and testing VML and it looks like vdAdd is not faster (if not slower) than a "normal" C loop. I have written a small program that exhibits the problem. Does anyone else got this issue ? Did I forgot to configure or install something ?&lt;/P&gt;
&lt;P&gt;Thanks in advance&lt;/P&gt;
&lt;P&gt;J Canedo&lt;/P&gt;&lt;PRE&gt;#include &lt;IOSTREAM&gt;&lt;/IOSTREAM&gt;&lt;/PRE&gt;&lt;PRE&gt;#define _WIN32_WINNT 0x400&lt;BR /&gt;#include &lt;WINDOWS.H&gt;&lt;BR /&gt;#include &lt;PSAPI.H&gt;&lt;BR /&gt;#include &lt;OMP.H&gt;&lt;BR /&gt;#include &lt;MKL.H&gt;&lt;/MKL.H&gt;&lt;/OMP.H&gt;&lt;/PSAPI.H&gt;&lt;/WINDOWS.H&gt;&lt;/PRE&gt;&lt;PRE&gt;////&lt;/PRE&gt;&lt;PRE&gt;template &lt;TYPENAME func=""&gt;&lt;BR /&gt;class PerfTest&lt;BR /&gt;{&lt;BR /&gt;public:&lt;BR /&gt; &lt;BR /&gt; PerfTest(const size_t it, const std::string &amp;amp; l,&lt;BR /&gt; const Func &amp;amp; f = Func())&lt;BR /&gt; : func_(f), elapsed_(0), iterations_(it),&lt;BR /&gt; label("START Disasm ") &lt;BR /&gt; {&lt;BR /&gt; label += l;&lt;BR /&gt; }&lt;/TYPENAME&gt;&lt;/PRE&gt;&lt;PRE&gt; void operator()(const size_t size,&lt;BR /&gt; const double * left, const double * right, double * result) const&lt;BR /&gt; {&lt;BR /&gt; OutputDebugString(label.c_str());&lt;BR /&gt; DWORD start = GetTickCount(), end = 0;&lt;BR /&gt; for (size_t i = 0; i != iterations_; ++i)&lt;BR /&gt; {&lt;BR /&gt; func_(size, left, right, result);&lt;BR /&gt; }&lt;BR /&gt; end = GetTickCount();&lt;BR /&gt; elapsed_ = end - start;&lt;BR /&gt; OutputDebugString("STOP Disasm");&lt;BR /&gt; }&lt;/PRE&gt;&lt;PRE&gt; DWORD elapsed() const { return elapsed_; }&lt;/PRE&gt;&lt;PRE&gt;private:&lt;/PRE&gt;&lt;PRE&gt; Func func_;&lt;BR /&gt; mutable DWORD elapsed_;&lt;BR /&gt; size_t iterations_;&lt;BR /&gt; std::string label;&lt;BR /&gt;};&lt;/PRE&gt;&lt;PRE&gt;////&lt;/PRE&gt;&lt;PRE&gt;struct Empty&lt;BR /&gt;{&lt;BR /&gt; inline void operator()(const size_t size, const double * left, const double * right, &lt;BR /&gt; double * result) const {}&lt;BR /&gt;};&lt;/PRE&gt;&lt;PRE&gt;struct MKLAdd&lt;BR /&gt;{&lt;BR /&gt; inline void operator()(const size_t size, const double * left, const double * right,&lt;BR /&gt; double * result) const&lt;BR /&gt; {&lt;BR /&gt; vdAdd(size, left, right, result);&lt;BR /&gt; }&lt;BR /&gt;};&lt;/PRE&gt;&lt;PRE&gt;struct SSECLoop&lt;BR /&gt;{&lt;BR /&gt; void operator()(const size_t vector_size, const double * left, const double * right,&lt;BR /&gt; double * result) const&lt;BR /&gt; {&lt;BR /&gt; for (size_t index = 0; index != vector_size; ++index)&lt;BR /&gt; {&lt;BR /&gt; result[index] = left[index] + right[index];&lt;BR /&gt; }&lt;BR /&gt; }&lt;BR /&gt;};&lt;/PRE&gt;&lt;PRE&gt;int main(int argc, char * argv[])&lt;BR /&gt;{&lt;BR /&gt; if (argc == 4)&lt;BR /&gt; {&lt;BR /&gt; size_t vector_size = atoi(argv[1]), iterations = atoi(argv[2]),&lt;BR /&gt; threads = atoi(argv[3]);&lt;/PRE&gt;&lt;PRE&gt; if (vector_size % 32)&lt;BR /&gt; {&lt;BR /&gt; std::cerr &amp;lt;&amp;lt; "Vector size must be 32 multiple
";&lt;BR /&gt; return -1;&lt;BR /&gt; }&lt;BR /&gt; &lt;BR /&gt; std::cout &amp;lt;&amp;lt; "Vector size " &amp;lt;&amp;lt; vector_size&lt;BR /&gt; &amp;lt;&amp;lt; "
Iterations " &amp;lt;&amp;lt; iterations&lt;BR /&gt; &amp;lt;&amp;lt; "
Threads " &amp;lt;&amp;lt; threads&lt;BR /&gt; &amp;lt;&amp;lt; "
Dynamic " &amp;lt;&amp;lt; mkl_get_dynamic()&lt;BR /&gt; &amp;lt;&amp;lt; "
Max threads " &amp;lt;&amp;lt; mkl_get
_max_threads()&lt;BR /&gt; &amp;lt;&amp;lt; '
';&lt;BR /&gt; &lt;BR /&gt; // setup threads number&lt;/PRE&gt;&lt;PRE&gt; MKLVersion version;&lt;BR /&gt; MKLGetVersion(&amp;amp;version);&lt;/PRE&gt;&lt;PRE&gt; std::cout &amp;lt;&amp;lt; version.MajorVersion &amp;lt;&amp;lt; "." &amp;lt;&amp;lt; version.MinorVersion&lt;BR /&gt; &amp;lt;&amp;lt; "." &amp;lt;&amp;lt; version.BuildNumber &amp;lt;&amp;lt; " " &amp;lt;&amp;lt; version.ProductStatus&lt;BR /&gt; &amp;lt;&amp;lt; " " &amp;lt;&amp;lt; version.Processor &amp;lt;&amp;lt; '
';&lt;/PRE&gt;&lt;PRE&gt; mkl_set_num_threads(threads);&lt;BR /&gt; mkl_set_dynamic(1);&lt;/PRE&gt;&lt;PRE&gt;#pragma omp parallel default(shared)&lt;BR /&gt; {&lt;BR /&gt; DWORD_PTR mask = (1 &amp;lt;&amp;lt; omp_get_thread_num());&lt;BR /&gt; SetThreadAffinityMask(GetCurrentThread(), mask);&lt;BR /&gt; }&lt;/PRE&gt;&lt;PRE&gt; std::cout &amp;lt;&amp;lt; "
Dynamic " &amp;lt;&amp;lt; mkl_get_dynamic()&lt;BR /&gt; &amp;lt;&amp;lt; "
Max threads " &amp;lt;&amp;lt; mkl_get_max_threads()&lt;BR /&gt; &amp;lt;&amp;lt; '
';&lt;BR /&gt; &lt;BR /&gt; double * v1 = (double *) _aligned_malloc(vector_size * sizeof(double), 16),&lt;BR /&gt; * v2 = (double *) _aligned_malloc(vector_size * sizeof(double), 16);&lt;/PRE&gt;&lt;PRE&gt; for (size_t i = 0; i != vector_size; ++i)&lt;BR /&gt; {&lt;BR /&gt; v1&lt;I&gt; = i;&lt;BR /&gt; v2&lt;I&gt; = 2 * i;&lt;BR /&gt; }&lt;/I&gt;&lt;/I&gt;&lt;/PRE&gt;&lt;PRE&gt; unsigned int mode = vmlGetMode();&lt;/PRE&gt;&lt;PRE&gt; // Noop base time&lt;/PRE&gt;&lt;PRE&gt; PerfTest&lt;EMPTY&gt; noop(iterations, "Noop");&lt;BR /&gt; noop(0, 0, 0, 0);&lt;BR /&gt; std::cout &amp;lt;&amp;lt; "Noop elapsed " &amp;lt;&amp;lt; noop.elapsed() &amp;lt;&amp;lt; " clocks
";&lt;/EMPTY&gt;&lt;/PRE&gt;&lt;PRE&gt; //// MKL&lt;/PRE&gt;&lt;PRE&gt; PerfTest&lt;MKLADD&gt; mkl(iterations, "MKL");&lt;BR /&gt; double * v3mkl = (double *) _aligned_malloc(vector_size * sizeof(double), 16);&lt;BR /&gt; memset(v3mkl, 0, vector_size * sizeof(double));&lt;BR /&gt; mkl(vector_size, v1, v2, v3mkl);&lt;BR /&gt; std::cout &amp;lt;&amp;lt; "MKL elapsed " &amp;lt;&amp;lt; mkl.elapsed() &amp;lt;&amp;lt; " clocks
";&lt;/MKLADD&gt;&lt;/PRE&gt;&lt;PRE&gt; //// normal loop&lt;BR /&gt; &lt;BR /&gt; PerfTest&lt;SSECLOOP&gt; cloop(iterations, "CLoop");&lt;BR /&gt; double * v3cloop = (double *) _aligned_malloc(vector_size * sizeof(double), 16);&lt;BR /&gt; memset(v3cloop, 0, vector_size * sizeof(double));&lt;BR /&gt; cloop(vector_size, v1, v2, v3cloop);&lt;BR /&gt; std::cout &amp;lt;&amp;lt; "C loop elapsed " &amp;lt;&amp;lt; cloop.elapsed() &amp;lt;&amp;lt; " clocks
";&lt;BR /&gt; &lt;BR /&gt; // Compare results&lt;BR /&gt; &lt;BR /&gt; std::cout &amp;lt;&amp;lt; "MKL vs CLoop " &amp;lt;&amp;lt; memcmp(v3mkl, v3cloop, vector_size * sizeof(double)) &lt;BR /&gt; &amp;lt;&amp;lt; '
';&lt;BR /&gt; &lt;BR /&gt; _aligned_free(v1);&lt;BR /&gt; _aligned_free(v2);&lt;BR /&gt; _aligned_free(v3mkl);&lt;BR /&gt; _aligned_free(v3cloop);&lt;BR /&gt; }&lt;BR /&gt; else&lt;BR /&gt; {&lt;BR /&gt; std::cerr &amp;lt;&amp;lt; "Usage: " &amp;lt;&amp;lt; argv[0] &amp;lt;&amp;lt; " length iterations threads
n";&lt;BR /&gt; }&lt;BR /&gt; &lt;BR /&gt; return 0;&lt;BR /&gt;}&lt;BR /&gt;&lt;/SSECLOOP&gt;&lt;/PRE&gt;&lt;PRE&gt;Makefile&lt;/PRE&gt;&lt;PRE&gt;MKL_ROOT_DIR := C:/Program Files/Intel/MKL/10.0.1.015&lt;BR /&gt;MKL_INCLUDE_DIR := $(MKL_ROOT_DIR)/include&lt;BR /&gt;MKL_ARCHITECTURE := ia32&lt;BR /&gt;MKL_LIB_DIR := $(MKL_ROOT_DIR)/$(MKL_ARCHITECTURE)/lib&lt;/PRE&gt;&lt;PRE&gt;####&lt;/PRE&gt;&lt;PRE&gt;# MKL_INTERFACE := stdcall&lt;BR /&gt;MKL_INTERFACE := cdecl&lt;/PRE&gt;&lt;PRE&gt;MKL_LINK_MODE := dynamic&lt;BR /&gt;# MKL_LINK_MODE := static&lt;/PRE&gt;&lt;PRE&gt;####&lt;/PRE&gt;&lt;PRE&gt;ifeq ($(MKL_INTERFACE),cdecl)&lt;/PRE&gt;&lt;PRE&gt;MKL_DEFINES := -DMKL_VML_CDECL&lt;BR /&gt;MKL_INTERFACE_SUFFIX := _c&lt;/PRE&gt;&lt;PRE&gt;else&lt;/PRE&gt;&lt;PRE&gt;MKL_DEFINES := -DMKL_VML_STDCALL&lt;BR /&gt;MKL_INTERFACE_SUFFIX := _s&lt;/PRE&gt;&lt;PRE&gt;endif&lt;/PRE&gt;&lt;PRE&gt;ifeq ($(MKL_LINK_MODE),static)&lt;/PRE&gt;&lt;PRE&gt;CC_OPTS := -MT -openmp&lt;BR /&gt;MKL_LINK_MODE_SUFFIX := &lt;BR /&gt;MKL_RTL_LINK_MODE_SUFFIX := t&lt;/PRE&gt;&lt;PRE&gt;else&lt;/PRE&gt;&lt;PRE&gt;CC_OPTS := -MD -openmp -Ox&lt;BR /&gt;MKL_LINK_MODE_SUFFIX := _dll&lt;BR /&gt;MKL_RTL_LINK_MODE_SUFFIX := d&lt;/PRE&gt;&lt;PRE&gt;endif&lt;/PRE&gt;&lt;PRE&gt;MKL_INTERFACE_LIB := mkl_intel$(MKL_INTERFACE_SUFFIX)$(MKL_LINK_MODE_SUFFIX).lib&lt;BR /&gt;MKL_THREADING_LIB := mkl_sequential$(MKL_LINK_MODE_SUFFIX).lib # mkl_intel_thread$(MKL_LINK_MODE_SUFFIX).lib&lt;BR /&gt;MKL_COMPUTATION_LIB := mkl_core$(MKL_LINK_MODE_SUFFIX).lib&lt;BR /&gt;MKL_RUNTIME_LIB := # libiomp5m$(MKL_RTL_LINK_MODE_SUFFIX).lib&lt;/PRE&gt;&lt;PRE&gt;VTUNE_ROOT_DIR := C:/Program Files/Intel/VTune/Analyzer&lt;BR /&gt;VTUNE_INCLUDE_DIR := $(VTUNE_ROOT_DIR)/include&lt;BR /&gt;VTUNE_LIB_DIR := $(VTUNE_ROOT_DIR)/lib&lt;/PRE&gt;&lt;PRE&gt;all: test2.exe&lt;/PRE&gt;&lt;PRE&gt;test2.exe: test2.cpp&lt;BR /&gt;cl -Ox -arch:SSE2 $(MKL_DEFINES) -EHsc -nologo -I"$(MKL_INCLUDE_DIR)" $&amp;lt; -link -libpath:"$(MKL_LIB_DIR)" $(MKL_INTERFACE_LIB) $(MKL_THREADING_LIB) $(MKL_COMPUTATION_LIB) $(MKL_RUNTIME_LIB) psapi.lib&lt;/PRE&gt;&lt;PRE&gt;Output:&lt;/PRE&gt;&lt;PRE&gt;$ ./test2 2048 200000 1&lt;BR /&gt;Vector size 2048&lt;BR /&gt;Iterations 200000&lt;BR /&gt;Threads 1&lt;BR /&gt;Dynamic 1&lt;BR /&gt;Max threads 1&lt;BR /&gt;10.0.1 Product Intel Core 2 Duo Processor&lt;/PRE&gt;&lt;PRE&gt;Dynamic 1&lt;BR /&gt;Max threads 1&lt;BR /&gt;Noop elapsed 0 clocks&lt;BR /&gt;MKL elapsed 516 clocks&lt;BR /&gt;C loop elapsed 500 clocks&lt;BR /&gt;MKL vs CLoop 0&lt;/PRE&gt;&lt;PRE&gt;&lt;/PRE&gt;</description>
      <pubDate>Wed, 13 Feb 2008 17:51:55 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/vdAdd-not-faster-than-hand-written-unrolled-C-loop/m-p/850386#M6526</guid>
      <dc:creator>jcanedo</dc:creator>
      <dc:date>2008-02-13T17:51:55Z</dc:date>
    </item>
    <item>
      <title>Re: vdAdd not faster than hand written unrolled C loop</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/vdAdd-not-faster-than-hand-written-unrolled-C-loop/m-p/850387#M6527</link>
      <description>&lt;P&gt;&lt;FONT face="Arial" color="#000080" size="2"&gt;Thank you for the report, we will look for further optimization opportunities.&lt;/FONT&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 18 Feb 2008 09:44:51 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/vdAdd-not-faster-than-hand-written-unrolled-C-loop/m-p/850387#M6527</guid>
      <dc:creator>Eugeny_G_Intel</dc:creator>
      <dc:date>2008-02-18T09:44:51Z</dc:date>
    </item>
  </channel>
</rss>

