vdAdd not faster than hand written unrolled C loop

jcanedo · ‎02-13-2008

We are evaluating and testing VML and it looks like vdAdd is not faster (if not slower) than a "normal" C loop. I have written a small program that exhibits the problem. Does anyone else got this issue ? Did I forgot to configure or install something ?

Thanks in advance

J Canedo

#include

#define _WIN32_WINNT 0x400
#include 
#include 
#include 
#include

////

template 
class PerfTest
{
public:
 
 PerfTest(const size_t it, const std::string & l,
 const Func & f = Func())
 : func_(f), elapsed_(0), iterations_(it),
 label("START Disasm ") 
 {
 label += l;
 }

 void operator()(const size_t size,
 const double * left, const double * right, double * result) const
 {
 OutputDebugString(label.c_str());
 DWORD start = GetTickCount(), end = 0;
 for (size_t i = 0; i != iterations_; ++i)
 {
 func_(size, left, right, result);
 }
 end = GetTickCount();
 elapsed_ = end - start;
 OutputDebugString("STOP Disasm");
 }

 DWORD elapsed() const { return elapsed_; }

private:

 Func func_;
 mutable DWORD elapsed_;
 size_t iterations_;
 std::string label;
};

////

struct Empty
{
 inline void operator()(const size_t size, const double * left, const double * right, 
 double * result) const {}
};

struct MKLAdd
{
 inline void operator()(const size_t size, const double * left, const double * right,
 double * result) const
 {
 vdAdd(size, left, right, result);
 }
};

struct SSECLoop
{
 void operator()(const size_t vector_size, const double * left, const double * right,
 double * result) const
 {
 for (size_t index = 0; index != vector_size; ++index)
 {
 result[index] = left[index] + right[index];
 }
 }
};

int main(int argc, char * argv[])
{
 if (argc == 4)
 {
 size_t vector_size = atoi(argv[1]), iterations = atoi(argv[2]),
 threads = atoi(argv[3]);

 if (vector_size % 32)
 {
 std::cerr << "Vector size must be 32 multiple
";
 return -1;
 }
 
 std::cout << "Vector size " << vector_size
 << "
Iterations " << iterations
 << "
Threads " << threads
 << "
Dynamic " << mkl_get_dynamic()
 << "
Max threads " << mkl_get
_max_threads()
 << '
';
 
 // setup threads number

 MKLVersion version;
 MKLGetVersion(&version);

 std::cout << version.MajorVersion << "." << version.MinorVersion
 << "." << version.BuildNumber << " " << version.ProductStatus
 << " " << version.Processor << '
';

 mkl_set_num_threads(threads);
 mkl_set_dynamic(1);

#pragma omp parallel default(shared)
 {
 DWORD_PTR mask = (1 << omp_get_thread_num());
 SetThreadAffinityMask(GetCurrentThread(), mask);
 }

 std::cout << "
Dynamic " << mkl_get_dynamic()
 << "
Max threads " << mkl_get_max_threads()
 << '
';
 
 double * v1 = (double *) _aligned_malloc(vector_size * sizeof(double), 16),
 * v2 = (double *) _aligned_malloc(vector_size * sizeof(double), 16);

 for (size_t i = 0; i != vector_size; ++i)
 {
 v1 = i;
 v2 = 2 * i;
 }

 unsigned int mode = vmlGetMode();

 // Noop base time

 PerfTest noop(iterations, "Noop");
 noop(0, 0, 0, 0);
 std::cout << "Noop elapsed " << noop.elapsed() << " clocks
";

 //// MKL

 PerfTest mkl(iterations, "MKL");
 double * v3mkl = (double *) _aligned_malloc(vector_size * sizeof(double), 16);
 memset(v3mkl, 0, vector_size * sizeof(double));
 mkl(vector_size, v1, v2, v3mkl);
 std::cout << "MKL elapsed " << mkl.elapsed() << " clocks
";

 //// normal loop
 
 PerfTest cloop(iterations, "CLoop");
 double * v3cloop = (double *) _aligned_malloc(vector_size * sizeof(double), 16);
 memset(v3cloop, 0, vector_size * sizeof(double));
 cloop(vector_size, v1, v2, v3cloop);
 std::cout << "C loop elapsed " << cloop.elapsed() << " clocks
";
 
 // Compare results
 
 std::cout << "MKL vs CLoop " << memcmp(v3mkl, v3cloop, vector_size * sizeof(double)) 
 << '
';
 
 _aligned_free(v1);
 _aligned_free(v2);
 _aligned_free(v3mkl);
 _aligned_free(v3cloop);
 }
 else
 {
 std::cerr << "Usage: " << argv[0] << " length iterations threads
n";
 }
 
 return 0;
}

Makefile

MKL_ROOT_DIR := C:/Program Files/Intel/MKL/10.0.1.015
MKL_INCLUDE_DIR := $(MKL_ROOT_DIR)/include
MKL_ARCHITECTURE := ia32
MKL_LIB_DIR := $(MKL_ROOT_DIR)/$(MKL_ARCHITECTURE)/lib

####

# MKL_INTERFACE := stdcall
MKL_INTERFACE := cdecl

MKL_LINK_MODE := dynamic
# MKL_LINK_MODE := static

####

ifeq ($(MKL_INTERFACE),cdecl)

MKL_DEFINES := -DMKL_VML_CDECL
MKL_INTERFACE_SUFFIX := _c

else

MKL_DEFINES := -DMKL_VML_STDCALL
MKL_INTERFACE_SUFFIX := _s

endif

ifeq ($(MKL_LINK_MODE),static)

CC_OPTS := -MT -openmp
MKL_LINK_MODE_SUFFIX := 
MKL_RTL_LINK_MODE_SUFFIX := t

else

CC_OPTS := -MD -openmp -Ox
MKL_LINK_MODE_SUFFIX := _dll
MKL_RTL_LINK_MODE_SUFFIX := d

endif

MKL_INTERFACE_LIB := mkl_intel$(MKL_INTERFACE_SUFFIX)$(MKL_LINK_MODE_SUFFIX).lib
MKL_THREADING_LIB := mkl_sequential$(MKL_LINK_MODE_SUFFIX).lib # mkl_intel_thread$(MKL_LINK_MODE_SUFFIX).lib
MKL_COMPUTATION_LIB := mkl_core$(MKL_LINK_MODE_SUFFIX).lib
MKL_RUNTIME_LIB := # libiomp5m$(MKL_RTL_LINK_MODE_SUFFIX).lib

VTUNE_ROOT_DIR := C:/Program Files/Intel/VTune/Analyzer
VTUNE_INCLUDE_DIR := $(VTUNE_ROOT_DIR)/include
VTUNE_LIB_DIR := $(VTUNE_ROOT_DIR)/lib

all: test2.exe

test2.exe: test2.cpp
cl -Ox -arch:SSE2 $(MKL_DEFINES) -EHsc -nologo -I"$(MKL_INCLUDE_DIR)" $< -link -libpath:"$(MKL_LIB_DIR)" $(MKL_INTERFACE_LIB) $(MKL_THREADING_LIB) $(MKL_COMPUTATION_LIB) $(MKL_RUNTIME_LIB) psapi.lib

Output:

$ ./test2 2048 200000 1
Vector size 2048
Iterations 200000
Threads 1
Dynamic 1
Max threads 1
10.0.1 Product Intel Core 2 Duo Processor

Dynamic 1
Max threads 1
Noop elapsed 0 clocks
MKL elapsed 516 clocks
C loop elapsed 500 clocks
MKL vs CLoop 0

Eugeny_G_Intel · ‎02-18-2008

Thank you for the report, we will look for further optimization opportunities.