Intel® oneAPI Math Kernel Library
Ask questions and share information with other developers who use Intel® Math Kernel Library.
6981 Discussions

vdAdd not faster than hand written unrolled C loop

jcanedo
Beginner
416 Views

We are evaluating and testing VML and it looks like vdAdd is not faster (if not slower) than a "normal" C loop. I have written a small program that exhibits the problem. Does anyone else got this issue ? Did I forgot to configure or install something ?

Thanks in advance

J Canedo

#include 
#define _WIN32_WINNT 0x400
#include
#include
#include
#include
////
template 
class PerfTest
{
public:

PerfTest(const size_t it, const std::string & l,
const Func & f = Func())
: func_(f), elapsed_(0), iterations_(it),
label("START Disasm ")
{
label += l;
}
 void operator()(const size_t size,
const double * left, const double * right, double * result) const
{
OutputDebugString(label.c_str());
DWORD start = GetTickCount(), end = 0;
for (size_t i = 0; i != iterations_; ++i)
{
func_(size, left, right, result);
}
end = GetTickCount();
elapsed_ = end - start;
OutputDebugString("STOP Disasm");
}
 DWORD elapsed() const { return elapsed_; }
private:
 Func func_;
mutable DWORD elapsed_;
size_t iterations_;
std::string label;
};
////
struct Empty
{
inline void operator()(const size_t size, const double * left, const double * right,
double * result) const {}
};
struct MKLAdd
{
inline void operator()(const size_t size, const double * left, const double * right,
double * result) const
{
vdAdd(size, left, right, result);
}
};
struct SSECLoop
{
void operator()(const size_t vector_size, const double * left, const double * right,
double * result) const
{
for (size_t index = 0; index != vector_size; ++index)
{
result[index] = left[index] + right[index];
}
}
};
int main(int argc, char * argv[])
{
if (argc == 4)
{
size_t vector_size = atoi(argv[1]), iterations = atoi(argv[2]),
threads = atoi(argv[3]);
 if (vector_size % 32)
{
std::cerr << "Vector size must be 32 multiple ";
return -1;
}

std::cout << "Vector size " << vector_size
<< " Iterations " << iterations
<< " Threads " << threads
<< " Dynamic " << mkl_get_dynamic()
<< " Max threads " << mkl_get _max_threads()
<< ' ';

// setup threads number
 MKLVersion version;
MKLGetVersion(&version);
 std::cout << version.MajorVersion << "." << version.MinorVersion
<< "." << version.BuildNumber << " " << version.ProductStatus
<< " " << version.Processor << ' ';
 mkl_set_num_threads(threads);
mkl_set_dynamic(1);
#pragma omp parallel default(shared)
{
DWORD_PTR mask = (1 << omp_get_thread_num());
SetThreadAffinityMask(GetCurrentThread(), mask);
}
 std::cout << "
Dynamic " << mkl_get_dynamic()
<< " Max threads " << mkl_get_max_threads()
<< ' ';

double * v1 = (double *) _aligned_malloc(vector_size * sizeof(double), 16),
* v2 = (double *) _aligned_malloc(vector_size * sizeof(double), 16);
 for (size_t i = 0; i != vector_size; ++i)
{
v1 = i;
v2 = 2 * i;
}
 unsigned int mode = vmlGetMode();
 // Noop base time
 PerfTest noop(iterations, "Noop");
noop(0, 0, 0, 0);
std::cout << "Noop elapsed " << noop.elapsed() << " clocks ";
 //// MKL
 PerfTest mkl(iterations, "MKL");
double * v3mkl = (double *) _aligned_malloc(vector_size * sizeof(double), 16);
memset(v3mkl, 0, vector_size * sizeof(double));
mkl(vector_size, v1, v2, v3mkl);
std::cout << "MKL elapsed " << mkl.elapsed() << " clocks ";
 //// normal loop

PerfTest cloop(iterations, "CLoop");
double * v3cloop = (double *) _aligned_malloc(vector_size * sizeof(double), 16);
memset(v3cloop, 0, vector_size * sizeof(double));
cloop(vector_size, v1, v2, v3cloop);
std::cout << "C loop elapsed " << cloop.elapsed() << " clocks ";

// Compare results

std::cout << "MKL vs CLoop " << memcmp(v3mkl, v3cloop, vector_size * sizeof(double))
<< ' ';

_aligned_free(v1);
_aligned_free(v2);
_aligned_free(v3mkl);
_aligned_free(v3cloop);
}
else
{
std::cerr << "Usage: " << argv[0] << " length iterations threads n";
}

return 0;
}
Makefile
MKL_ROOT_DIR := C:/Program Files/Intel/MKL/10.0.1.015
MKL_INCLUDE_DIR := $(MKL_ROOT_DIR)/include
MKL_ARCHITECTURE := ia32
MKL_LIB_DIR := $(MKL_ROOT_DIR)/$(MKL_ARCHITECTURE)/lib
####
# MKL_INTERFACE := stdcall
MKL_INTERFACE := cdecl
MKL_LINK_MODE := dynamic
# MKL_LINK_MODE := static
####
ifeq ($(MKL_INTERFACE),cdecl)
MKL_DEFINES := -DMKL_VML_CDECL
MKL_INTERFACE_SUFFIX := _c
else
MKL_DEFINES := -DMKL_VML_STDCALL
MKL_INTERFACE_SUFFIX := _s
endif
ifeq ($(MKL_LINK_MODE),static)
CC_OPTS := -MT -openmp
MKL_LINK_MODE_SUFFIX :=
MKL_RTL_LINK_MODE_SUFFIX := t
else
CC_OPTS := -MD -openmp -Ox
MKL_LINK_MODE_SUFFIX := _dll
MKL_RTL_LINK_MODE_SUFFIX := d
endif
MKL_INTERFACE_LIB := mkl_intel$(MKL_INTERFACE_SUFFIX)$(MKL_LINK_MODE_SUFFIX).lib
MKL_THREADING_LIB := mkl_sequential$(MKL_LINK_MODE_SUFFIX).lib # mkl_intel_thread$(MKL_LINK_MODE_SUFFIX).lib
MKL_COMPUTATION_LIB := mkl_core$(MKL_LINK_MODE_SUFFIX).lib
MKL_RUNTIME_LIB := # libiomp5m$(MKL_RTL_LINK_MODE_SUFFIX).lib
VTUNE_ROOT_DIR := C:/Program Files/Intel/VTune/Analyzer
VTUNE_INCLUDE_DIR := $(VTUNE_ROOT_DIR)/include
VTUNE_LIB_DIR := $(VTUNE_ROOT_DIR)/lib
all: test2.exe
test2.exe: test2.cpp
cl -Ox -arch:SSE2 $(MKL_DEFINES) -EHsc -nologo -I"$(MKL_INCLUDE_DIR)" $< -link -libpath:"$(MKL_LIB_DIR)" $(MKL_INTERFACE_LIB) $(MKL_THREADING_LIB) $(MKL_COMPUTATION_LIB) $(MKL_RUNTIME_LIB) psapi.lib
Output:
$ ./test2 2048 200000 1
Vector size 2048
Iterations 200000
Threads 1
Dynamic 1
Max threads 1
10.0.1 Product Intel Core 2 Duo Processor
Dynamic 1
Max threads 1
Noop elapsed 0 clocks
MKL elapsed 516 clocks
C loop elapsed 500 clocks
MKL vs CLoop 0

					
				
			
			
				
			
			
			
			
			
			
			
		
0 Kudos
1 Reply
Eugeny_G_Intel
Employee
415 Views

Thank you for the report, we will look for further optimization opportunities.

0 Kudos
Reply