2D convolution using VSL Conv

Phil_D_1 · ‎02-11-2015

Hi all,

I'm wondering if anyone can help. I'm trying to calculate a 2d convolution of two square arrays f(x,y) and g(x,y) using the code below, which compiles and runs fine (no error messages), but does not give the correct output for h(x,y). Instead, all values of h are unchanged (e.g. if initialized before attempting the convolution) except one "column" of the output, at x=0.

I've scoured the examples included with MKL libraries, but can't spot anything different between my code and the minimal working examples of this type.

Thanks,

Phil

int main(){

//......

int pts = pow(2,10);

    double f[pts*pts];
   double g[pts*pts];
   double h[pts*pts];

for(int i=0;i<pts;++i)

   {
       for(int j=0;j<pts;++j)
       {

//functions f[i*pts+j] and g[i*pts+j] are defined here

}
}

int n0 = 0.5*pts;

int start[2] = {n0,n0};
int status;

MKL_INT shape[2] = {nGrid,nGrid};
MKL_INT rank = 2;

int stride[2] = {1,1};

        VSLConvTaskPtr task;
        status = vsldConvNewTask(&task,VSL_CONV_MODE_AUTO,rank,shape,shape,shape);
        status = vslConvSetStart(task,start);
        status = vsldConvExec(task, f, stride, g, stride, h, stride);
        status = vslConvDeleteTask(&task);
   if (status!=VSL_STATUS_OK) {cout << "Error with 2d convolution."<<endl; exit (EXIT_FAILURE);}

//......

return 0;

}

Ying_H_Intel · ‎02-11-2015

Hi Phil D.

Could you please tell the value of nGrid and how you compile the code? Windows and Linux, 64bit or 32bit, threaded or sequential?

Best Regards,

Ying

Dmitry_B_Intel · ‎02-12-2015

Hi Phil,

May I suggest you to check the status after each call of convolution functions, not only the last one. It would also be helpful to read about how the data is supposed to be allocated for this function call, the documentation can be found here.

Thanks
Dima

Phil_D_1 · ‎02-12-2015

Hi Ying and Dima,

Thank you both for your quick replies!

Ying: I use a value of nGrid = pow(2,12) (i.e. 4096*4096 array size), and compile the code on a 32bit Linux box, using icc version 13.1.0 and the linkers -L$(MKLROOT)/lib/ia32 -lmkl_intel -lmkl_core -lmkl_intel_thread -lpthread -lm (from MKL link line advisor), such that the compile line is

icc [code name] -align -O3 -funroll-loops -ipo -align -ip -openmp -L($MKLROOT)/lib/ia32 -lmkl_intel -lmkl_core -lmkl_intel_thread -lpthread -lm -mkl=parallel -lboost_program_options -lboost_filesystem -I($MKLROOT)/include -o [executable name]

Dima: Thanks for your suggestion- I've aded extra status checks, and you're correct- there's an error while executing vsldConvExec. I had thought that once an operation on "task" had returned !VSL_STATUS_OK all subsequent operations would also, which is silly in hindsight! Thank you for the link to the data allocation documentation; using this and the test case in the examples library I've found the problem- as you suggested, one of data allocation. In the 1d convolution, one can get away with setting the output array to the same dimensions as the input dimensions (rather than size(z) = size(x) + size(y) - 1 as it should be), when using the code to calculate the (circular) convolution integral. This trick doesn't work in 2d due to the way the array is stored, so one needs the full shapez = {shapex[0]+shapey[0] -1, shapex[1]+shapey[1]-1}.

It seems that using the FT algorithms in "mkl_dfti.h" to do the circular convolution is a more appropriate solution.

Thanks again for all your help!

Regards,

Phil

Ying_H_Intel · ‎02-14-2015

Hi Phil,

Dima create a nice sample to show the problem, I attach here hope it can help.

You can use smaller shape h, but you need control the shape and stride so that the true memory access of z is not overflow, for example, the actual length of z must be at least 1+zstride*(zshape-1) elements or more.

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "mkl.h"

int main()
{
    VSLConvTaskPtr task;
    
    MKL_INT f_shape[] = { 4, 4 };
    MKL_INT g_shape[] = { 4, 4 };
    MKL_INT Rmin[] = { 0, 0 };
    MKL_INT Rmax[] = { f_shape[0] + g_shape[0] - 1, f_shape[1] + g_shape[1] - 1 };

#if 0 /* FULL RESULT */    
    MKL_INT h_shape[] = { Rmax[0], Rmax[1] };
    MKL_INT h_start[] = { 0, 0 };
#elif 1 /* A BOTTOM-LEFT TILE OF THE RESULT */
    MKL_INT size0 = 3;
    MKL_INT size1 = 2;
    MKL_INT h_shape[] = { size0, size1 };
    MKL_INT h_start[] = { Rmax[0]-size0, 0 };
#elif 1 /* A MIDDLE TILE OF THE RESULT */
    MKL_INT size0 = Rmax[0] / 2;
    MKL_INT size1 = Rmax[1] / 2;
    MKL_INT h_shape[] = { size0, size1 };
    MKL_INT h_start[] = { (Rmax[0] - size0) / 2, (Rmax[1] - size1) / 2 };
#endif

    MKL_INT f_stride[] = { f_shape[1], 1 };
    MKL_INT g_stride[] = { g_shape[1], 1 };
    MKL_INT h_stride[] = { h_shape[1], 1 };
    
    double *f = new double[ f_stride[0] * f_shape[0] ];
    double *g = new double[ g_stride[0] * g_shape[0] ];
    double *h = new double[ h_stride[0] * h_shape[0] ];

    for(int i=0; i < f_shape[0]; ++i)
        for(int j=0; j < f_shape[1]; ++j)
            f[ i * f_stride[0] + j ] = 1;

    for(int i=0; i < g_shape[0]; ++i)
        for(int j=0; j < g_shape[1]; ++j)
            g[ i * g_stride[0] + j ] = 1;

    memset( h, 0, sizeof(h[0]) * h_stride[0] * h_shape[0] );

    int status;
    status = vsldConvNewTask( &task, VSL_CONV_MODE_AUTO, 2, f_shape, g_shape, h_shape );
    assert(status == VSL_STATUS_OK);

    status = vslConvSetStart( task, h_start );
    assert(status == VSL_STATUS_OK);

    status = vsldConvExec( task, f, f_stride, g, g_stride, h, h_stride );
    assert(status == VSL_STATUS_OK);

    status = vslConvDeleteTask(&task);
    assert(status == VSL_STATUS_OK);
    
    for (int i=0; i< h_shape[0]; ++i)
    {
        printf("%3i: ",i);
        for (int j=0; j < h_shape[1]; ++j)
        {
            printf("%4.0f ",h[ i * h_stride[0] + j ]);
        }
        printf("\n");
    }
    return 0;
}

Best Regards,

Ying

Phil_D_1 · ‎02-17-2015

Hi Ying,

Thank you very much for this - it's so helpful to see the concepts in a working piece of code, to really get a concrete understanding. I see now how you need to be careful in setting the length of z and the start position.

Many thanks again to both you and Dima for your help in this!

Best wishes,

Phil

Sneha_P_ · ‎04-03-2015

HELLO

I am doing a project where I have to benchmark 1D and 2D convolution on Xeon E5 2670 and Xeon PHI 5110P. I have the entire code Phil has posted in a function called con2d_st(). In my main I have a nested loop. Outer loop is to generate data size and loop is to call the function 1000 times for each size. However when I run it I see segmentation error. I took out everything in the main and tried calling the function twice and I still see the error. However when I just call it once there is no error. I have included the code below please help.

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include<time.h>
#include<sys/time.h>
#include "mkl.h"

double timerval();
double con2d_st (int );
main()
{
int i;
int var = 2;

double timee = con2d_st(var);
printf("value is %f /n",timee);
timee = con2d_st(4);
printf("value is %f /n",timee);

}

double timerval()
{
   struct timeval st;
   gettimeofday(&st, NULL);
   return(st.tv_sec+st.tv_usec*1e-6);
}

double con2d_st(int n)
{
   double starttime, endtime, timee;
//instantiate input signal

   int i, j;
VSLConvTaskPtr task;

MKL_INT f_shape[2] = { n, n };
MKL_INT g_shape[2] = { 4, 4 };
MKL_INT Rmin[2] = { 0, 0 };
MKL_INT Rmax[2] = { f_shape[0] + g_shape[0] - 1, f_shape[1] + g_shape[1] - 1 };

/* FULL RESULT */
MKL_INT h_shape[2] = { Rmax[0], Rmax[1] };
MKL_INT h_start[2] = { 0, 0 };

MKL_INT f_stride[2] = { f_shape[1], 1 };
MKL_INT g_stride[2] = { g_shape[1], 1 };
MKL_INT h_stride[2] = { h_shape[1], 1 };

   double *f = (double*)malloc((f_stride[0] * f_shape[0])*sizeof(int));
   double *g = (double*)malloc((g_stride[0] * g_shape[0])*sizeof(int));
   double *h = (double*)malloc((h_stride[0] * h_shape[0])*sizeof(int));

for( i=0; i < f_shape[0]; ++i)
for( j=0; j < f_shape[1]; ++j)
f[ i * f_stride[0] + j ] = drand48()*1000;

for( i=0; i < g_shape[0]; ++i)
for( j=0; j < g_shape[1]; ++j)
g[ i * g_stride[0] + j ] = 1;

memset( h, 0, sizeof(h[0]) * h_stride[0] * h_shape[0] );

int status;
status = vsldConvNewTask( &task, VSL_CONV_MODE_AUTO, 2, f_shape, g_shape, h_shape );
assert(status == VSL_STATUS_OK);

status = vslConvSetStart( task, h_start );
assert(status == VSL_STATUS_OK);
starttime = timerval();
status = vsldConvExec( task, f, f_stride, g, g_stride, h, h_stride );
endtime = timerval();
assert(status == VSL_STATUS_OK);

status = vslConvDeleteTask(&task);
assert(status == VSL_STATUS_OK);

/* for ( i=0; i< h_shape[0]; ++i)
{
printf("%3i: ",i);
for ( j=0; j < h_shape[1]; ++j)
{
printf("%4.0f ",h[ i * h_stride[0] + j ]);
}
printf("\n");
} */
timee = endtime-starttime;
return timee;
}

Ying_H_Intel · ‎04-13-2015

Hi Sneha P.

Could it be the memory overflow problem? In most of case, the sizeof (double) =8, while the sizeof(int)=4, you may change the

double *f = (double*)malloc((f_stride[0] * f_shape[0])*sizeof(int));

double *g = (double*)malloc((g_stride[0] * g_shape[0])*sizeof(int));

double *h = (double*)malloc((h_stride[0] * h_shape[0])*sizeof(int));

to

double *f = (double*)malloc((f_stride[0] * f_shape[0])*sizeof(double));

double *g = (double*)malloc((g_stride[0] * g_shape[0])*sizeof(double));

double *h = (double*)malloc((h_stride[0] * h_shape[0])*sizeof(double));

and see if it can work.

Best Regards,

Ying