I can't quite figure out what

atilla_k_ · ‎02-16-2016

Hello,

I have a problem with low porformans of IPP library on smp multicore system.( have an i7-4700EQ processor and Hyperthreading disabled.) I want to use 4 cores with parallel. I compiled below code and run it. With only 1 core time measurement was 394us. But with 4 cores, i saw 1191us for each cores. How can it possible? I should have seen about 394us, right? Because I used independent tasks and independent memories. And why "IPP BUF SIZES:" different? I think these values should be same.

code;

float *fInR, *fInI;
InputStruct InputTest;
double dtime1[4], dtime2[4];
#define FORLOOP 1000

void cfrRSIAna(int iWeight, int iHeight, int iAffin)
{
ippInit();
    int iFFTOrderWeight;
    IppsFFTSpec_C_32f *specWeight[4] = {NULL, NULL, NULL, NULL};
    Ipp8u* specbufWeight[4] = {NULL, NULL, NULL, NULL};
    Ipp8u* specinitWeight[4] = {NULL, NULL, NULL, NULL};
    Ipp8u* workbufWeight[4] = {NULL, NULL, NULL, NULL};
    int iSpecSizeWeight[4]={0,0,0,0}, iSpecInitSizeWeight[4]={0,0,0,0}, iWorkBufSizeWeight[4]={0,0,0,0};


    TASK_ID tids[4];                /* some task IDs */
    char taskIsmi[32];
    int i, cpuIx[] = {0,1,2,3}; /* core ID'ler*/
    phys_cpuset_t affinity;

    libinfo();   /* ipp version */

    InputTest.iWeight = iWeight;
    InputTest.iHeight = iHeight;

    strcpy(cXmlDosyaAdi, cCfrRSIXml);/* aksi takdirde, caller dosya adini free ederse yanlis deger okunabilir */

    fInR = memalign(128, 4*1024*1024*4);
    fInI = memalign(128, 4*1024*1024*4);
    fOutR = memalign(128, 4*1024*1024*4);
    fOutI = memalign(128, 4*1024*1024*4);

    for(i=0; i<iAffin*iWeight*iHeight; i++)
    {
        fInR = rand()/(float)rand();
        fInI = rand()*1.315/(float)rand();
    }

    InputTest.fpInputR      = fInR;
    InputTest.fpInputI      = fInI;
    InputTest.fpOutputR      = fOutR;
    InputTest.fpOutputI      = fOutI;

iFFTOrderWeight = (int)(LOG2_X(iWeight));
printf("fftOrder:%d\n", iFFTOrderWeight);
for(i=0; i<iAffin; i++)
{
  ippsFFTGetSize_C_32f(iFFTOrderWeight, IPP_FFT_NODIV_BY_ANY, ippAlgHintNone, &iSpecSizeWeight, &iSpecInitSizeWeight, &iWorkBufSizeWeight);
  if(iSpecSizeWeight)     {specbufWeight = ippsMalloc_8u(iSpecSizeWeight);}
  if(iSpecInitSizeWeight) {specinitWeight = ippsMalloc_8u(iSpecInitSizeWeight);}
  if(iWorkBufSizeWeight) {workbufWeight = ippsMalloc_8u(iWorkBufSizeWeight);}
  ippsFFTInit_C_32f(&specWeight, iFFTOrderWeight, IPP_FFT_NODIV_BY_ANY, ippAlgHintNone, specbufWeight, specinitWeight);
  InputTest.specWeight    = specWeight;
  InputTest.workbufWeight = workbufWeight;
  printf("IPP BUF SIZES: specSize:%d InitSize:%d WorkSize:%d specbuffDop:%d workbuffDopp:%d\n", iSpecSizeWeight, iSpecInitSizeWeight, iWorkBufSizeWeight,specWeight,workbufWeight);
}


    /*************************************************************************/

    printf("Cores setting...\n");
    for(i=0; i<iAffin; i++)
    {
        PHYS_CPUSET_ZERO(affinity);
        PHYS_CPUSET_SET(affinity, cpuIx);
        sprintf(taskIsmi, "t%s%d", "testIPP", i);
  tids = taskCreate(taskIsmi, 120, TASK_OPTIONS, 65536, (FUNCPTR)IPPTestFunc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
        printf("Task create edildi:0x%08x\n", tids);
        if (tids == NULL)
        {
            /*return (ERROR);*/
            printf("Task create hatasi:0x%08x\n", tids);
        }
        if(iAffin != -1)
        {
            printf("Kontrol %d\n", i);
            /* Clear the affinity CPU set and set index for CPU */
            if (taskCpuAffinitySet(tids, affinity) == ERROR)
            {
                /* Either CPUs are not enabled or we are in UP mode */
                printf("Affinity set edilemedi !!!test_tap\n");
                taskDelete(tids);
                /*return (ERROR);*/
            }
            taskDelay(sysClkRateGet()/10);
            taskCpuAffinityGet(tids, &affinity);
            printf("Task Affinity:%d\n", affinity);
        }
    }
for(i=0; i<iAffin; i++)
{
  taskActivate(tids);
}

    taskDelay(sysClkRateGet()* 4); /* for finish all cores.*/

    for(i=0; i<iAffin; i++)
    {
        printf("\nStartTime[%d]=%f FinishTime[%d]=%f ExecutionTimeForCore[%d]=%f us\n", i, dtime1, i, dtime2, i, (dtime2-dtime1)/FORLOOP);
    }
    for(i=0; i<iAffin; i++)
    {
        taskDelete(tids);
    }


    for(i=0; i<iAffin; i++)
    {
    ippsFFTFree_C_32f(specWeight);
        ippFree(specbufWeight);
        ippFree(specinitWeight);
        ippFree(workbufWeight);
    }

     /* mem free*/

}

void IPPTestFunc(void)
{
    ippInit();
int iCpuId = vxCpuPhysIndexGet();
    IppsFFTSpec_C_32f *specDop = NULL;
    unsigned char *workbufDop = NULL;
    int iWeight, iHeight, i;
    float *fInReal, *fInImag, *fOutReal, *fOutImag;

    iWeight = InputTest->iWeight;
iHeight = InputTest->iHeight;
fInReal = InputTest->fpInputR+iCpuId*iWeight*Height;
fInImag = InputTest->fpInputI+iCpuId*iWeight*Height;
fOutReal = InputTest->fpOutputR+iCpuId*iWeight*Height;
fOutImag = InputTest->fpOutputI+iCpuId*iWeight*Height;

    specDop    = InputTest->specDoppler[iCpuId];
workbufDop = InputTest->workbufDoppler[iCpuId];

dtime1[iCpuId] = getTimeDouble(2);

for(i=0; i<iHeight; i++)
{
ippsFFTFwd_CToC_32f(fInReal+i*iWeight, fInImag+i*iWeight, fOutReal+i*iWeight, fOutImag+i*iWeight, specDop, workbufDop);
}

dtime2[iCpuId] = getTimeDouble(2);

}

             screen;


value = IPP Versiyon---> -140737483331680ippSP AVX2 (l9) = 0xffff8000004ca7a0 8.1.0 (r41883) 8.1.0.41883
fftOrder:7
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:859210432 workbuffDopp:5102080
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:859825472 workbuffDopp:859827072
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:252225216 workbuffDopp:252226816
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:252228032 workbuffDopp:252229632
Cores setting...
Task create edildi:0x0f034350
Kontrol 0
Task Affinity:1
Task create edildi:0x333279b0
Kontrol 1
Task Affinity:2
Task create edildi:0x0f081790
Kontrol 2
Task Affinity:4
Task create edildi:0x0f0b0010
Kontrol 3
Task Affinity:8

StartTime[0]=1033823205.286667 FinishTime[0]=1034757013.596667 ExecutionTimeForCore[0]=933.808310 us

StartTime[1]=1033823224.323333 FinishTime[1]=1035062267.645000 ExecutionTimeForCore[1]=1239.043322 us

StartTime[2]=1033823204.383333 FinishTime[2]=1035014816.198333 ExecutionTimeForCore[2]=1191.611815 us

StartTime[3]=1033823206.416667 FinishTime[3]=1034973920.471667 ExecutionTimeForCore[3]=1150.714055 us

value = IPP Versiyon---> -140737483331680ippSP AVX2 (l9) = 0xffff8000004ca7a0 8.1.0 (r41883) 8.1.0.41883
fftOrder:7
IPP BUF SIZES: specSize:1536 InitSize:0 WorkSize:1088 specbuffDop:5120064 workbuffDopp:251123968
Cores setting...
Task create edildi:0x0ef7dd90
Kontrol 0
Task Affinity:1

StartTime[0]=946258083.425000 FinishTime[0]=946652897.046667 ExecutionTimeForCore[0]=394.813622 us

Chao_Y_Intel · ‎02-23-2016

Hello,

Thanks for your report. Could you attach your full code here? We want to reproduce this problem, and have further check.

Regards,
Chao

McCalpinJohn · ‎02-24-2016

I can't quite figure out what problem sizes you are using here....

If the transform sizes (input + output + temporary workspace) are larger than each core's L2 cache, then the four transforms will compete for bandwidth in the shared L3 and/or in the shared DRAM subsystem. For processors like the Core i7-4700EQ, any one of the cores can use all of the available memory bandwidth. If the execution time is dominated by memory traffic, then it is not unusual for four independent jobs to take four times as long. The ratio can be even worse if all the data for one transform fits in the L3 cache, but the data for four transforms overflows the L3 cache.

FFTs are extremely difficult to analyze on cached memory hierarchies, and running multiple independent copies makes this even worse, so an easy answer might not be obtainable.

I don't know if IPP includes multi-threaded FFTs, but MKL is capable of using multiple threads to perform single FFTs. For the LIGO gravity wave analysis, I found that using 8 cores on a Xeon E5-2680 to solve a *single* 2^20 (single precision) FFT gave 4x-5x better throughput than running 8 independent FFTs concurrently. (FFTW3 gave slightly better results than MKL for these large parallelized FFTs, but MKL was faster for the smaller FFTs and MKL was often faster for non-power-of-2 thread counts.)

IPP smp performance