Intel® Integrated Performance Primitives
Deliberate problems developing high-performance vision, signal, security, and storage applications.

converting Intel ipp_resize_mt.cpp to support float source images (Ipp32f)

spence__ian
Beginner
854 Views

Hi,

I'm trying to make an Intel IPP example compatible with 32bit float source images.  As the Intel example's read/write library does not support this, the main() has been replaced by an external function routine : decimate_tbb(), supplying "float* srcData" (2D array) as the first parameter, and returning float *finalbuff (2D array).

The code below works perfectly for float sources, with 1 (YY), 3 (RGBRGB), & 4 (RGBARGBA) channels using SINGLE THREAD (iThreads = 1). The mult-threaded TBB version (iThreads = 0) gives a corrupted result on an 8 thread CPU. Any ideas would be most welcomed. Thanks in advance.

Ian,

 

 Compiles : OK -
 clang++  -std=c++11 -O2 -DUSE_TBB -c src//ipp_resize_mt_tbb.cpp -o obj//ipp_resize_mt_tbb.o -I $TBBROOT/include -I $IPPROOT/include -I $IPPROOT/components/common/include
 
 Link : OK, with external main() not supplied here -
 clang++ -L $TBBROOT/lib -l tbb -l tbbmalloc -L $IPPROOT/lib -lcommon -lippi -lipps -lippcore  -stdlib=libc++ -o bin//ipp_resize_mt_tbb obj//ipp_resize_mt_tbb.o


/*

ipp_resize_mt.cpp -  REVISED

Works for iThreads = 1; iThreads > 1  image is corrupt

// *******************************************************************************
* Copyright 2012-2019 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
// *******************************************************************************

 */
#include <math.h>
#include <memory>
#include "base_ipp.h"

#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/ipp/include/ippcore.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/ipp/include/ipps.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/ipp/include/ippi.h"

#ifdef USE_TBB
//#define __TBB_NO_IMPLICIT_LINKAGE 0
#define TBB_PREVIEW_MEMORY_POOL 1
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/tbb/include/tbb/task_scheduler_init.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/tbb/include/tbb/parallel_for.h"    
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/tbb/include/tbb/blocked_range2d.h"  
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/tbb/include/tbb/memory_pool.h"    
using namespace tbb;
#endif

// added iamge source / destination specs
struct image_specsStruct{
    int SrcW;
    int SrcH;
    int DstW;
    int DstH;
    IppiSize srcSize;
    IppiSize dstSize;
    Ipp32s srcStep;
    Ipp32s dstStep;
    Ipp8u nChannels;
}header;

class Resize
{
public:
    Resize()
    {
        m_iThreads = 0;
        m_interpolation = ippCubic;  // default
        m_pSpec         = 0;
        m_pInitBuffer   = 0;
        
        m_fBVal   = 1;
        m_fCVal   = 0;
        m_iLobes  = 3;
    }
    
    virtual ~Resize()
    {
        Close();
    }
    
    void Close()
    {
        if(m_pSpec)
        {
            ippsFree(m_pSpec);
            m_pSpec = 0;
        }
        
        if(m_pInitBuffer)
        {
            ippsFree(m_pInitBuffer);
            m_pInitBuffer = 0;
        }
    }
    
    virtual Status Init(Ipp32f *pSrcImage, Ipp32f *pDstImage)
    {
        IppStatus       ippSts;
        IppiBorderSize  borderSize;
        Ipp32s          iSpecSize = 0;
        Ipp32s          iInitSize = 0;
        
        if(!pSrcImage || !pDstImage)return STS_ERR_NULL_PTR;
        
        Close();
        
        ippSts = ippiResizeGetSize_32f(header.srcSize, header.dstSize, m_interpolation, 0, &iSpecSize, &iInitSize);
        CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetSize_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
        
        // allocate internal buffer
        m_pSpec = (IppiResizeSpec_32f*)ippsMalloc_8u(iSpecSize);
        if(!m_pSpec){
            printf("\nCannot allocate memory for resize m_pSpec");
            return STS_ERR_ALLOC;
        }
        
        // init ipp resizer
        if(m_interpolation == ippCubic){
            // allocate initialization buffer. external buffer
            if(iInitSize){
                m_pInitBuffer = ippsMalloc_8u(iInitSize);
                if(!m_pInitBuffer){
                    printf("\nCannot allocate memory for resize m_pInitBuffer");
                    return STS_ERR_ALLOC;
                }
            }

            ippSts = ippiResizeCubicInit_32f(header.srcSize, header.dstSize, m_fBVal, m_fCVal, m_pSpec, m_pInitBuffer);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeCubicInit_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
            ippsFree(m_pInitBuffer);
            m_pInitBuffer = NULL;
        }

        ippSts = ippiResizeGetBorderSize_32f(m_pSpec, &borderSize);
        CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetBorderSize_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
        
        m_templ = *pSrcImage;
        
        return STS_OK;
    }
    
    Status ResizeBlock(Ipp32f *pSrcImage, Ipp32f *pDstImage, Rect roi, IppiBorderType border, unsigned char *pExtBuffer = 0)
    {
        Status      status;
        IppStatus   ippSts;
        IppiPoint   dstRoiOffset = {(int)roi.x, (int)roi.y};        // full image, no parallel dstOffset = {0, 0};
        IppiSize    dstRoiSize   = {(int)roi.width, (int)roi.height};
        IppiPoint   srcRoiOffset;
        IppiSize    srcRoiSize;
        
        Ipp32f          *pSrcPtr = 0;
        Ipp32f          *pDstPtr = 0;
        unsigned char   *pBuffer = 0;
        int             iBufferSize = 0;
        
        if(!pSrcImage || !pDstImage)return STS_ERR_NULL_PTR;
        if(!m_pSpec)return STS_ERR_NOT_INITIALIZED;
        
        // Zero size means full size
        if(!dstRoiSize.width)dstRoiSize.width   = (int)header.DstW;
        if(!dstRoiSize.height)dstRoiSize.height = (int)header.DstH;
        
         if(m_templ != *pSrcImage){
         status = Init(pSrcImage, pDstImage);
         CHECK_STATUS_PRINT_RS(status, "Resize::Init()", GetBaseStatusString(status));
         }

        ippSts = ippiResizeGetSrcRoi_32f(m_pSpec, dstRoiOffset, dstRoiSize, &srcRoiOffset, &srcRoiSize);
        CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetSrcRoi_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
        
        // adjust input and output buffers to current ROI
        pSrcPtr = (Ipp32f*)pSrcImage + ((srcRoiOffset.y * srcRoiOffset.x));  // seg fault: * header.srcStep
        
        pDstPtr = (Ipp32f*)pDstImage + (dstRoiOffset.y * dstRoiOffset.x);
        
        if(!pExtBuffer){
           
            ippSts = ippiResizeGetBufferSize_32f(m_pSpec, dstRoiSize, header.nChannels, &iBufferSize);
            CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetBufferSize_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
            
            pBuffer = ippsMalloc_8u(iBufferSize);
            if(!pBuffer){
                printf("\nCannot allocate memory for resize buffer");
                return STS_ERR_ALLOC;
            }
        }
        else pBuffer = pExtBuffer;
        
        // perform resize
        if(m_interpolation == ippCubic)
        {
            
            if(header.nChannels == 1)
                ippSts = ippiResizeCubic_32f_C1R(pSrcPtr, (int)header.srcStep, pDstPtr, (int)header.dstStep, dstRoiOffset, dstRoiSize, border, 0, m_pSpec, pBuffer);
            else if(header.nChannels == 3)
                ippSts = ippiResizeCubic_32f_C3R(pSrcPtr, (int)header.srcStep, pDstPtr, (int)header.dstStep, dstRoiOffset, dstRoiSize, border, 0, m_pSpec, pBuffer);
            else if(header.nChannels == 4)
                ippSts = ippiResizeCubic_32f_C4R(pSrcPtr, (int)header.srcStep, pDstPtr, (int)header.dstStep, dstRoiOffset, dstRoiSize, border, 0, m_pSpec, pBuffer);
            
        }
       
        CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeX_32f_CXR()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
        if(!pExtBuffer)ippsFree(pBuffer);
        
        return STS_OK;
    }
    
    virtual Status ResizeImage(Ipp32f *pSrcImage, Ipp32f *pDstImage)
    {
        
        Rect roi(header.DstW, header.DstH);
        
        return ResizeBlock(pSrcImage, pDstImage, roi, ippBorderRepl);
    }
    
    
public:
    unsigned int m_iThreads;
    float        m_fBVal;
    float        m_fCVal;
    IppiInterpolationType m_interpolation;
    
protected:
    Ipp32f m_templ;
    
     // initializes the IppiResizeSpec_32f structure for the resize operation
    IppiResizeSpec_32f *m_pSpec;
    
    unsigned char  *m_pInitBuffer;
    
};

#ifdef USE_TBB
class ResizeTBB : public Resize
{
public:
    ResizeTBB(): m_scheduler(task_scheduler_init::deferred)
    {
        m_iGrainX  = 0;
        m_iGrainY  = 0;
    }
    
    Status Init(Ipp32f *pSrcImage, Ipp32f *pDstImage)
    {
        Status status;
        
        status = Resize::Init(pSrcImage, pDstImage);
        CHECK_STATUS_PRINT_RS(status, "Resize::Init()", GetBaseStatusString(status));
        
        if(m_iThreads == 0)         // automatic threads number
        {
            m_iThreads = task_scheduler_init::default_num_threads();
            m_scheduler.initialize(m_iThreads);
        }
        else m_scheduler.initialize(m_iThreads);  // specific threads number
        
        if(!m_iGrainX)m_iGrainX = (unsigned int)(header.DstW + m_iThreads - 1)/m_iThreads;
        if(!m_iGrainY)m_iGrainY = (unsigned int)(header.DstH + m_iThreads - 1)/m_iThreads;
        
        m_task.m_pSrcData = pSrcImage;
        m_task.m_pDstData = pDstImage;
        m_task.m_pResize  = this;
        
        return STS_OK;
    }
    
    Status ResizeImage(Ipp32f *pSrcImage, Ipp32f *pDstImage)
    {
        
        blocked_range2d<unsigned int, unsigned int> tbbRange(0, (unsigned int)header.DstH, m_iGrainY, 0, (unsigned int)header.DstW, m_iGrainX);
        
        try
        {
            parallel_for(tbbRange, m_task, m_part_auto);
        }
        catch(Status status)
        {
            return status;
        }
        
        return STS_OK;
    }
    
private:
    class ResizeTBBTask
    {
    public:
        ResizeTBBTask() {};
        
        void operator()(blocked_range2d<unsigned int, unsigned int> &r) const
        {
            IppStatus ippSts;
            Status    status;
            Rect      roi( (int)r.cols().begin(), (int)r.rows().begin(), r.cols().end() - r.cols().begin(), r.rows().end() - r.rows().begin() );
            IppiSize  dstRoiSize  = {(int)roi.width, (int)roi.height};
            IppiBorderType border = ippBorderRepl;
            
            unsigned char *pBuffer = 0;

            int iBufferSize;

            ippSts = ippiResizeGetBufferSize_32f(m_pResize->m_pSpec, dstRoiSize, header.nChannels, &iBufferSize);  // 1, 3, 4

            CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetBufferSize_32f()", ippGetStatusString(ippSts), throw(STS_ERR_FAILED));
            
            pBuffer = (unsigned char*)m_pResize->m_memPool.malloc(iBufferSize);
            
            if(!pBuffer)
            {
                printf("\nCannot allocate memory for Resize pBuffer");
                throw(STS_ERR_ALLOC);
            }
            
            status = m_pResize->ResizeBlock(m_pSrcData, m_pDstData, roi, border, pBuffer);
            if(status != STS_OK)throw(status);
            
            m_pResize->m_memPool.free(pBuffer);
        }
        
        ResizeTBB  *m_pResize;
        Ipp32f     *m_pSrcData;
        Ipp32f     *m_pDstData;
    };
    
public:
    unsigned int m_iGrainX;
    unsigned int m_iGrainY;
    
private:
    ResizeTBBTask       m_task;
    task_scheduler_init m_scheduler;
    auto_partitioner    m_part_auto;
    memory_pool< scalable_allocator<unsigned char> > m_memPool;
    
};
#endif

float* decimate_tbb(float* srcData, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel, uint8_t interp)
{
    
    // Variables initialization
#if defined(USE_TBB)                // compile option yes
    unsigned int iThreads       = 0;
#else
    unsigned int iThreads       = 1;
#endif
    
    Status       status         = STS_OK;
    char*        sIppCpu        = 0;
    
    IppiInterpolationType interpolation;
    Resize *pResize = 0;
    
    header.SrcW = (int)nSrcW;
    header.SrcH = (int)nSrcH;
    header.DstW = (int)nDstW;
    header.DstH = (int)nDstH;
    
    IppiSize srcSizeM = {header.SrcW, header.SrcH};
    IppiSize dstSizeM = {header.DstW, header.DstH};
    
    header.srcSize = srcSizeM;
    header.dstSize = dstSizeM;
    header.nChannels = (Ipp8u)byteperpixel;
    
    Ipp32f *pSrc = NULL;
    Ipp32f *pDst = NULL;
    
    int srcStepM;
    int dstStepM;
    
    if (header.nChannels == 1){
        pSrc = ippiMalloc_32f_C1(header.SrcW, header.SrcH, &srcStepM);
        ippiCopy_32f_C1R(srcData, header.SrcW * header.nChannels * sizeof(Ipp32f), pSrc, srcStepM, header.srcSize);
        pDst = ippiMalloc_32f_C1(header.DstW, header.DstH, &dstStepM);
    }
    else if (header.nChannels == 3){
        pSrc = ippiMalloc_32f_C3(header.SrcW, header.SrcH, &srcStepM);
        ippiCopy_32f_C3R(srcData, header.SrcW * header.nChannels * sizeof(Ipp32f), pSrc, srcStepM, header.srcSize);
        pDst = ippiMalloc_32f_C3(header.DstW, header.DstH, &dstStepM);
    }
    else if (header.nChannels == 4){
        pSrc = ippiMalloc_32f_C4(header.SrcW, header.SrcH, &srcStepM);
        ippiCopy_32f_C4R(srcData, header.SrcW * header.nChannels * sizeof(Ipp32f), pSrc, srcStepM, header.srcSize);
        pDst = ippiMalloc_32f_C4(header.DstW, header.DstH, &dstStepM);
    }
    
    header.srcStep = srcStepM;
    header.dstStep = dstStepM;


    InitPreferredCpu(sIppCpu);
    
    for(;;)
    {
        
       //  iThreads = 0;   // TBB - IMAGE CORRUPTED
           iThreads = 1;       // SINGLE THREADED - IMAGE OK
        
        if(iThreads == 1)
        {
            printf("\nSequential resize\n");
            pResize = new Resize;
            if(!pResize){
                printf("\nFailed to allocate sequential resize class");
                exit(1);
            }
        }               // iThreads == 1
#ifdef USE_TBB
        else
        {
            printf("\nTBB resize\n");
            pResize = new ResizeTBB;
            if(!pResize){
                printf("\nFailed to allocate Intel TBB resize class");
                exit(1);
            }
            pResize->m_iThreads = iThreads;
        }           // USE_TBB
#endif
        
     
            interpolation = ippCubic;
            // Cubic specific values
            float  fBVal   = 1;
            float  fCVal   = 0;
            // pre-init
            pResize->m_fBVal         = fBVal;
            pResize->m_fCVal         = fCVal;
            printf("\nInterpolation : Cubic (B = %.2f; C = %.2f)\n", fBVal, fCVal);


        // pre-init
        pResize->m_interpolation = interpolation;
        
        status = pResize->Init(pSrc, pDst);
        CHECK_STATUS_PRINT_BR(status, "Resize::Init()", GetBaseStatusString(status));
        
        if(iThreads != 1){
            iThreads = pResize->m_iThreads;
            printf("Threads: %d\n", iThreads);
        }
        
        status = pResize->ResizeImage(pSrc, pDst);
        CHECK_STATUS_PRINT_BR(status, "Resize::ResizeImage()", GetBaseStatusString(status));
        if(status < 0) break;

        break;
    }
    
    uint64_t pixels = (uint64_t)nDstW * nDstH * header.nChannels;
    float                *finalbuff,*tmp_w;
    
    if ((finalbuff=(float *)malloc(pixels *(sizeof(float))))==NULL){
        printf("\nError : not enough memory to allocate pixels in texture_color - exiting...\n");
        exit(1);
    }
    
    tmp_w = finalbuff;
    int dstep = dstStepM >> 2;
    
    for (int h=0;h < nDstH;h++){
        for (int w=0; w < nDstW * header.nChannels; w++)*tmp_w++ = pDst[h * dstep + w];
    }
    
    delete pResize;
    ippFree(pSrc);
    ippFree(pDst);
    
    return finalbuff;
}

 

0 Kudos
2 Replies
Andrey_B_Intel
Employee
854 Views

Hi Ian!

Could you please replace these lines in code

pSrcPtr = (Ipp32f*)pSrcImage + ((srcRoiOffset.y * srcRoiOffset.x));  // seg fault: * header.srcStep
        
        pDstPtr = (Ipp32f*)pDstImage + (dstRoiOffset.y * dstRoiOffset.x); 

with

        pSrcPtr = (Ipp32f*)((Ipp8u*)pSrcImage + srcRoiOffset.y*header.srcStep+ srcRoiOffset.x*header.nChannels*sizeof(Ipp32f));  // seg fault: * header.srcStep
        pDstPtr = (Ipp32f*)((Ipp8u*)pDstImage + dstRoiOffset.y*header.dstStep+ dstRoiOffset.x*header.nChannels*sizeof(Ipp32f));

I hope it will help.

Thanks for your feedback!

 

0 Kudos
spence__ian
Beginner
854 Views

Hi Andrey,

Yes, you absolutely nailed it. Thank you very much for your help with this. I did try different things here, but ran into Seg 11 faults, clearly failing to attain your logic.

Thank you

Ian

0 Kudos
Reply