- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I'm trying to make an Intel IPP example compatible with 32bit float source images. As the Intel example's read/write library does not support this, the main() has been replaced by an external function routine : decimate_tbb(), supplying "float* srcData" (2D array) as the first parameter, and returning float *finalbuff (2D array).
The code below works perfectly for float sources, with 1 (YY), 3 (RGBRGB), & 4 (RGBARGBA) channels using SINGLE THREAD (iThreads = 1). The mult-threaded TBB version (iThreads = 0) gives a corrupted result on an 8 thread CPU. Any ideas would be most welcomed. Thanks in advance.
Ian,
Compiles : OK -
clang++ -std=c++11 -O2 -DUSE_TBB -c src//ipp_resize_mt_tbb.cpp -o obj//ipp_resize_mt_tbb.o -I $TBBROOT/include -I $IPPROOT/include -I $IPPROOT/components/common/include
Link : OK, with external main() not supplied here -
clang++ -L $TBBROOT/lib -l tbb -l tbbmalloc -L $IPPROOT/lib -lcommon -lippi -lipps -lippcore -stdlib=libc++ -o bin//ipp_resize_mt_tbb obj//ipp_resize_mt_tbb.o
/*
ipp_resize_mt.cpp - REVISED
Works for iThreads = 1; iThreads > 1 image is corrupt
// *******************************************************************************
* Copyright 2012-2019 Intel Corporation.
*
* This software and the related documents are Intel copyrighted materials, and
* your use of them is governed by the express license under which they were
* provided to you (License). Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute, disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents are provided as is, with no express
* or implied warranties, other than those that are expressly stated in the
* License.
// *******************************************************************************
*/
#include <math.h>
#include <memory>
#include "base_ipp.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/ipp/include/ippcore.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/ipp/include/ipps.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/ipp/include/ippi.h"
#ifdef USE_TBB
//#define __TBB_NO_IMPLICIT_LINKAGE 0
#define TBB_PREVIEW_MEMORY_POOL 1
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/tbb/include/tbb/task_scheduler_init.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/tbb/include/tbb/parallel_for.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/tbb/include/tbb/blocked_range2d.h"
#include "/opt/intel/compilers_and_libraries_2019.5.281/mac/tbb/include/tbb/memory_pool.h"
using namespace tbb;
#endif
// added iamge source / destination specs
struct image_specsStruct{
int SrcW;
int SrcH;
int DstW;
int DstH;
IppiSize srcSize;
IppiSize dstSize;
Ipp32s srcStep;
Ipp32s dstStep;
Ipp8u nChannels;
}header;
class Resize
{
public:
Resize()
{
m_iThreads = 0;
m_interpolation = ippCubic; // default
m_pSpec = 0;
m_pInitBuffer = 0;
m_fBVal = 1;
m_fCVal = 0;
m_iLobes = 3;
}
virtual ~Resize()
{
Close();
}
void Close()
{
if(m_pSpec)
{
ippsFree(m_pSpec);
m_pSpec = 0;
}
if(m_pInitBuffer)
{
ippsFree(m_pInitBuffer);
m_pInitBuffer = 0;
}
}
virtual Status Init(Ipp32f *pSrcImage, Ipp32f *pDstImage)
{
IppStatus ippSts;
IppiBorderSize borderSize;
Ipp32s iSpecSize = 0;
Ipp32s iInitSize = 0;
if(!pSrcImage || !pDstImage)return STS_ERR_NULL_PTR;
Close();
ippSts = ippiResizeGetSize_32f(header.srcSize, header.dstSize, m_interpolation, 0, &iSpecSize, &iInitSize);
CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetSize_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
// allocate internal buffer
m_pSpec = (IppiResizeSpec_32f*)ippsMalloc_8u(iSpecSize);
if(!m_pSpec){
printf("\nCannot allocate memory for resize m_pSpec");
return STS_ERR_ALLOC;
}
// init ipp resizer
if(m_interpolation == ippCubic){
// allocate initialization buffer. external buffer
if(iInitSize){
m_pInitBuffer = ippsMalloc_8u(iInitSize);
if(!m_pInitBuffer){
printf("\nCannot allocate memory for resize m_pInitBuffer");
return STS_ERR_ALLOC;
}
}
ippSts = ippiResizeCubicInit_32f(header.srcSize, header.dstSize, m_fBVal, m_fCVal, m_pSpec, m_pInitBuffer);
CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeCubicInit_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
ippsFree(m_pInitBuffer);
m_pInitBuffer = NULL;
}
ippSts = ippiResizeGetBorderSize_32f(m_pSpec, &borderSize);
CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetBorderSize_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
m_templ = *pSrcImage;
return STS_OK;
}
Status ResizeBlock(Ipp32f *pSrcImage, Ipp32f *pDstImage, Rect roi, IppiBorderType border, unsigned char *pExtBuffer = 0)
{
Status status;
IppStatus ippSts;
IppiPoint dstRoiOffset = {(int)roi.x, (int)roi.y}; // full image, no parallel dstOffset = {0, 0};
IppiSize dstRoiSize = {(int)roi.width, (int)roi.height};
IppiPoint srcRoiOffset;
IppiSize srcRoiSize;
Ipp32f *pSrcPtr = 0;
Ipp32f *pDstPtr = 0;
unsigned char *pBuffer = 0;
int iBufferSize = 0;
if(!pSrcImage || !pDstImage)return STS_ERR_NULL_PTR;
if(!m_pSpec)return STS_ERR_NOT_INITIALIZED;
// Zero size means full size
if(!dstRoiSize.width)dstRoiSize.width = (int)header.DstW;
if(!dstRoiSize.height)dstRoiSize.height = (int)header.DstH;
if(m_templ != *pSrcImage){
status = Init(pSrcImage, pDstImage);
CHECK_STATUS_PRINT_RS(status, "Resize::Init()", GetBaseStatusString(status));
}
ippSts = ippiResizeGetSrcRoi_32f(m_pSpec, dstRoiOffset, dstRoiSize, &srcRoiOffset, &srcRoiSize);
CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetSrcRoi_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
// adjust input and output buffers to current ROI
pSrcPtr = (Ipp32f*)pSrcImage + ((srcRoiOffset.y * srcRoiOffset.x)); // seg fault: * header.srcStep
pDstPtr = (Ipp32f*)pDstImage + (dstRoiOffset.y * dstRoiOffset.x);
if(!pExtBuffer){
ippSts = ippiResizeGetBufferSize_32f(m_pSpec, dstRoiSize, header.nChannels, &iBufferSize);
CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetBufferSize_32f()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
pBuffer = ippsMalloc_8u(iBufferSize);
if(!pBuffer){
printf("\nCannot allocate memory for resize buffer");
return STS_ERR_ALLOC;
}
}
else pBuffer = pExtBuffer;
// perform resize
if(m_interpolation == ippCubic)
{
if(header.nChannels == 1)
ippSts = ippiResizeCubic_32f_C1R(pSrcPtr, (int)header.srcStep, pDstPtr, (int)header.dstStep, dstRoiOffset, dstRoiSize, border, 0, m_pSpec, pBuffer);
else if(header.nChannels == 3)
ippSts = ippiResizeCubic_32f_C3R(pSrcPtr, (int)header.srcStep, pDstPtr, (int)header.dstStep, dstRoiOffset, dstRoiSize, border, 0, m_pSpec, pBuffer);
else if(header.nChannels == 4)
ippSts = ippiResizeCubic_32f_C4R(pSrcPtr, (int)header.srcStep, pDstPtr, (int)header.dstStep, dstRoiOffset, dstRoiSize, border, 0, m_pSpec, pBuffer);
}
CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeX_32f_CXR()", ippGetStatusString(ippSts), return STS_ERR_FAILED);
if(!pExtBuffer)ippsFree(pBuffer);
return STS_OK;
}
virtual Status ResizeImage(Ipp32f *pSrcImage, Ipp32f *pDstImage)
{
Rect roi(header.DstW, header.DstH);
return ResizeBlock(pSrcImage, pDstImage, roi, ippBorderRepl);
}
public:
unsigned int m_iThreads;
float m_fBVal;
float m_fCVal;
IppiInterpolationType m_interpolation;
protected:
Ipp32f m_templ;
// initializes the IppiResizeSpec_32f structure for the resize operation
IppiResizeSpec_32f *m_pSpec;
unsigned char *m_pInitBuffer;
};
#ifdef USE_TBB
class ResizeTBB : public Resize
{
public:
ResizeTBB(): m_scheduler(task_scheduler_init::deferred)
{
m_iGrainX = 0;
m_iGrainY = 0;
}
Status Init(Ipp32f *pSrcImage, Ipp32f *pDstImage)
{
Status status;
status = Resize::Init(pSrcImage, pDstImage);
CHECK_STATUS_PRINT_RS(status, "Resize::Init()", GetBaseStatusString(status));
if(m_iThreads == 0) // automatic threads number
{
m_iThreads = task_scheduler_init::default_num_threads();
m_scheduler.initialize(m_iThreads);
}
else m_scheduler.initialize(m_iThreads); // specific threads number
if(!m_iGrainX)m_iGrainX = (unsigned int)(header.DstW + m_iThreads - 1)/m_iThreads;
if(!m_iGrainY)m_iGrainY = (unsigned int)(header.DstH + m_iThreads - 1)/m_iThreads;
m_task.m_pSrcData = pSrcImage;
m_task.m_pDstData = pDstImage;
m_task.m_pResize = this;
return STS_OK;
}
Status ResizeImage(Ipp32f *pSrcImage, Ipp32f *pDstImage)
{
blocked_range2d<unsigned int, unsigned int> tbbRange(0, (unsigned int)header.DstH, m_iGrainY, 0, (unsigned int)header.DstW, m_iGrainX);
try
{
parallel_for(tbbRange, m_task, m_part_auto);
}
catch(Status status)
{
return status;
}
return STS_OK;
}
private:
class ResizeTBBTask
{
public:
ResizeTBBTask() {};
void operator()(blocked_range2d<unsigned int, unsigned int> &r) const
{
IppStatus ippSts;
Status status;
Rect roi( (int)r.cols().begin(), (int)r.rows().begin(), r.cols().end() - r.cols().begin(), r.rows().end() - r.rows().begin() );
IppiSize dstRoiSize = {(int)roi.width, (int)roi.height};
IppiBorderType border = ippBorderRepl;
unsigned char *pBuffer = 0;
int iBufferSize;
ippSts = ippiResizeGetBufferSize_32f(m_pResize->m_pSpec, dstRoiSize, header.nChannels, &iBufferSize); // 1, 3, 4
CHECK_STATUS_PRINT_AC(ippSts, "ippiResizeGetBufferSize_32f()", ippGetStatusString(ippSts), throw(STS_ERR_FAILED));
pBuffer = (unsigned char*)m_pResize->m_memPool.malloc(iBufferSize);
if(!pBuffer)
{
printf("\nCannot allocate memory for Resize pBuffer");
throw(STS_ERR_ALLOC);
}
status = m_pResize->ResizeBlock(m_pSrcData, m_pDstData, roi, border, pBuffer);
if(status != STS_OK)throw(status);
m_pResize->m_memPool.free(pBuffer);
}
ResizeTBB *m_pResize;
Ipp32f *m_pSrcData;
Ipp32f *m_pDstData;
};
public:
unsigned int m_iGrainX;
unsigned int m_iGrainY;
private:
ResizeTBBTask m_task;
task_scheduler_init m_scheduler;
auto_partitioner m_part_auto;
memory_pool< scalable_allocator<unsigned char> > m_memPool;
};
#endif
float* decimate_tbb(float* srcData, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel, uint8_t interp)
{
// Variables initialization
#if defined(USE_TBB) // compile option yes
unsigned int iThreads = 0;
#else
unsigned int iThreads = 1;
#endif
Status status = STS_OK;
char* sIppCpu = 0;
IppiInterpolationType interpolation;
Resize *pResize = 0;
header.SrcW = (int)nSrcW;
header.SrcH = (int)nSrcH;
header.DstW = (int)nDstW;
header.DstH = (int)nDstH;
IppiSize srcSizeM = {header.SrcW, header.SrcH};
IppiSize dstSizeM = {header.DstW, header.DstH};
header.srcSize = srcSizeM;
header.dstSize = dstSizeM;
header.nChannels = (Ipp8u)byteperpixel;
Ipp32f *pSrc = NULL;
Ipp32f *pDst = NULL;
int srcStepM;
int dstStepM;
if (header.nChannels == 1){
pSrc = ippiMalloc_32f_C1(header.SrcW, header.SrcH, &srcStepM);
ippiCopy_32f_C1R(srcData, header.SrcW * header.nChannels * sizeof(Ipp32f), pSrc, srcStepM, header.srcSize);
pDst = ippiMalloc_32f_C1(header.DstW, header.DstH, &dstStepM);
}
else if (header.nChannels == 3){
pSrc = ippiMalloc_32f_C3(header.SrcW, header.SrcH, &srcStepM);
ippiCopy_32f_C3R(srcData, header.SrcW * header.nChannels * sizeof(Ipp32f), pSrc, srcStepM, header.srcSize);
pDst = ippiMalloc_32f_C3(header.DstW, header.DstH, &dstStepM);
}
else if (header.nChannels == 4){
pSrc = ippiMalloc_32f_C4(header.SrcW, header.SrcH, &srcStepM);
ippiCopy_32f_C4R(srcData, header.SrcW * header.nChannels * sizeof(Ipp32f), pSrc, srcStepM, header.srcSize);
pDst = ippiMalloc_32f_C4(header.DstW, header.DstH, &dstStepM);
}
header.srcStep = srcStepM;
header.dstStep = dstStepM;
InitPreferredCpu(sIppCpu);
for(;;)
{
// iThreads = 0; // TBB - IMAGE CORRUPTED
iThreads = 1; // SINGLE THREADED - IMAGE OK
if(iThreads == 1)
{
printf("\nSequential resize\n");
pResize = new Resize;
if(!pResize){
printf("\nFailed to allocate sequential resize class");
exit(1);
}
} // iThreads == 1
#ifdef USE_TBB
else
{
printf("\nTBB resize\n");
pResize = new ResizeTBB;
if(!pResize){
printf("\nFailed to allocate Intel TBB resize class");
exit(1);
}
pResize->m_iThreads = iThreads;
} // USE_TBB
#endif
interpolation = ippCubic;
// Cubic specific values
float fBVal = 1;
float fCVal = 0;
// pre-init
pResize->m_fBVal = fBVal;
pResize->m_fCVal = fCVal;
printf("\nInterpolation : Cubic (B = %.2f; C = %.2f)\n", fBVal, fCVal);
// pre-init
pResize->m_interpolation = interpolation;
status = pResize->Init(pSrc, pDst);
CHECK_STATUS_PRINT_BR(status, "Resize::Init()", GetBaseStatusString(status));
if(iThreads != 1){
iThreads = pResize->m_iThreads;
printf("Threads: %d\n", iThreads);
}
status = pResize->ResizeImage(pSrc, pDst);
CHECK_STATUS_PRINT_BR(status, "Resize::ResizeImage()", GetBaseStatusString(status));
if(status < 0) break;
break;
}
uint64_t pixels = (uint64_t)nDstW * nDstH * header.nChannels;
float *finalbuff,*tmp_w;
if ((finalbuff=(float *)malloc(pixels *(sizeof(float))))==NULL){
printf("\nError : not enough memory to allocate pixels in texture_color - exiting...\n");
exit(1);
}
tmp_w = finalbuff;
int dstep = dstStepM >> 2;
for (int h=0;h < nDstH;h++){
for (int w=0; w < nDstW * header.nChannels; w++)*tmp_w++ = pDst[h * dstep + w];
}
delete pResize;
ippFree(pSrc);
ippFree(pDst);
return finalbuff;
}
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Ian!
Could you please replace these lines in code
pSrcPtr = (Ipp32f*)pSrcImage + ((srcRoiOffset.y * srcRoiOffset.x)); // seg fault: * header.srcStep pDstPtr = (Ipp32f*)pDstImage + (dstRoiOffset.y * dstRoiOffset.x);
with
pSrcPtr = (Ipp32f*)((Ipp8u*)pSrcImage + srcRoiOffset.y*header.srcStep+ srcRoiOffset.x*header.nChannels*sizeof(Ipp32f)); // seg fault: * header.srcStep pDstPtr = (Ipp32f*)((Ipp8u*)pDstImage + dstRoiOffset.y*header.dstStep+ dstRoiOffset.x*header.nChannels*sizeof(Ipp32f));
I hope it will help.
Thanks for your feedback!
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Andrey,
Yes, you absolutely nailed it. Thank you very much for your help with this. I did try different things here, but ran into Seg 11 faults, clearly failing to attain your logic.
Thank you
Ian
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page