Re: Clearly shows step size calculation issue. This code seems

noemata · ‎04-10-2009

// ippEx1.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

#include
#include
#include

#pragma comment(lib, "gdiplus")

#include "ipp.h"

#pragma comment(lib, "ippcore.lib")
#pragma comment(lib, "ippi.lib")

using namespace Gdiplus;

class GdiplusInit
{
public:
GdiplusInit()
{
GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, NULL);
}
~GdiplusInit()
{
GdiplusShutdown(gdiplusToken);
}
protected:
private:
GdiplusStartupInput gdiplusStartupInput;
ULONG_PTR gdiplusToken;
};

GdiplusInit InitGDIPlus;

int GetEncoderClsid(const WCHAR* format, CLSID* pClsid)
{
UINT num = 0; // number of image encoders
UINT size = 0; // size of the image encoder array
// in bytes
Gdiplus::ImageCodecInfo* pImageCodecInfo = NULL;

Gdiplus::GetImageEncodersSize(&num, &size);
if(size == 0)
return -1;

pImageCodecInfo = (Gdiplus::ImageCodecInfo*)(malloc(size));
if(pImageCodecInfo == NULL)
return -1;

GetImageEncoders(num, size, pImageCodecInfo);

for(UINT j = 0; j < num; ++j)
{
if( wcscmp(pImageCodecInfo.MimeType, format) == 0 )
{
*pClsid = pImageCodecInfo.Clsid;
free(pImageCodecInfo);
return j; // Success
}
}

free(pImageCodecInfo);
return -1;
}

// Note: this only works for 1 byte wide bitmaps.
void showimg(const Ipp8u *pSrc, IppiSize srcSize)
{
printf("\n");

for ( int y = 0; y < srcSize.height; y++ )
{
for ( int x = 0; x < srcSize.width; x++ )
printf("%03d ", pSrc[(y * srcSize.width) + x]);

printf("\n");
}
}

IppStatus GaussBmp(Ipp8u *pSrc, IppiSize srcSize, int nChannels, IppiMaskSize op)
{
int nPad, step;
IppiSize padSize;
Ipp8u *pad, *flt;
IppStatus status = ippStsNoMemErr;

// Set mask operation. Intel IPP docs do not provide an adequate explanation of border value condition handling.
// Overall, the docs are riddled with mistakes, poor grammar and incomplete explanations.
if ( op == ippMskSize3x3 )
{
nPad = 3;
}
else if ( op == ippMskSize5x5 )
{
nPad = 5; // Crashes when set to 5. Why???
}
else
return ippStsNotSupportedModeErr;

// Calculate padded and filtered bitmap sizes.
padSize.width = srcSize.width + (2 * nPad);
padSize.height = srcSize.height + (2 * nPad);

// Allocate padded bitmap that will contain replicated border of source bitmap and corresponding filter target bitmap.
if (nChannels == 1)
{
pad = ippiMalloc_8u_C1(padSize.width, padSize.height, &step);
if ( !pad )
goto Fault;
flt = ippiMalloc_8u_C1(padSize.width, padSize.height, &step);
if ( !flt )
goto Fault;
}
else if (nChannels == 3)
{
pad = ippiMalloc_8u_C3(padSize.width, padSize.height, &step);
if ( !pad )
goto Fault;
flt = ippiMalloc_8u_C3(padSize.width, padSize.height, &step);
if ( !flt )
goto Fault;
}
else if (nChannels == 4)
{
pad = ippiMalloc_8u_AC4(padSize.width, padSize.height, &step);
if ( !pad )
goto Fault;
flt = ippiMalloc_8u_AC4(padSize.width, padSize.height, &step);
if ( !flt )
goto Fault;
}
else
return ippStsNotSupportedModeErr;

// Note: Intel IPP seems to incorrectly caclulate the step value!!! Why???
step = padSize.width * nChannels;

if (nChannels == 1)
{
status = ippiCopyReplicateBorder_8u_C1R(pSrc, srcSize.width * nChannels, srcSize, pad, step, padSize, nPad, nPad);
}
else if (nChannels == 3)
{
status = ippiCopyReplicateBorder_8u_C3R(pSrc, srcSize.width * nChannels, srcSize, pad, step, padSize, nPad, nPad);
}
else if (nChannels == 4)
{
status = ippiCopyReplicateBorder_8u_AC4R(pSrc, srcSize.width * nChannels, srcSize, pad, step, padSize, nPad, nPad);
}

if ( status )
goto Fault;

// If we do not increase value of buffersize calculated by ippiResizeSqrPixelGetBufSize, we crash ...
// Depending on image size, Intel IPP crashes here!!! Why??? Crashes are also random, in other words, not every time for same operation!!!!!
if (nChannels == 1)
{
status = ippiFilterGauss_8u_C1R(pad, step, flt, step, padSize, op);
}
else if (nChannels == 3)
{
status = ippiFilterGauss_8u_C3R(pad, step, flt, step, padSize, op);
}
else if (nChannels == 4)
{
status = ippiFilterGauss_8u_AC4R(pad, step, flt, step, padSize, op);
}

if ( status )
goto Fault;

// Copy filtered bitmap back to source. We need to set the row and colomn of the filtered
// bitmap so that it starts at the offset that corresponds to our border padding value.
if (nChannels == 1)
{
status = ippiCopy_8u_C1R(&flt[(padSize.width * nPad * nChannels) + (nPad * nChannels)], step, pSrc, srcSize.width * nChannels, srcSize);
}
else if (nChannels == 3)
{
status = ippiCopy_8u_C3R(&flt[(padSize.width * nPad * nChannels) + (nPad * nChannels)], step, pSrc, srcSize.width * nChannels, srcSize);
}
else if (nChannels == 4)
{
status = ippiCopy_8u_AC4R(&flt[(padSize.width * nPad * nChannels) + (nPad * nChannels)], step, pSrc, srcSize.width * nChannels, srcSize);
}

Fault:
if ( pad )
ippiFree(pad);
if ( flt )
ippiFree(flt);

return status;
}

int _tmain(int argc, _TCHAR* argv[])
{
const IppLibraryVersion* ippVersion = ippiGetLibVersion();

printf("IPP: [ %s %s ]", ippVersion->Name, ippVersion->Version);

Ipp8u src1[8*4] =
{3, 3, 3, 3, 3, 8, 8, 8,
3, 2, 1, 2, 3, 8, 8, 8,
3, 2, 1, 2, 3, 8, 8, 8,
3, 3, 3, 3, 3, 8, 8, 8};

IppiSize src1Size = { 8, 4 };

Ipp8u src2[8*5] =
{1, 1, 1, 1, 1, 1, 1, 1,
1, 4, 4, 4, 4, 4, 4, 1,
1, 4, 6, 6, 6, 6, 4, 1,
1, 4, 4, 4, 4, 4, 4, 1,
1, 1, 1, 1, 1, 1, 1, 1};

IppiSize src2Size = { 8, 5 };

showimg(src1, src1Size);

GaussBmp(src1, src1Size, 1, ippMskSize3x3);

showimg(src1, src1Size);

GaussBmp(src1, src1Size, 1, ippMskSize5x5);

showimg(src1, src1Size);

showimg(src2, src2Size);

GaussBmp(src2, src2Size, 1, ippMskSize5x5);

showimg(src2, src2Size);

return 0;
}

Vladimir_Dudnik · ‎04-10-2009

Agree, there isan issue in step size calculation.The line step = padSize.width * nChannels is wrong in given context.

Please spent some time for product documentation which might be not ideal one but still captured the basic IPP concepts very well. For example, from ippiMalloc function description:

The function ippiMalloc is declared in the ippi.h file. This function allocates a memory block aligned to a 32-byte boundary for elements of different data types. Every line of the image is aligned by padding with zeros in accordance with the pStepBytes parameter, which is calculated by the ippiMalloc function and returned for further use.

Regards,
Vladimir

noemata · ‎04-11-2009

Quoting - Vladimir Dudnik (Intel)

Agree, there isan issue in step size calculation.The line step = padSize.width * nChannels is wrong in given context.

Please spent some time for product documentation which might be not ideal one but still captured the basic IPP concepts very well. For example, from ippiMalloc function description:

The function ippiMalloc is declared in the ippi.h file. This function allocates a memory block aligned to a 32-byte boundary for elements of different data types. Every line of the image is aligned by padding with zeros in accordance with the pStepBytes parameter, which is calculated by the ippiMalloc function and returned for further use.

Regards,
Vladimir

Had you taken the time to try out the code, you would have noticed that what you identified as the problem is what makes this code work. The real problem is the step size returned by IPP. Try it. See what happens. And don't use the beta ... use the product your customers are using. You'll be unpleasantly surprised by the result if you use the IPP calculated step value. And you'll realize the somewhat horrific ramifications for those trying to use the API. Small bitmaps will either crash your app, or be completely WRONG!

This sample, as is, works. Some of the code is incorrect for other use cases. But that's a different story that I addressed in other posts here.

Vladimir_Dudnik · ‎04-13-2009

Yes, I look through the code you provided. Below are comments:

[cpp]status = GaussBmp((Ipp8u*)growData.Scan0, growSize, nChannel, ippMskSize5x5);
[/cpp]

In GaussBmp function youassume that bitmap data in growData object have no paddingwhat iswrong. I'd better use growData.Stride as a step through bitmap image provided by Windows API.

The step you get from ippiMalloc function is only applicable to the memory buffer allocated by this IPP call. I think it is clear that ippiMalloc have no (and don't to have) any guess on what memory layout is in bitmap allocated by Windows API. So when you apply step provided by ippiMalloc to the data allocated by Windows API it is the second mistake.

Regards,
Vladimir

Ying_H_Intel · ‎04-14-2009

Hello noemata,

I try your code. You may have noticed that there is different valuebetween
pad=ippiMalloc_8u_C1
and
step = padSize.width * nChannels
for example, in your case, for image pad, width=8+2*3=14,
the step returned by ippiMalloc is32
step= padSize.width*nChannnels is 14.

Then you havethecode
step = padSize.width * nChannels;
if (nChannels == 1)
{
status = ippiCopyReplicateBorder_8u_C1R(pSrc, srcSize.width * nChannels, srcSize, pad, step, padSize, nPad, nPad);

Itseems to work, but actually the copy result is not expected. (It will cause more potiential error, e.g your mentioned 24 bit doesn't work).

in detial, if you use ippiMalloc toallocate memory to pad, then the copy resultin padshould be
i.e the first twoline
003 003 003 003 003 003 003 003 008 008 008 008 008 008 then 000 000 000 000 000 000 000 000 ...
003 003 003 003 003 003 003 003 008 008 008 008 008 008 then 000 000 000 000 000 000 000 000

(As Vladmirmentioned,pad is 32-byte aligned, where every line is 32 bytes. The first 18 byte has copy value, the 18~32 should be padded with zeros. )

But if you change the step=14, thenthe real result in pad (first64 bytes)is
003 003 003 003 003 003 003 003 008 008 008 008 008 008 003 003 003 003 003 003 003 003 008 008 008 008 008 008 003 003 003 003 ....
become consecutive value.

So when you use ippiMalloc,you can use stepBytes directly instead to compute the step by width*channel.But Of cause,if you perfer tosee the concecutive value inmemory, you caluse malloc() directly.
for example
Ipp8u*pad = (Ipp8u*)malloc(padSize.width * nChannels*padSize.height);
Then the step should be padSize.width * nChannels.

Considering the step, thereis another thing you may take care.
stepBytes, almost each ipp image processcing function requires the parameter.
It is the distance in bytes between lines.

As you know, image data is often 4-bytes aligned (i.e, bmp format file). So the distance between lines for such kind of image data is multiples of4, thus the stepBytesare not equal to width*sizeof(type)*channel in some case either.

So when you read 24bit bitmap image from file, it is better to use pSrc->Step() (show in ipp sample codeat <http://software.intel.com/en-us/articles/intel-integrated-performance-primitives-code-samples/>
instead of calculateimage_WIDTH*channel.

Best Regards,
Ying

noemata · ‎04-15-2009

Quoting - (Intel)

Hello noemata,

I try your code. You may have noticed that there is different valuebetween
pad=ippiMalloc_8u_C1
and
step = padSize.width * nChannels
for example, in your case, for image pad, width=8+2*3=14,
the step returned by ippiMalloc is32
step= padSize.width*nChannnels is 14.

Then you havethecode
step = padSize.width * nChannels;
if (nChannels == 1)
{
status = ippiCopyReplicateBorder_8u_C1R(pSrc, srcSize.width * nChannels, srcSize, pad, step, padSize, nPad, nPad);

Itseems to work, but actually the copy result is not expected. (It will cause more potiential error, e.g your mentioned 24 bit doesn't work).

in detial, if you use ippiMalloc toallocate memory to pad, then the copy resultin padshould be
i.e the first twoline
003 003 003 003 003 003 003 003 008 008 008 008 008 008 then 000 000 000 000 000 000 000 000 ...
003 003 003 003 003 003 003 003 008 008 008 008 008 008 then 000 000 000 000 000 000 000 000

(As Vladmirmentioned,pad is 32-byte aligned, where every line is 32 bytes. The first 18 byte has copy value, the 18~32 should be padded with zeros. )

But if you change the step=14, thenthe real result in pad (first64 bytes)is
003 003 003 003 003 003 003 003 008 008 008 008 008 008 003 003 003 003 003 003 003 003 008 008 008 008 008 008 003 003 003 003 ....
become consecutive value.

So when you use ippiMalloc,you can use stepBytes directly instead to compute the step by width*channel.But Of cause,if you perfer tosee the concecutive value inmemory, you caluse malloc() directly.
for example
Ipp8u*pad = (Ipp8u*)malloc(padSize.width * nChannels*padSize.height);
Then the step should be padSize.width * nChannels.

Considering the step, thereis another thing you may take care.
stepBytes, almost each ipp image processcing function requires the parameter.
It is the distance in bytes between lines.

As you know, image data is often 4-bytes aligned (i.e, bmp format file). So the distance between lines for such kind of image data is multiples of4, thus the stepBytesare not equal to width*sizeof(type)*channel in some case either.

So when you read 24bit bitmap image from file, it is better to use pSrc->Step() (show in ipp sample codeat <http://software.intel.com/en-us/articles/intel-integrated-performance-primitives-code-samples/>
instead of calculateimage_WIDTH*channel.

Best Regards,
Ying

Makes perfect sense. The step/stride value was something I was looking for further guidance on. It would have been very helpful to have examples that used either OpengGL textures, DirectX textures or GDI+ bitmaps.

The examples in the docs often make reference to hand coded bitmap buffers that do not incorporate padding per scan line which confused the issue for me when I was trying tofit some of the documentation sample elements into test cases I was building for myself.

Initially I didn't have a failure check for this api: ippiResizeSqrPixelGetBufSize(...)

Which furtherconfused things because Iended up using the previouscalls buffer size as a result. It was a surprise to see this call fail for what should have been a valid interpolation mode.

That's the problem withsample codethatisn't production grade.Thoughit may make the example simpler understand, it can hide potentially critical considerations.I think it would be prudent to include a couple production grade examples in the IPP docs that usecomplimentary technologies such as OpenGL, DirectX and GDI+ (and perhaps Mac and Linux related tech).

The ippiDemo code sample is so layered that youcan't really see how you would need to be usea set of APIsin sequence. Nor does it directly demonstrate how border pixels need to be initialized for a target bitmap size. Yet it was the best example I could find.

My two cents worth.

Vladimir_Dudnik · ‎04-15-2009

And that's the problem!

1. 'production grade' sample is too layered
2. simple samples in documentationare too simple
3.one may consider it is too boring to read the documentation carefully

I see no other way but carefully study product manual and play with code samples. May be these quotations from manual will help...

Regards,
Vladimir

noemata · ‎04-15-2009

I did read this section. The hand coded bitmaps lackingalignment considerationsdidn't help getting tothe followingline of code:

// Filter operation must start at upper left corner of first image pixel which is justpast the upperleft corner of the replicated border. Row addressing must acount for pixel row word/dword alignment. The row address must also account for individual pixel size (1C, 3C, A4C)when formulating the pixel row address value.
&pixelbuffer[nPaddingPixels *rowStepsize + (nPaddingPixels * nChannels)];

A couple of the other examples I've since seen resolve to this in a somwhat convoluted way. Anyhow, my questions are mostly answered. For now, I will assume that the border size is set to the size of the masking operation chosen. That's the last point that I'm not completely clear on.

What's missing from my codecomment aboveis the border padding criteria. What should it be Vladimir?

Thanks.

noemata · ‎04-15-2009

Quoting - noemata

I did read this section. The hand coded bitmaps lackingalignment considerationsdidn't help getting tothe followingline of code:

// Filter operation must start at upper left corner of first image pixel which is justpast the upperleft corner of the replicated border. Row addressing must acount for pixel row word/dword alignment. The row address must also account for individual pixel size (1C, 3C, A4C)when formulating the pixel row address value.
&pixelbuffer[nPaddingPixels *rowStepsize + (nPaddingPixels * nChannels)];

A couple of the other examples I've since seen resolve to this in a somwhat convoluted way. Anyhow, my questions are mostly answered. For now, I will assume that the border size is set to the size of the masking operation chosen. That's the last point that I'm not completely clear on.

What's missing from my codecomment aboveis the border padding criteria. What should it be Vladimir?

Thanks.

My comment isn't quite right. It should read:

// Filter operation must start at upper left corner of first image pixel
// which is just past the lower right corner of the replicated border.
// Row addressing must account for pixel row word/dword alignment.
// The pixel row address must also account for individual pixel size
// (C1, C3, AC4) when formulating the pixel row address value.

I haven't spotted a section in the docs that describes how to set the border size so as to avoid access violations because the border size isn't big enough. It suggests that this should have it's own API.

Clearly shows step size calculation issue. This code seems to work for single byte bitmaps, but bombs with 24 bit bitmaps.