Adventures with ippiFilterWiener (2)

Adriaan_van_Os · ‎04-27-2016

The documentation for ippiFilterWiener says the following about the noise parameter: If this parameter is not defined (noise = 0), then the function estimates the noise level by averaging through the image of all local variances σi,j, and stores the corresponding values in the noise for further use. However, when I pass noise=0.0, ippiFilterWiener crashes (with what looks like a corrupted stack). This is with IPP 9.0.2 on Mac OS X and an Intel 2.5 GHz Core 2 Duo MacBook Pro. It happens with all ippiFilterWiener variants I tried. Below, gdb info for ippiFilterWiener_32f_C1R. Regards, Adriaan van Os (gdb) bt #0 0x0044a680 in s8_ownippsSum_32fc_Accur () #1 0x0044a75f in s8_ownippsSum_32f_Fast () (gdb) info registers all eax 0x612a9c 6367900 ecx 0x1fe 510 edx 0x44a650 4499024 ebx 0x810 2064 esp 0xbfff8474 0xbfff8474 ebp 0xbfff848c 0xbfff848c esi 0x62f000 6483968 edi 0xb 11 eip 0x44a680 0x44a680 eflags 0x210216 2163222 cs 0x17 23 ss 0x1f 31 ds 0x1f 31 es 0x1f 31 fs 0x0 0 gs 0x37 55 st0 -nan(0xc000000000000000) (raw 0xffffc000000000000000) st1 1.8429914817468991589404685143008851e-37 (raw 0x3f84fadae16000000000) st2 1.7609456388294814816641033672841063e-37 (raw 0x3f84efb0000000000000) st3 7.3921886536716230873355441867864221e-38 (raw 0x3f83c93c094000000000) st4 1.2518971788368537901486625344444118e-37 (raw 0x3f84aa66400000000000) st5 1.2952924775761875792156629182813672e-38 (raw 0x3f818d0b7c0000000000) st6 6.9833720180014973328020631173089637e-38 (raw 0x3f83be1b000000000000) st7 3.6056194214217565132427279466616184e-38 (raw 0x3f82c44f000000000000) fctrl 0x137f 4991 fstat 0x3863 14435 ftag 0x8000 32768 fiseg 0x17 23 fioff 0x44a6a8 4499112 foseg 0x1f 31 fooff 0x0 0 fop 0x1f7 503 xmm0 { v4_float = {0, 0, 0, 0.187211409}, v2_double = {0, 5.1598354856965341e-315}, v16_int8 = '\0' , ">?\264Y", v8_int16 = {0, 0, 0, 0, 0, 0, 15935, -19367}, v4_int32 = {0, 0, 0, 1044362329}, v2_int64 = {0, 1044362329}, uint128 = 6463860900704026624 } (raw 0x59b43f3e000000000000000000000000) xmm1 { v4_float = {0, 0, 0, 0.067506507}, v2_double = {0, 5.1010832593468362e-315}, v16_int8 = '\0' , "=\212@\332", v8_int16 = {0, 0, 0, 0, 0, 0, 15754, 16602}, v4_int32 = {0, 0, 0, 1032470746}, v2_int64 = {0, 1032470746}, uint128 = 15726721893375410176 } (raw 0xda408a3d000000000000000000000000) xmm2 { v4_float = {0, 0, 0, 0.144226447}, v2_double = {0, 5.1455833123493324e-315}, v16_int8 = '\0' , ">\023\260\031", v8_int16 = {0, 0, 0, 0, 0, 0, 15891, -20455}, v4_int32 = {0, 0, 0, 1041477657}, v2_int64 = {0, 1041477657}, uint128 = 1851000603858173952 } (raw 0x19b0133e000000000000000000000000) xmm3 { v4_float = {-0.0061361026, -0.0371395648, 0.222022787, 0.15741688}, v2_double = {-1.0616635290980361e-20, 3.6044674592801975e-08}, v16_int8 = "\273\311\021\\\275\030\037\250>cY\361>!1\344", v8_int16 = {-17463, 4444, -17128, 8104, 15971, 23025, 15905, 12772}, v4_int32 = {-1144450724, -1122492504, 1046698481, 1042362852}, v2_int64 = {-4915378428291047512, 4495535745710240228}, uint128 = 0xbbc9115cbd181fa83e6359f13e2131e4 } (raw 0xe431213ef159633ea81f18bd5c11c9bb) xmm4 { v4_float = {0, 0, 0.248581812, 0.173564509}, v2_double = {0, 1.1379934465744102e-07}, v16_int8 = "\000\000\000\000\000\000\000\000>~\214;>1\272\345", v8_int16 = {0, 0, 0, 0, 15998, -29637, 15921, -17691}, v4_int32 = {0, 0, 1048480827, 1043446501}, v2_int64 = {0, 4503190863491480293}, uint128 = 16553597523710475838 } (raw 0xe5ba313e3b8c7e3e0000000000000000) xmm5 { v4_float = {0, -0.0061361026, -0.031003464, 0.00444444502}, v2_double = {1.5565620048787301e-314, -6.6569817986579441e-15}, v16_int8 = "\000\000\000\000\273\311\021\\\274\375\372\372;\221\242\265", v8_int16 = {0, 0, -17463, 4444, -17155, -1286, 15249, -23883}, v4_int32 = {0, -1144450724, -1124205830, 999400117}, v2_int64 = {3150516572, -4828427272823135563}, uint128 = 0x00000000bbc9115cbcfdfafa3b91a2b5 } (raw 0xb5a2913bfafafdbc5c11c9bb00000000) xmm6 { v4_float = {0.111111112, 0.111111112, 0.111111112, 0.111111112}, v2_double = {1.4228543251986994e-10, 1.4228543251986994e-10}, v16_int8 = "=\343\2169=\343\2169=\343\2169=\343\2169", v8_int16 = {15843, -29127, 15843, -29127, 15843, -29127, 15843, -29127}, v4_int32 = {1038323257, 1038323257, 1038323257, 1038323257}, v2_int64 = {4459564432529526329, 4459564432529526329}, uint128 = 0x3de38e393de38e393de38e393de38e39 } (raw 0x398ee33d398ee33d398ee33d398ee33d) xmm7 { v4_float = {0, 0, 0, 0}, v2_double = {0, 0}, v16_int8 = '\0' , v8_int16 = {0, 0, 0, 0, 0, 0, 0, 0}, v4_int32 = {0, 0, 0, 0}, v2_int64 = {0, 0}, uint128 = 0 } (raw 0x00000000000000000000000000000000) mxcsr 0x1faf 8111 mm0 { uint64 = -4301219119115534336, v2_int32 = {0, -1001455616}, v4_int16 = {0, 0, 0, -15281}, v8_int8 = "\000\000\000\000\000\000O\304" } (raw 0xc44f000000000000) mm1 { uint64 = -4611686018427387904, v2_int32 = {0, -1073741824}, v4_int16 = {0, 0, 0, -16384}, v8_int8 = "\000\000\000\000\000\000\000\300" } (raw 0xc000000000000000) mm2 { uint64 = -370736216871534592, v2_int32 = {0, -86318752}, v4_int16 = {0, 0, -7840, -1318}, v8_int8 = "\000\000\000\000`\341\332\372" } (raw 0xfadae16000000000) mm3 { uint64 = -1175439502743699456, v2_int32 = {0, -273678336}, v4_int16 = {0, 0, 0, -4176}, v8_int8 = "\000\000\000\000\000\000\260\357" } (raw 0xefb0000000000000) mm4 { uint64 = -3946269003000840192, v2_int32 = {0, -918812352}, v4_int16 = {0, 0, 2368, -14020}, v8_int8 = "\000\000\000\000@\t<\311" } (raw 0xc93c094000000000) mm5 { uint64 = -6168172270893137920, v2_int32 = {0, -1436139520}, v4_int16 = {0, 0, 16384, -21914}, v8_int8 = "\000\000\000\000\000@f\252" } (raw 0xaa66400000000000) mm6 { uint64 = -8283390750176051200, v2_int32 = {0, -1928627200}, v4_int16 = {0, 0, 31744, -29429}, v8_int8 = "\000\000\000\000\000|\v\215" } (raw 0x8d0b7c0000000000) mm7 { uint64 = -4748201382132056064, v2_int32 = {0, -1105526784}, v4_int16 = {0, 0, 0, -16869}, v8_int8 = "\000\000\000\000\000\000\033\276" } (raw 0xbe1b000000000000)

Igor_A_Intel · ‎04-27-2016

hi Adriaan van Os,

could you provide a reproducer?(size of buffers you allocated, how you passed pointers and steps, did you perform your src pointer shift to ROI, etc.) GDB output can't help with understanding the root of your issue. Also it is always better (and recommended) to provide an output from ippiGetLibVersion() in order to understand which optimization, ia32 or x64, static or dynamic, threaded or not, which version was working.

const IppLibraryVersion* lib;

lib = ippiGetLibVersion();

printf("%s %s %d.%d.%d.%d\n", lib->Name, lib->Version, lib->major, lib->minor, lib->majorBuild, lib->build);

regards, Igor

Adriaan_van_Os · ‎04-27-2016

Thanks for looking into this. The iipiLibVersion is ippIP Atom (s8 threaded) 9.0.2 (r49912)Dec 28 2015 9.0.2.49912 IPP is statically linked into the application executablewith ld, namely libippcore.a libippi.a libippcv.a libippcc.a libipps.a libiomp5.a I do offset the ROI and the data pointers with the border radius to keep ippiWiener filter inside the image. Also, I do call ippiFilterWienerGetBufferSize and ippMalloc to allocate the buffer. The filter works fine with any value for noise between (but not including) 0.0 and 1.0. This makes it unlikely that this is an application errror. Regards, Adriaan van Os

Igor_A_Intel · ‎04-27-2016

Could you try the same (noise==0) with the single-threaded libraries? I'll try to create some reproducer at my side - but there is no any guarantee that my test code will also crash. If you can't provide any reproducer - could you at least provide the full function name and size parameters - width, height, kernel size, step in bytes - in order guaranteed to be on the same page. (otherwise it looks like for looking for a black cat in a dark room, when nobody knows is this cat in this room or not...)

regards, Igor

Adriaan_van_Os · ‎04-27-2016

Would it help if I send you (by private email) a Mac OS X GUI app that reproduces the crash and links to dynamic libraries ? Then, you can use you own source-debuggable libraries and a debugger, to see all parameters, etcetera. What must be the path to the dynamic libraries ?

Adriaan_van_Os · ‎04-27-2016

Tried it, but single-threaded crashes too. Oh, I forgot to mention, this is IA-32. Regards, Adriaan van Os

Igor_A_Intel · ‎04-27-2016

Ok, I've taken a look at the sources - there is no internal threading for C1R Wiener functions as well as for ippsMean_32f that is used internally, therefore doesn't matter threaded or not libraries are used (C3, C4, AC4 - have OMP implementation). I don't think that sharing of your GUI code will not help - guess we'll spend more time to build it and make workable at our side. I'll make a small cmd line reproducer and share with you.

regards, Igor

Adriaan_van_Os · ‎04-27-2016

I meant a GUI executable, not its sources, so no need to build anything. On Mac OS X it is easy to debug a dynamic lib, even when you have access to the sources of the library only. So, that would still be an option if your command-line executable doesn't reproduce it. Thanks for looking into this. Regards, Adriaan van Os

Igor_A_Intel · ‎04-27-2016

Having your GUI executable I can't see how you use FilterWiener and parameters - I don't think there is any issue in the library - this is too old function that has not been changed for years, it (as all other IPP functions) has several types of tests that are executed with different seeds on the regular basis: algorithmic (of course for all ranges of sizes for roi, mask and noise zero/non-zero), bad-arg, misalignment, mem-bound, thread-safe, special conditions, performance, - on all OSes we support and on all HW we support. Please try the reproducer code below - if you can make this code crash with some legal parameters - please report me back these parameters:

#include <stdio.h>

#include "ipp.h"

static void libInfo( void ){

const IppLibraryVersion *lib;

lib = ippiGetLibVersion();

printf( "Intel(R) Integrated Performance Primitives\n" );

printf( "Wiener filter demo\n" );

printf( "Library IppIP\n" );

printf( "CPU : %s\n", lib->targetCpu );

printf( "Name : %s\n", lib->Name );

printf( "Version : %s\n", lib->Version );

printf( "Build date: %s\n", lib->BuildDate );

}

#define MASK 5

#define WIDTH (640+MASK-1)

#define HEIGHT (480+MASK-1)

int main( ){

int bufSize, srcStep, dstStep, seed = 0;

float noise = 0.0f;

IppStatus status = ippStsNoErr;

IppiSize roi, mask, image = {WIDTH,HEIGHT};

IppiPoint anchor;

Ipp32f *pSrc, *pDst, *pImage;

Ipp8u *pBuf;

ippInit();

ippSetCpuFeatures( 0x3F ); /* Atom - S8 code */

mask.width = mask.height = MASK;

anchor.x = mask.width >> 1;

anchor.y = mask.height >> 1;

roi.width = WIDTH - mask.width + 1;

roi.height = HEIGHT - mask.height + 1;

pImage = ippiMalloc_32f_C1( WIDTH, HEIGHT, &srcStep );

pDst = ippiMalloc_32f_C1( roi.width, roi.height, &dstStep );

pSrc = (Ipp32f*)((Ipp8u*)pImage + ( mask.height - anchor.y - 1 ) * srcStep + sizeof(Ipp32f) * ( mask.width - anchor.x - 1 ));

libInfo();

ippiImageJaehne_32f_C1R( pImage, srcStep, image );

ippiAddRandUniform_32f_C1IR( pImage, srcStep, image, 0.0f, 0.1f, &seed );

ippiFilterWienerGetBufferSize( roi, mask, 1, &bufSize );

pBuf = ippsMalloc_8u( bufSize );

status = ippiFilterWiener_32f_C1R( pSrc, srcStep, pDst, dstStep, roi, mask, anchor, &noise, pBuf );

if( status != ippStsNoErr ){

printf( "\nippiWiener function return error status:\n %s\n", ippGetStatusString( status ));

} else {

printf("\nippiWiener detected noise = %f\n", noise );

}

ippiFree( pImage );

ippiFree( pDst );

ippsFree( pBuf );

return 0;

}

how to build: copy this source and ipp headers + ipp static 32-bit libs to some folder:

:wiener iastakh$ gcc -m32 -c wiener_tst.c -o wien.o

:wiener iastakh$ gcc -m32 wien.o libippi.a libipps.a libippvm.a libippcore.a -o wien

:wiener iastakh$ ./wien
Intel(R) Integrated Performance Primitives
Wiener filter demo
Library IppIP
CPU       : s8
Name      : ippIP Atom (s8)
Version   : 9.0.2 (r49912)
Build date: Dec 28 2015

ippiWiener detected noise = 0.092751

uname -a

Darwin xxxxxxxxxx 13.4.0 Darwin Kernel Version 13.4.0: Sun Aug 17 19:50:11 PDT 2014; root:xnu-2422.115.4~1/RELEASE_X86_64 x86_64

regards, Igor

Adriaan_van_Os · ‎04-27-2016

> Having your GUI executable I can't see how you use FilterWiener and parameters With all due respect, this is not true. If you debug a dynamic library on Mac OS X, you can see all calls with the values of all parameters used, It is really as simple as it could be - and a very powerful tool. Regards, Adriaan van Os

Adriaan_van_Os · ‎04-28-2016

I found out that the crash has to do with smart-linking. That is, without ld being passed -dead_strip -no_dead_strip_inits_and_terms it doesn't crash. So, there must be some hidden init code somewhere. Must some compiler glue be referenced at startup, so that ld links it in ? I noticed that there are a lot op NOPs above the instruction where it crashes. Is this intended or modified at runtime ? 0x0044a633 : jmp 0x44a640 0x0044a635 : nop 0x0044a636 : nop 0x0044a637 : nop 0x0044a638 : nop 0x0044a639 : nop 0x0044a63a : nop 0x0044a63b : nop 0x0044a63c : nop 0x0044a63d : nop 0x0044a63e : nop 0x0044a63f : nop crashes here ----> 0x0044a640 : fadds (%esi)

Adriaan_van_Os · ‎05-14-2016

I did some more debugging and found the following. The crash disappears when adding dummy references to the following calls (or by instructing ld not to dead-strip those calls)

ippsSum_32s_Sfs, ippsSum_32f, ippsSum_32fc, ippsSum_64f and ippsSum_64fc

So, the bug is that ippFilterWiener is calling variants (or internal code) of those functions (when the noise level is 0.0) in a way that is not compatible with ld's -dead_strip linker option. I assume ippFilterWiener is using a nasty trick to call ownippsSum, otherwise ld would recognize the call. Anyway, as of the Xcode June 2004 release, library code must be compatible with -dead_strip.

Regards,

Adriaan van Os

Adriaan_van_Os · ‎05-31-2016

Since nobody seems interested in fixing bugs, I will no longer spend time debugging and reporting them.

Regards,

Adriaan van Os

Igor_A_Intel · ‎06-01-2016

Hi Adriaan van Os,

IPP works correctly in all other environments - different kinds of Linux, Yocto, QNX, VxWorks, Android, Windows, etc. Therefore I guess that it is XCode ld's -dead_strip linker option bug, not IPPs one. There are no any tricks in calling internal functions. For these particular calls:

IPPFUN( IppStatus, ippiFilterWiener_32f_C1R,( const Ipp32f* pSrc, int srcStep,
             Ipp32f* pDst, int dstStep, IppiSize dstRoiSize, IppiSize maskSize,
                           IppiPoint anchor, Ipp32f noise[1], Ipp8u* pBuffer ))
{
OWN_WIENER_BADARG_TST1( pSrc, srcStep, pDst, dstStep, dstRoiSize, maskSize,
                                                       anchor, noise, pBuffer )
{
    int width = dstRoiSize.width, height = dstRoiSize.height;
    int mw = maskSize.width, mh = maskSize.height, bufStep, h;
    Ipp32f shum, *pBuf = NULL, *pLocMean, *pLocVar;
    Ipp32f *pV, *pM, min, max;
    Ipp32f mpy = 1.f / ( mw * mh ), tmp;
    Ipp32f *pS, *pD, *strtSrc, *pS1;
    Ipp64f variance = 0;
    IppiSize tmpRoi;
    int    bufSize;

    strtSrc = (Ipp32f*)( (Ipp8u*)pSrc - ( mh - anchor.y - 1 ) * srcStep -
                                       sizeof(Ipp32f) * ( mw - anchor.x - 1 ));
    tmpRoi.width = width + mw - 1; tmpRoi.height = height + mh - 1;
    ippiMinMax_32f_C1R( strtSrc, srcStep, tmpRoi, &min, &max );
    max = max - min; max *= max;
    bufSize = ALIGN_ON_4_FLOAT( dstRoiSize.width + BUF_JUNK + maskSize.width );
    bufSize = bufSize * 4 * sizeof(Ipp32f) + ALIGN_MAGIC;
    ippsZero_8u( pBuffer, bufSize );
    pBuf = (Ipp32f*)ALIGN_PTR( pBuffer );
    bufStep = ALIGN_ON_4_FLOAT( width + mw + BUF_JUNK ) * sizeof(Ipp32f);
    pLocMean = pBuf;
    pLocVar = (Ipp32f*)( (Ipp8u*)pBuf + 2 * bufStep );
    if( wabs( noise[0] ) < IPP_EPS_32F ){/* if the noise variance is not given */
        pM = pLocMean; pV = pLocVar;
        for( h = 0; h < height; h++ ){
            pS = (Ipp32f*)( (Ipp8u*)strtSrc + h * srcStep );
            owniLocalVarMean_32f_C1L( pS, srcStep, maskSize, pM, pV,
                                            bufStep, dstRoiSize, h, mpy, 0.f );
            ippsMean_32f( pV, width, &tmp, ippAlgHintFast );
            pV = (Ipp32f*)( (Ipp8u*)pV + bufStep );
            pM = (Ipp32f*)( (Ipp8u*)pM + bufStep );
            variance += tmp;
            bufStep = -bufStep;
        }
        if( bufStep < 0 ) bufStep = - bufStep;
        shum = (Ipp32f)( variance / height );
        noise[0] = shum * mpy / max;
    } else {.................

where Sum functions are called from ippsMean:

Ipp32f ownippsMean_32f( const Ipp32f *pSrc, int len, IppHintAlgorithm hint )
{
return (Ipp32f)( ownippsSum_32f( pSrc, len, hint ) / (Ipp64f)len );
}

IPPFUN(IppStatus,ippsMean_32f,(const Ipp32f* __RESTRICT pSrc,int len,Ipp32f* __RESTRICT pMean,
IppHintAlgorithm hint))
{

IPP_BAD_PTR2_RET(pSrc,pMean)
IPP_BAD_SIZE_RET(len)

*pMean = ownippsMean_32f( pSrc, len, hint );
return ippStsNoErr;
}

Regards, Igor

Adriaan_van_Os · ‎06-18-2016

Thanks for you reply.

So, we are looking at what seems to be a linker, assembler or compiler bug (in order of increasing probability). I bet it has to do with the compiler trying to inline Ipp32f ownippsMean_32f

Ipp32f ownippsMean_32f( const Ipp32f *pSrc, int len, IppHintAlgorithm hint )
{
return (Ipp32f)( ownippsSum_32f( pSrc, len, hint ) / (Ipp64f)len );
}

which can checked by adding __attribute__ ((noinline)) to it.

Regards,

Adriaan van Os