bad floating point result on a PIII but works on a P4

duncanh · ‎04-30-2003

I have the following code:

mathCopyVector(&pao->apb->position, &aobj->base.pos);

ActorMgr_GetOrientation(aobj, &rotation);
rotation.y += (- rotation.y) / 8.0f;
ActorMgr_SetOrientation(aobj, &rotation, 1);

which the Intel C++ Compiler compiles to:

165: mathCopyVector(&pao->apb->position, &aobj->base.pos);
004F32FD mov eax,dword ptr [ebp+48h]
004F3300 movaps xmm4,xmmword ptr [eax+160h]
004F3307 movaps xmmword ptr [edi+20h],xmm4
166:
167: ActorMgr_GetOrientation(aobj, &rotation);
004F330B lea edx,[esp+10h]
004F330F push edx
004F3310 push edi
004F3311 call _ActorMgr_GetOrientation (004e9d50)
004F3316 add esp,8
168: rotation.y += (- rotation.y) / 8.0f;
004F3319 fld dword ptr [esp+14h]
004F331D fld st(0)
004F331F fchs
004F3321 fmul dword ptr [string "CanShootAt"+10h (008377e0)]
004F3327 faddp st(1),st
169: ActorMgr_SetOrientation(aobj, &rotation, 1);
004F3329 lea eax,[esp+10h]
168: rotation.y += (- rotation.y) / 8.0f;
004F332D fstp dword ptr [esp+14h]
169: ActorMgr_SetOrientation(aobj, &rotation, 1);
004F3331 push 1
004F3333 push eax
004F3334 push edi
004F3335 call _ActorMgr_SetOrientation (004e75e8)

Now the problem is that on a Pentium3 and Athlon processor the 'fld st(0)' at 004F331D pushes a 1#IND on to the floating point stack. Even though st(0) is already set to 0.

But a Pentium4 works fine!

Does anybody know how to fix this problem?

Ta.

bronx · ‎05-01-2003

The 1st thing I'll check in such cases (any x87 strange behavior) is if I haven't some missing EMMS before the culprit code sequence. So, provided that you use MMX opcodes in other areas, try it again with a _mm_empty() right before to decrement "orientation.y".

Another workaround will be to force the usage of SSE to compute the rotation step, typically with MULSS for the 0.125f scaling and SUBSS for the decrement

Ganesh_R_Intel · ‎05-01-2003

Duncanh,
Welcome!
Would it be possible for you to isolate a testcase and log this on premier.intel.com?

Cheers,
Ganesh

duncanh · ‎05-03-2003

I have finally managed to track down the problem.
I had written some SSE intrinsice code that the complier had inserted a 'movdqa' MMX instruction for me. This then trashed x87 registers. Doh.

For the your refrence the offending code is below.
Compile with '/QxK'
If you uncomment the '__m128i ssei' member of '__m128_cpp4' then

const __m128_cpp4 rect_ps = { rect.left, rect.top, rect.right, rect.bottom};

is compiled to a 'movdqa'.

Code:

#include "emmintrin.h"
#include "xmmintrin.h"

// DLH - Use these initialise SSE variables when using the Intel Compiler
typedef union
{
float m128_f32[ 4];
__m128 sse;
//__m128i ssei;
}__m128_cpp4;

typedef union
{
int8 m128i_i8[ 16];
int64 m128i_i64[ 2];
__m128 sse;
}__m128_cpp16;

enum CLIP_PLANE
{
CPF_T = 0x01,
CPF_B = 0x02,
CPF_L = 0x04,
CPF_R = 0x08,
CPF_N = 0x10,
CPF_F = 0x20,
};

__forceinline void ClippedOBB_SSE_Vertex( const __m128_cpp4& rect_ps, const MATRIX* m, const float x, const float y, const float z, __m128_cpp16& outsideflags_ps)
{
static const __m128_cpp4 zero_ps = {0, 0, 0, 0};
static const __m128_cpp16 nearflags_ps = { CPF_N,0,0,0, CPF_N,0,0,0, CPF_N,0,0,0, CPF_N,0,0,0};
static const __m128_cpp16 rect_flags_ps = { CPF_L,0,0,0, CPF_T,0,0,0, CPF_R,0,0,0, CPF_B,0,0,0};
static const __m128_cpp16 invert_ps = { 0,0,0,0, 0,0,0,0, -1,-1,-1,-1, -1,-1,-1,-1};

__m128_cpp4 t_ps;
__m128_cpp16 flags_ps;
const __m128_cpp4 vert = { x, y, z, 0};

t_ps.sse = _mm_load_ps( &m->m30);
t_ps.sse = _mm_add_ps( t_ps.sse, _mm_mul_ps( m->m128_m0, _mm_shuffle_ps( vert.sse, vert.sse, _MM_SHUFFLE( 0, 0, 0, 0)) ));
t_ps.sse = _mm_add_ps( t_ps.sse, _mm_mul_ps( m->m128_m1, _mm_shuffle_ps( vert.sse, vert.sse, _MM_SHUFFLE( 1, 1, 1, 1)) ));
t_ps.sse = _mm_add_ps( t_ps.sse, _mm_mul_ps( m->m128_m2, _mm_shuffle_ps( vert.sse, vert.sse, _MM_SHUFFLE( 2, 2, 2, 2)) ));

flags_ps.sse = _mm_and_ps( nearflags_ps.sse, _mm_cmpgt_ps( zero_ps.sse, _mm_shuffle_ps( t_ps.sse, t_ps.sse, _MM_SHUFFLE( 3, 3, 3, 3))));
flags_ps.sse = _mm_or_ps( flags_ps.sse, _mm_and_ps( rect_flags_ps.sse, _mm_xor_ps( invert_ps.sse, _mm_cmpgt_ps( _mm_mul_ps( _mm_shuffle_ps( t_ps.sse, t_ps.sse, _MM_SHUFFLE( 3, 3, 3, 3)), rect_ps.sse), _mm_shuffle_ps( t_ps.sse, t_ps.sse, _MM_SHUFFLE( 1, 0, 1, 0))))));
outsideflags_ps.sse = _mm_and_ps( outsideflags_ps.sse, flags_ps.sse);
}

bool ClippedOBB_SSE( const MATRIX& matrix, const RECTF& rect, const VECTOR& bbmin, const VECTOR& bbmax)
{
const __m128_cpp4 rect_ps = { rect.left, rect.top, rect.right, rect.bottom};
static const uint8 init_flags = CPF_T | CPF_B | CPF_L | CPF_R | CPF_F | CPF_N;
static const __m128_cpp16 init_outsideflags_ps = { init_flags,0,0,0, init_flags,0,0,0, init_flags,0,0,0, init_flags,0,0,0};
__m128_cpp16 outsideflags_ps = init_outsideflags_ps;
const MATRIX* m = &matrix;

ClippedOBB_SSE_Vertex( rect_ps, m, bbmin.x, bbmin.y, bbmin.z, outsideflags_ps);
ClippedOBB_SSE_Vertex( rect_ps, m, bbmax.x, bbmin.y, bbmin.z, outsideflags_ps);
ClippedOBB_SSE_Vertex( rect_ps, m, bbmin.x, bbmax.y, bbmin.z, outsideflags_ps);
ClippedOBB_SSE_Vertex( rect_ps, m, bbmax.x, bbmax.y, bbmin.z, outsideflags_ps);
ClippedOBB_SSE_Vertex( rect_ps, m, bbmin.x, bbmin.y, bbmax.z, outsideflags_ps);
ClippedOBB_SSE_Vertex( rect_ps, m, bbmax.x, bbmin.y, bbmax.z, outsideflags_ps);
ClippedOBB_SSE_Vertex( rect_ps, m, bbmin.x, bbmax.y, bbmax.z, outsideflags_ps);
ClippedOBB_SSE_Vertex( rect_ps, m, bbmax.x, bbmax.y, bbmax.z, outsideflags_ps);

return( (outsideflags_ps.m128i_i64[ 0] | outsideflags_ps.m128i_i64[ 1]) != 0);
}