- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi:
I have write a stupid code to practiceSSE:
#include
#include
#include
#include /*SSE*/
#define MALLOC_ALIGN_16BYTE(_size) _aligned_malloc( _size, 16)
#define FREE_ALIGN_16BYTE(ptr) _aligned_free(ptr)
int main(void)
{
int inN;
short *input;
float *yIn;
inN = 10;
input = (short*)MALLOC_ALIGN_16BYTE(inN*sizeof(short));
yIn = (float*)MALLOC_ALIGN_16BYTE(inN*sizeof(float));
for(int i = 0; i< inN; i++){
input = (short)i;
}/*for i*/
__m64 *pShort = (__m64*)input;
__m128 *pFloat = (__m128*)yIn;
int m = inN/4;
#if(1)
for(int i = 0; i< m; i++){
pFloat = _mm_cvtpi16_ps(pShort);
}/*for i*/
for(int i = m*4; i< inN;i++){
yIn = (float)input;
}
for(int i = 0; i< inN; i++){
printf("i = %d, yout = %4.3f\\n", i,yIn );
}/*for i*/
#else
for(int i = 0; i< inN;i++){
yIn = (float)input;
}
for(int i = 0; i< inN; i++){
printf("i = %d, yout = %4.3f\\n", i,yIn );
}/*for i*/
#endif
FREE_ALIGN_16BYTE(input);
FREE_ALIGN_16BYTE(yIn);
}/*main*/
the printf result is :
i = 0, yout = 0.000
i = 1, yout = 1.000
i = 2, yout = 2.000
i = 3, yout = 3.000
i = 4, yout = 4.000
i = 5, yout = 5.000
i = 6, yout = 6.000
i = 7, yout = 7.000
i = 8, yout = -1.#IO
i = 9, yout = 9.000
if I set inN = 12; the result is
i = 0, yout = -1.#IO
i = 1, yout = 1.000
i = 2, yout = 2.000
i = 3, yout = 3.000
i = 4, yout = 4.000
i = 5, yout = 5.000
i = 6, yout = 6.000
i = 7, yout = 7.000
i = 8, yout = 8.000
i = 9, yout = 9.000
i = 10, yout = 10.000
i = 11, yout = 11.000
They is both the same for VC8 or ICC10.1.
I do not know what mistake I make.... i am newbie in use SSE intrinsics.
could someone help me ?
thank you.
I have write a stupid code to practiceSSE:
#include
#include
#include
#include
#define MALLOC_ALIGN_16BYTE(_size) _aligned_malloc( _size, 16)
#define FREE_ALIGN_16BYTE(ptr) _aligned_free(ptr)
int main(void)
{
int inN;
short *input;
float *yIn;
inN = 10;
input = (short*)MALLOC_ALIGN_16BYTE(inN*sizeof(short));
yIn = (float*)MALLOC_ALIGN_16BYTE(inN*sizeof(float));
for(int i = 0; i< inN; i++){
input = (short)i;
}/*for i*/
__m64 *pShort = (__m64*)input;
__m128 *pFloat = (__m128*)yIn;
int m = inN/4;
#if(1)
for(int i = 0; i< m; i++){
pFloat = _mm_cvtpi16_ps(pShort);
}/*for i*/
for(int i = m*4; i< inN;i++){
yIn = (float)input;
}
for(int i = 0; i< inN; i++){
printf("i = %d, yout = %4.3f\\n", i,yIn );
}/*for i*/
#else
for(int i = 0; i< inN;i++){
yIn = (float)input;
}
for(int i = 0; i< inN; i++){
printf("i = %d, yout = %4.3f\\n", i,yIn );
}/*for i*/
#endif
FREE_ALIGN_16BYTE(input);
FREE_ALIGN_16BYTE(yIn);
}/*main*/
the printf result is :
i = 0, yout = 0.000
i = 1, yout = 1.000
i = 2, yout = 2.000
i = 3, yout = 3.000
i = 4, yout = 4.000
i = 5, yout = 5.000
i = 6, yout = 6.000
i = 7, yout = 7.000
i = 8, yout = -1.#IO
i = 9, yout = 9.000
if I set inN = 12; the result is
i = 0, yout = -1.#IO
i = 1, yout = 1.000
i = 2, yout = 2.000
i = 3, yout = 3.000
i = 4, yout = 4.000
i = 5, yout = 5.000
i = 6, yout = 6.000
i = 7, yout = 7.000
i = 8, yout = 8.000
i = 9, yout = 9.000
i = 10, yout = 10.000
i = 11, yout = 11.000
They is both the same for VC8 or ICC10.1.
I do not know what mistake I make.... i am newbie in use SSE intrinsics.
could someone help me ?
thank you.
Link Copied
3 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
on my end your code runs OK
i'll take the inN=12 case (the 10 one does not make much sense to me)
essentially you transform each 4 short values into 4 sp float values (in 128bit format) which is OK
nownext step would be to apply some sort of arithmetic/logica operation on your sp floats (eg mm_add_ps)
for inN=12 case this loop does not make sense(maybe you need to rethink your code)
for(int i = m*4; i< inN;i++){
yIn = (float)input;
}
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I think I found the error :
I use intel 10.1 compiler, there is warning #965 and #964:
this explain I should insert _mm_empty() after use SSE point:
for(int i = 0; i< m; i++){
pFloat = _mm_cvtpi16_ps(pShort);
}/*for i*/
_mm_empty();
for(int i = m*4 ; i< inN;i++){
yIn = (float)input;
}/*for i*/
and the output would be regular.
I use intel 10.1 compiler, there is warning #965 and #964:
this explain I should insert _mm_empty() after use SSE point:
for(int i = 0; i< m; i++){
pFloat = _mm_cvtpi16_ps(pShort);
}/*for i*/
_mm_empty();
for(int i = m*4 ; i< inN;i++){
yIn = (float)input;
}/*for i*/
and the output would be regular.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
A missing emms causes often really strange results. I can't imagine how much time I've spent debugging such code :(
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page