- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I am writing a method for a P3, that uses SIMD, which processes a large block of data. So it reads and writes more than a cache full. The reads are no problem since I am using prefetches, etc. But it is the writes that are slowing me down. I am writing 64 bits of data at a time with movntq, and it is averaging 22 cycles of stall per call. I am using the ICC on a linux machine with the following assembly as inline. I also tried for comparision using movq and it is twice as slow. Below is part of my code. I am especially worried because I may have to change the writes to 32 bit writes due to a change in the algorithm. Any ideas would be appreciated on how to optimize this.
// Compiled with : icc -use_msasm -O3 -xK
asm(
...
LOOP1:
// Fetch 5 loops in advance.
prefetchnta [esi+6*32];
prefetchnta [esi+2880+6*32];
/**********************************************************************
*
* For optimal cache use, we will process a line of cache at a time
* which is 32 bytes. or 8 pixels. So we will process the average
* of the first 8.
* First do Pixels 1-3
*
**********************************************************************/
movq mm2, [esi+8]; // pixel 3 - 4
movq mm3, [esi+2888];
// Save so we can use to calculate pixel 4.
// mm3 will not be changed.
movq mm7, mm2;
// For pixel 2 , we want top of mm0, and bot mm2
// AB CD <=> AD; D0 AB = AD
movq mm4, mm2;
psllq mm4, 32; // Move bottom half of mm2 to top.
punpckhdq mm4, mm0; // mm4 becomes high of mm0, low mm2
// Now repeat for bottom row pixels. t mm1, and b mm3
movq mm5, mm3;
psllq mm5, 32;
punpckhdq mm5, mm1;
pavgb mm0,mm1; // For pixel 1
pavgb mm2,mm3; // For pixel 3
pavgb mm4,mm5; // For pixel 2
// Now to get average of dw's in mm, copy the top of
punpckldq mm6,mm0; // xx ab = bx : pixel 1
pavgb mm0,mm6;
punpckldq mm6,mm2; // xx cd = dx : pixel 3
pavgb mm2,mm6;
punpckldq mm6,mm4; // xx ef = fx : pixel 2
pavgb mm4,mm6;
// Now top half of mm0,mm2,mm4 have the average pixel.
// Combine mm0 and mm4 to store pixels 1 and 2.
// ab cd <=> ca
punpckhdq mm0,mm4;
// The source of the stalls.
movntq [ebx],mm0; // Store Pixels 1 and 2.
...
// Compiled with : icc -use_msasm -O3 -xK
asm(
...
LOOP1:
// Fetch 5 loops in advance.
prefetchnta [esi+6*32];
prefetchnta [esi+2880+6*32];
/**********************************************************************
*
* For optimal cache use, we will process a line of cache at a time
* which is 32 bytes. or 8 pixels. So we will process the average
* of the first 8.
* First do Pixels 1-3
*
**********************************************************************/
movq mm2, [esi+8]; // pixel 3 - 4
movq mm3, [esi+2888];
// Save so we can use to calculate pixel 4.
// mm3 will not be changed.
movq mm7, mm2;
// For pixel 2 , we want top of mm0, and bot mm2
// AB CD <=> AD; D0 AB = AD
movq mm4, mm2;
psllq mm4, 32; // Move bottom half of mm2 to top.
punpckhdq mm4, mm0; // mm4 becomes high of mm0, low mm2
// Now repeat for bottom row pixels. t mm1, and b mm3
movq mm5, mm3;
psllq mm5, 32;
punpckhdq mm5, mm1;
pavgb mm0,mm1; // For pixel 1
pavgb mm2,mm3; // For pixel 3
pavgb mm4,mm5; // For pixel 2
// Now to get average of dw's in mm, copy the top of
punpckldq mm6,mm0; // xx ab = bx : pixel 1
pavgb mm0,mm6;
punpckldq mm6,mm2; // xx cd = dx : pixel 3
pavgb mm2,mm6;
punpckldq mm6,mm4; // xx ef = fx : pixel 2
pavgb mm4,mm6;
// Now top half of mm0,mm2,mm4 have the average pixel.
// Combine mm0 and mm4 to store pixels 1 and 2.
// ab cd <=> ca
punpckhdq mm0,mm4;
// The source of the stalls.
movntq [ebx],mm0; // Store Pixels 1 and 2.
...
Link Copied
0 Replies
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page