- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
This also similar with my previous question about bus transaction.
I implement cache blocking. Is it right?
I implement cache blocking. Is it right?
system spec: H/W : IBM Xseries 225
OS : Redhat Linux 9
compiler : icc 8.0
From IA-32 Intel Architecture Optimization Chapter 7"
Cache Blocking Technique
Loop blocking is useful for reducing cache misses and improving
memory access performance. The selection of a suitable block size is
critical when applying the loop blocking technique. Loop blocking is
applicable to single-threaded applications as well as to multithreaded
applications running on processors with or without Hyper-Threading
Technology. The technique transforms the memory access pattern into
blocks that efficiently fit in the target cache size.
When targeting IA-32 processors with Hyper-Threading Technology,
the loop blocking technique should select a block size that is no more
than one half of the target cache size. The upper limit of the block size
for loop blocking should be determined by dividing the target cache size
by the number of logical processors available in a physical processor
package. Typically, some cache lines are needed to access data that are
not part of the source or destination buffers used in cache blocking, so
the block size can be chosen between one quarter to one half of the
target cache (see also, Chapter 3).
User/Source Coding Rule 30. (H impact, H generality) Use cache blocking
to improve locality of data access. Target one quarter to one half of the cache
size when targeting IA-32 processors with Hyper-Threading Technology.
Loop blocking is useful for reducing cache misses and improving
memory access performance. The selection of a suitable block size is
critical when applying the loop blocking technique. Loop blocking is
applicable to single-threaded applications as well as to multithreaded
applications running on processors with or without Hyper-Threading
Technology. The technique transforms the memory access pattern into
blocks that efficiently fit in the target cache size.
When targeting IA-32 processors with Hyper-Threading Technology,
the loop blocking technique should select a block size that is no more
than one half of the target cache size. The upper limit of the block size
for loop blocking should be determined by dividing the target cache size
by the number of logical processors available in a physical processor
package. Typically, some cache lines are needed to access data that are
not part of the source or destination buffers used in cache blocking, so
the block size can be chosen between one quarter to one half of the
target cache (see also, Chapter 3).
User/Source Coding Rule 30. (H impact, H generality) Use cache blocking
to improve locality of data access. Target one quarter to one half of the cache
size when targeting IA-32 processors with Hyper-Threading Technology.
Source for cache blocking
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include
#include
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include
#include
void CacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint
ITERATIONS);
void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS);
void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS);
void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS);
void timersubb( struct timeval *, struct timeval *, struct timeval *);
void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS);
void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS);
void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS);
void timersubb( struct timeval *, struct timeval *, struct timeval *);
int main(int argc, char* argv[])
{
int i, j;
uint ITERATIONS = 1000;
uint ARRAY_SZ = 4096000;
uint* array = (uint*) malloc(sizeof(uint) * ARRAY_SZ);
{
int i, j;
uint ITERATIONS = 1000;
uint ARRAY_SZ = 4096000;
uint* array = (uint*) malloc(sizeof(uint) * ARRAY_SZ);
for(i = 0; i < ITERATIONS; i++)
for(j = 0; j < ARRAY_SZ; j++)
array
printf("NO Cache Blocking ");
TimeNonCacheBlocking(array, ARRAY_SZ, ITERATIONS);
printf(" Single Threaded Cache Blocking ");
TimeCacheBlocking(array, ARRAY_SZ, 204800, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 136534, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 117029, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 102400, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 68267, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 34134, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 25600, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 12800, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 6400, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 3200, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 1600, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 136534, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 117029, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 102400, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 68267, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 34134, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 25600, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 12800, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 6400, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 3200, ITERATIONS);
TimeCacheBlocking(array, ARRAY_SZ, 1600, ITERATIONS);
return 0;
}
void
CacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint
ITERATIONS)
{
uint sum =0;
uint index = 0, i, j;
{
uint sum =0;
uint index = 0, i, j;
for(index = 0; index < ARRAY_SZ;) {
uint* data = &array[index];
index += BLOCK_SZ;
uint* data = &array[index];
index += BLOCK_SZ;
if(index > ARRAY_SZ)
BLOCK_SZ = ARRAY_SZ - (index - BLOCK_SZ);
BLOCK_SZ = ARRAY_SZ - (index - BLOCK_SZ);
for(i=0; i< ITERATIONS; i++)
for(j=0; j < BLOCK_SZ; j++)
sum += data + data + ITERATIONS;
}
for(j=0; j < BLOCK_SZ; j++)
sum += data
}
*result = sum;
}
void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS)
{
uint sum =0;
uint i, j;
{
uint sum =0;
uint i, j;
for(i=0; i< ITERATIONS; i++)
for(j=0; j < ARRAY_SZ; j++)
sum += array
*result = sum;
}
void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS)
{
uint sum = 0;
struct timeval start, end, result;
{
uint sum = 0;
struct timeval start, end, result;
gettimeofday(&start, NULL);
CacheBlocking (∑, array, ARRAY_SZ, BLOCK_SZ, ITERATIONS);
gettimeofday(&end, NULL);
timersubb(&start, &end, &result);
printf("%ld sec, %ld usec ", result.tv_sec, result.tv_usec);
printf("Block Size: %u K ", BLOCK_SZ * sizeof(uint) /1024);
printf("Results: %u ", sum);
}
timersubb(&start, &end, &result);
printf("%ld sec, %ld usec ", result.tv_sec, result.tv_usec);
printf("Block Size: %u K ", BLOCK_SZ * sizeof(uint) /1024);
printf("Results: %u ", sum);
}
void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS)
{
uint sum = 0;
struct timeval start, end, result;
{
uint sum = 0;
struct timeval start, end, result;
gettimeofday(&start, NULL);
NonCacheBlocking (∑, array, ARRAY_SZ, ITERATIONS);
gettimeofday(&end, NULL);
timersubb(&start, &end, &result);
printf("%ld sec, %ld usec ", result.tv_sec, result.tv_usec);
printf("Block Size: 0 K ");
printf("Results: %u ", sum);
}
timersubb(&start, &end, &result);
printf("%ld sec, %ld usec ", result.tv_sec, result.tv_usec);
printf("Block Size: 0 K ");
printf("Results: %u ", sum);
}
void timersubb( struct timeval *start, struct timeval *end, struct timeval *result)
{
result->tv_sec = end->tv_sec - start->tv_sec;
result->tv_usec = end->tv_usec - start->tv_usec;
{
result->tv_sec = end->tv_sec - start->tv_sec;
result->tv_usec = end->tv_usec - start->tv_usec;
if(result->tv_usec < 0) {
--result->tv_sec;
result->tv_usec += 1000000;
}
return;
}
--result->tv_sec;
result->tv_usec += 1000000;
}
return;
}
Link Copied
1 Reply
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Persepone -
I'm not familiar with the text that you are citing from the IA-32 Architecture manual, but your example looks to be correct. Are you not seeing any improvement on your system between the different blocking sizes that you're using?
-- clay
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page