<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic About Cache Blocking in Intel® Moderncode for Parallel Architectures</title>
    <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/About-Cache-Blocking/m-p/965738#M5397</link>
    <description>&lt;DIV&gt;This also similar with my previous question about bus transaction.&lt;BR /&gt;I implement cache blocking. Is it right?&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt;system spec: H/W : IBM Xseries 225&lt;BR /&gt; OS : Redhat Linux 9&lt;BR /&gt; compiler : icc 8.0&lt;/DIV&gt;
&lt;DIV&gt;From IA-32 Intel Architecture Optimization Chapter 7"&lt;/DIV&gt;
&lt;DIV&gt;Cache Blocking Technique&lt;BR /&gt;Loop blocking is useful for reducing cache misses and improving&lt;BR /&gt;memory access performance. The selection of a suitable block size is&lt;BR /&gt;critical when applying the loop blocking technique. Loop blocking is&lt;BR /&gt;applicable to single-threaded applications as well as to multithreaded&lt;BR /&gt;applications running on processors with or without Hyper-Threading&lt;BR /&gt;Technology. The technique transforms the memory access pattern into&lt;BR /&gt;blocks that efficiently fit in the target cache size.&lt;BR /&gt;When targeting IA-32 processors with Hyper-Threading Technology,&lt;BR /&gt;the loop blocking technique should select a block size that is no more&lt;BR /&gt;than one half of the target cache size. The upper limit of the block size&lt;BR /&gt;for loop blocking should be determined by dividing the target cache size&lt;BR /&gt;by the number of logical processors available in a physical processor&lt;BR /&gt;package. Typically, some cache lines are needed to access data that are&lt;BR /&gt;not part of the source or destination buffers used in cache blocking, so&lt;BR /&gt;the block size can be chosen between one quarter to one half of the&lt;BR /&gt;target cache (see also, Chapter 3).&lt;BR /&gt;User/Source Coding Rule 30. (H impact, H generality) Use cache blocking&lt;BR /&gt;to improve locality of data access. Target one quarter to one half of the cache&lt;BR /&gt;size when targeting IA-32 processors with Hyper-Threading Technology.&lt;/DIV&gt;
&lt;DIV&gt;&lt;/DIV&gt;
&lt;DIV&gt;Source for cache blocking&lt;BR /&gt;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++&lt;BR /&gt;#include &lt;STDLIB.H&gt;&lt;BR /&gt;#include &lt;SYS&gt;&lt;/SYS&gt;&lt;/STDLIB.H&gt;&lt;/DIV&gt;
&lt;DIV&gt;void CacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint &lt;/DIV&gt;
&lt;DIV&gt;ITERATIONS);&lt;BR /&gt;void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS);&lt;BR /&gt;void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS);&lt;BR /&gt;void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS);&lt;BR /&gt;void timersubb( struct timeval *, struct timeval *, struct timeval *);&lt;/DIV&gt;
&lt;DIV&gt;int main(int argc, char* argv[])&lt;BR /&gt;{ &lt;BR /&gt;int i, j;&lt;BR /&gt;uint ITERATIONS = 1000;&lt;BR /&gt;uint ARRAY_SZ = 4096000;&lt;BR /&gt;uint* array = (uint*) malloc(sizeof(uint) * ARRAY_SZ);&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt;for(i = 0; i &amp;lt; ITERATIONS; i++)&lt;BR /&gt; for(j = 0; j &amp;lt; ARRAY_SZ; j++)&lt;BR /&gt;array&lt;J&gt; = 3;&lt;/J&gt;&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt;printf("NO Cache Blocking 
");&lt;/DIV&gt;
&lt;DIV&gt;TimeNonCacheBlocking(array, ARRAY_SZ, ITERATIONS);&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt;printf("

 Single Threaded Cache Blocking 
");&lt;/DIV&gt;
&lt;DIV&gt;TimeCacheBlocking(array, ARRAY_SZ, 204800, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 136534, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 117029, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 102400, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 68267, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 34134, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 25600, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 12800, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 6400, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 3200, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 1600, ITERATIONS);&lt;/DIV&gt;
&lt;DIV&gt; return 0;&lt;/DIV&gt;
&lt;DIV&gt;}&lt;/DIV&gt;
&lt;DIV&gt;&lt;/DIV&gt;
&lt;DIV&gt;void 
CacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint &lt;/DIV&gt;
&lt;DIV&gt;ITERATIONS)&lt;BR /&gt;{&lt;BR /&gt;uint sum =0;&lt;BR /&gt;uint index = 0, i, j;&lt;/DIV&gt;
&lt;DIV&gt;for(index = 0; index &amp;lt; ARRAY_SZ;) {&lt;BR /&gt;uint* data = &amp;amp;array[index];&lt;BR /&gt;index += BLOCK_SZ;&lt;/DIV&gt;
&lt;DIV&gt;if(index &amp;gt; ARRAY_SZ) &lt;BR /&gt;BLOCK_SZ = ARRAY_SZ - (index - BLOCK_SZ);&lt;/DIV&gt;
&lt;DIV&gt;for(i=0; i&amp;lt; ITERATIONS; i++)&lt;BR /&gt; for(j=0; j &amp;lt; BLOCK_SZ; j++)&lt;BR /&gt;sum += data&lt;J&gt; + data&lt;J&gt; + ITERATIONS;&lt;BR /&gt;}&lt;/J&gt;&lt;/J&gt;&lt;/DIV&gt;
&lt;DIV&gt;*result = sum;&lt;/DIV&gt;
&lt;DIV&gt;}&lt;/DIV&gt;
&lt;DIV&gt;&lt;/DIV&gt;
&lt;DIV&gt;void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS)&lt;BR /&gt;{&lt;BR /&gt; uint sum =0;&lt;BR /&gt;uint i, j;&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt; for(i=0; i&amp;lt; ITERATIONS; i++)&lt;BR /&gt; for(j=0; j &amp;lt; ARRAY_SZ; j++)&lt;BR /&gt; sum += array&lt;J&gt; + array&lt;J&gt; + ITERATIONS;&lt;/J&gt;&lt;/J&gt;&lt;/DIV&gt;
&lt;DIV&gt; *result = sum;&lt;/DIV&gt;
&lt;DIV&gt;}&lt;/DIV&gt;
&lt;DIV&gt;void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS)&lt;BR /&gt;{&lt;BR /&gt;uint sum = 0;&lt;BR /&gt;struct timeval start, end, result;&lt;/DIV&gt;
&lt;DIV&gt; gettimeofday(&amp;amp;start, NULL);&lt;/DIV&gt;
&lt;DIV&gt;CacheBlocking (∑, array, ARRAY_SZ, BLOCK_SZ, ITERATIONS);&lt;/DIV&gt;
&lt;DIV&gt;gettimeofday(&amp;amp;end, NULL);&lt;BR /&gt; timersubb(&amp;amp;start, &amp;amp;end, &amp;amp;result);&lt;BR /&gt; printf("%ld sec, %ld usec
", result.tv_sec, result.tv_usec);&lt;BR /&gt;printf("Block Size: %u K	", BLOCK_SZ * sizeof(uint) /1024);&lt;BR /&gt;printf("Results: %u
", sum);&lt;BR /&gt;}&lt;/DIV&gt;
&lt;DIV&gt;void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS)&lt;BR /&gt;{&lt;BR /&gt; uint sum = 0;&lt;BR /&gt; struct timeval start, end, result;&lt;/DIV&gt;
&lt;DIV&gt; gettimeofday(&amp;amp;start, NULL);&lt;/DIV&gt;
&lt;DIV&gt; NonCacheBlocking (∑, array, ARRAY_SZ, ITERATIONS);&lt;/DIV&gt;
&lt;DIV&gt; gettimeofday(&amp;amp;end, NULL);&lt;BR /&gt; timersubb(&amp;amp;start, &amp;amp;end, &amp;amp;result);&lt;BR /&gt; printf("%ld sec, %ld usec
", result.tv_sec, result.tv_usec);&lt;BR /&gt; printf("Block Size: 0 K		");&lt;BR /&gt; printf("Results: %u
", sum);&lt;BR /&gt;}&lt;/DIV&gt;
&lt;DIV&gt;&lt;/DIV&gt;
&lt;DIV&gt;void timersubb( struct timeval *start, struct timeval *end, struct timeval *result)&lt;BR /&gt;{&lt;BR /&gt; result-&amp;gt;tv_sec = end-&amp;gt;tv_sec - start-&amp;gt;tv_sec;&lt;BR /&gt; result-&amp;gt;tv_usec = end-&amp;gt;tv_usec - start-&amp;gt;tv_usec;&lt;/DIV&gt;
&lt;DIV&gt; if(result-&amp;gt;tv_usec &amp;lt; 0) {&lt;BR /&gt; --result-&amp;gt;tv_sec;&lt;BR /&gt; result-&amp;gt;tv_usec += 1000000;&lt;BR /&gt; }&lt;BR /&gt;return;&lt;BR /&gt;}&lt;/DIV&gt;</description>
    <pubDate>Fri, 26 Mar 2004 21:18:23 GMT</pubDate>
    <dc:creator>icicle</dc:creator>
    <dc:date>2004-03-26T21:18:23Z</dc:date>
    <item>
      <title>About Cache Blocking</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/About-Cache-Blocking/m-p/965738#M5397</link>
      <description>&lt;DIV&gt;This also similar with my previous question about bus transaction.&lt;BR /&gt;I implement cache blocking. Is it right?&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt;system spec: H/W : IBM Xseries 225&lt;BR /&gt; OS : Redhat Linux 9&lt;BR /&gt; compiler : icc 8.0&lt;/DIV&gt;
&lt;DIV&gt;From IA-32 Intel Architecture Optimization Chapter 7"&lt;/DIV&gt;
&lt;DIV&gt;Cache Blocking Technique&lt;BR /&gt;Loop blocking is useful for reducing cache misses and improving&lt;BR /&gt;memory access performance. The selection of a suitable block size is&lt;BR /&gt;critical when applying the loop blocking technique. Loop blocking is&lt;BR /&gt;applicable to single-threaded applications as well as to multithreaded&lt;BR /&gt;applications running on processors with or without Hyper-Threading&lt;BR /&gt;Technology. The technique transforms the memory access pattern into&lt;BR /&gt;blocks that efficiently fit in the target cache size.&lt;BR /&gt;When targeting IA-32 processors with Hyper-Threading Technology,&lt;BR /&gt;the loop blocking technique should select a block size that is no more&lt;BR /&gt;than one half of the target cache size. The upper limit of the block size&lt;BR /&gt;for loop blocking should be determined by dividing the target cache size&lt;BR /&gt;by the number of logical processors available in a physical processor&lt;BR /&gt;package. Typically, some cache lines are needed to access data that are&lt;BR /&gt;not part of the source or destination buffers used in cache blocking, so&lt;BR /&gt;the block size can be chosen between one quarter to one half of the&lt;BR /&gt;target cache (see also, Chapter 3).&lt;BR /&gt;User/Source Coding Rule 30. (H impact, H generality) Use cache blocking&lt;BR /&gt;to improve locality of data access. Target one quarter to one half of the cache&lt;BR /&gt;size when targeting IA-32 processors with Hyper-Threading Technology.&lt;/DIV&gt;
&lt;DIV&gt;&lt;/DIV&gt;
&lt;DIV&gt;Source for cache blocking&lt;BR /&gt;+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++&lt;BR /&gt;#include &lt;STDLIB.H&gt;&lt;BR /&gt;#include &lt;SYS&gt;&lt;/SYS&gt;&lt;/STDLIB.H&gt;&lt;/DIV&gt;
&lt;DIV&gt;void CacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint &lt;/DIV&gt;
&lt;DIV&gt;ITERATIONS);&lt;BR /&gt;void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS);&lt;BR /&gt;void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS);&lt;BR /&gt;void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS);&lt;BR /&gt;void timersubb( struct timeval *, struct timeval *, struct timeval *);&lt;/DIV&gt;
&lt;DIV&gt;int main(int argc, char* argv[])&lt;BR /&gt;{ &lt;BR /&gt;int i, j;&lt;BR /&gt;uint ITERATIONS = 1000;&lt;BR /&gt;uint ARRAY_SZ = 4096000;&lt;BR /&gt;uint* array = (uint*) malloc(sizeof(uint) * ARRAY_SZ);&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt;for(i = 0; i &amp;lt; ITERATIONS; i++)&lt;BR /&gt; for(j = 0; j &amp;lt; ARRAY_SZ; j++)&lt;BR /&gt;array&lt;J&gt; = 3;&lt;/J&gt;&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt;printf("NO Cache Blocking 
");&lt;/DIV&gt;
&lt;DIV&gt;TimeNonCacheBlocking(array, ARRAY_SZ, ITERATIONS);&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt;printf("

 Single Threaded Cache Blocking 
");&lt;/DIV&gt;
&lt;DIV&gt;TimeCacheBlocking(array, ARRAY_SZ, 204800, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 136534, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 117029, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 102400, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 68267, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 34134, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 25600, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 12800, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 6400, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 3200, ITERATIONS);&lt;BR /&gt;TimeCacheBlocking(array, ARRAY_SZ, 1600, ITERATIONS);&lt;/DIV&gt;
&lt;DIV&gt; return 0;&lt;/DIV&gt;
&lt;DIV&gt;}&lt;/DIV&gt;
&lt;DIV&gt;&lt;/DIV&gt;
&lt;DIV&gt;void 
CacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint &lt;/DIV&gt;
&lt;DIV&gt;ITERATIONS)&lt;BR /&gt;{&lt;BR /&gt;uint sum =0;&lt;BR /&gt;uint index = 0, i, j;&lt;/DIV&gt;
&lt;DIV&gt;for(index = 0; index &amp;lt; ARRAY_SZ;) {&lt;BR /&gt;uint* data = &amp;amp;array[index];&lt;BR /&gt;index += BLOCK_SZ;&lt;/DIV&gt;
&lt;DIV&gt;if(index &amp;gt; ARRAY_SZ) &lt;BR /&gt;BLOCK_SZ = ARRAY_SZ - (index - BLOCK_SZ);&lt;/DIV&gt;
&lt;DIV&gt;for(i=0; i&amp;lt; ITERATIONS; i++)&lt;BR /&gt; for(j=0; j &amp;lt; BLOCK_SZ; j++)&lt;BR /&gt;sum += data&lt;J&gt; + data&lt;J&gt; + ITERATIONS;&lt;BR /&gt;}&lt;/J&gt;&lt;/J&gt;&lt;/DIV&gt;
&lt;DIV&gt;*result = sum;&lt;/DIV&gt;
&lt;DIV&gt;}&lt;/DIV&gt;
&lt;DIV&gt;&lt;/DIV&gt;
&lt;DIV&gt;void NonCacheBlocking(uint* result, uint* array, uint ARRAY_SZ, uint ITERATIONS)&lt;BR /&gt;{&lt;BR /&gt; uint sum =0;&lt;BR /&gt;uint i, j;&lt;/DIV&gt;
&lt;DIV&gt;&lt;BR /&gt; for(i=0; i&amp;lt; ITERATIONS; i++)&lt;BR /&gt; for(j=0; j &amp;lt; ARRAY_SZ; j++)&lt;BR /&gt; sum += array&lt;J&gt; + array&lt;J&gt; + ITERATIONS;&lt;/J&gt;&lt;/J&gt;&lt;/DIV&gt;
&lt;DIV&gt; *result = sum;&lt;/DIV&gt;
&lt;DIV&gt;}&lt;/DIV&gt;
&lt;DIV&gt;void TimeCacheBlocking(uint* array, uint ARRAY_SZ, uint BLOCK_SZ, uint ITERATIONS)&lt;BR /&gt;{&lt;BR /&gt;uint sum = 0;&lt;BR /&gt;struct timeval start, end, result;&lt;/DIV&gt;
&lt;DIV&gt; gettimeofday(&amp;amp;start, NULL);&lt;/DIV&gt;
&lt;DIV&gt;CacheBlocking (∑, array, ARRAY_SZ, BLOCK_SZ, ITERATIONS);&lt;/DIV&gt;
&lt;DIV&gt;gettimeofday(&amp;amp;end, NULL);&lt;BR /&gt; timersubb(&amp;amp;start, &amp;amp;end, &amp;amp;result);&lt;BR /&gt; printf("%ld sec, %ld usec
", result.tv_sec, result.tv_usec);&lt;BR /&gt;printf("Block Size: %u K	", BLOCK_SZ * sizeof(uint) /1024);&lt;BR /&gt;printf("Results: %u
", sum);&lt;BR /&gt;}&lt;/DIV&gt;
&lt;DIV&gt;void TimeNonCacheBlocking(uint* array, uint ARRAY_SZ, uint ITERATIONS)&lt;BR /&gt;{&lt;BR /&gt; uint sum = 0;&lt;BR /&gt; struct timeval start, end, result;&lt;/DIV&gt;
&lt;DIV&gt; gettimeofday(&amp;amp;start, NULL);&lt;/DIV&gt;
&lt;DIV&gt; NonCacheBlocking (∑, array, ARRAY_SZ, ITERATIONS);&lt;/DIV&gt;
&lt;DIV&gt; gettimeofday(&amp;amp;end, NULL);&lt;BR /&gt; timersubb(&amp;amp;start, &amp;amp;end, &amp;amp;result);&lt;BR /&gt; printf("%ld sec, %ld usec
", result.tv_sec, result.tv_usec);&lt;BR /&gt; printf("Block Size: 0 K		");&lt;BR /&gt; printf("Results: %u
", sum);&lt;BR /&gt;}&lt;/DIV&gt;
&lt;DIV&gt;&lt;/DIV&gt;
&lt;DIV&gt;void timersubb( struct timeval *start, struct timeval *end, struct timeval *result)&lt;BR /&gt;{&lt;BR /&gt; result-&amp;gt;tv_sec = end-&amp;gt;tv_sec - start-&amp;gt;tv_sec;&lt;BR /&gt; result-&amp;gt;tv_usec = end-&amp;gt;tv_usec - start-&amp;gt;tv_usec;&lt;/DIV&gt;
&lt;DIV&gt; if(result-&amp;gt;tv_usec &amp;lt; 0) {&lt;BR /&gt; --result-&amp;gt;tv_sec;&lt;BR /&gt; result-&amp;gt;tv_usec += 1000000;&lt;BR /&gt; }&lt;BR /&gt;return;&lt;BR /&gt;}&lt;/DIV&gt;</description>
      <pubDate>Fri, 26 Mar 2004 21:18:23 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/About-Cache-Blocking/m-p/965738#M5397</guid>
      <dc:creator>icicle</dc:creator>
      <dc:date>2004-03-26T21:18:23Z</dc:date>
    </item>
    <item>
      <title>Re: About Cache Blocking</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/About-Cache-Blocking/m-p/965739#M5398</link>
      <description>&lt;DIV&gt;&lt;/DIV&gt;
&lt;P&gt;Persepone -&lt;/P&gt;
&lt;P&gt;I'm not familiar with the text that you are citing from the IA-32 Architecture manual, but your example looks to be correct. Are you not seeing any improvement on your system between the different blocking sizes that you're using?&lt;/P&gt;
&lt;P&gt;-- clay&lt;/P&gt;</description>
      <pubDate>Tue, 06 Apr 2004 21:28:07 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/About-Cache-Blocking/m-p/965739#M5398</guid>
      <dc:creator>ClayB</dc:creator>
      <dc:date>2004-04-06T21:28:07Z</dc:date>
    </item>
  </channel>
</rss>

