<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Hi Jim, i did a terrible in Intel® Moderncode for Parallel Architectures</title>
    <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108534#M7423</link>
    <description>&lt;P&gt;Hi Jim, i did a terrible mistake some post ago. I posted a wrong code, without an if condition that could be the explanation of non vectorization. This is the right code.&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;for(n = *begin_ntrace; n &amp;lt; *end_ntrace; n++) {
      r_idx   = tt&lt;N&gt; * inv_sampling + 1.0f;

      i_idx1  = (int32)r_idx;
      i_idx2  = i_idx1 + dms - low;

      if(i_idx1 &amp;gt; low &amp;amp;&amp;amp; i_idx2 &amp;lt; ns) {
          ntr++;

          up_interp = r_idx  - (float32)i_idx1;

          i_idx1  -= low + 1;
 
          {
  
              float * restrict num_rp = &amp;amp;num_r[0];
              float * restrict num_ip = &amp;amp;num_i[0];
              float * restrict traces_rp = &amp;amp;traces&lt;N&gt;.r[i_idx1];
              float * restrict traces_ip = &amp;amp;traces&lt;N&gt;.i[i_idx1];
              int i_iter = i_idx2 - i_idx1 + 1; // # iterations
              // *** use loop control variable and index that is scoped inside the for loop
              int k;
              for(k = 0; k &amp;lt; i_iter; k++) {
                  num_rp&lt;K&gt; = num_rp&lt;K&gt; + (1.0f-up_interp)*(traces_rp&lt;K&gt;) + up_interp*traces_rp[k+1];
                  num_ip&lt;K&gt; = num_ip&lt;K&gt; + (1.0f-up_interp)*(traces_ip&lt;K&gt;) + up_interp*traces_ip[k+1];
              }

          }

&amp;nbsp;         denom += traces&lt;N&gt;.r[i_idx2] * traces&lt;N&gt;.r[i_idx2] + traces&lt;N&gt;.i[i_idx2] * traces&lt;N&gt;.i[i_idx2];
 
      }

}&lt;/N&gt;&lt;/N&gt;&lt;/N&gt;&lt;/N&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/N&gt;&lt;/N&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;And this is the Assembly generated:&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;Address	Source Line	Assembly	
0x3eee		Block 12:				
0x3eee	1,385	movsxdl  (%rcx), %r10			
0x3ef1	1,385	xor %ecx, %ecx				
0x3ef3	1,385	movsxdl  (%rsi), %rsi				
0x3ef6	1,385	xor %eax, %eax			
0x3ef8	1,385	cmp %r10, %rsi			
0x3efb	1,385	jnl 0x433a &amp;lt;Block 48&amp;gt;				
0x3f01		Block 13:				
0x3f01	1,391	movl  0x204234(%rip), %r13d				
0x3f08	1,386	lea (%r9,%rsi,4), %r15				
0x3f0c	1,389	movl  0x204221(%rip), %r11d				
0x3f13	1,389	mov %r12d, %r14d				
0x3f16	1,391	movl  %r13d, -0x60(%rbp)				
0x3f1a	1,425	mov %r8, %r9				
0x3f1d	1,396	movsxd %r11d, %r13				
0x3f20	1,385	sub %rsi, %r10				
0x3f23	1,403	shl $0x4, %rsi				
0x3f27	1,389	sub %r11d, %r14d				
0x3f2a	1,425	sub %r13, %r9				
0x3f2d	1,386	vmovssl  0x2041f7(%rip), %xmm1				
0x3f35	1,386	vmovssl  0x1ba7(%rip), %xmm0				
0x3f3d	1,403	addq  0x2041bc(%rip), %rsi				
0x3f44	1,425	movq  %r8, -0xc8(%rbp)				
0x3f4b	1,425	movq  %r9, -0x80(%rbp)				
0x3f4f	1,425	movq  %r13, -0x70(%rbp)				
0x3f53	1,425	movl  %r14d, -0x58(%rbp)				
0x3f57	1,425	movq  %r10, -0x50(%rbp)				
0x3f5b	1,425	movq  %r15, -0x40(%rbp)				
0x3f5f	1,425	movl  %r11d, -0x48(%rbp)				
0x3f63	1,425	movq  %rdx, -0x90(%rbp)				
0x3f6a	1,425	movq  %rdi, -0x98(%rbp)				
0x3f71	1,425	movl  %r12d, -0xd0(%rbp)				
0x3f78		Block 14:				
0x3f78	1,386	vmovaps %xmm0, %xmm4	
0x3f7c	1,386	movq  -0x40(%rbp), %rdx	
0x3f80	1,386	vfmadd231ssl  (%rdx,%rcx,4), %xmm1, %xmm4		
0x3f86	1,388	vcvttss2si %xmm4, %r13d	0.4%	
0x3f8a	1,391	cmpl  -0x48(%rbp), %r13d	
0x3f8e	1,391	jle 0x430d &amp;lt;Block 46&amp;gt;				
0x3f94		Block 15:				
0x3f94	1,389	movl  -0x58(%rbp), %edx	0.1%	
0x3f97	1,389	lea (%rdx,%r13,1), %r15d	
0x3f9b	1,391	cmpl  -0x60(%rbp), %r15d	
0x3f9f	1,391	jnl 0x430d &amp;lt;Block 46&amp;gt;				
0x3fa5		Block 16:				
0x3fa5	1,394	vxorps %xmm5, %xmm5, %xmm5	
0x3fa9	1,394	vcvtsi2ss %r13d, %xmm5, %xmm5		
0x3fae	1,396	movsxd %r13d, %r13	
0x3fb1	1,394	vsubss %xmm5, %xmm4, %xmm6		
0x3fb5	1,396	mov %r13, %r14	
0x3fb8	1,396	subq  -0x70(%rbp), %r14		
0x3fbc	1,403	movq  (%rax,%rsi,1), %rdx	
0x3fc0	1,404	movq  0x8(%rax,%rsi,1), %r12	
0x3fc5	1,392	incl  -0x78(%rbp)	
0x3fc8	1,396	lea -0x1(%r14), %r11	
0x3fcc	1,405	mov %r11d, %r9d	0.1%	
0x3fcf	1,403	lea -0x4(%rdx,%r14,4), %r10				
0x3fd4	1,405	neg %r9d	0.1%	
0x3fd7	1,404	lea -0x4(%r12,%r14,4), %r8	
0x3fdc	1,405	add %r15d, %r9d		
0x3fdf	1,403	movq  %rdx, -0x68(%rbp)		
0x3fe3	1,408	test %r9d, %r9d		
0x3fe6	1,408	jle 0x40fe &amp;lt;Block 25&amp;gt;				
0x3fec		Block 17:				
0x3fec	1,408	movsxd %r9d, %rdi		
0x3fef	1,408	cmp $0x8, %rdi		
0x3ff3	1,408	jl 0x4501 &amp;lt;Block 69&amp;gt;				
0x3ff9		Block 18:				
0x3ff9	1,408	mov %r9d, %edx		
0x3ffc	1,409	vsubss %xmm6, %xmm0, %xmm5	
0x4000	1,368	vbroadcastss %xmm6, %ymm4		
0x4005	1,408	movq  $0x0, -0x88(%rbp)		
0x4010	1,408	and $0xfffffff8, %edx		
0x4013	1,368	movq  %rsi, -0xb0(%rbp)	
0x401a	1,368	movq  %rax, -0xa8(%rbp)		
0x4021	1,368	movq  %rcx, -0xa0(%rbp)		
0x4028	1,408	movsxd %edx, %rdx		
0x402b	1,409	vbroadcastss %xmm5, %ymm5	
0x4030	1,368	movq  -0x88(%rbp), %rax		
0x4037	1,368	movq  -0x90(%rbp), %rcx		
0x403e	1,368	movq  -0x98(%rbp), %rsi		
0x4045		Block 19:				
0x4045	1,409	vmovupsy  (%r10,%rax,4), %ymm7	
0x404b	1,410	vmovupsy  (%r8,%rax,4), %ymm8	
0x4051	1,409	vfmadd213psy  (%rsi,%rax,4), %ymm5, %ymm7	
0x4057	1,410	vfmadd213psy  (%rcx,%rax,4), %ymm5, %ymm8	
0x405d	1,409	vfmadd231psy  0x4(%r10,%rax,4), %ymm4, %ymm7	
0x4064	1,410	vfmadd231psy  0x4(%r8,%rax,4), %ymm4, %ymm8	
0x406b	1,409	vmovupsy  %ymm7, (%rsi,%rax,4)	
0x4070	1,410	vmovupsy  %ymm8, (%rcx,%rax,4)		
0x4075	1,408	add $0x8, %rax	
0x4079	1,408	cmp %rdx, %rax		
0x407c	1,408	jb 0x4045 &amp;lt;Block 19&amp;gt;				
0x407e		Block 20:				
0x407e	1,408	movq  -0xb0(%rbp), %rsi	
0x4085	1,408	movq  -0xa8(%rbp), %rax		
0x408c	1,408	movq  -0xa0(%rbp), %rcx	
0x4093		Block 21:				
0x4093	1,408	cmp %rdi, %rdx		
0x4096	1,408	jnb 0x40fe &amp;lt;Block 25&amp;gt;				
0x4098		Block 22:				
0x4098	1,409	movq  %rax, -0xa8(%rbp)		
0x409f	1,409	vsubss %xmm6, %xmm0, %xmm4		
0x40a3	1,409	movq  %rcx, -0xa0(%rbp)	
0x40aa	1,409	movq  -0x90(%rbp), %rax				
0x40b1	1,409	movq  -0x98(%rbp), %rcx	
0x40b8		Block 23:				
0x40b8	1,409	vmovssl  (%r10,%rdx,4), %xmm5	
0x40be	1,410	vmovssl  (%r8,%rdx,4), %xmm7	
0x40c4	1,409	vfmadd213ssl  (%rcx,%rdx,4), %xmm4, %xmm5		
0x40ca	1,410	vfmadd213ssl  (%rax,%rdx,4), %xmm4, %xmm7	
0x40d0	1,409	vfmadd231ssl  0x4(%r10,%rdx,4), %xmm6, %xmm5	
0x40d7	1,410	vfmadd231ssl  0x4(%r8,%rdx,4), %xmm6, %xmm7	
0x40de	1,409	vmovssl  %xmm5, (%rcx,%rdx,4)	
0x40e3	1,410	vmovssl  %xmm7, (%rax,%rdx,4)	
0x40e8	1,408	inc %rdx	
0x40eb	1,408	cmp %rdi, %rdx	
0x40ee	1,408	jb 0x40b8 &amp;lt;Block 23&amp;gt;				
0x40f0		Block 24:				
0x40f0	1,408	movq  -0xa8(%rbp), %rax		
0x40f7	1,408	movq  -0xa0(%rbp), %rcx	
&lt;/PRE&gt;

&lt;P&gt;Sorry for the error, but I have two very similar piece of codes and I confused the two source codes. My apologies.&lt;/P&gt;</description>
    <pubDate>Tue, 22 Mar 2016 09:15:12 GMT</pubDate>
    <dc:creator>unrue</dc:creator>
    <dc:date>2016-03-22T09:15:12Z</dc:date>
    <item>
      <title>Shifted load</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108509#M7398</link>
      <description>&lt;P&gt;Deat Intel developers,&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;I'm using AVX whit Intel 15.0.1 compiler. I need to load some float shifted by one in order to do some AVX operations:&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;sample = traces&lt;N&gt;.r&lt;J&gt; + traces&lt;N&gt;.r[j+1];&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;At the moment, I do two distinct AVX load:&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;intr_1= _mm256_load_ps(&amp;amp;traces&lt;N&gt;.r&lt;J&gt;);
intr_2= _mm256_load_ps(&amp;amp;traces&lt;N&gt;.r[j+1]);&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;In this way, I calculate sample in two distinct vectorized phase, first part for J and second part for J+1. It works, but it is quiet slow. In fact, using the second load, I load seven elements I already have and just one new element. So, maybe I can do a sort of load, and left shift and finally a second load of one elements. What is the best strategy? Thanks in advance.&lt;/P&gt;</description>
      <pubDate>Fri, 11 Mar 2016 10:29:35 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108509#M7398</guid>
      <dc:creator>unrue</dc:creator>
      <dc:date>2016-03-11T10:29:35Z</dc:date>
    </item>
    <item>
      <title>Your method with unaligned</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108510#M7399</link>
      <description>&lt;P&gt;Your method with unaligned loads is probably satisfactory for current CPU architecture (Haswell or newer).&amp;nbsp; For Sandy Bridge, unaligned load will be intolerably slow, so you may want to consider permitting your compiler to choose the method, if you don't like to write in _mm128 loads to be combined by &lt;SPAN class="sig"&gt;&lt;SPAN class="name"&gt;_mm256_insertf128_ps.&amp;nbsp;&amp;nbsp; AVX2 permutes appear better suited to your suggestion about shifts, but then you shouldn't be seeing such poor performance of unaligned load.&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN class="sig"&gt;&lt;SPAN class="name"&gt;Ivy Bridge was designed specifically to alleviate the penalty incurred by unaligned mm256 loads.&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN class="sig"&gt;&lt;SPAN class="name"&gt;The question seems more topical for &lt;/SPAN&gt;&lt;/SPAN&gt;&lt;A href="https://software.intel.com/en-us/forums/intel-isa-extensions" target="_blank"&gt;https://software.intel.com/en-us/forums/intel-isa-extensions&lt;/A&gt;, but there is limited appeal in struggling with low level intrinsics code to optimize for an old ISA.&amp;nbsp; After all, there is "moderncode" in the title of this forum.&lt;/P&gt;</description>
      <pubDate>Fri, 11 Mar 2016 11:06:01 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108510#M7399</guid>
      <dc:creator>TimP</dc:creator>
      <dc:date>2016-03-11T11:06:01Z</dc:date>
    </item>
    <item>
      <title>Hi Tim,</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108511#M7400</link>
      <description>&lt;P&gt;Hi Tim,&lt;/P&gt;

&lt;P&gt;I'm using Sandy Bridge andy my load are aligned (traces is 32 bytes aligned by using mm_malloc). But I've noted no differences between aligned or unaligned load.&lt;/P&gt;</description>
      <pubDate>Fri, 11 Mar 2016 11:42:39 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108511#M7400</guid>
      <dc:creator>unrue</dc:creator>
      <dc:date>2016-03-11T11:42:39Z</dc:date>
    </item>
    <item>
      <title>If traces[n].r[0] is aligned</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108512#M7401</link>
      <description>&lt;P&gt;If traces&lt;N&gt;.r[0] is aligned to 32-bytes, then&lt;/N&gt;&lt;/P&gt;

&lt;P&gt;_mm256_load_ps(&amp;amp;traces&lt;N&gt;.r[0]) is aligned load of r[0], r[1],... r[7] and&lt;BR /&gt;
	_mm256_load_ps(&amp;amp;traces&lt;N&gt;.r[1]) is unaligned, a split load of r[1],... r[7] and then r[8].&lt;/N&gt;&lt;/N&gt;&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Fri, 11 Mar 2016 17:54:36 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108512#M7401</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2016-03-11T17:54:36Z</dc:date>
    </item>
    <item>
      <title>It is almost always better to</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108513#M7402</link>
      <description>&lt;P&gt;It is almost always better to reload the data than to use the permute functions, even with the cache-line-crossing and page-crossing penalties on Sandy Bridge.&amp;nbsp; It is possible to make the page-crossing cases run faster by using permute, but it is a lot of effort to generate special versions for all possible alignments.&lt;/P&gt;</description>
      <pubDate>Fri, 11 Mar 2016 18:08:38 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108513#M7402</guid>
      <dc:creator>McCalpinJohn</dc:creator>
      <dc:date>2016-03-11T18:08:38Z</dc:date>
    </item>
    <item>
      <title>With avx load, performance</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108514#M7403</link>
      <description>&lt;P&gt;With avx load, performance varies with data alignment but not whether aligned or unaligned intrinsic is issued. mm128 loads may not lose performance with misalignment.&lt;/P&gt;</description>
      <pubDate>Fri, 11 Mar 2016 19:30:30 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108514#M7403</guid>
      <dc:creator>TimP</dc:creator>
      <dc:date>2016-03-11T19:30:30Z</dc:date>
    </item>
    <item>
      <title>Quote:jimdempseyatthecove</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108515#M7404</link>
      <description>&lt;P&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;jimdempseyatthecove wrote:&lt;BR /&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;If traces&lt;N&gt;.r[0] is aligned to 32-bytes, then&lt;/N&gt;&lt;/P&gt;

&lt;P&gt;_mm256_load_ps(&amp;amp;traces&lt;N&gt;.r[0]) is aligned load of r[0], r[1],... r[7] and&lt;BR /&gt;
	_mm256_load_ps(&amp;amp;traces&lt;N&gt;.r[1]) is unaligned, a split load of r[1],... r[7] and then r[8].&lt;/N&gt;&lt;/N&gt;&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;

&lt;P&gt;&lt;/P&gt;&lt;/BLOCKQUOTE&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;Hi Jim, this is true, in fact, I do a manual unroll of step 8:&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;for(j,........, j+=8) {

_mm256_load_ps(&amp;amp;traces&lt;N&gt;.r&lt;J&gt;) 
_mm256_load_ps(&amp;amp;traces&lt;N&gt;.r[j+1]) 

}&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;I have for each iteration one load aligned and one not. How can I do to have all load aligned?&lt;/P&gt;</description>
      <pubDate>Sat, 12 Mar 2016 08:59:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108515#M7404</guid>
      <dc:creator>unrue</dc:creator>
      <dc:date>2016-03-12T08:59:00Z</dc:date>
    </item>
    <item>
      <title>I think it would be more</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108516#M7405</link>
      <description>&lt;P&gt;I think it would be more productive for you and the responders to see a larger picture of what you are trying to do.&lt;/P&gt;

&lt;P&gt;From the little of what you have shown, you last post contains too little of what you are trying to do for any of us to offer productive advice. We do not see the outputs. IOW, are you producing a vector, one element shorter than the input where each output elements has each element the sum of the adjacent input vector? Or are you intending to perform a sum reduction (sum of all elements in the input vector)? Or something entirely different. If the first case, is the output vector the input vector, if so, can this be made to different output vector?&lt;/P&gt;

&lt;P&gt;To answer your question, sketch out what you are asking:&lt;/P&gt;

&lt;PRE class="brush:plain;"&gt;cache lines |0123456789ABCDEF|0123456789ABCDEF|0123456789ABCDEF|...
j=0 _mm256&amp;nbsp;&amp;nbsp; 01234567        |                |
j+1 _mm256&amp;nbsp;&amp;nbsp;&amp;nbsp; 01234567       |                |
j=8 _mm256&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 01234567|                |
j+1 _mm256&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 0123456|7               |
j=16 _mm256&amp;nbsp;&amp;nbsp;                |01234567&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; |
j+1  _mm256&amp;nbsp;&amp;nbsp;&amp;nbsp;               | 01234567&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; |
...
&lt;/PRE&gt;

&lt;P&gt;Where | is the cache line interval. When mod(j,16)==0 then j+1 is within the same cache line as j, however, when mod(j,16)==8, then j+1 will cross cache line.&lt;/P&gt;

&lt;P&gt;Good advice for the narrow scope is likely to provide bad advice for the larger scope.&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Sat, 12 Mar 2016 13:13:38 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108516#M7405</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2016-03-12T13:13:38Z</dc:date>
    </item>
    <item>
      <title>Hi Jim, thanks for the</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108517#M7406</link>
      <description>&lt;P&gt;Hi Jim,&lt;/P&gt;

&lt;P&gt;thanks for the explanations.&amp;nbsp;&lt;SPAN style="font-size: 1em; line-height: 1.5;"&gt;I post more source code to explain better my goal:&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;for(n = *begin_ntrace; n &amp;lt; *end_ntrace; n++) {

    r_idx   = tt&lt;N&gt;;
    i_idx1  = (int)r_idx;
    i_idx2  = i_idx1 + dms;

    for(j = i_idx1, k = 0; j &amp;lt; i_idx2; j++, k++) {
        sample.r = traces&lt;N&gt;.r&lt;J&gt; + traces&lt;N&gt;.r[j+1];
        sample.i = traces&lt;N&gt;.i&lt;J&gt; + traces&lt;N&gt;.i[j+1];
   
        num&lt;K&gt;.r += sample.r;
        num&lt;K&gt;.i += sample.i;

    }

}&lt;/K&gt;&lt;/K&gt;&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;where appears j and j+1 memory access.&lt;/P&gt;</description>
      <pubDate>Mon, 14 Mar 2016 08:08:46 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108517#M7406</guid>
      <dc:creator>unrue</dc:creator>
      <dc:date>2016-03-14T08:08:46Z</dc:date>
    </item>
    <item>
      <title>Now you have changed the</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108518#M7407</link>
      <description>&lt;P&gt;Sorry, I misread the posted code as nested loops, seemingly implying a sum reduction.&amp;nbsp; I'm concerned about writing source code with multiple variables in the for() field not only due to this possibility of misreading, but also an account of needing to always read compiler reports to see the effect.&amp;nbsp; If you happen to be compiling in 32-bit mode, it's of utmost importance that the compiler combine multiple pointer references into a single indexing register with offsets.&amp;nbsp; A confirmation of vectorization would be sufficient to indicate this has happened, but use of simd intrinsics may obscure it.&lt;/P&gt;

&lt;P&gt;As you have mentioned the hope that keeping the duplicate memory references at register level might boost performance, I would like to mention that this seems most likely to work for data in L1 cache, when you might be coming up against the Sandy Bridge limit of 2 128-bit reads and 1 128-bit store to L1 cache per clock cycle.&amp;nbsp; If your data runs are long enough to go to memory, the combining effect of last level cache and read stream buffers should prevent repeated memory references from absorbing memory bandwidth.&lt;/P&gt;</description>
      <pubDate>Mon, 14 Mar 2016 14:12:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108518#M7407</guid>
      <dc:creator>TimP</dc:creator>
      <dc:date>2016-03-14T14:12:00Z</dc:date>
    </item>
    <item>
      <title>TimP,</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108519#M7408</link>
      <description>&lt;P&gt;TimP,&lt;/P&gt;

&lt;P&gt;This isn't a sum reduction, it is more like a raster (or bin) accumulation. The output isn't a scalar sum, rather it is a vector of sums of vectors (a vector and its +1 neighbor).&lt;/P&gt;

&lt;P&gt;What is a representative value for dms? IOW what is a representative trip count for the inner loop?&lt;/P&gt;

&lt;P&gt;Do you have a pattern for i_idx1 for each n? IOW is it always a multiple of&amp;nbsp;some number&amp;nbsp;or is it somewhat random?&lt;/P&gt;

&lt;P&gt;A major inhibitor of vectorization for the above code is the output array num as being a structure. It would be better if you made two arrays, and accumulated as&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;for(n = *begin_ntrace; n &amp;lt; *end_ntrace; n++) {
&amp;nbsp;&amp;nbsp;&amp;nbsp; r_idx&amp;nbsp;&amp;nbsp; = tt&lt;N&gt;;
&amp;nbsp;&amp;nbsp;&amp;nbsp; i_idx1&amp;nbsp; = (int)r_idx;
&amp;nbsp;&amp;nbsp;&amp;nbsp; i_idx2&amp;nbsp; = i_idx1 + dms;
&amp;nbsp;&amp;nbsp;&amp;nbsp; for(j = i_idx1, k = 0; j &amp;lt; i_idx2; j++, k++) {
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; num_r&lt;K&gt; += traces&lt;N&gt;.r&lt;J&gt; + traces&lt;N&gt;.r[j+1]; // vector of r to vector of r
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; num_i&lt;K&gt; += traces&lt;N&gt;.i&lt;J&gt; + traces&lt;N&gt;.i[j+1]; // vector of i to vector of i
&amp;nbsp;&amp;nbsp;&amp;nbsp; }
}
&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/K&gt;&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/K&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Mon, 14 Mar 2016 15:27:26 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108519#M7408</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2016-03-14T15:27:26Z</dc:date>
    </item>
    <item>
      <title>And if need be, follow the</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108520#M7409</link>
      <description>&lt;P&gt;And if need be, follow the above with:&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;for(k=0; k&amp;lt;dms; ++k) {
&amp;nbsp; num&lt;K&gt;.r = num_r&lt;K&gt;;
&amp;nbsp; num&lt;K&gt;.i = num_i&lt;K&gt;;
}
&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/PRE&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Mon, 14 Mar 2016 15:32:16 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108520#M7409</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2016-03-14T15:32:16Z</dc:date>
    </item>
    <item>
      <title>Quote:jimdempseyatthecove]&lt;/p</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108521#M7410</link>
      <description>&lt;P&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;jimdempseyatthecove wrote:&lt;BR /&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;TimP,&lt;/P&gt;

&lt;P&gt;This isn't a sum reduction, it is more like a raster (or bin) accumulation. The output isn't a scalar sum, rather it is a vector of sums of vectors (a vector and its +1 neighbor).&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 13.008px; line-height: 15.6096px;"&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;jimdempseyatthecove wrote:&lt;BR /&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;What is a representative value for dms? IOW what is a representative trip count for the inner loop?&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 13.008px; line-height: 15.6096px;"&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;/BLOCKQUOTE&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 13.008px; line-height: 15.6096px;"&gt;Typical value of dms is from 10 to 30 and around this.&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 13.008px; line-height: 15.6096px;"&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;jimdempseyatthecove wrote:&lt;BR /&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;Do you have a pattern for i_idx1 for each n? IOW is it always a multiple of&amp;nbsp;some number&amp;nbsp;or is it somewhat random?&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 13.008px; line-height: 15.6096px;"&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;/BLOCKQUOTE&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 13.008px; line-height: 15.6096px;"&gt;Unfortunately not, no pattern are associated to&amp;nbsp;i_idx1.&amp;nbsp;&amp;nbsp;Typical trip count is about 20. It is very small loop called many and many times inside a function.&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 13.008px; line-height: 15.6096px;"&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;jimdempseyatthecove wrote:&lt;BR /&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;A major inhibitor of vectorization for the above code is the output array num as being a structure. It would be better if you made two arrays, and accumulated as&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;for(n = *begin_ntrace; n &amp;lt; *end_ntrace; n++) {
&amp;nbsp;&amp;nbsp;&amp;nbsp; r_idx&amp;nbsp;&amp;nbsp; = tt&lt;N&gt;;
&amp;nbsp;&amp;nbsp;&amp;nbsp; i_idx1&amp;nbsp; = (int)r_idx;
&amp;nbsp;&amp;nbsp;&amp;nbsp; i_idx2&amp;nbsp; = i_idx1 + dms;
&amp;nbsp;&amp;nbsp;&amp;nbsp; for(j = i_idx1, k = 0; j &amp;lt; i_idx2; j++, k++) {
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; num_r&lt;K&gt; += traces&lt;N&gt;.r&lt;J&gt; + traces&lt;N&gt;.r[j+1]; // vector of r to vector of r
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; num_i&lt;K&gt; += traces&lt;N&gt;.i&lt;J&gt; + traces&lt;N&gt;.i[j+1]; // vector of i to vector of i
&amp;nbsp;&amp;nbsp;&amp;nbsp; }
}
&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/K&gt;&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/K&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;

&lt;P&gt;&lt;/P&gt;&lt;/BLOCKQUOTE&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 1em; line-height: 1.5;"&gt;Thanks Jim, I'll try it.&lt;/SPAN&gt;&lt;/P&gt;&lt;/BLOCKQUOTE&gt;</description>
      <pubDate>Mon, 14 Mar 2016 15:33:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108521#M7410</guid>
      <dc:creator>unrue</dc:creator>
      <dc:date>2016-03-14T15:33:00Z</dc:date>
    </item>
    <item>
      <title>I don't have all of the</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108522#M7411</link>
      <description>&lt;P&gt;I don't have all of the numbers handy, but if I recall correctly the Sandy Bridge core pays a 3 cycle penalty for 256-bit loads that cross a cache line boundary and a 1-cycle penalty for 128-bit loads that cross a cache line boundary.&amp;nbsp; So you definitely don't want 256-bit loads in this case.&lt;/P&gt;

&lt;P&gt;It is not obvious to me whether you will be better off with (128-bit loads/stores and 128-bit arithmetic) or with (128-bit loads/stores and 256-bit arithmetic).&amp;nbsp; There will probably not be a lot of difference in this case -- the execution time should be limited by the loads and stores, with plenty of time to fully overlap the arithmetic instruction execution.&lt;/P&gt;</description>
      <pubDate>Mon, 14 Mar 2016 19:29:46 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108522#M7411</guid>
      <dc:creator>McCalpinJohn</dc:creator>
      <dc:date>2016-03-14T19:29:46Z</dc:date>
    </item>
    <item>
      <title>Quote:jimdempseyatthecove</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108523#M7412</link>
      <description>&lt;P&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;jimdempseyatthecove wrote:&lt;BR /&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;And if need be, follow the above with:&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;for(k=0; k&amp;lt;dms; ++k) {
&amp;nbsp; num&lt;K&gt;.r = num_r&lt;K&gt;;
&amp;nbsp; num&lt;K&gt;.i = num_i&lt;K&gt;;
}
&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/PRE&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;

&lt;P&gt;&lt;/P&gt;&lt;/BLOCKQUOTE&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;Hi Jim,&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 1em; line-height: 1.5;"&gt;using your suggestion, code is about 15% faster.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 15 Mar 2016 12:56:31 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108523#M7412</guid>
      <dc:creator>unrue</dc:creator>
      <dc:date>2016-03-15T12:56:31Z</dc:date>
    </item>
    <item>
      <title>That is a good start.</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108524#M7413</link>
      <description>&lt;P&gt;That is a good start.&lt;/P&gt;

&lt;P&gt;Next adventure for you is to run the release code under VTune, not necessarily to find the bottlenecks (at this time). Instead, use Bottom Up listing, find the function, then open the Assembly window.&lt;/P&gt;

&lt;P&gt;You do not need to fully understand the Assembly instruction syntax. You just need to understand some simple rules:&lt;/P&gt;

&lt;P&gt;a) In the source window, highlight the lines of interest (e.g. the for(j= loop in post #11, select/highlight lines 5:8)&lt;BR /&gt;
	b) Locate the highlighted lines in the Assembly window (you may need to sort by source line number by clicking on the "Line" header in the Assembly window).&lt;BR /&gt;
	c) Assess the number of assembly instructions, after a few of these exercizes you will get a feel of "too many" or "about right". One of the issues that can arrise is loop invarient code not getting moved out of the loop. For this loop, you would expect the compiler to locate "num_r", "num_i",&amp;nbsp;"traces.r" and "traces.i" into registers. If you see memory fetches (those with "ptr [...]" then these addresses are not registerized. If the inner loop does not have the array addresses registerized, then you may need to add within the&amp;nbsp;"for(n=" loop,&amp;nbsp;scoped restrict pointers to those arrays, code to use them, VTune and examine Assembly again.&lt;BR /&gt;
	d) While one can use the vectorization report, I prefer to see the Assembly to see if there is "efficient" vectorization. In this case you are looking for the xmm or ymm instructions (xmm is 128 bit, ymm is 256 bit). Note, John McCalpin&amp;nbsp;pointed out that on some CPU's 128 bit can be faster. And more importantly the instructions end in ...ps or ...pd for floating point (packed single, packed double), or are of the form v... or p... for integer.&lt;/P&gt;

&lt;P&gt;After you get the instruction count down to "about right", then you can look at the metrics offered by VTune to see if there is something else you can learn. For example, if you are building x32 applications, you have a limited number of registers available. If the for(j= loop runs out of registers, it may have to load the array addresses from memory. In this situation, making two loops, one for r and one for i, may be faster. On x64, the register usage on the for(j= should not be an issue (verify this assumption by looking at the assembly code).&lt;/P&gt;

&lt;P&gt;You do not need to the the Assembly checkout for all code, just your few hot spots.&lt;/P&gt;

&lt;P&gt;Last note:&lt;/P&gt;

&lt;P&gt;It has been my experience that when you manage to wipe out a hot spot (making the loops run multiples of times faster), that you almost always fine a now new hot spot that didn't seem all that important before. Therefore, expect to iterate working away at a series of new hot spots.&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Tue, 15 Mar 2016 15:33:23 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108524#M7413</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2016-03-15T15:33:23Z</dc:date>
    </item>
    <item>
      <title>BTW for</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108525#M7414</link>
      <description>&lt;P&gt;BTW for&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; for(j = i_idx1, k = 0; j &amp;lt; i_idx2; j++, k++) {
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; num_r&lt;K&gt; += traces&lt;N&gt;.r&lt;J&gt; + traces&lt;N&gt;.r[j+1]; // vector of r to vector of r
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; num_i&lt;K&gt; += traces&lt;N&gt;.i&lt;J&gt; + traces&lt;N&gt;.i[j+1]; // vector of i to vector of i
&amp;nbsp;&amp;nbsp;&amp;nbsp; }
&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/K&gt;&lt;/N&gt;&lt;/J&gt;&lt;/N&gt;&lt;/K&gt;&lt;/PRE&gt;

&lt;P&gt;You would expect to see 6 "... ptr[...]" instructions (4 reads, 2 writes). Though you may see double this if the compiler unrolled it once (or multiples with higher unroll count).&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Tue, 15 Mar 2016 15:41:21 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108525#M7414</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2016-03-15T15:41:21Z</dc:date>
    </item>
    <item>
      <title>Quote:jimdempseyatthecove</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108526#M7415</link>
      <description>&lt;P&gt;&lt;/P&gt;&lt;BLOCKQUOTE&gt;jimdempseyatthecove wrote:&lt;BR /&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;That is a good start.&lt;/P&gt;

&lt;P&gt;Next adventure for you is to run the release code under VTune, not necessarily to find the bottlenecks (at this time). Instead, use Bottom Up listing, find the function, then open the Assembly window.&lt;/P&gt;

&lt;P&gt;You do not need to fully understand the Assembly instruction syntax. You just need to understand some simple rules:&lt;/P&gt;

&lt;P&gt;a) In the source window, highlight the lines of interest (e.g. the for(j= loop in post #11, select/highlight lines 5:8)&lt;BR /&gt;
	b) Locate the highlighted lines in the Assembly window (you may need to sort by source line number by clicking on the "Line" header in the Assembly window).&lt;BR /&gt;
	c) Assess the number of assembly instructions, after a few of these exercizes you will get a feel of "too many" or "about right". One of the issues that can arrise is loop invarient code not getting moved out of the loop. For this loop, you would expect the compiler to locate "num_r", "num_i",&amp;nbsp;"traces.r" and "traces.i" into registers. If you see memory fetches (those with "ptr [...]" then these addresses are not registerized. If the inner loop does not have the array addresses registerized, then you may need to add within the&amp;nbsp;"for(n=" loop,&amp;nbsp;scoped restrict pointers to those arrays, code to use them, VTune and examine Assembly again.&lt;BR /&gt;
	d) While one can use the vectorization report, I prefer to see the Assembly to see if there is "efficient" vectorization. In this case you are looking for the xmm or ymm instructions (xmm is 128 bit, ymm is 256 bit). Note, John McCalpin&amp;nbsp;pointed out that on some CPU's 128 bit can be faster. And more importantly the instructions end in ...ps or ...pd for floating point (packed single, packed double), or are of the form v... or p... for integer.&lt;/P&gt;

&lt;P&gt;After you get the instruction count down to "about right", then you can look at the metrics offered by VTune to see if there is something else you can learn. For example, if you are building x32 applications, you have a limited number of registers available. If the for(j= loop runs out of registers, it may have to load the array addresses from memory. In this situation, making two loops, one for r and one for i, may be faster. On x64, the register usage on the for(j= should not be an issue (verify this assumption by looking at the assembly code).&lt;/P&gt;

&lt;P&gt;You do not need to the the Assembly checkout for all code, just your few hot spots.&lt;/P&gt;

&lt;P&gt;Last note:&lt;/P&gt;

&lt;P&gt;It has been my experience that when you manage to wipe out a hot spot (making the loops run multiples of times faster), that you almost always fine a now new hot spot that didn't seem all that important before. Therefore, expect to iterate working away at a series of new hot spots.&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;

&lt;P&gt;&lt;/P&gt;&lt;/BLOCKQUOTE&gt;&lt;P&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 1em; line-height: 1.5;"&gt;Hi Jim&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 1em; line-height: 1.5;"&gt;I attach the assembly from VTune. From what I understand, there are some values not &lt;/SPAN&gt;&lt;SPAN style="font-size: 13.008px; line-height: 15.6096px;"&gt;registerized so&lt;/SPAN&gt;&lt;SPAN style="font-size: 1em; line-height: 1.5;"&gt; I tried this code by using restrict ponters:&lt;/SPAN&gt;&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;         float * restrict num_rp = &amp;amp;num_r[0];
         float * restrict num_ip = &amp;amp;num_i[0];
         float * restrict traces_rp = &amp;amp;traces&lt;N&gt;.r[i_idx1];
         float * restrict traces_ip = &amp;amp;traces&lt;N&gt;.i[i_idx1];

         for(j = i_idx1, k = 0; j &amp;lt; i_idx2; j++, k++) {
             *num_rp = *num_rp + (1.0f-up_interp)*(*traces_rp) + up_interp*(*(traces_rp+1));
             *num_ip = *num_ip + (1.0f-up_interp)*(*traces_ip) + up_interp*(*(traces_ip+1));
              num_rp++;
              num_ip++;
              traces_rp++;
              traces_ip++;
         }
         &lt;/N&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;but I obtained no performance gain.&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 16 Mar 2016 08:52:49 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108526#M7415</guid>
      <dc:creator>unrue</dc:creator>
      <dc:date>2016-03-16T08:52:49Z</dc:date>
    </item>
    <item>
      <title>It's hard to read from your</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108527#M7416</link>
      <description>&lt;P&gt;It's hard to read from your screen shots; I would think that saving asm code and finding the hot loops might be easier.&amp;nbsp; From what I can see, it does seem that the compiler may not have combined your pointers into loop carried register variables; as you appear to need 6 integer or pointer registers, failure to perform such combination would prevent optimization for 32-bit mode, where at most 3 such registers are available.&amp;nbsp; Avoiding this trap appears to be among the advantages, besides readability, of array indexing notation num_rp&lt;K&gt; ....&lt;/K&gt;&lt;/P&gt;

&lt;P&gt;Intel C++ has a habit anyway of failing to optimize where there are too many of those post-increments at the bottom of the loop.&lt;/P&gt;</description>
      <pubDate>Wed, 16 Mar 2016 17:15:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108527#M7416</guid>
      <dc:creator>TimP</dc:creator>
      <dc:date>2016-03-16T17:15:00Z</dc:date>
    </item>
    <item>
      <title>The compiler usually</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108528#M7417</link>
      <description>&lt;P&gt;The compiler usually generates much better code using subscripted restrict pointes as opposed to pointer++, besides, incrementing one index register is faster than indexing 4 pointers. Use this:&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;float * restrict num_rp = &amp;amp;num_r[0];
float * restrict num_ip = &amp;amp;num_i[0];
float * restrict traces_rp = &amp;amp;traces&lt;N&gt;.r[i_idx1];
float * restrict traces_ip = &amp;amp;traces&lt;N&gt;.i[i_idx1];
int i_iter = i_idx2 - i_idx1 + 1; // # iterations
// *** use loop control variable and index that is scoped inside the for loop
for(int k = 0; k &amp;lt; i_itr; k++) {
&amp;nbsp;&amp;nbsp;&amp;nbsp; num_rp&lt;K&gt; = num_rp&lt;K&gt; + (1.0f-up_interp)*(traces_rp&lt;K&gt;) + up_interp*traces_rp[k+1];
&amp;nbsp;&amp;nbsp;&amp;nbsp; num_ip&lt;K&gt; = num_ip&lt;K&gt; + (1.0f-up_interp)*(traces_ip&lt;K&gt;) + up_interp*traces_ip[k+1];
}&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/K&gt;&lt;/N&gt;&lt;/N&gt;&lt;/PRE&gt;

&lt;P&gt;On 32-bit system, the above should require 4 registers for your float*'s and 1 register each for i_iter and k for a total of 6 GP registers.&lt;/P&gt;

&lt;P&gt;If the above does not fully registerize (usually due to the compiler thinking something outside the scope of what shown above is more important) then enclose the above in {...}. Yes, this do would appear to be&amp;nbsp;meaningless, however, the meaning is that the 4 float*'s and the variable i_iter are disposable (not used outside the scope). This can improve register usage, especially on x32 builds.&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Wed, 16 Mar 2016 19:59:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/Shifted-load/m-p/1108528#M7417</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2016-03-16T19:59:00Z</dc:date>
    </item>
  </channel>
</rss>

