Vectorization Problems

brice_videau · ‎03-26-2012

I have problems trying to translate code written with intrinsics to OpenCL vector code. I am working with double precision. The code is doing vectorized filtering of data using a filter of length 16.

The original code is (sorry but I could not reduce it further):
[bash]const double filter[] __attribute__ ((aligned (16))) = { 8.4334247333529341094733325815816e-7, -0.1290557201342060969516786758559028e-4, 0.8762984476210559564689161894116397e-4, -0.30158038132690463167163703826169879e-3, 0.174723713672993903449447812749852942e-2, -0.942047030201080385922711540948195075e-2, 0.2373821463724942397566389712597274535e-1, 0.612625895831207982195380597e-1, 0.9940415697834003993178616713, -0.604895289196983516002834636e-1, -0.2103025160930381434955489412839065067e-1, 0.1337263414854794752733423467013220997e-1, -0.344128144493493857280881509686821861e-2, 0.49443227688689919192282259476750972e-3, -0.5185986881173432922848639136911487e-4, 2.72734492911979659657715313017228e-6}; const double filter_u[] __attribute__ ((aligned (16))) = { 2.72734492911979659657715313017228e-6, 8.4334247333529341094733325815816e-7, -0.1290557201342060969516786758559028e-4, 0.8762984476210559564689161894116397e-4, -0.30158038132690463167163703826169879e-3, 0.174723713672993903449447812749852942e-2, -0.942047030201080385922711540948195075e-2, 0.2373821463724942397566389712597274535e-1, 0.612625895831207982195380597e-1, 0.9940415697834003993178616713, -0.604895289196983516002834636e-1, -0.2103025160930381434955489412839065067e-1, 0.1337263414854794752733423467013220997e-1, -0.344128144493493857280881509686821861e-2, 0.49443227688689919192282259476750972e-3, -0.5185986881173432922848639136911487e-4, 2.72734492911979659657715313017228e-6}; #define conv_4x2_block_fused(offset_filter,offset_source,d00,d10,d20,d30) \\ FA = _mm_load_pd(filter+offset_filter);\\ d00 = _mm_load_pd(source0+offset_source);\\ S00 = _mm_add_pd(S00,_mm_mul_pd(d00,FA));\\ FU = _mm_load_pd(filter_u+offset_filter);\\ d10 = _mm_load_pd(source1+offset_source);\\ S01 = _mm_add_pd(S01,_mm_mul_pd(d00,FU));\\ S11 = _mm_add_pd(S11,_mm_mul_pd(d10,FU));\\ d20 = _mm_load_pd(source2+offset_source);\\ S10 = _mm_add_pd(S10,_mm_mul_pd(d10,FA));\\ d30 = _mm_load_pd(source3+offset_source);\\ S20 = _mm_add_pd(S20,_mm_mul_pd(d20,FA));\\ S30 = _mm_add_pd(S30,_mm_mul_pd(d30,FA));\\ S31 = _mm_add_pd(S31,_mm_mul_pd(d30,FU));\\ S21 = _mm_add_pd(S21,_mm_mul_pd(d20,FU)); void conv_4x2_fused(size_t ndat, double const * source0, double const * source1, double const * source2, double const * source3, double * dest){ __m128d S00,S01,S10,S11,S20,S21,S30,S31; __m128d FA,FU; __m128d D00,D10,D20,D30; FA = _mm_load_pd(filter); D00 = _mm_load_pd(source0); S00 = _mm_mul_pd(D00,FA); D10 = _mm_load_pd(source1); S10 = _mm_mul_pd(D10,FA); D20 = _mm_load_pd(source2); S20 = _mm_mul_pd(D20,FA); D30 = _mm_load_pd(source3); S30 = _mm_mul_pd(D30,FA); FU = _mm_load_pd(filter_u); S01 = _mm_loadl_pd(D00,source0+16); S01 = _mm_mul_pd(S01,FU); S11 = _mm_loadl_pd(D10,source1+16); S11 = _mm_mul_pd(S11,FU); S21 = _mm_loadl_pd(D20,source2+16); S21 = _mm_mul_pd(S21,FU); S31 = _mm_loadl_pd(D30,source3+16); S31 = _mm_mul_pd(S31,FU); conv_4x2_block_fused(2,2,D00,D10,D20,D30); conv_4x2_block_fused(4,4,D00,D10,D20,D30); conv_4x2_block_fused(6,6,D00,D10,D20,D30); conv_4x2_block_fused(8,8,D00,D10,D20,D30); conv_4x2_block_fused(10,10,D00,D10,D20,D30); conv_4x2_block_fused(12,12,D00,D10,D20,D30); conv_4x2_block_fused(14,14,D00,D10,D20,D30); _mm_store_pd(dest,_mm_hadd_pd(S00,S10)); _mm_store_pd(dest+2,_mm_hadd_pd(S20,S30)); _mm_store_pd(dest+ndat,_mm_hadd_pd(S01,S11)); _mm_store_pd(dest+2+ndat,_mm_hadd_pd(S21,S31)); } [/bash]
Here is the generated assembly (gcc-4.6, but icc generates similar code):
[bash]0000000000408970 : 408970: 66 0f 28 0e movapd (%rsi),%xmm1 408974: 66 0f 28 1a movapd (%rdx),%xmm3 408978: 66 0f 28 01 movapd (%rcx),%xmm0 40897c: 66 0f 28 e9 movapd %xmm1,%xmm5 408980: 66 0f 12 8e 80 00 00 movlpd 0x80(%rsi),%xmm1 408987: 00 408988: 66 0f 28 35 80 ee 00 movapd 0xee80(%rip),%xmm6 # 417810 40898f: 00 408990: 66 0f 28 fb movapd %xmm3,%xmm7 408994: 66 0f 12 9a 80 00 00 movlpd 0x80(%rdx),%xmm3 40899b: 00 40899c: 66 41 0f 28 10 movapd (%r8),%xmm2 4089a1: 66 0f 28 e0 movapd %xmm0,%xmm4 4089a5: 66 0f 12 81 80 00 00 movlpd 0x80(%rcx),%xmm0 4089ac: 00 4089ad: 66 44 0f 28 05 ca ed movapd 0xedca(%rip),%xmm8 # 417780 4089b4: 00 00 4089b6: 66 0f 59 ee mulpd %xmm6,%xmm5 4089ba: 66 44 0f 28 56 10 movapd 0x10(%rsi),%xmm10 4089c0: 66 0f 59 fe mulpd %xmm6,%xmm7 4089c4: 66 44 0f 28 0d 53 ee movapd 0xee53(%rip),%xmm9 # 417820 4089cb: 00 00 4089cd: 66 0f 59 e6 mulpd %xmm6,%xmm4 4089d1: 66 0f 59 f2 mulpd %xmm2,%xmm6 4089d5: 66 44 0f 28 5a 10 movapd 0x10(%rdx),%xmm11 4089db: 66 41 0f 12 90 80 00 movlpd 0x80(%r8),%xmm2 4089e2: 00 00 4089e4: 66 41 0f 59 c8 mulpd %xmm8,%xmm1 4089e9: 66 41 0f 59 d8 mulpd %xmm8,%xmm3 4089ee: 66 41 0f 59 c0 mulpd %xmm8,%xmm0 4089f3: 66 41 0f 59 d0 mulpd %xmm8,%xmm2 4089f8: 66 45 0f 28 c2 movapd %xmm10,%xmm8 4089fd: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408a02: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408a07: 66 44 0f 28 05 80 ed movapd 0xed80(%rip),%xmm8 # 417790 408a0e: 00 00 408a10: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408a15: 66 41 0f 58 ca addpd %xmm10,%xmm1 408a1a: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408a1f: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408a24: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408a29: 66 41 0f 58 da addpd %xmm10,%xmm3 408a2e: 66 44 0f 28 51 10 movapd 0x10(%rcx),%xmm10 408a34: 66 41 0f 58 fb addpd %xmm11,%xmm7 408a39: 66 45 0f 28 58 10 movapd 0x10(%r8),%xmm11 408a3f: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408a44: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408a49: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408a4e: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408a53: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408a58: 66 44 0f 28 56 20 movapd 0x20(%rsi),%xmm10 408a5e: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408a63: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408a68: 66 44 0f 28 0d bf ed movapd 0xedbf(%rip),%xmm9 # 417830 408a6f: 00 00 408a71: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408a76: 66 44 0f 28 5a 20 movapd 0x20(%rdx),%xmm11 408a7c: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408a81: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408a86: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408a8b: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408a90: 66 44 0f 28 05 07 ed movapd 0xed07(%rip),%xmm8 # 4177a0 408a97: 00 00 408a99: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408a9e: 66 41 0f 58 ca addpd %xmm10,%xmm1 408aa3: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408aa8: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408aad: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408ab2: 66 41 0f 58 da addpd %xmm10,%xmm3 408ab7: 66 44 0f 28 51 20 movapd 0x20(%rcx),%xmm10 408abd: 66 41 0f 58 fb addpd %xmm11,%xmm7 408ac2: 66 45 0f 28 58 20 movapd 0x20(%r8),%xmm11 408ac8: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408acd: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408ad2: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408ad7: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408adc: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408ae1: 66 44 0f 28 56 30 movapd 0x30(%rsi),%xmm10 408ae7: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408aec: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408af1: 66 44 0f 28 0d 46 ed movapd 0xed46(%rip),%xmm9 # 417840 408af8: 00 00 408afa: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408aff: 66 44 0f 28 5a 30 movapd 0x30(%rdx),%xmm11 408b05: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408b0a: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408b0f: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408b14: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408b19: 66 44 0f 28 05 8e ec movapd 0xec8e(%rip),%xmm8 # 4177b0 408b20: 00 00 408b22: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408b27: 66 41 0f 58 ca addpd %xmm10,%xmm1 408b2c: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408b31: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408b36: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408b3b: 66 41 0f 58 da addpd %xmm10,%xmm3 408b40: 66 44 0f 28 51 30 movapd 0x30(%rcx),%xmm10 408b46: 66 41 0f 58 fb addpd %xmm11,%xmm7 408b4b: 66 45 0f 28 58 30 movapd 0x30(%r8),%xmm11 408b51: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408b56: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408b5b: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408b60: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408b65: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408b6a: 66 44 0f 28 56 40 movapd 0x40(%rsi),%xmm10 408b70: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408b75: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408b7a: 66 44 0f 28 0d cd ec movapd 0xeccd(%rip),%xmm9 # 417850 408b81: 00 00 408b83: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408b88: 66 44 0f 28 5a 40 movapd 0x40(%rdx),%xmm11 408b8e: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408b93: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408b98: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408b9d: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408ba2: 66 44 0f 28 05 15 ec movapd 0xec15(%rip),%xmm8 # 4177c0 408ba9: 00 00 408bab: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408bb0: 66 41 0f 58 ca addpd %xmm10,%xmm1 408bb5: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408bba: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408bbf: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408bc4: 66 41 0f 58 da addpd %xmm10,%xmm3 408bc9: 66 44 0f 28 51 40 movapd 0x40(%rcx),%xmm10 408bcf: 66 41 0f 58 fb addpd %xmm11,%xmm7 408bd4: 66 45 0f 28 58 40 movapd 0x40(%r8),%xmm11 408bda: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408bdf: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408be4: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408be9: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408bee: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408bf3: 66 44 0f 28 56 50 movapd 0x50(%rsi),%xmm10 408bf9: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408bfe: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408c03: 66 44 0f 28 0d 54 ec movapd 0xec54(%rip),%xmm9 # 417860 408c0a: 00 00 408c0c: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408c11: 66 44 0f 28 5a 50 movapd 0x50(%rdx),%xmm11 408c17: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408c1c: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408c21: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408c26: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408c2b: 66 44 0f 28 05 9c eb movapd 0xeb9c(%rip),%xmm8 # 4177d0 408c32: 00 00 408c34: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408c39: 66 41 0f 58 ca addpd %xmm10,%xmm1 408c3e: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408c43: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408c48: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408c4d: 66 41 0f 58 da addpd %xmm10,%xmm3 408c52: 66 44 0f 28 51 50 movapd 0x50(%rcx),%xmm10 408c58: 66 41 0f 58 fb addpd %xmm11,%xmm7 408c5d: 66 45 0f 28 58 50 movapd 0x50(%r8),%xmm11 408c63: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408c68: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408c6d: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408c72: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408c77: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408c7c: 66 44 0f 28 56 60 movapd 0x60(%rsi),%xmm10 408c82: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408c87: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408c8c: 66 44 0f 28 0d db eb movapd 0xebdb(%rip),%xmm9 # 417870 408c93: 00 00 408c95: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408c9a: 66 44 0f 28 5a 60 movapd 0x60(%rdx),%xmm11 408ca0: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408ca5: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408caa: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408caf: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408cb4: 66 44 0f 28 05 23 eb movapd 0xeb23(%rip),%xmm8 # 4177e0 408cbb: 00 00 408cbd: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408cc2: 66 41 0f 58 ca addpd %xmm10,%xmm1 408cc7: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408ccc: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408cd1: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408cd6: 66 41 0f 58 da addpd %xmm10,%xmm3 408cdb: 66 44 0f 28 51 60 movapd 0x60(%rcx),%xmm10 408ce1: 66 41 0f 58 fb addpd %xmm11,%xmm7 408ce6: 66 45 0f 28 58 60 movapd 0x60(%r8),%xmm11 408cec: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408cf1: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408cf6: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408cfb: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408d00: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408d05: 66 44 0f 28 56 70 movapd 0x70(%rsi),%xmm10 408d0b: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408d10: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408d15: 66 44 0f 28 0d 62 eb movapd 0xeb62(%rip),%xmm9 # 417880 408d1c: 00 00 408d1e: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408d23: 66 44 0f 28 5a 70 movapd 0x70(%rdx),%xmm11 408d29: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408d2e: 66 45 0f 28 c2 movapd %xmm10,%xmm8 408d33: 66 45 0f 59 c1 mulpd %xmm9,%xmm8 408d38: 66 41 0f 58 e8 addpd %xmm8,%xmm5 408d3d: 66 44 0f 28 05 aa ea movapd 0xeaaa(%rip),%xmm8 # 4177f0 408d44: 00 00 408d46: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408d4b: 66 41 0f 58 ca addpd %xmm10,%xmm1 408d50: 66 45 0f 28 d3 movapd %xmm11,%xmm10 408d55: 66 45 0f 59 d0 mulpd %xmm8,%xmm10 408d5a: 66 45 0f 59 d9 mulpd %xmm9,%xmm11 408d5f: 66 41 0f 58 da addpd %xmm10,%xmm3 408d64: 66 44 0f 28 51 70 movapd 0x70(%rcx),%xmm10 408d6a: 66 41 0f 58 fb addpd %xmm11,%xmm7 408d6f: 66 45 0f 28 58 70 movapd 0x70(%r8),%xmm11 408d75: 66 45 0f 28 e2 movapd %xmm10,%xmm12 408d7a: 66 45 0f 59 e1 mulpd %xmm9,%xmm12 408d7f: 66 0f 7c cb haddpd %xmm3,%xmm1 408d83: 66 45 0f 59 cb mulpd %xmm11,%xmm9 408d88: 66 0f 7c ef haddpd %xmm7,%xmm5 408d8c: 66 45 0f 59 d8 mulpd %xmm8,%xmm11 408d91: 66 45 0f 59 c2 mulpd %xmm10,%xmm8 408d96: 66 41 0f 29 29 movapd %xmm5,(%r9) 408d9b: 66 41 0f 58 e4 addpd %xmm12,%xmm4 408da0: 66 41 0f 58 f1 addpd %xmm9,%xmm6 408da5: 66 41 0f 58 d3 addpd %xmm11,%xmm2 408daa: 66 41 0f 58 c0 addpd %xmm8,%xmm0 408daf: 66 0f 7c e6 haddpd %xmm6,%xmm4 408db3: 66 0f 7c c2 haddpd %xmm2,%xmm0 408db7: 66 41 0f 29 61 10 movapd %xmm4,0x10(%r9) 408dbd: 66 41 0f 29 0c f9 movapd %xmm1,(%r9,%rdi,8) 408dc3: 66 41 0f 29 44 f9 10 movapd %xmm0,0x10(%r9,%rdi,8) 408dca: c3 retq 408dcb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) [/bash]
Here is the OpenCL equivalent code:

[bash]#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif const double2 FA[8] = { (double2)(8.4334247333529341094733325815816e-7, -0.1290557201342060969516786758559028e-4), (double2)(0.8762984476210559564689161894116397e-4, -0.30158038132690463167163703826169879e-3), (double2)(0.174723713672993903449447812749852942e-2, -0.942047030201080385922711540948195075e-2), (double2)(0.2373821463724942397566389712597274535e-1, 0.612625895831207982195380597e-1), (double2)(0.9940415697834003993178616713, -0.604895289196983516002834636e-1), (double2)(-0.2103025160930381434955489412839065067e-1, 0.1337263414854794752733423467013220997e-1), (double2)(-0.344128144493493857280881509686821861e-2, 0.49443227688689919192282259476750972e-3), (double2)(-0.5185986881173432922848639136911487e-4, 2.72734492911979659657715313017228e-6)}; const double2 FU[8] = { (double2)(2.72734492911979659657715313017228e-6, 8.4334247333529341094733325815816e-7), (double2)(-0.1290557201342060969516786758559028e-4, 0.8762984476210559564689161894116397e-4), (double2)(-0.30158038132690463167163703826169879e-3, 0.174723713672993903449447812749852942e-2), (double2)(-0.942047030201080385922711540948195075e-2, 0.2373821463724942397566389712597274535e-1), (double2)(0.612625895831207982195380597e-1, 0.9940415697834003993178616713), (double2)(-0.604895289196983516002834636e-1, -0.2103025160930381434955489412839065067e-1), (double2)(0.1337263414854794752733423467013220997e-1, -0.344128144493493857280881509686821861e-2), (double2)(0.49443227688689919192282259476750972e-3, -0.5185986881173432922848639136911487e-4)}; #define conv_4x2_block_fused(o_f,o_s,d00,d10,d20,d30) \ d00 = in0[o_s];\ A = FA[o_f];\ S00 += d00 * A;\ U = FU[o_f];\ d10 = in1[o_s];\ S01 += d00 * U;\ S11 += d10 * U;\ d20 = in2[o_s];\ S10 += d10 * A;\ d30 = in3[o_s];\ S20 += d20 * A;\ S30 += d30 * A;\ S31 += d30 * U;\ S21 += d20 * U; __kernel void magic_filter5(const ulong n, const ulong ndat, __global const double2 *in, __global double2 *out) { const unsigned int n2 = n/2; const unsigned int ndat2 = ndat/2; __global const double2 *in0 = in; __global const double2 *in1 = in0 + n2; __global const double2 *in2 = in1 + n2; __global const double2 *in3 = in2 + n2; __global double2 *out = out + 4; double2 D00,D10,D20,D30; double2 S00,S01,S10,S11,S20,S21,S30,S31; double2 A, U; A = FA[0]; D00 = in0[0]; S00 = D00 * A; D10 = in1[0]; S10 = D10 * A; D20 = in2[0]; S20 = D20 * A; D30 = in3[0]; S30 = D30 * A; U = FU[0]; D00.x = in0[8].x; S01 = D00 * U; D10.x = in1[8].x; S11 = D10 * U; D20.x = in2[8].x; S21 = D20 * U; D30.x = in3[8].x; S31 = D30 * U; conv_4x2_block_fused(1,1,D00,D10,D20,D30); conv_4x2_block_fused(2,2,D00,D10,D20,D30); conv_4x2_block_fused(3,3,D00,D10,D20,D30); conv_4x2_block_fused(4,4,D00,D10,D20,D30); conv_4x2_block_fused(5,5,D00,D10,D20,D30); conv_4x2_block_fused(6,6,D00,D10,D20,D30); conv_4x2_block_fused(7,7,D00,D10,D20,D30); out[0] = (double2)(S00.x + S00.y, S10.x + S10.y); out[1] = (double2)(S20.x + S20.y, S30.x + S30.y); out[ndat2 + 0] = (double2)(S01.x + S01.y, S11.x + S11.y); out[ndat2 + 1] = (double2)(S21.x + S21.y, S31.x + S31.y); }[/bash]
And here is the generated assembly:

[bash] .section .rodata.cst16,"aM",@progbits,16 .align 16 .LCPI2_0: # constant pool <2 x double> .quad 4546094365667641806 # double 8.762984e-05 .quad -4669173237098585703 # double -3.015804e-04 .LCPI2_1: # constant pool <2 x double> .quad 4516068371457184450 # double 8.433425e-07 .quad -4689636306257622577 # double -1.290557e-05 .LCPI2_2: # constant pool <2 x double> .quad 4565700531973276514 # double 1.747237e-03 .quad -4646787883676373161 # double -9.420470e-03 .LCPI2_3: # constant pool <2 x double> .quad 4582499295942488407 # double 2.373821e-02 .quad 4588989690655579855 # double 6.126259e-02 .LCPI2_4: # constant pool <2 x double> .quad 4607128750031811026 # double 9.940416e-01 .quad -4634493755982099125 # double -6.048953e-02 .LCPI2_5: # constant pool <2 x double> .quad -4641653258114437092 # double -2.103025e-02 .quad 4578862420522603841 # double 1.337263e-02 .LCPI2_6: # constant pool <2 x double> .quad -4653290559565923332 # double -3.441281e-03 .quad 4557699556108328520 # double 4.944323e-04 .LCPI2_7: # constant pool <2 x double> .quad -4680594046271356616 # double -5.185987e-05 .quad 4523550187392857396 # double 2.727345e-06 .LCPI2_8: # constant pool <2 x double> .quad -4689636306257622577 # double -1.290557e-05 .quad 4546094365667641806 # double 8.762984e-05 .LCPI2_9: # constant pool <2 x double> .quad 4523550187392857396 # double 2.727345e-06 .quad 4516068371457184450 # double 8.433425e-07 .LCPI2_10: # constant pool <2 x double> .quad -4669173237098585703 # double -3.015804e-04 .quad 4565700531973276514 # double 1.747237e-03 .LCPI2_11: # constant pool <2 x double> .quad -4646787883676373161 # double -9.420470e-03 .quad 4582499295942488407 # double 2.373821e-02 .LCPI2_12: # constant pool <2 x double> .quad 4588989690655579855 # double 6.126259e-02 .quad 4607128750031811026 # double 9.940416e-01 .LCPI2_13: # constant pool <2 x double> .quad -4634493755982099125 # double -6.048953e-02 .quad -4641653258114437092 # double -2.103025e-02 .LCPI2_14: # constant pool <2 x double> .quad 4578862420522603841 # double 1.337263e-02 .quad -4653290559565923332 # double -3.441281e-03 .LCPI2_15: # constant pool <2 x double> .quad 4557699556108328520 # double 4.944323e-04 .quad -4680594046271356616 # double -5.185987e-05 .text .globl magic_filter5 .align 16, 0x90 .type magic_filter5,@function magic_filter5: # @magic_filter5 # BB#0: # %FirstBB push RBP push R15 push R14 push R13 push R12 push RBX sub RSP, 344 mov RAX, QWORD PTR [RSP + 400] shr RAX mov EAX, EAX lea RCX, QWORD PTR [RAX + 2*RAX] mov RDX, RAX shl RDX, 5 mov RSI, QWORD PTR [RSP + 416] lea RDI, QWORD PTR [RDX + RSI + 128] mov QWORD PTR [RSP + 8], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RDX + 16] mov QWORD PTR [RSP], RDI # 8-byte Spill lea RDI, QWORD PTR [RDX + RSI + 32] mov QWORD PTR [RSP - 8], RDI # 8-byte Spill lea RDI, QWORD PTR [RDX + RSI + 48] mov QWORD PTR [RSP - 16], RDI # 8-byte Spill lea RDI, QWORD PTR [RDX + RSI + 64] mov QWORD PTR [RSP - 24], RDI # 8-byte Spill lea RDI, QWORD PTR [RDX + RSI + 80] mov QWORD PTR [RSP - 32], RDI # 8-byte Spill lea RDI, QWORD PTR [RDX + RSI + 96] mov QWORD PTR [RSP - 40], RDI # 8-byte Spill lea RDI, QWORD PTR [RDX + RSI + 112] mov QWORD PTR [RSP - 48], RDI # 8-byte Spill add RDX, RSI shl RAX, 4 lea RDI, QWORD PTR [RSI + RAX + 128] mov QWORD PTR [RSP - 56], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RAX + 16] mov QWORD PTR [RSP - 64], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RAX + 32] mov QWORD PTR [RSP - 72], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RAX + 48] mov QWORD PTR [RSP - 80], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RAX + 64] mov QWORD PTR [RSP - 88], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RAX + 80] mov QWORD PTR [RSP - 96], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RAX + 96] mov QWORD PTR [RSP - 104], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RAX + 112] mov QWORD PTR [RSP - 112], RDI # 8-byte Spill shl RCX, 4 lea RDI, QWORD PTR [RSI + RCX + 128] mov QWORD PTR [RSP - 120], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RCX + 16] mov QWORD PTR [RSP - 128], RDI # 8-byte Spill lea RDI, QWORD PTR [RSI + RCX + 32] lea R8, QWORD PTR [RSI + RCX + 48] lea R9, QWORD PTR [RSI + RCX + 64] lea R10, QWORD PTR [RSI + RCX + 80] lea R11, QWORD PTR [RSI + RCX + 96] lea RBX, QWORD PTR [RSI + RCX + 112] mov R14, QWORD PTR [RSP + 408] movabs R15, 8589934590 and R15, R14 mov R12, QWORD PTR [RSP + 424] lea R15, QWORD PTR [R12 + 8*R15 + 64] shr R14 inc R14D shl R14, 4 lea R14, QWORD PTR [R14 + R12 + 64] add RCX, RSI add RAX, RSI mov R13, -1 .align 16, 0x90 .LBB2_1: # %SyncBB # =>This Inner Loop Header: Depth=1 mov RBP, QWORD PTR [RSP - 64] # 8-byte Reload movapd XMM0, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 304], XMM0 # 16-byte Spill movapd XMM1, XMM0 mulpd XMM1, XMMWORD PTR [RIP + .LCPI2_0] movapd XMM2, XMMWORD PTR [RAX] movapd XMM3, XMM2 mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_1] addpd XMM3, XMM1 mov RBP, QWORD PTR [RSP - 72] # 8-byte Reload movapd XMM0, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 64], XMM0 # 16-byte Spill movapd XMM1, XMM0 mulpd XMM1, XMMWORD PTR [RIP + .LCPI2_2] addpd XMM1, XMM3 mov RBP, QWORD PTR [RSP - 80] # 8-byte Reload movapd XMM0, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 288], XMM0 # 16-byte Spill movapd XMM3, XMM0 mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_3] addpd XMM3, XMM1 mov RBP, QWORD PTR [RSP - 88] # 8-byte Reload movapd XMM0, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 256], XMM0 # 16-byte Spill movapd XMM1, XMM0 mulpd XMM1, XMMWORD PTR [RIP + .LCPI2_4] addpd XMM1, XMM3 mov RBP, QWORD PTR [RSP - 96] # 8-byte Reload movapd XMM0, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 144], XMM0 # 16-byte Spill movapd XMM3, XMM0 mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_5] addpd XMM3, XMM1 mov RBP, QWORD PTR [RSP - 104] # 8-byte Reload movapd XMM0, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 128], XMM0 # 16-byte Spill movapd XMM1, XMM0 mulpd XMM1, XMMWORD PTR [RIP + .LCPI2_6] addpd XMM1, XMM3 mov RBP, QWORD PTR [RSP - 112] # 8-byte Reload movapd XMM3, XMMWORD PTR [RBP] movapd XMM4, XMM3 mulpd XMM4, XMMWORD PTR [RIP + .LCPI2_7] addpd XMM4, XMM1 movapd XMM1, XMM4 unpckhpd XMM1, XMM1 # xmm1 = xmm1[1,1] addsd XMM1, XMM4 movapd XMM4, XMMWORD PTR [RSI] movapd XMM5, XMMWORD PTR [RSI + 16] movapd XMMWORD PTR [RSP + 320], XMM5 # 16-byte Spill movapd XMM0, XMMWORD PTR [RSI + 32] movapd XMMWORD PTR [RSP + 112], XMM0 # 16-byte Spill movapd XMM5, XMMWORD PTR [RSI + 48] movapd XMMWORD PTR [RSP + 272], XMM5 # 16-byte Spill movapd XMM6, XMMWORD PTR [RSP + 320] # 16-byte Reload mulpd XMM6, XMMWORD PTR [RIP + .LCPI2_0] movapd XMM7, XMM4 mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_1] addpd XMM7, XMM6 movapd XMM6, XMM0 mulpd XMM6, XMMWORD PTR [RIP + .LCPI2_2] addpd XMM6, XMM7 movapd XMM7, XMM5 mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_3] addpd XMM7, XMM6 movapd XMM0, XMMWORD PTR [RSI + 64] movapd XMMWORD PTR [RSP + 96], XMM0 # 16-byte Spill movapd XMM5, XMM0 mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_4] addpd XMM5, XMM7 movapd XMM0, XMMWORD PTR [RSI + 80] movapd XMMWORD PTR [RSP + 80], XMM0 # 16-byte Spill movapd XMM6, XMM0 mulpd XMM6, XMMWORD PTR [RIP + .LCPI2_5] addpd XMM6, XMM5 movapd XMM0, XMMWORD PTR [RSI + 96] movapd XMMWORD PTR [RSP + 16], XMM0 # 16-byte Spill movapd XMM5, XMM0 mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_6] addpd XMM5, XMM6 movapd XMM6, XMMWORD PTR [RSI + 112] movapd XMM7, XMM6 mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_7] addpd XMM7, XMM5 movapd XMM5, XMM7 unpckhpd XMM5, XMM5 # xmm5 = xmm5[1,1] addsd XMM5, XMM7 unpcklpd XMM5, XMM1 # xmm5 = xmm5[0],xmm1[0] movapd XMM1, XMMWORD PTR [RDX] movapd XMM7, XMM1 mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_1] mov RBP, QWORD PTR [RSP + 8] # 8-byte Reload movlpd XMM1, QWORD PTR [RBP] movapd XMM8, XMMWORD PTR [RCX] movapd XMM9, XMM8 mulpd XMM9, XMMWORD PTR [RIP + .LCPI2_1] mov RBP, QWORD PTR [RSP - 120] # 8-byte Reload movlpd XMM8, QWORD PTR [RBP] movapd XMM10, XMMWORD PTR [RSP + 320] # 16-byte Reload mulpd XMM10, XMMWORD PTR [RIP + .LCPI2_8] movapd XMMWORD PTR [RSP + 320], XMM10 # 16-byte Spill movlpd XMM4, QWORD PTR [RSI + 128] mov RBP, QWORD PTR [RSP - 56] # 8-byte Reload movlpd XMM2, QWORD PTR [RBP] mov RBP, QWORD PTR [RSP - 48] # 8-byte Reload movapd XMM10, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 48], XMM10 # 16-byte Spill mov RBP, QWORD PTR [RSP - 40] # 8-byte Reload movapd XMM11, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 240], XMM11 # 16-byte Spill mov RBP, QWORD PTR [RSP - 32] # 8-byte Reload movapd XMM12, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 224], XMM12 # 16-byte Spill mov RBP, QWORD PTR [RSP - 24] # 8-byte Reload movapd XMM13, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 208], XMM13 # 16-byte Spill mov RBP, QWORD PTR [RSP - 16] # 8-byte Reload movapd XMM14, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 192], XMM14 # 16-byte Spill mov RBP, QWORD PTR [RSP - 8] # 8-byte Reload movapd XMM15, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 160], XMM15 # 16-byte Spill mov RBP, QWORD PTR [RSP] # 8-byte Reload movapd XMM0, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 176], XMM0 # 16-byte Spill movapd XMM0, XMMWORD PTR [RBX] movapd XMM10, XMMWORD PTR [R11] movapd XMM11, XMMWORD PTR [R10] movapd XMM12, XMMWORD PTR [R9] movapd XMM13, XMMWORD PTR [R8] movapd XMM14, XMMWORD PTR [RDI] mov RBP, QWORD PTR [RSP - 128] # 8-byte Reload movapd XMM15, XMMWORD PTR [RBP] movapd XMMWORD PTR [RSP + 32], XMM15 # 16-byte Spill movapd XMMWORD PTR [R12 + 64], XMM5 movapd XMM5, XMM15 mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_0] addpd XMM5, XMM9 movapd XMM9, XMM14 mulpd XMM9, XMMWORD PTR [RIP + .LCPI2_2] addpd XMM9, XMM5 movapd XMM5, XMM13 mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_3] addpd XMM5, XMM9 movapd XMM9, XMM12 mulpd XMM9, XMMWORD PTR [RIP + .LCPI2_4] addpd XMM9, XMM5 movapd XMM5, XMM11 mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_5] addpd XMM5, XMM9 movapd XMM9, XMM10 mulpd XMM9, XMMWORD PTR [RIP + .LCPI2_6] addpd XMM9, XMM5 movapd XMM5, XMM0 mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_7] addpd XMM5, XMM9 movapd XMM9, XMM5 unpckhpd XMM9, XMM9 # xmm9 = xmm9[1,1] addsd XMM9, XMM5 movapd XMM5, XMMWORD PTR [RSP + 176] # 16-byte Reload mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_0] addpd XMM5, XMM7 movapd XMM7, XMMWORD PTR [RSP + 160] # 16-byte Reload mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_2] addpd XMM7, XMM5 movapd XMM5, XMMWORD PTR [RSP + 192] # 16-byte Reload mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_3] addpd XMM5, XMM7 movapd XMM7, XMMWORD PTR [RSP + 208] # 16-byte Reload mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_4] addpd XMM7, XMM5 movapd XMM5, XMMWORD PTR [RSP + 224] # 16-byte Reload mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_5] addpd XMM5, XMM7 movapd XMM7, XMMWORD PTR [RSP + 240] # 16-byte Reload mulpd XMM7, XMMWORD PTR [RIP + .LCPI2_6] addpd XMM7, XMM5 movapd XMM5, XMMWORD PTR [RSP + 48] # 16-byte Reload mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_7] addpd XMM5, XMM7 movapd XMM7, XMM5 unpckhpd XMM7, XMM7 # xmm7 = xmm7[1,1] addsd XMM7, XMM5 unpcklpd XMM7, XMM9 # xmm7 = xmm7[0],xmm9[0] movapd XMMWORD PTR [R12 + 80], XMM7 movapd XMM5, XMMWORD PTR [RSP + 304] # 16-byte Reload mulpd XMM5, XMMWORD PTR [RIP + .LCPI2_8] movapd XMMWORD PTR [RSP + 304], XMM5 # 16-byte Spill movapd XMM5, XMMWORD PTR [RIP + .LCPI2_9] mulpd XMM2, XMM5 addpd XMM2, XMMWORD PTR [RSP + 304] # 16-byte Folded Reload movapd XMM7, XMMWORD PTR [RIP + .LCPI2_10] movapd XMM9, XMMWORD PTR [RSP + 64] # 16-byte Reload mulpd XMM9, XMM7 addpd XMM9, XMM2 movapd XMMWORD PTR [RSP + 64], XMM9 # 16-byte Spill movapd XMM2, XMMWORD PTR [RSP + 288] # 16-byte Reload mulpd XMM2, XMMWORD PTR [RIP + .LCPI2_11] addpd XMM2, XMM9 movapd XMMWORD PTR [RSP + 288], XMM2 # 16-byte Spill movapd XMM2, XMMWORD PTR [RSP + 256] # 16-byte Reload mulpd XMM2, XMMWORD PTR [RIP + .LCPI2_12] addpd XMM2, XMMWORD PTR [RSP + 288] # 16-byte Folded Reload movapd XMMWORD PTR [RSP + 256], XMM2 # 16-byte Spill movapd XMM2, XMMWORD PTR [RSP + 144] # 16-byte Reload mulpd XMM2, XMMWORD PTR [RIP + .LCPI2_13] addpd XMM2, XMMWORD PTR [RSP + 256] # 16-byte Folded Reload movapd XMMWORD PTR [RSP + 144], XMM2 # 16-byte Spill movapd XMM2, XMMWORD PTR [RIP + .LCPI2_14] movapd XMM9, XMMWORD PTR [RSP + 128] # 16-byte Reload mulpd XMM9, XMM2 addpd XMM9, XMMWORD PTR [RSP + 144] # 16-byte Folded Reload movapd XMMWORD PTR [RSP + 128], XMM9 # 16-byte Spill movapd XMM9, XMMWORD PTR [RIP + .LCPI2_15] mulpd XMM3, XMM9 addpd XMM3, XMMWORD PTR [RSP + 128] # 16-byte Folded Reload movapd XMM15, XMM3 unpckhpd XMM15, XMM15 # xmm15 = xmm15[1,1] addsd XMM15, XMM3 mulpd XMM4, XMM5 addpd XMM4, XMMWORD PTR [RSP + 320] # 16-byte Folded Reload movapd XMM3, XMMWORD PTR [RSP + 112] # 16-byte Reload mulpd XMM3, XMM7 addpd XMM3, XMM4 movapd XMMWORD PTR [RSP + 112], XMM3 # 16-byte Spill movapd XMM3, XMMWORD PTR [RSP + 272] # 16-byte Reload mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_11] addpd XMM3, XMMWORD PTR [RSP + 112] # 16-byte Folded Reload movapd XMMWORD PTR [RSP + 272], XMM3 # 16-byte Spill movapd XMM3, XMMWORD PTR [RSP + 96] # 16-byte Reload mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_12] addpd XMM3, XMMWORD PTR [RSP + 272] # 16-byte Folded Reload movapd XMMWORD PTR [RSP + 96], XMM3 # 16-byte Spill movapd XMM3, XMMWORD PTR [RSP + 80] # 16-byte Reload mulpd XMM3, XMMWORD PTR [RIP + .LCPI2_13] addpd XMM3, XMMWORD PTR [RSP + 96] # 16-byte Folded Reload movapd XMMWORD PTR [RSP + 80], XMM3 # 16-byte Spill movapd XMM3, XMMWORD PTR [RSP + 16] # 16-byte Reload mulpd XMM3, XMM2 addpd XMM3, XMMWORD PTR [RSP + 80] # 16-byte Folded Reload movapd XMMWORD PTR [RSP + 16], XMM3 # 16-byte Spill mulpd XMM6, XMM9 addpd XMM6, XMM3 movapd XMM3, XMM6 unpckhpd XMM3, XMM3 # xmm3 = xmm3[1,1] addsd XMM3, XMM6 unpcklpd XMM3, XMM15 # xmm3 = xmm3[0],xmm15[0] movapd XMMWORD PTR [R15], XMM3 movapd XMM15, XMMWORD PTR [RSP + 32] # 16-byte Reload mulpd XMM15, XMMWORD PTR [RIP + .LCPI2_8] movapd XMMWORD PTR [RSP + 32], XMM15 # 16-byte Spill mulpd XMM8, XMM5 addpd XMM8, XMM15 mulpd XMM14, XMM7 addpd XMM14, XMM8 mulpd XMM13, XMMWORD PTR [RIP + .LCPI2_11] addpd XMM13, XMM14 mulpd XMM12, XMMWORD PTR [RIP + .LCPI2_12] addpd XMM12, XMM13 mulpd XMM11, XMMWORD PTR [RIP + .LCPI2_13] addpd XMM11, XMM12 mulpd XMM10, XMM2 addpd XMM10, XMM11 mulpd XMM0, XMM9 addpd XMM0, XMM10 movapd XMM3, XMM0 unpckhpd XMM3, XMM3 # xmm3 = xmm3[1,1] addsd XMM3, XMM0 movapd XMM0, XMMWORD PTR [RSP + 176] # 16-byte Reload mulpd XMM0, XMMWORD PTR [RIP + .LCPI2_8] movapd XMMWORD PTR [RSP + 176], XMM0 # 16-byte Spill mulpd XMM1, XMM5 addpd XMM1, XMM0 movapd XMM15, XMMWORD PTR [RSP + 160] # 16-byte Reload mulpd XMM15, XMM7 addpd XMM15, XMM1 movapd XMMWORD PTR [RSP + 160], XMM15 # 16-byte Spill movapd XMM14, XMMWORD PTR [RSP + 192] # 16-byte Reload mulpd XMM14, XMMWORD PTR [RIP + .LCPI2_11] addpd XMM14, XMM15 movapd XMMWORD PTR [RSP + 192], XMM14 # 16-byte Spill movapd XMM13, XMMWORD PTR [RSP + 208] # 16-byte Reload mulpd XMM13, XMMWORD PTR [RIP + .LCPI2_12] addpd XMM13, XMM14 movapd XMMWORD PTR [RSP + 208], XMM13 # 16-byte Spill movapd XMM12, XMMWORD PTR [RSP + 224] # 16-byte Reload mulpd XMM12, XMMWORD PTR [RIP + .LCPI2_13] addpd XMM12, XMM13 movapd XMMWORD PTR [RSP + 224], XMM12 # 16-byte Spill movapd XMM11, XMMWORD PTR [RSP + 240] # 16-byte Reload mulpd XMM11, XMM2 addpd XMM11, XMM12 movapd XMMWORD PTR [RSP + 240], XMM11 # 16-byte Spill movapd XMM10, XMMWORD PTR [RSP + 48] # 16-byte Reload mulpd XMM10, XMM9 addpd XMM10, XMM11 movapd XMM0, XMM10 unpckhpd XMM0, XMM0 # xmm0 = xmm0[1,1] addsd XMM0, XMM10 unpcklpd XMM0, XMM3 # xmm0 = xmm0[0],xmm3[0] movapd XMMWORD PTR [R14], XMM0 inc R13 cmp R13, QWORD PTR [RSP + 480] jb .LBB2_1 # BB#2: # %SyncBB29 add RSP, 344 pop RBX pop R12 pop R13 pop R14 pop R15 pop RBP ret .Ltmp2: .size magic_filter5, .Ltmp2-magic_filter5[/bash]

As can be seen, the OpenCL version is spilling a lot of registers. This effect halves the performances.
Am I doing something wrong? I would have expected code written in intrinsics to port quite straightforwardly to OpenCL vectorized language. Even though I was curious to see if the compiler would optimize the transposition using haddpd, I did not expect such differences.

Sorry for the long post with lots of code but I could not reduce further.

brice_videau · ‎04-23-2012

I am still struggling with this problem. Would there be a way in the compiler to bypass most of the LLVM optimizations and go straight to the code generation phase? This way the compiler would just be used to allocate registers, the way it happens when using intrinsics and a standard compiler.

Maxim_S_Intel · ‎04-23-2012

One specific optimization you might want to disable first when dealing with explicitly vectorized code is OCL vectorizer itself.

Consider prefixing your kernel withvec_type_hint:

__kernel __attribute__((vec_type_hint(double2))) foo(...)

This is unlikely to be related to the register allocation though. If you are seeing too much spiils/fills it might indicate that you (for example) are having too much constants in use. Native (not OpenCL, which is just JIT) compiler might handle this situation somewhat better.