Community
cancel
Showing results for
Did you mean:
Highlighted
Beginner
6 Views

## Bad performance in templated function

I have the following templated function:

```template<int dim>
void advecu(double * restrict ut, double * restrict u, double * restrict v, double * restrict w, double * restrict dzi4)
{
// Here comes the initialization of the constants...
// Loop
for(int k=grid->kstart; k<grid->kend; k++)
for(int j=grid->jstart; j<grid->jend; j++)
#pragma ivdep
for(int i=grid->istart; i<grid->iend; i++)
{
ijk = i + j*jj1 + k*kk1;
ut[ijk] -= ( cg0*((ci0*u[ijk-ii3] + ci1*u[ijk-ii2] + ci2*u[ijk-ii1] + ci3*u[ijk    ]) * (ci0*u[ijk-ii3] + ci1*u[ijk-ii2] + ci2*u[ijk-ii1] + ci3*u[ijk    ]))
+ cg1*((ci0*u[ijk-ii2] + ci1*u[ijk-ii1] + ci2*u[ijk    ] + ci3*u[ijk+ii1]) * (ci0*u[ijk-ii2] + ci1*u[ijk-ii1] + ci2*u[ijk    ] + ci3*u[ijk+ii1]))
+ cg2*((ci0*u[ijk-ii1] + ci1*u[ijk    ] + ci2*u[ijk+ii1] + ci3*u[ijk+ii2]) * (ci0*u[ijk-ii1] + ci1*u[ijk    ] + ci2*u[ijk+ii1] + ci3*u[ijk+ii2]))
+ cg3*((ci0*u[ijk    ] + ci1*u[ijk+ii1] + ci2*u[ijk+ii2] + ci3*u[ijk+ii3]) * (ci0*u[ijk    ] + ci1*u[ijk+ii1] + ci2*u[ijk+ii2] + ci3*u[ijk+ii3])) ) * cgi*dxi;

if(dim == 3)
{
ut[ijk] -= ( cg0*((ci0*v[ijk-ii2-jj1] + ci1*v[ijk-ii1-jj1] + ci2*v[ijk-jj1] + ci3*v[ijk+ii1-jj1]) * (ci0*u[ijk-jj3] + ci1*u[ijk-jj2] + ci2*u[ijk-jj1] + ci3*u[ijk    ]))
+ cg1*((ci0*v[ijk-ii2    ] + ci1*v[ijk-ii1    ] + ci2*v[ijk    ] + ci3*v[ijk+ii1    ]) * (ci0*u[ijk-jj2] + ci1*u[ijk-jj1] + ci2*u[ijk    ] + ci3*u[ijk+jj1]))
+ cg2*((ci0*v[ijk-ii2+jj1] + ci1*v[ijk-ii1+jj1] + ci2*v[ijk+jj1] + ci3*v[ijk+ii1+jj1]) * (ci0*u[ijk-jj1] + ci1*u[ijk    ] + ci2*u[ijk+jj1] + ci3*u[ijk+jj2]))
+ cg3*((ci0*v[ijk-ii2+jj2] + ci1*v[ijk-ii1+jj2] + ci2*v[ijk+jj2] + ci3*v[ijk+ii1+jj2]) * (ci0*u[ijk    ] + ci1*u[ijk+jj1] + ci2*u[ijk+jj2] + ci3*u[ijk+jj3])) ) * cgi*dyi;
}

ut[ijk] -= ( cg0*((ci0*w[ijk-ii2-kk1] + ci1*w[ijk-ii1-kk1] + ci2*w[ijk-kk1] + ci3*w[ijk+ii1-kk1]) * (ci0*u[ijk-kk3] + ci1*u[ijk-kk2] + ci2*u[ijk-kk1] + ci3*u[ijk    ]))
+ cg1*((ci0*w[ijk-ii2    ] + ci1*w[ijk-ii1    ] + ci2*w[ijk    ] + ci3*w[ijk+ii1    ]) * (ci0*u[ijk-kk2] + ci1*u[ijk-kk1] + ci2*u[ijk    ] + ci3*u[ijk+kk1]))
+ cg2*((ci0*w[ijk-ii2+kk1] + ci1*w[ijk-ii1+kk1] + ci2*w[ijk+kk1] + ci3*w[ijk+ii1+kk1]) * (ci0*u[ijk-kk1] + ci1*u[ijk    ] + ci2*u[ijk+kk1] + ci3*u[ijk+kk2]))
+ cg3*((ci0*w[ijk-ii2+kk2] + ci1*w[ijk-ii1+kk2] + ci2*w[ijk+kk2] + ci3*w[ijk+ii1+kk2]) * (ci0*u[ijk    ] + ci1*u[ijk+kk1] + ci2*u[ijk+kk2] + ci3*u[ijk+kk3])) )
* dzi4;
}
}
```

In case variable dim equals 3, this function is almost 50 per cent slower than the old version, which did not contain templates. Why do templates make this function so slow? I expected to get identical results as after parsing of the template, my old and new version are identical.

9 Replies
Highlighted
Employee
6 Views

could you please post your "old version" of code if possible? It will be even better if you can post the complete code which can be built . :)

I can see in your code, you have "if(dim == 3)" and the same code will be executed twice, will that be an issue?

Thanks,

Shenghong

Highlighted
Beginner
6 Views

@Shenghong, the piece of code in between the brackets is different than the other part, since it contains a different variable. The original code is just the same, but without the template. I put the template to merge two-dimensional and three-dimensional routines as they largely overlap. The old code looks like this:

```void advecu(double * restrict ut, double * restrict u, double * restrict v, double * restrict w, double * restrict dzi4)
{
// Here comes the initialization of the constants...
// Loop
for(int k=grid->kstart; k<grid->kend; k++)
for(int j=grid->jstart; j<grid->jend; j++)
#pragma ivdep
for(int i=grid->istart; i<grid->iend; i++)
{
ijk = i + j*jj1 + k*kk1;
ut[ijk] -= ( cg0*((ci0*u[ijk-ii3] + ci1*u[ijk-ii2] + ci2*u[ijk-ii1] + ci3*u[ijk    ]) * (ci0*u[ijk-ii3] + ci1*u[ijk-ii2] + ci2*u[ijk-ii1] + ci3*u[ijk    ]))
+ cg1*((ci0*u[ijk-ii2] + ci1*u[ijk-ii1] + ci2*u[ijk    ] + ci3*u[ijk+ii1]) * (ci0*u[ijk-ii2] + ci1*u[ijk-ii1] + ci2*u[ijk    ] + ci3*u[ijk+ii1]))
+ cg2*((ci0*u[ijk-ii1] + ci1*u[ijk    ] + ci2*u[ijk+ii1] + ci3*u[ijk+ii2]) * (ci0*u[ijk-ii1] + ci1*u[ijk    ] + ci2*u[ijk+ii1] + ci3*u[ijk+ii2]))
+ cg3*((ci0*u[ijk    ] + ci1*u[ijk+ii1] + ci2*u[ijk+ii2] + ci3*u[ijk+ii3]) * (ci0*u[ijk    ] + ci1*u[ijk+ii1] + ci2*u[ijk+ii2] + ci3*u[ijk+ii3])) ) * cgi*dxi;

ut[ijk] -= ( cg0*((ci0*v[ijk-ii2-jj1] + ci1*v[ijk-ii1-jj1] + ci2*v[ijk-jj1] + ci3*v[ijk+ii1-jj1]) * (ci0*u[ijk-jj3] + ci1*u[ijk-jj2] + ci2*u[ijk-jj1] + ci3*u[ijk    ]))
+ cg1*((ci0*v[ijk-ii2    ] + ci1*v[ijk-ii1    ] + ci2*v[ijk    ] + ci3*v[ijk+ii1    ]) * (ci0*u[ijk-jj2] + ci1*u[ijk-jj1] + ci2*u[ijk    ] + ci3*u[ijk+jj1]))
+ cg2*((ci0*v[ijk-ii2+jj1] + ci1*v[ijk-ii1+jj1] + ci2*v[ijk+jj1] + ci3*v[ijk+ii1+jj1]) * (ci0*u[ijk-jj1] + ci1*u[ijk    ] + ci2*u[ijk+jj1] + ci3*u[ijk+jj2]))
+ cg3*((ci0*v[ijk-ii2+jj2] + ci1*v[ijk-ii1+jj2] + ci2*v[ijk+jj2] + ci3*v[ijk+ii1+jj2]) * (ci0*u[ijk    ] + ci1*u[ijk+jj1] + ci2*u[ijk+jj2] + ci3*u[ijk+jj3])) ) * cgi*dyi;

ut[ijk] -= ( cg0*((ci0*w[ijk-ii2-kk1] + ci1*w[ijk-ii1-kk1] + ci2*w[ijk-kk1] + ci3*w[ijk+ii1-kk1]) * (ci0*u[ijk-kk3] + ci1*u[ijk-kk2] + ci2*u[ijk-kk1] + ci3*u[ijk    ]))
+ cg1*((ci0*w[ijk-ii2    ] + ci1*w[ijk-ii1    ] + ci2*w[ijk    ] + ci3*w[ijk+ii1    ]) * (ci0*u[ijk-kk2] + ci1*u[ijk-kk1] + ci2*u[ijk    ] + ci3*u[ijk+kk1]))
+ cg2*((ci0*w[ijk-ii2+kk1] + ci1*w[ijk-ii1+kk1] + ci2*w[ijk+kk1] + ci3*w[ijk+ii1+kk1]) * (ci0*u[ijk-kk1] + ci1*u[ijk    ] + ci2*u[ijk+kk1] + ci3*u[ijk+kk2]))
+ cg3*((ci0*w[ijk-ii2+kk2] + ci1*w[ijk-ii1+kk2] + ci2*w[ijk+kk2] + ci3*w[ijk+ii1+kk2]) * (ci0*u[ijk    ] + ci1*u[ijk+kk1] + ci2*u[ijk+kk2] + ci3*u[ijk+kk3])) )
* dzi4;
}
}
```
Highlighted
Employee
6 Views

Hi Chiel,

Thank you for providing the old version of code. I agree with you this template code should not affect the performance, they should be totally same code after template specialization.

I have explored with some simple code to see how ICC handle the template specialization. And I do see some strange bug (from my understanding, this may affect performance for your case), see below:

```#include <stdio.h>

template <int dim>
void foo() {
if(dim==3) {
printf("dim is 3\n");
}
if(dim==4) {
printf("dim is 4\n");
}
}

template void foo<3>();

int main() {
foo<3>();
return 0;
}
```

I build with:

```\$ g++ foo.cpp -g -O0 -o gcc.out
\$ icc foo.cpp -g -O0 -o icc.out
\$
```

Now, if I check the results with objdump ( objdump -d gcc.out -C). The generated code for "foo" are quite different:

GCC Version (there are no logic of "if(dim==3)" after template replacement)

```0000000000400600 <void foo<3>()>:
400600:       55                      push   %rbp
400601:       48 89 e5                mov    %rsp,%rbp
400604:       bf fc 06 40 00          mov    \$0x4006fc,%edi
400609:       e8 5a fe ff ff          callq  400468 <puts@plt>
40060e:       5d                      pop    %rbp
40060f:       c3                      retq
```

ICC Version: (It regards dim as a variable, instead of template replacement as we expect):

```00000000004006b0 <void foo<3>()>:
4006b0:       55                      push   %rbp
4006b1:       48 89 e5                mov    %rsp,%rbp
4006b4:       48 83 ec 10             sub    \$0x10,%rsp
4006b8:       b8 00 00 00 00          mov    \$0x0,%eax
4006bd:       83 f8 01                cmp    \$0x1,%eax
4006c0:       74 15                   je     4006d7 <void foo<3>()+0x27>
4006c2:       b8 ec 07 40 00          mov    \$0x4007ec,%eax
4006c7:       48 89 c7                mov    %rax,%rdi
4006ca:       b8 00 00 00 00          mov    \$0x0,%eax
4006cf:       e8 3c fe ff ff          callq  400510 <printf@plt>
4006d4:       89 45 f0                mov    %eax,-0x10(%rbp)
4006d7:       b8 00 00 00 00          mov    \$0x0,%eax
4006dc:       85 c0                   test   %eax,%eax
4006de:       74 15                   je     4006f5 <void foo<3>()+0x45>
4006e0:       b8 f8 07 40 00          mov    \$0x4007f8,%eax
4006e5:       48 89 c7                mov    %rax,%rdi
4006e8:       b8 00 00 00 00          mov    \$0x0,%eax
4006ed:       e8 1e fe ff ff          callq  400510 <printf@plt>
4006f2:       89 45 f4                mov    %eax,-0xc(%rbp)
4006f5:       c9                      leaveq
4006f6:       c3                      retq```

In this case, the compiler front end just regard "dim" as a variable and pass to compiler backend, for this very simple test case, the code will still be well optimized with O2/3. But for your code, if "dim==3" is not done by front-end, it may block the code to be vectorized in back-end optimization. You may check with vectorization report to analysis your code, to see whether the loop is vectorized with template used.

I will submit my findings for developer to verify whether this is a bug. But if you can send me a complete test case (which can be built), I can be more easier to convince developer this is unexpected and developer can also validate their fix based on your test case (it should be very easy for you, you may just malloc some memory, and invoke your "advecu" function). I will also see whether I can construct a test with loops and show the issue. :)

Hope above analysis can help you to understand the "bug" and why it will affect performance.

Thanks,

Shenghong

Highlighted
Beginner
6 Views

Dear Shenghong,

I have made a simple test program, and found out that there the templating works properly. I have in my code removed the template from its class and made it global and then the code optimizes fine. I have attached my working example, for what it is worth, since the error seems to be more complex.

Chiel

Highlighted
Employee
6 Views

Thank you Chiel for the information. Does it means, template is not the root cause for the performance issues?

I also tried to write some code and even the template is not parsed at compiler front-end, it can always be well optimized at back-end, and will not affect performance....:(

I also tested with your code just now, I can get same performance with and without template used (about 6.6s).

You may submit a new post or reply here if you figured out more on the root cause of your performance degradation. :)

Thanks,

Shenghong

Highlighted
Beginner
6 Views

Dear Shenghong, I have figured the cause out in the meantime. Our template is a class member function. What I have noticed that if I take my kernels, which are private functions of the class, out of the class, the templates work as expected.

If I take my old version, and make them a template, but never use the template argument (dim), then I already lose the performance. It therefore seems that the compiler does not appreciate to have template class members.

Highlighted
Employee
6 Views

Hi Chiel,

Thank you for the clarification.

could you please send me your test case to reproduce? Will writing a simple class to contain the template class member enough to show the issue?

Thanks,

Shenghong

Highlighted
Beginner
6 Views

Dear Shenghong,

This turns out to be one of the nasty problems. I do not manage to reproduce a simple test program, because they all work perfect, but neither can I figure a reason why making a function template, even though nothing happens with the template arguments, would slow down the performance. An interesting detail is that the Clang compiler showed the same behavior, even though GCC works well. I am puzzled. Our complete code is to be found at https:/github.com/microhh/microhh, but this code is a bit too big to oversee the problem.

Cheers,

Chiel

Highlighted
Beginner
6 Views

Dear Shenhong,

After searching for a long time, I found the problem. As soon as you introduce templates in the function, the function cannot figure out the vectorization if the declaration in the header does not contain the restrict keyword:

```// declaration leading to fast code
template<int>
void advecu(double * restrict, double * restrict, double * restrict, double * restrict, double * restrict);

// declaration leading to slow code
template<int>
void advecu(double *, double *, double *, double *, double *);

// identical implementation for both
template<int dim3>
void advecu(double * restrict ut, double * restrict u, double * restrict v, double * restrict w, double * restrict dzi4)
{
// Implementation...
}```

Cheers,

Chiel