- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I'm trying to speed up some code using auto vectorization from Intel Compiler and using sse. All computations are transformation some struct node_t to another struct w_t (functions tr() and gen_tr()). When I try vectorize function gen_tr() it does not produce any effects.
If change data storage format, when each struct component stored in different array of floats, then auto vectorization works well, see function genv_tr().
If change data storage format, when each struct component stored in different array of floats, then auto vectorization works well, see function genv_tr().
Function that used sse called ssev_tr (N should divided evenly by 4).
transform.c:
[cpp]#include#include #include #include static __inline__ unsigned long getCC(void) { unsigned a, d; asm volatile("rdtsc" : "=a" (a), "=d" (d)); return ((unsigned long)a) | (((unsigned long)d) << 32); } typedef struct { float x1, x2, x3, x4, x5; } node_t; typedef struct { float w1, w2, w3, w4; } w_t; void tr(node_t *n, float c1, float c2, w_t *w) { const float nv = n->x1; const float N00T = n->x3 * c1; const float n1v = n->x2; const float N01T = n->x4 * c2; w->w1 = nv - N00T; w->w2 = nv + N00T; w->w3 = n1v - N01T; w->w4 = n1v + N01T; } __attribute__ ((noinline)) void gen_tr(node_t *n, w_t *w, const int N, float c1, float c2) { int i; #pragma vector aligned #pragma ivdep for (i = 0; i < N; i++) { tr(n + i, c1, c2, w + i); } } __attribute__ ((noinline)) void genv_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2) { int i; #pragma vector aligned #pragma ivdep for (i = 0; i < N; i++) { const float N00T = x3 * c1; const float N01T = x4 * c2; w1 = x1 - N00T; w2 = x1 + N00T; w3 = x2 - N01T; w4 = x2 + N01T; } } __attribute__ ((noinline)) void ssev_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2) { __m128 *ws1 = (__m128*)w1; __m128 *ws2 = (__m128*)w2; __m128 *ws3 = (__m128*)w3; __m128 *ws4 = (__m128*)w4; __m128 *xs1 = (__m128*)x1; __m128 *xs2 = (__m128*)x2; __m128 *xs3 = (__m128*)x3; __m128 *xs4 = (__m128*)x4; const __m128 cs1 = _mm_set1_ps(c1); const __m128 cs2 = _mm_set1_ps(c2); int i; #pragma vector aligned #pragma ivdep for (i = 0; i < N / 4; i++) { const __m128 N00T = _mm_mul_ps(xs3, cs1); const __m128 N01T = _mm_mul_ps(xs4, cs2); ws1 = _mm_sub_ps(xs1, N00T); ws2 = _mm_add_ps(xs1, N00T); ws3 = _mm_sub_ps(xs2, N01T); ws4 = _mm_add_ps(xs2, N01T); } } #define test(func) \ for (i = 0; i < n; i++) { \ x.x1 = 1.0; \ x.x2 = 2.0; \ x.x3 = 2.0; \ x.x4 = 2.0; \ x.x5 = 2.0; \ } \ \ t1 = getCC(); \ for (i = 0; i < rep; i++) { \ func(x, w, n, c1, c2); \ } \ t2 = getCC(); \ printf("\t%f", ((double)(t2 - t1)) / n / rep); #define test1(func) \ for (i = 0; i < n; i++) { \ x1 = 1.0; \ x2 = 2.0; \ x3 = 2.0; \ x4 = 2.0; \ x5 = 2.0; \ } \ \ t1 = getCC(); \ for (i = 0; i < rep; i++) { \ func(x1, x2, x3, x4, x5, w1, w2, w3, w4, n, c1, c2); \ } \ t2 = getCC(); \ printf("\t%f", ((double)(t2 - t1)) / n / rep); int main(int argc, char *argv[]) { if (argc < 2) { printf("Usage %s vector_size\n", argv[0]); } int n = atoi(argv[1]); printf("%d", n); int rep = 100000000 / n; int i; int inc = 1; float c1 = 2.0, c2 = 1.0; unsigned long t1, t2; node_t *x = (node_t*)malloc(n * sizeof(node_t)); w_t *w = (w_t*)malloc(n * sizeof(w_t)); float *x1 = (float*)malloc(n * sizeof(float)); float *x2 = (float*)malloc(n * sizeof(float)); float *x3 = (float*)malloc(n * sizeof(float)); float *x4 = (float*)malloc(n * sizeof(float)); float *x5 = (float*)malloc(n * sizeof(float)); float *w1 = (float*)malloc(n * sizeof(float)); float *w2 = (float*)malloc(n * sizeof(float)); float *w3 = (float*)malloc(n * sizeof(float)); float *w4 = (float*)malloc(n * sizeof(float)); test(gen_tr); test1(genv_tr); test1(ssev_tr); printf("\n"); return 0; }[/cpp]
Compile options: icc -O3 -Wall -W -vec-report6 transform.c -o transform
Version of icc - 12.1.2, OS - Fedora 16 x86_64, CPU - Intel Core2 Quad CPU Q8200.
Then i run it with different size from 16 to 3000 with step 64, here script:
[bash]#!/bin/bash echo "" > run.log for ((c=16;c<3000;c+=64)) do ./transform $c | tee -a run.log done[/bash]Here some result of work this script (size, gen_tr, genv_tr, ssev_tr), all times shown per one array element:
[plain]16 7.710743 3.168577 3.253829 272 7.166493 1.983918 2.618569 528 7.121866 1.920195 2.567109 784 7.115007 1.899451 2.549645 1040 8.104026 2.481062 2.944317 1296 8.137537 5.105032 5.104614 1552 8.118534 5.068812 5.064211 1808 8.138309 5.077831 5.085015 2064 8.149699 5.107503 5.069958 2320 8.164556 5.080981 5.099313 2576 8.151524 5.086056 5.089294 2832 8.212946 5.061927 5.072261[/plain]why it is so significant change about size 1000 when using vectorized version of function? Does it because of cache miss? Is it possible to save same speed on all data ranges?
Link Copied
2 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
It seems likely that you are seeing a cache or DTLB miss effect. Remember that hardware prefetch stops at 4KB page boundary. Did you try adjusting opt-prefetch level?
Did you check whether the compiler is splitting the loops, and use #pragma distribute point to control it? Such effects occur when the loop is split between uses of the same array section.
Did you check whether the compiler is splitting the loops, and use #pragma distribute point to control it? Such effects occur when the loop is split between uses of the same array section.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Also, check to verify your allocations are vector aligned. Some heap managers allocate to 8-byte granularity.
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page