- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Everyone,
I am trying to speed up a roll up / cumulative sum of an array using SIMD. Below is the code ...
I need shift operator to achieve this. However, it gives casting error. I look up the Intel Intirinsics guide
But, all shift operators seems to working on integer values (ie. __m128i but not __m128).
Please let me know how to resolve this.
Much appreciate it.
#include <iostream>
#include <emmintrin.h>
#include <immintrin.h>
int main(){
float *a = (float*) _mm_malloc((1 << 20)*sizeof(float), 512/8);
float *as = (float*) _mm_malloc((1 << 20)*sizeof(float), 512/8);
float seri_sum, vec_sum, vec_parallel_sum;
for (int i = 1; i < (1 << 20); i++){
as[i] += as[i-1];
}
seri_sum = as[(1 << 20)-1];
std::cout << "seri_sum" << " = " << seri_sum << std::endl;
float sum = 0;
for(int i = 0; i < (1 << 20); i+=((128)/(sizeof(float)*8))){
__m128 vsum;
vsum = _mm_load_ps1(&sum);
__m128 vec;
vec = _mm_load_ps(&a[i]);
vsum = _mm_add_ps(vsum,vec);
for(int j = 1; j < (1 << 20); i++){
vec = (__m128) _mm_slli_epi32((__m128i) vec,1);
vsum = _mm_add_ps(vsum,vec);
}
_mm_store_ps(&a[i],vsum);
sum = a[i+((128)/(sizeof(float)*8))-1];
}
_mm_free(a);
_mm_free(as);
return 0;
}
Below is the error message:
C:\tmp\simdtests>make clean test_vec
rm -f *.exp *.lib *.supp *.exe *.obj *.optrpt
icl.exe -Qopt-report:5 -Qopt-report-phase:all /Ob0 -Qopenmp -Qsimd -Qopenmp-simd -arch:avx -Qdiag-error-limit:5 -c test_vec.cpp -Fo:test_vec.obj
Intel(R) C++ Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.2.254 Build 20200623
Copyright (C) 1985-2020 Intel Corporation. All rights reserved.
icl: remark #10397: optimization reports are generated in *.optrpt files in the output location
test_vec.cpp
test_vec.cpp(32): error: no suitable user-defined conversion from "__m128" to "__m128i" exists
vec = (__m128) _mm_slli_epi32((__m128i) vec,1);
^
test_vec.cpp(32): error: no suitable user-defined conversion from "__m128i" to "__m128" exists
vec = (__m128) _mm_slli_epi32((__m128i) vec,1);
^
compilation aborted for test_vec.cpp (code 2)
make: *** [Makefile:32: test_vec.obj] Error 2
C:\tmp\simdtests>
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thanks for reaching out to us.
We see that you are trying to do typecasting over here
>>vec = (__m128) _mm_slli_epi32((__m128i) vec,1);
In Intrinsic guide we have type casting methods to do it.
For Example : To cast from __m128 to __m128i we can use below method
__m128i _mm_castps_si128 (__m128 a)
Please refer:
Please try this Intrinsic typecasting methods and let us know if you face any issues.
Thanks & Regards
Noorjahan.
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thanks for reaching out to us.
We see that you are trying to do typecasting over here
>>vec = (__m128) _mm_slli_epi32((__m128i) vec,1);
In Intrinsic guide we have type casting methods to do it.
For Example : To cast from __m128 to __m128i we can use below method
__m128i _mm_castps_si128 (__m128 a)
Please refer:
Please try this Intrinsic typecasting methods and let us know if you face any issues.
Thanks & Regards
Noorjahan.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
thanks for the hint ... I did like successfully below in case others are interested
#include <iostream>
#include <emmintrin.h>
#include <immintrin.h>
int main(){
float *a = (float*) _mm_malloc((1 << 4)*sizeof(float), 512/8);
float *as = (float*) _mm_malloc((1 << 4)*sizeof(float), 512/8);
for(int i = 0; i < (1 << 4); i++){
a[i] = i;
as[i] = i;
}
float seri_sum, vec_sum, vec_parallel_sum;
for (int i = 1; i < (1 << 4); i++){
as[i] += as[i-1];
}
seri_sum = as[(1 << 4)-1];
std::cout << "seri_sum" << " = " << seri_sum << std::endl;
float sum = 0;
for(int i = 0; i < (1 << 4); i+=((128)/(sizeof(float)*8))){
__m128 vsum;
vsum = _mm_load_ps1(&sum);
__m128 vec;
vec = _mm_load_ps(&a[i]);
vsum = _mm_add_ps(vsum,vec);
for(int j = 1; j < ((128)/(sizeof(float)*8)); j++){
vec = _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128(vec),sizeof(float)));
vsum = _mm_add_ps(vsum,vec);
}
_mm_store_ps(&a[i],vsum);
sum = a[i+((128)/(sizeof(float)*8))-1];
}
vec_sum = sum;
std::cout << "vec_sum" << " = " << vec_sum << std::endl;
_mm_free(a);
_mm_free(as);
return 0;
}
Issue resolved feel free to close this thread.
Much appreciate it.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thank you for accepting as a solution.
we will no longer respond to this thread.
If you require any additional assistance from Intel, please start a new thread.
Any further interaction in this thread will be considered community only.
Thanks & Regards
Noorjahan

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page