- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello,
I'm having problem with the std::sort() function called with a device_execution_policy.
The sort function is applied to a vector of tuples<int,int,int>; the problem is that in the end my vector
is not sorted.
Here, the code executed on DevCloud:
int main(int argc, char **argv) {
auto asyncHandler = [&](cl::sycl::exception_list eL) {
for (auto& e : eL) {
try {
std::rethrow_exception(e);
}catch (cl::sycl::exception& e) {
std::cout << e.what() << std::endl;
std::cout << "fail" << std::endl;
// std::terminate() will exit the process, return non-zero, and output a
// message to the user about the exception
std::terminate();
}
}
};
cl::sycl::queue q(cpu_selector{});
const int n = 100000000;
vector<std::tuple<int,int,int>> keys(n);
for_each(keys.begin(), keys.end(),[](std::tuple<int,int,int>& value){ get<0>(value)=(int)(std::rand()%10000);get<1>(value)=(int)(std::rand()%10000);get<2>(value)=(int)(std::rand()%10000);});
vector<int> res_val(n);
vector<std::tuple<int,int,int>> res_key(n);
auto start=std::chrono::system_clock::now();
auto policy1 = device_policy<parallel_unsequenced_policy, class Policy1>{q};
sort(policy1, keys.begin(), keys.end(), [](tuple<int,int,int> t1, tuple<int,int,int> t2){
return ( get<0>(t1)<get<0>(t2)
|| (get<0>(t1)==get<0>(t2) && get<1>(t1)<get<1>(t2) )
|| (get<0>(t1)==get<0>(t2) && get<1>(t1)==get<1>(t2) && get<2>(t1)<get<2>(t2)) );
});
auto end=std::chrono::system_clock::now();
auto red_time=std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count();
std::cout<<(float)red_time/1000<<std::endl;
for(int i=0; i<10; i++){
cout<<"("<< get<0>(keys[i])<<","<<get<1>(keys[i])<<","<<get<2>(keys[i])<<")"<<std::endl;
}
return 0;
}
Thank you.
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Eugenio,
We have tried your code but got syntax errors in lots of places in your code. So we tried to rectify your code and it is working now and giving correct results. Please check the below code and try it on your end and update us if it solved your problem.
#include <CL/sycl.hpp>
#include <iostream>
#include <chrono>
#include <vector>
#include <dpstd/iterator>
#include <dpstd/algorithm>
#include <dpstd/execution>
int main(int argc, char **argv) {
auto asyncHandler = [&](cl::sycl::exception_list eL) {
for (auto& e : eL) {
try {
std::rethrow_exception(e);
}catch (cl::sycl::exception& e) {
std::cout << e.what() << std::endl;
std::cout << "fail" << std::endl;
// std::terminate() will exit the process, return non-zero, and output a
// message to the user about the exception
std::terminate();
}
}
};
sycl::queue q(sycl::default_selector{}, asyncHandler);
const int n = 100000000;
std::vector<std::tuple<int,int,int> > keys(n);
int i=1;
for_each(keys.begin(), keys.end(),[&](std::tuple<int,int,int>& value){
get<0>(value)=n-i;/*(int)(std::rand()%10000);*/
get<1>(value)=i;/*(int)(std::rand()%10000);*/
get<2>(value)=n-i-i;/*(int)(std::rand()%10000);*/
i++;
});
for(int i=0; i<10; i++){
std::cout<<"("<< get<0>(keys[i])<<","<<get<1>(keys[i])<<","<<get<2>(keys[i])<<")"<<std::endl;
}
std::vector<int> res_val(n);
std::vector<std::tuple<int,int,int> > res_key(n);
auto start=std::chrono::system_clock::now();
//auto policy1 = device_policy<std::execution::parallel_unsequenced_policy, class Policy1>{q};
auto policy = dpstd::execution::make_device_policy(q);
std::cout << "Runnning on "
<< q.get_device().get_info<sycl::info::device::name>()
<< "\n";
std::sort(policy, keys.begin(), keys.end(), [](std::tuple<int,int,int> t1, std::tuple<int,int,int> t2){
return ( get<0>(t1) < get<0>(t2)
|| ( get<0>(t1)==get<0>(t2) && get<1>(t1) < get<1>(t2) )
|| ( get<0>(t1)==get<0>(t2) && get<1>(t1)==get<1>(t2) && get<2>(t1)<get<2>(t2)) );
});
auto end=std::chrono::system_clock::now();
auto red_time=std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count();
std::cout<<(float)red_time/1000<<std::endl;
for(int i=0; i<10; i++){
std::cout<<"("<< get<0>(keys[i])<<","<<get<1>(keys[i])<<","<<get<2>(keys[i])<<")"<<std::endl;
}
return 0;
}
Warm Regards,
Abhishek
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Yes, I'm sorry, I didn't copy the "include file".
Anyway, I tried your version of code, and also if it works, the result seem to be wrong.
This is the output, and the vector is not sorted correctly, as you can see. It is as it was before the sorting; instead it should have values in a different order.
Maybe this is different from the output you obtained...
Thank you.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Eugenio,
We are also getting the wrong results using CPU whereas with GPU it's giving correct results. We don't the exact reason for this behavior yet but we are debugging it meanwhile you can use GPU to execute your code.
We will get back to you as soon as we get the exact reason for this behavior.
Warm Regards,
Abhishek
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Ok. In the meanwhile I post another related question:
is it normal that the sort on GPU is much slower than the sequential one, for sorting a vector<int>?
I'm talking about the same sort function, executed with an execution policy (on GPU) and without policy.
Here the output of the program, and the code:
#include <CL/sycl.hpp>
#include <iostream>
#include <chrono>
#include <vector>
#include <dpstd/iterator>
#include <dpstd/algorithm>
#include <dpstd/execution>
using namespace std;
int main(int argc, char **argv) {
auto asyncHandler = [&](cl::sycl::exception_list eL) {
for (auto& e : eL) {
try {
std::rethrow_exception(e);
}catch (cl::sycl::exception& e) {
std::cout << e.what() << std::endl;
std::cout << "fail" << std::endl;
// std::terminate() will exit the process, return non-zero, and output a
// message to the user about the exception std::terminate();
}
}
};
auto start=std::chrono::system_clock::now();
auto end=std::chrono::system_clock::now();
long time=0;
sycl::queue q(sycl::default_selector{}, asyncHandler);
const int n = 500000000;
{// TEST simple integer
std::vector<int> keys(n);
int i=1;
// Initialization
for_each(keys.begin(), keys.end(),[&](int& value){
value=n-i;
i++;
});
cout<<"Input array dimension: "<<n<<std::endl;
cout<<"First 10 values of input array: "<<std::endl;
for_each(keys.begin(), keys.begin()+10,[&](int& value){
std::cout<<value<<std::endl;
});
cout<<"-------------------------------------------------- "<<std::endl<<std::endl;
cout<<"Sequential sorting... "<<std::endl;
// Sequential std::sort
start=std::chrono::system_clock::now();
std::sort(keys.begin(), keys.end());
end=std::chrono::system_clock::now();
time=std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count();
for_each(keys.begin(), keys.begin()+10,[&](int& value){
std::cout<<value<<std::endl;
});
std::cout<<"Time: "<<(float)time/1000<<std::endl;
cout<<"-------------------------------------------------- "<<std::endl<<std::endl;
// Parallel std::sort
auto policy = dpstd::execution::make_device_policy(q);
std::cout << "Parallel sort running on: " << q.get_device().get_info<sycl::info::device::name>() << "\n";
start=std::chrono::system_clock::now();
std::sort(policy, keys.begin(), keys.end(), [](int i1, int i2){
return i1 < i2;
});
end=std::chrono::system_clock::now();
time=std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count();
for_each(keys.begin(), keys.begin()+10,[&](int& value){
std::cout<<value<<std::endl;
});
std::cout<<"Time: "<<(float)time/1000<<std::endl;
}
return 0;
}
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thanks for finding out this problem we are also facing the same issue, so we will escalate this to the concerned team they will definitely help you on this.
Meanwhile, you can try with buffers rather than using vectors, buffers are giving correct results on all device selectors(CPU's, GPU's) and also takes less time compared to that of vectors. The below code will help you.
#include <CL/sycl.hpp>
#include <iostream>
#include <chrono>
#include <vector>
#include <dpstd/iterator>
#include <dpstd/algorithm>
#include <dpstd/execution>
int main(int argc, char **argv) {
auto asyncHandler = [&](cl::sycl::exception_list eL) {
for (auto& e : eL) {
try {
std::rethrow_exception(e);
}catch (cl::sycl::exception& e) {
std::cout << e.what() << std::endl;
std::cout << "fail" << std::endl;
// std::terminate() will exit the process, return non-zero, and output a
// message to the user about the exception
std::terminate();
}
}
};
sycl::queue q(sycl::default_selector{}, asyncHandler);
const int n = 100000000;
sycl::buffer<int> t1_buf{n};
sycl::buffer<int> t2_buf{n};
sycl::buffer<int> t3_buf{n};
auto t1_begin = dpstd::begin(t1_buf);
auto t2_begin = dpstd::begin(t2_buf);
auto t3_begin = dpstd::begin(t3_buf);
auto policy = dpstd::execution::make_device_policy(q);
std::transform(policy, dpstd::counting_iterator<int>{0}, dpstd::counting_iterator<int>{0} + n, t1_begin, [n](int i) { return n - i; });
std::transform(policy, dpstd::counting_iterator<int>{0}, dpstd::counting_iterator<int>{0} + n, t2_begin, [n](int i) { return i; });
std::transform(policy, dpstd::counting_iterator<int>{0}, dpstd::counting_iterator<int>{0} + n, t3_begin, [n](int i) { return n - i -1; });
auto zipped_begin = dpstd::make_zip_iterator(t1_begin, t2_begin, t3_begin);
std::cout << "Runnning on "
<< q.get_device().get_info<sycl::info::device::name>()
<< "\n";
auto start=std::chrono::system_clock::now();
std::sort(policy, zipped_begin, zipped_begin + n, [](auto t1, auto t2){
using std::get;
return ( get<0>(t1) < get<0>(t2) || ( get<0>(t1)==get<0>(t2) && get<1>(t1) < get<1>(t2) )
|| ( get<0>(t1)==get<0>(t2) && get<1>(t1)==get<1>(t2) && get<2>(t1)<get<2>(t2)) );
});
auto end=std::chrono::system_clock::now();
auto red_time=std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count();
std::cout<<(float)red_time/1000<<std::endl;
auto host_t11 = t1_buf.get_access<sycl::access::mode::read>();
auto host_t21 = t2_buf.get_access<sycl::access::mode::read>();
auto host_t31 = t3_buf.get_access<sycl::access::mode::read>();
for(int i=0; i<10; i++){
std::cout<<"("<< host_t11[i]<<","<< host_t21[i]<<","<< host_t31[i]<<")"<<std::endl;
}
return 0;
}
And we are escalating this issue to the concerned team.
Warm Regards,
Abhishek
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
This issue has been resolved in the oneAPI Gold release 2021.1.0

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page