- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi, I'm new to Intel DPC++ programming,
Recently I just received the warning:
warning: reduction.hpp:2814:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
I'm just wondering the cause of this warning? Is this due to the parallel_for with reduction object in my code?
I'm using the Intel DPC++ compiler 2024.2. There are multiple parallel reductions in my code,
one of them as follows.
Thanks!!
sycl::buffer buf_Nd_coords(Nd_coords_cref);
sycl::buffer buf_tet4_enTags_Vi(Vents_cref[i].Get_ALLtet4_enTags_cref());
sycl::buffer<float> buf_xlen_sumVi(&xlen_sumVi, 1);
sycl::buffer<float> buf_ylen_sumVi(&ylen_sumVi, 1);
sycl::buffer<float> buf_zlen_sumVi(&zlen_sumVi, 1);
Q.submit([&](sycl::handler& cgh) { //command group lambda
sycl::range<1> tot_workers(num_of_wkgrps * wkgrp_sz);
sycl::range<1> workers_in_grp(wkgrp_sz);
sycl::nd_range<1> nd_rg(tot_workers, workers_in_grp);
sycl::accessor acc_Nd_coords(buf_Nd_coords, cgh, sycl::read_only);
sycl::accessor acc_tet4_enTags_Vi(buf_tet4_enTags_Vi, cgh,
sycl::read_only);
auto Ruc_xlen_sumVi = sycl::reduction(buf_xlen_sumVi, cgh,
sycl::plus<>());
auto Ruc_ylen_sumVi = sycl::reduction(buf_ylen_sumVi, cgh,
sycl::plus<>());
auto Ruc_zlen_sumVi = sycl::reduction(buf_zlen_sumVi, cgh,
sycl::plus<>());
cgh.parallel_for(nd_rg, Ruc_xlen_sumVi, Ruc_ylen_sumVi,
Ruc_zlen_sumVi,
[=](sycl::nd_item<1> it,
auto& reducer_xlen_sumVi,
auto& reducer_ylen_sumVi,
auto& reducer_zlen_sumVi) { //kernel lambda
//Distribute total work to current work group
uint_t wkgrp_st, wkgrp_end;
PC::dpcpp_common::Distribute_range_to_wkgrps(wkgrp_st,
wkgrp_end, it.get_group_linear_id(), num_of_wkgrps,
num_tet4_Vi);
//partial sum for an individual worker thread
float partial_xlen_sum_thd = 0.f;
float partial_ylen_sum_thd = 0.f;
float partial_zlen_sum_thd = 0.f;
for (uint_t idx = wkgrp_st + it.get_local_linear_id();
idx < wkgrp_end; idx +=it.get_local_range(0)) {
const tet4_enTags& tet4_elm = acc_tet4_enTags_Vi[idx];
tet4_cont tet4c(tet4_elm, 0, 0);
tet4_Ndcoords XI;
tet4c.Fetch_Nd_coords(acc_Nd_coords, XI);
sycl::float3 contBx_sz_tet4_i;
tet4c.Get_contBox_sz(contBx_sz_tet4_i, XI);
//accumulate partial sum of current thread
partial_xlen_sum_thd += contBx_sz_tet4_i[0];
partial_ylen_sum_thd += contBx_sz_tet4_i[1];
partial_zlen_sum_thd += contBx_sz_tet4_i[2];
}
reducer_xlen_sumVi.combine(partial_xlen_sum_thd);
reducer_ylen_sumVi.combine(partial_ylen_sum_thd);
reducer_zlen_sumVi.combine(partial_zlen_sum_thd);
} //end of kernel lambda
); //end of handler::parallel_for
} //end of cgh lambda
); //end of queue::submit
Q.wait_and_throw();
Link Copied
1 Reply
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Please provide a runnable reproducer and the exact commands used to demonstrate you issue.

Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page