Question about a warning “warning: reduction.hpp:2814:0:”

PC-1 · ‎07-21-2024

Hi, I'm new to Intel DPC++ programming,

Recently I just received the warning:

warning: reduction.hpp:2814:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering

I'm just wondering the cause of this warning? Is this due to the parallel_for with reduction object in my code?

I'm using the Intel DPC++ compiler 2024.2. There are multiple parallel reductions in my code,

one of them as follows.

Thanks!!

sycl::buffer buf_Nd_coords(Nd_coords_cref);

sycl::buffer buf_tet4_enTags_Vi(Vents_cref[i].Get_ALLtet4_enTags_cref());

sycl::buffer<float> buf_xlen_sumVi(&xlen_sumVi, 1);

sycl::buffer<float> buf_ylen_sumVi(&ylen_sumVi, 1);

sycl::buffer<float> buf_zlen_sumVi(&zlen_sumVi, 1);

Q.submit([&](sycl::handler& cgh) { //command group lambda

sycl::range<1> tot_workers(num_of_wkgrps * wkgrp_sz);

sycl::range<1> workers_in_grp(wkgrp_sz);

sycl::nd_range<1> nd_rg(tot_workers, workers_in_grp);

sycl::accessor acc_Nd_coords(buf_Nd_coords, cgh, sycl::read_only);

sycl::accessor acc_tet4_enTags_Vi(buf_tet4_enTags_Vi, cgh,

sycl::read_only);

auto Ruc_xlen_sumVi = sycl::reduction(buf_xlen_sumVi, cgh,

sycl::plus<>());

auto Ruc_ylen_sumVi = sycl::reduction(buf_ylen_sumVi, cgh,

sycl::plus<>());

auto Ruc_zlen_sumVi = sycl::reduction(buf_zlen_sumVi, cgh,

sycl::plus<>());

cgh.parallel_for(nd_rg, Ruc_xlen_sumVi, Ruc_ylen_sumVi,

Ruc_zlen_sumVi,

[=](sycl::nd_item<1> it,

auto& reducer_xlen_sumVi,

auto& reducer_ylen_sumVi,

auto& reducer_zlen_sumVi) { //kernel lambda

//Distribute total work to current work group

uint_t wkgrp_st, wkgrp_end;

PC::dpcpp_common::Distribute_range_to_wkgrps(wkgrp_st,

wkgrp_end, it.get_group_linear_id(), num_of_wkgrps,

num_tet4_Vi);

//partial sum for an individual worker thread

float partial_xlen_sum_thd = 0.f;

float partial_ylen_sum_thd = 0.f;

float partial_zlen_sum_thd = 0.f;

for (uint_t idx = wkgrp_st + it.get_local_linear_id();

idx < wkgrp_end; idx +=it.get_local_range(0)) {

const tet4_enTags& tet4_elm = acc_tet4_enTags_Vi[idx];

tet4_cont tet4c(tet4_elm, 0, 0);

tet4_Ndcoords XI;

tet4c.Fetch_Nd_coords(acc_Nd_coords, XI);

sycl::float3 contBx_sz_tet4_i;

tet4c.Get_contBox_sz(contBx_sz_tet4_i, XI);

//accumulate partial sum of current thread

partial_xlen_sum_thd += contBx_sz_tet4_i[0];

partial_ylen_sum_thd += contBx_sz_tet4_i[1];

partial_zlen_sum_thd += contBx_sz_tet4_i[2];

}

reducer_xlen_sumVi.combine(partial_xlen_sum_thd);

reducer_ylen_sumVi.combine(partial_ylen_sum_thd);

reducer_zlen_sumVi.combine(partial_zlen_sum_thd);

} //end of kernel lambda

); //end of handler::parallel_for

} //end of cgh lambda

); //end of queue::submit

Q.wait_and_throw();

Alex_Y_Intel · ‎07-22-2024

Please provide a runnable reproducer and the exact commands used to demonstrate you issue.