- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I receive an error: "inclusive_scan directive must be closely nested inside a simd loop directive" when I compile the following code.
// Prefix Scan Example - Worksharing Scan
// scan.h
// after McCool etal. (2012)
#include <omp.h>
template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial) // error flagged here
out[i] = initial;
}
}
return nt;
}
template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial) // error flagged here
initial += in[i];
}
out[size - 1] = initial;
}
return nt;
}
template <typename T, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
S scan_fn, // scan function (inclusive or exclusive)
T initial // initial value
)
{
return scan_fn(in, out, size, T(0));
}
// Workshop 4 - Prefix Scan - Serial
// scan.cpp
// 2020.10.12
// Chris Szalwinski
#include <iostream>
#include <chrono>
#include "scan.h"
// report system time
//
void reportTime(const char* msg, std::chrono::steady_clock::duration span) {
auto ms = std::chrono::duration_cast<std::chrono::microseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " microseconds" << std::endl;
}
int main(int argc, char** argv) {
if (argc > 2) {
std::cerr << argv[0] << ": invalid number of arguments\n";
std::cerr << "Usage: " << argv[0] << "\n";
std::cerr << "Usage: " << argv[0] << " power_of_2\n";
return 1;
}
std::cout << "Worksharing Prefix Scan" << std::endl;
// initial values for testing
const int N = 9;
const int in_[N]{ 3, 1, 7, 0, 1, 4, 5, 9, 2 };
// command line arguments - none for testing, 1 for large arrays
int n, nt{ 1 };
if (argc == 1) {
n = N;
}
else {
n = 1 << std::atoi(argv[1]);
if (n < N) n = N;
}
int* in = new int[n];
int* out = new int[n];
// initialize
for (int i = 0; i < N; i++)
in[i] = in_[i];
for (int i = N; i < n; i++)
in[i] = 1;
auto add = [](int a, int b) { return a + b; };
std::chrono::steady_clock::time_point ts, te;
// Inclusive Prefix Scan - Remove Startup Cost
scan(in, out, n, incl_scan<int>, (int)0);
// Inclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, incl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();
std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
for (int i = 0; i < N; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 1] << std::endl;
reportTime("Inclusive Scan", te - ts);
// Exclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, excl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();
std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
for (int i = 0; i < N; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 1] << std::endl;
reportTime("Exclusive Scan", te - ts);
// Exclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, incl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();
std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
std::cout << 0 << ' ';
for (int i = 0; i < N - 1; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 2] << std::endl;
reportTime("Exclusive Scan", te - ts);
delete[] in;
delete[] out;
}
My code for no worksharing compiles successfully
// Prefix Scan Example - SIMD Scan
// scan.cpp
// after McCool etal. (2012)
#include <omp.h>
template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
#pragma omp simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial)
out[i] = initial;
}
return 1; // 1 thread
}
template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
if (size > 0) {
#pragma omp simd reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial)
initial += in[i];
}
out[size - 1] = initial;
}
return 1; // 1 thread
}
template <typename T, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
S scan_fn, // scan function (exclusive or inclusive)
T initial // initial value
)
{
return scan_fn(in, out, size, T(0));
}
I don't understand the error message: 'simd' where there is no simd directive.
Any thoughts?
Chris
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Try adding simd to your parallel for
#pragma omp for simd reduction(inscan, +:initial)
etc...
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I've tried adding simd but no joy:
template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
int nt = 1;
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial)
out[i] = initial;
}
}
return nt; // 1 thread
}
template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
int nt = 1;
if (size > 0) {
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial)
initial += in[i];
}
}
out[size - 1] = initial;
}
return nt;
}
Same error message at the scan directives: ??clusive_scan directive must be closely nested inside a simd loop directive
Chris
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
While I cannot address your compiler issue regarding the reduction(inscan...), I can address the topic of a parallel inclusive scan:
https://software.intel.com/content/www/us/en/develop/articles/elusive-algorithms-parallel-scan.html
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Thanks for reaching out to us. We are forwarding this issue to the SME. They will get back to you.
Warm Regards,
Abhishek
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Seems like an issue with icpc. Let me work with our Compiler Developer.
Here is a smaller test case; compiled with g++ (8.1.0)
$ g++ -fopenmp scan.cpp
$ icpc -fopenmp scan.cpp
scan.cpp(7): error: inclusive_scan directive must be closely nested inside a simd loop directive
#pragma omp scan inclusive(initial)
^
detected during instantiation of "int incl_scan(const T *, T *, int, T) [with T=int]" at line 21
compilation aborted for scan.cpp (code 2)
$ cat scan.cpp
template <typename T>
int incl_scan( const T* in, T* out, int size, T initial) {
#pragma omp parallel
{
for (int i = 0;; i++) {
#pragma omp scan inclusive(initial)
out[i] = initial;
}
};
}
template <typename T, typename S>
int scan(const T* in, T* out,int size,S scan_fn, T initial)
{
return size ;
}
int main(int argc, char** argv) {
int n;
int* in = new int[n];
int* out = new int[n];
scan(in, out, n, incl_scan<int>, (int)0);
}
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
This simplification removes the for construct that was present in the original question
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Here is the simplified version, which encounters the same error : inclusive_scan directive must be closely nested inside a simd loop directive
// scan.h
#include <omp.h>
template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial) // error flagged here
out[i] = initial;
}
}
return nt;
}
// scan.cpp
// 2020.10.12
// Chris Szalwinski
#include <iostream>
#include "scan.h"
int main(int argc, char** argv) {
std::cout << "Worksharing Prefix Scan" << std::endl;
// command line arguments - none for testing, 1 for large arrays
const int n = 9;
int* in = new int[n] { 3, 1, 7, 0, 1, 4, 5, 9, 2 };
int* out = new int[n];
incl_scan(in, out, n, (int)0);
delete[] in;
delete[] out;
}
Chris
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
This a coding issue. The test case doesn't compile with gcc10 or clang11
You can try those compilers on https://godbolt.org/, and will see these error messages.
<source>: In function 'int incl_scan(const T*, T*, int, T)':
<source>:7:10: error: '#pragma omp scan' may only be used in a loop construct with 'inscan' 'reduction' clause
7 | #pragma omp scan inclusive(initial)
| ^~~
<source>: In function 'int incl_scan(const T*, T*, int, T) [with T = int]':
<source>:11:1: warning: control reaches end of non-void function [-Wreturn-type]
11 | }
| ^
Compiler returned: 1
Our Developer said: "It looks like gcc was fixed to issue this message in 2019. The pragma over the for loop isn't correct. Also the for loop shouldn't be nested in a block."
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Thanks for the test case. We'll look into it.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Chris,
Is it possible to use icpx instead of icpc?
Thanks,
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
Not sure if you are aware of, but Intel Classic Compiler will enter "Legacy Product Support" mode, signaling the end of regular updates. For that reason, we won't fix this issue in icpc. Please migrate your code to icx/icpx.
I am going to close this thread as "wont fix".
Sorry for the inconvenience.

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page