Intel® C++ Compiler
Community support and assistance for creating C++ code that runs on platforms based on Intel® processors.
7954 Discussions

Scan Directive with Worksharing

Chris_Szalwinski
New Contributor I
1,961 Views

Hi,

I receive an error: "inclusive_scan directive must be closely nested inside a simd loop directive"  when I compile the following code.  

// Prefix Scan Example - Worksharing Scan
// scan.h
// after McCool etal. (2012)

#include <omp.h>

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial) // error flagged here
out[i] = initial;
}
}
return nt;
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial) // error flagged here
initial += in[i];
}
out[size - 1] = initial;
}
return nt;
}

template <typename T, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
S scan_fn, // scan function (inclusive or exclusive)
T initial // initial value
)
{
return scan_fn(in, out, size, T(0));
}

// Workshop 4 - Prefix Scan - Serial
// scan.cpp
// 2020.10.12
// Chris Szalwinski

#include <iostream>
#include <chrono>
#include "scan.h"

// report system time
//
void reportTime(const char* msg, std::chrono::steady_clock::duration span) {
auto ms = std::chrono::duration_cast<std::chrono::microseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " microseconds" << std::endl;
}

int main(int argc, char** argv) {
if (argc > 2) {
std::cerr << argv[0] << ": invalid number of arguments\n";
std::cerr << "Usage: " << argv[0] << "\n";
std::cerr << "Usage: " << argv[0] << " power_of_2\n";
return 1;
}
std::cout << "Worksharing Prefix Scan" << std::endl;

// initial values for testing
const int N = 9;
const int in_[N]{ 3, 1, 7, 0, 1, 4, 5, 9, 2 };

// command line arguments - none for testing, 1 for large arrays
int n, nt{ 1 };
if (argc == 1) {
n = N;
}
else {
n = 1 << std::atoi(argv[1]);
if (n < N) n = N;
}
int* in = new int[n];
int* out = new int[n];

// initialize
for (int i = 0; i < N; i++)
in[i] = in_[i];
for (int i = N; i < n; i++)
in[i] = 1;
auto add = [](int a, int b) { return a + b; };

std::chrono::steady_clock::time_point ts, te;

// Inclusive Prefix Scan - Remove Startup Cost
scan(in, out, n, incl_scan<int>, (int)0);

// Inclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, incl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
for (int i = 0; i < N; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 1] << std::endl;
reportTime("Inclusive Scan", te - ts);

// Exclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, excl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
for (int i = 0; i < N; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 1] << std::endl;
reportTime("Exclusive Scan", te - ts);

// Exclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, incl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
std::cout << 0 << ' ';
for (int i = 0; i < N - 1; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 2] << std::endl;
reportTime("Exclusive Scan", te - ts);

delete[] in;
delete[] out;
}

 

My code for no worksharing compiles successfully

// Prefix Scan Example - SIMD Scan
// scan.cpp
// after McCool etal. (2012)

#include <omp.h>

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

#pragma omp simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial)
out[i] = initial;
}
return 1; // 1 thread
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

if (size > 0) {
#pragma omp simd reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial)
initial += in[i];
}
out[size - 1] = initial;
}
return 1; // 1 thread
}

template <typename T, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
S scan_fn, // scan function (exclusive or inclusive)
T initial // initial value
)
{
return scan_fn(in, out, size, T(0));
}

I don't understand the error message: 'simd' where there is no simd directive.

Any thoughts?

Chris

0 Kudos
11 Replies
jimdempseyatthecove
Honored Contributor III
1,940 Views

Try adding simd to your parallel for

#pragma omp for simd reduction(inscan, +:initial)

etc...

Jim Dempsey

0 Kudos
Chris_Szalwinski
New Contributor I
1,926 Views

I've tried adding simd but no joy:

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
int nt = 1;
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial)
out[i] = initial;
}
}
return nt; // 1 thread
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt = 1;
if (size > 0) {
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial)
initial += in[i];
}
}
out[size - 1] = initial;
}
return nt; 
}

Same error message at the scan directives: ??clusive_scan directive must be closely nested inside a simd loop directive 

Chris

0 Kudos
jimdempseyatthecove
Honored Contributor III
1,908 Views

While I cannot address your compiler issue regarding the reduction(inscan...), I can address the topic of a parallel inclusive scan:

https://software.intel.com/content/www/us/en/develop/articles/elusive-algorithms-parallel-scan.html

Jim Dempsey

0 Kudos
AbhishekD_Intel
Moderator
1,893 Views

Hi,


Thanks for reaching out to us. We are forwarding this issue to the SME. They will get back to you.



Warm Regards,

Abhishek


0 Kudos
Viet_H_Intel
Moderator
1,883 Views

Seems like an issue with icpc. Let me work with our Compiler Developer.

Here is a smaller test case; compiled with g++ (8.1.0)

$ g++ -fopenmp scan.cpp

$ icpc -fopenmp scan.cpp

scan.cpp(7): error: inclusive_scan directive must be closely nested inside a simd loop directive

  #pragma omp scan inclusive(initial)

                ^

     detected during instantiation of "int incl_scan(const T *, T *, int, T) [with T=int]" at line 21


compilation aborted for scan.cpp (code 2)

$ cat scan.cpp

template <typename T>

int incl_scan( const T* in, T* out, int size, T initial) {


#pragma omp parallel

{

 for (int i = 0;; i++) {

 #pragma omp scan inclusive(initial)

 out[i] = initial;

 }

};

}

template <typename T, typename S>

int scan(const T* in, T* out,int size,S scan_fn, T initial)

{

  return size ;

}

int main(int argc, char** argv) {

  int n;

  int* in = new int[n];

  int* out = new int[n];

  scan(in, out, n, incl_scan<int>, (int)0);

}



0 Kudos
Chris_Szalwinski
New Contributor I
1,862 Views

This simplification removes the for construct that was present in the original question

0 Kudos
Chris_Szalwinski
New Contributor I
1,852 Views

Here is the simplified version, which encounters the same error : inclusive_scan directive must be closely nested inside a simd loop directive

 

// scan.h

#include <omp.h>

template <typename T>
int incl_scan(
  const T* in, // source data
  T* out, // output data
  int size, // size of data sets
  T initial // initial value
) {

  int nt{ 1 };
#pragma omp parallel
  {
    nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
    for (int i = 0; i < size; i++) {
      initial += in[i];
#pragma omp scan inclusive(initial) // error flagged here
      out[i] = initial;
    }
  }
  return nt;
}

// scan.cpp
// 2020.10.12
// Chris Szalwinski

#include <iostream>
#include "scan.h"

int main(int argc, char** argv) {
std::cout << "Worksharing Prefix Scan" << std::endl;

// command line arguments - none for testing, 1 for large arrays
const int n = 9;
int* in = new int[n] { 3, 1, 7, 0, 1, 4, 5, 9, 2 };
int* out = new int[n];

incl_scan(in, out, n, (int)0);

delete[] in;
delete[] out;
}

Chris

0 Kudos
Viet_H_Intel
Moderator
1,876 Views

This a coding issue. The test case doesn't compile with gcc10 or clang11

You can try those compilers on https://godbolt.org/, and will see these error messages.


<source>: In function 'int incl_scan(const T*, T*, int, T)':

<source>:7:10: error: '#pragma omp scan' may only be used in a loop construct with 'inscan' 'reduction' clause

7 | #pragma omp scan inclusive(initial)

| ^~~

<source>: In function 'int incl_scan(const T*, T*, int, T) [with T = int]':

<source>:11:1: warning: control reaches end of non-void function [-Wreturn-type]

11 | }

| ^

Compiler returned: 1


Our Developer said: "It looks like gcc was fixed to issue this message in 2019. The pragma over the for loop isn't correct. Also the for loop shouldn't be nested in a block."



0 Kudos
Viet_H_Intel
Moderator
1,847 Views

Thanks for the test case. We'll look into it.


0 Kudos
Viet_H_Intel
Moderator
1,525 Views

Hi Chris,


Is it possible to use icpx instead of icpc?


Thanks,



0 Kudos
Viet_H_Intel
Moderator
1,143 Views

Hi,


Not sure if you are aware of, but Intel Classic Compiler will enter "Legacy Product Support" mode, signaling the end of regular updates. For that reason, we won't fix this issue in icpc. Please migrate your code to icx/icpx.

I am going to close this thread as "wont fix".

https://www.intel.com/content/www/us/en/developer/articles/technical/adoption-of-llvm-complete-icx.h...


Sorry for the inconvenience.



0 Kudos
Reply