Community
cancel
Showing results for 
Search instead for 
Did you mean: 
Highlighted
New Contributor I
192 Views

Scan Directive with Worksharing

Hi,

I receive an error: "inclusive_scan directive must be closely nested inside a simd loop directive"  when I compile the following code.  

// Prefix Scan Example - Worksharing Scan
// scan.h
// after McCool etal. (2012)

#include <omp.h>

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial) // error flagged here
out[i] = initial;
}
}
return nt;
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial) // error flagged here
initial += in[i];
}
out[size - 1] = initial;
}
return nt;
}

template <typename T, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
S scan_fn, // scan function (inclusive or exclusive)
T initial // initial value
)
{
return scan_fn(in, out, size, T(0));
}

// Workshop 4 - Prefix Scan - Serial
// scan.cpp
// 2020.10.12
// Chris Szalwinski

#include <iostream>
#include <chrono>
#include "scan.h"

// report system time
//
void reportTime(const char* msg, std::chrono::steady_clock::duration span) {
auto ms = std::chrono::duration_cast<std::chrono::microseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " microseconds" << std::endl;
}

int main(int argc, char** argv) {
if (argc > 2) {
std::cerr << argv[0] << ": invalid number of arguments\n";
std::cerr << "Usage: " << argv[0] << "\n";
std::cerr << "Usage: " << argv[0] << " power_of_2\n";
return 1;
}
std::cout << "Worksharing Prefix Scan" << std::endl;

// initial values for testing
const int N = 9;
const int in_[N]{ 3, 1, 7, 0, 1, 4, 5, 9, 2 };

// command line arguments - none for testing, 1 for large arrays
int n, nt{ 1 };
if (argc == 1) {
n = N;
}
else {
n = 1 << std::atoi(argv[1]);
if (n < N) n = N;
}
int* in = new int[n];
int* out = new int[n];

// initialize
for (int i = 0; i < N; i++)
in[i] = in_[i];
for (int i = N; i < n; i++)
in[i] = 1;
auto add = [](int a, int b) { return a + b; };

std::chrono::steady_clock::time_point ts, te;

// Inclusive Prefix Scan - Remove Startup Cost
scan(in, out, n, incl_scan<int>, (int)0);

// Inclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, incl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
for (int i = 0; i < N; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 1] << std::endl;
reportTime("Inclusive Scan", te - ts);

// Exclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, excl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
for (int i = 0; i < N; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 1] << std::endl;
reportTime("Exclusive Scan", te - ts);

// Exclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, incl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
std::cout << 0 << ' ';
for (int i = 0; i < N - 1; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 2] << std::endl;
reportTime("Exclusive Scan", te - ts);

delete[] in;
delete[] out;
}

 

My code for no worksharing compiles successfully

// Prefix Scan Example - SIMD Scan
// scan.cpp
// after McCool etal. (2012)

#include <omp.h>

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

#pragma omp simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial)
out[i] = initial;
}
return 1; // 1 thread
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

if (size > 0) {
#pragma omp simd reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial)
initial += in[i];
}
out[size - 1] = initial;
}
return 1; // 1 thread
}

template <typename T, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
S scan_fn, // scan function (exclusive or inclusive)
T initial // initial value
)
{
return scan_fn(in, out, size, T(0));
}

I don't understand the error message: 'simd' where there is no simd directive.

Any thoughts?

Chris

0 Kudos
9 Replies
Highlighted
171 Views

Try adding simd to your parallel for

#pragma omp for simd reduction(inscan, +:initial)

etc...

Jim Dempsey

0 Kudos
Highlighted
New Contributor I
157 Views

I've tried adding simd but no joy:

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
int nt = 1;
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial)
out[i] = initial;
}
}
return nt; // 1 thread
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt = 1;
if (size > 0) {
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial)
initial += in[i];
}
}
out[size - 1] = initial;
}
return nt; 
}

Same error message at the scan directives: ??clusive_scan directive must be closely nested inside a simd loop directive 

Chris

0 Kudos
Highlighted
139 Views

While I cannot address your compiler issue regarding the reduction(inscan...), I can address the topic of a parallel inclusive scan:

https://software.intel.com/content/www/us/en/develop/articles/elusive-algorithms-parallel-scan.html

Jim Dempsey

0 Kudos
Highlighted
Moderator
124 Views

Hi,


Thanks for reaching out to us. We are forwarding this issue to the SME. They will get back to you.



Warm Regards,

Abhishek


0 Kudos
Highlighted
Moderator
114 Views

Seems like an issue with icpc. Let me work with our Compiler Developer.

Here is a smaller test case; compiled with g++ (8.1.0)

$ g++ -fopenmp scan.cpp

$ icpc -fopenmp scan.cpp

scan.cpp(7): error: inclusive_scan directive must be closely nested inside a simd loop directive

  #pragma omp scan inclusive(initial)

                ^

     detected during instantiation of "int incl_scan(const T *, T *, int, T) [with T=int]" at line 21


compilation aborted for scan.cpp (code 2)

$ cat scan.cpp

template <typename T>

int incl_scan( const T* in, T* out, int size, T initial) {


#pragma omp parallel

{

 for (int i = 0;; i++) {

 #pragma omp scan inclusive(initial)

 out[i] = initial;

 }

};

}

template <typename T, typename S>

int scan(const T* in, T* out,int size,S scan_fn, T initial)

{

  return size ;

}

int main(int argc, char** argv) {

  int n;

  int* in = new int[n];

  int* out = new int[n];

  scan(in, out, n, incl_scan<int>, (int)0);

}



0 Kudos
Highlighted
Moderator
107 Views

This a coding issue. The test case doesn't compile with gcc10 or clang11

You can try those compilers on https://godbolt.org/, and will see these error messages.


<source>: In function 'int incl_scan(const T*, T*, int, T)':

<source>:7:10: error: '#pragma omp scan' may only be used in a loop construct with 'inscan' 'reduction' clause

7 | #pragma omp scan inclusive(initial)

| ^~~

<source>: In function 'int incl_scan(const T*, T*, int, T) [with T = int]':

<source>:11:1: warning: control reaches end of non-void function [-Wreturn-type]

11 | }

| ^

Compiler returned: 1


Our Developer said: "It looks like gcc was fixed to issue this message in 2019. The pragma over the for loop isn't correct. Also the for loop shouldn't be nested in a block."



0 Kudos
Highlighted
New Contributor I
93 Views

This simplification removes the for construct that was present in the original question

0 Kudos
Highlighted
New Contributor I
83 Views

Here is the simplified version, which encounters the same error : inclusive_scan directive must be closely nested inside a simd loop directive

 

// scan.h

#include <omp.h>

template <typename T>
int incl_scan(
  const T* in, // source data
  T* out, // output data
  int size, // size of data sets
  T initial // initial value
) {

  int nt{ 1 };
#pragma omp parallel
  {
    nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
    for (int i = 0; i < size; i++) {
      initial += in[i];
#pragma omp scan inclusive(initial) // error flagged here
      out[i] = initial;
    }
  }
  return nt;
}

// scan.cpp
// 2020.10.12
// Chris Szalwinski

#include <iostream>
#include "scan.h"

int main(int argc, char** argv) {
std::cout << "Worksharing Prefix Scan" << std::endl;

// command line arguments - none for testing, 1 for large arrays
const int n = 9;
int* in = new int[n] { 3, 1, 7, 0, 1, 4, 5, 9, 2 };
int* out = new int[n];

incl_scan(in, out, n, (int)0);

delete[] in;
delete[] out;
}

Chris

0 Kudos
Highlighted
Moderator
78 Views

Thanks for the test case. We'll look into it.


0 Kudos