Intel® C++ Compiler
Support and discussions for creating C++ code that runs on platforms based on Intel® processors.
Announcements
Welcome to the Intel Community. If you get an answer you like, please mark it as an Accepted Solution to help others. Thank you!
For the latest information on Intel’s response to the Log4j/Log4Shell vulnerability, please see Intel-SA-00646

Scan Directive with Worksharing

Chris_Szalwinski
New Contributor I
727 Views

Hi,

I receive an error: "inclusive_scan directive must be closely nested inside a simd loop directive"  when I compile the following code.  

// Prefix Scan Example - Worksharing Scan
// scan.h
// after McCool etal. (2012)

#include <omp.h>

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial) // error flagged here
out[i] = initial;
}
}
return nt;
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt{ 1 };
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial) // error flagged here
initial += in[i];
}
out[size - 1] = initial;
}
return nt;
}

template <typename T, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
S scan_fn, // scan function (inclusive or exclusive)
T initial // initial value
)
{
return scan_fn(in, out, size, T(0));
}

// Workshop 4 - Prefix Scan - Serial
// scan.cpp
// 2020.10.12
// Chris Szalwinski

#include <iostream>
#include <chrono>
#include "scan.h"

// report system time
//
void reportTime(const char* msg, std::chrono::steady_clock::duration span) {
auto ms = std::chrono::duration_cast<std::chrono::microseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " microseconds" << std::endl;
}

int main(int argc, char** argv) {
if (argc > 2) {
std::cerr << argv[0] << ": invalid number of arguments\n";
std::cerr << "Usage: " << argv[0] << "\n";
std::cerr << "Usage: " << argv[0] << " power_of_2\n";
return 1;
}
std::cout << "Worksharing Prefix Scan" << std::endl;

// initial values for testing
const int N = 9;
const int in_[N]{ 3, 1, 7, 0, 1, 4, 5, 9, 2 };

// command line arguments - none for testing, 1 for large arrays
int n, nt{ 1 };
if (argc == 1) {
n = N;
}
else {
n = 1 << std::atoi(argv[1]);
if (n < N) n = N;
}
int* in = new int[n];
int* out = new int[n];

// initialize
for (int i = 0; i < N; i++)
in[i] = in_[i];
for (int i = N; i < n; i++)
in[i] = 1;
auto add = [](int a, int b) { return a + b; };

std::chrono::steady_clock::time_point ts, te;

// Inclusive Prefix Scan - Remove Startup Cost
scan(in, out, n, incl_scan<int>, (int)0);

// Inclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, incl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
for (int i = 0; i < N; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 1] << std::endl;
reportTime("Inclusive Scan", te - ts);

// Exclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, excl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
for (int i = 0; i < N; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 1] << std::endl;
reportTime("Exclusive Scan", te - ts);

// Exclusive Prefix Scan
ts = std::chrono::steady_clock::now();
nt = scan(in, out, n, incl_scan<int>, (int)0);
te = std::chrono::steady_clock::now();

std::cout << nt << " thread" << (nt > 1 ? "s" : "") << std::endl;
std::cout << 0 << ' ';
for (int i = 0; i < N - 1; i++)
std::cout << out[i] << ' ';
std::cout << out[n - 2] << std::endl;
reportTime("Exclusive Scan", te - ts);

delete[] in;
delete[] out;
}

 

My code for no worksharing compiles successfully

// Prefix Scan Example - SIMD Scan
// scan.cpp
// after McCool etal. (2012)

#include <omp.h>

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

#pragma omp simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial)
out[i] = initial;
}
return 1; // 1 thread
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

if (size > 0) {
#pragma omp simd reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial)
initial += in[i];
}
out[size - 1] = initial;
}
return 1; // 1 thread
}

template <typename T, typename S>
int scan(
const T* in, // source data
T* out, // output data
int size, // size of source, output data sets
S scan_fn, // scan function (exclusive or inclusive)
T initial // initial value
)
{
return scan_fn(in, out, size, T(0));
}

I don't understand the error message: 'simd' where there is no simd directive.

Any thoughts?

Chris

0 Kudos
10 Replies
jimdempseyatthecove
Black Belt
706 Views

Try adding simd to your parallel for

#pragma omp for simd reduction(inscan, +:initial)

etc...

Jim Dempsey

Chris_Szalwinski
New Contributor I
692 Views

I've tried adding simd but no joy:

template <typename T>
int incl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {
int nt = 1;
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size; i++) {
initial += in[i];
#pragma omp scan inclusive(initial)
out[i] = initial;
}
}
return nt; // 1 thread
}

template <typename T>
int excl_scan(
const T* in, // source data
T* out, // output data
int size, // size of data sets
T initial // initial value
) {

int nt = 1;
if (size > 0) {
#pragma omp parallel
{
nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
for (int i = 0; i < size - 1; i++) {
out[i] = initial;
#pragma omp scan exclusive(initial)
initial += in[i];
}
}
out[size - 1] = initial;
}
return nt; 
}

Same error message at the scan directives: ??clusive_scan directive must be closely nested inside a simd loop directive 

Chris

jimdempseyatthecove
Black Belt
674 Views

While I cannot address your compiler issue regarding the reduction(inscan...), I can address the topic of a parallel inclusive scan:

https://software.intel.com/content/www/us/en/develop/articles/elusive-algorithms-parallel-scan.html

Jim Dempsey

AbhishekD_Intel
Moderator
659 Views

Hi,


Thanks for reaching out to us. We are forwarding this issue to the SME. They will get back to you.



Warm Regards,

Abhishek


Viet_H_Intel
Moderator
649 Views

Seems like an issue with icpc. Let me work with our Compiler Developer.

Here is a smaller test case; compiled with g++ (8.1.0)

$ g++ -fopenmp scan.cpp

$ icpc -fopenmp scan.cpp

scan.cpp(7): error: inclusive_scan directive must be closely nested inside a simd loop directive

  #pragma omp scan inclusive(initial)

                ^

     detected during instantiation of "int incl_scan(const T *, T *, int, T) [with T=int]" at line 21


compilation aborted for scan.cpp (code 2)

$ cat scan.cpp

template <typename T>

int incl_scan( const T* in, T* out, int size, T initial) {


#pragma omp parallel

{

 for (int i = 0;; i++) {

 #pragma omp scan inclusive(initial)

 out[i] = initial;

 }

};

}

template <typename T, typename S>

int scan(const T* in, T* out,int size,S scan_fn, T initial)

{

  return size ;

}

int main(int argc, char** argv) {

  int n;

  int* in = new int[n];

  int* out = new int[n];

  scan(in, out, n, incl_scan<int>, (int)0);

}



Chris_Szalwinski
New Contributor I
628 Views

This simplification removes the for construct that was present in the original question

Chris_Szalwinski
New Contributor I
618 Views

Here is the simplified version, which encounters the same error : inclusive_scan directive must be closely nested inside a simd loop directive

 

// scan.h

#include <omp.h>

template <typename T>
int incl_scan(
  const T* in, // source data
  T* out, // output data
  int size, // size of data sets
  T initial // initial value
) {

  int nt{ 1 };
#pragma omp parallel
  {
    nt = omp_get_num_threads();
#pragma omp for simd reduction(inscan, +:initial)
    for (int i = 0; i < size; i++) {
      initial += in[i];
#pragma omp scan inclusive(initial) // error flagged here
      out[i] = initial;
    }
  }
  return nt;
}

// scan.cpp
// 2020.10.12
// Chris Szalwinski

#include <iostream>
#include "scan.h"

int main(int argc, char** argv) {
std::cout << "Worksharing Prefix Scan" << std::endl;

// command line arguments - none for testing, 1 for large arrays
const int n = 9;
int* in = new int[n] { 3, 1, 7, 0, 1, 4, 5, 9, 2 };
int* out = new int[n];

incl_scan(in, out, n, (int)0);

delete[] in;
delete[] out;
}

Chris

Viet_H_Intel
Moderator
642 Views

This a coding issue. The test case doesn't compile with gcc10 or clang11

You can try those compilers on https://godbolt.org/, and will see these error messages.


<source>: In function 'int incl_scan(const T*, T*, int, T)':

<source>:7:10: error: '#pragma omp scan' may only be used in a loop construct with 'inscan' 'reduction' clause

7 | #pragma omp scan inclusive(initial)

| ^~~

<source>: In function 'int incl_scan(const T*, T*, int, T) [with T = int]':

<source>:11:1: warning: control reaches end of non-void function [-Wreturn-type]

11 | }

| ^

Compiler returned: 1


Our Developer said: "It looks like gcc was fixed to issue this message in 2019. The pragma over the for loop isn't correct. Also the for loop shouldn't be nested in a block."



Viet_H_Intel
Moderator
613 Views

Thanks for the test case. We'll look into it.


Viet_H_Intel
Moderator
291 Views

Hi Chris,


Is it possible to use icpx instead of icpc?


Thanks,



Reply