Sobel Filter (OpenMP implementation for Knights Landing)

Madhumeta_G_ · ‎06-08-2017

I am trying to implement a parallelized + vectorized version of Sobel Filter in C with OpenMP pragmas for the paralleization and #pragma simd for vectorization. My input is a .pgm image of 1024 by 1024. I am compiling this using Intel Compiler on a Xeon Knights Landing processor using the following command:

icc -qopenmp -O3 -qopt-report3 xeon.c -o xeon

So problems I am facing with the code in general are:

a) when do I parallelize and when do I vectorize. I have a nested for loop made up of four for loops -> should I parallelize or vectorize this piece of code

b) My 'min' and 'max' values are wrong. They are both shared variables and hence prone to race conditions, so I have added a #pragma omp critical around them. However, the values printed out for these two variables are still wrong and I have no idea why. I have even added a barrier before the print statement to make sure all threads pass through that critical section before the min and max values get printed out

c) the #pragma omp critical is making my program very very slow. In fact the execution time is even longer than the sequential runtime. Is there any way to avoid it?

Code:

**mypgm.h**

/* pgm file IO headerfile ------ mypgm.h */

/* Constant declaration */

#define MAX_IMAGEWIDTH  1024
#define MAX_IMAGEHEIGHT  1024
#define MAX_BRIGHTNESS  255 /* Maximum gray level */
#define GRAYLEVEL       256 /* No. of gray levels */
#define MAX_FILENAME    256 /* Filename length limit */
#define MAX_BUFFERSIZE  256

/* Global constant declaration */
/* Image storage arrays */
float image1[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT] __attribute__((aligned(64))),
image2[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT] __attribute__((aligned(64)));
int x_size1, y_size1, /* width & height of image1*/
x_size2, y_size2; /* width & height of image2 */

/* Prototype declaration of functions */
void load_image_data( ); /* image input */
void save_image_data( ); /* image output*/
void load_image_file(char *); /* image input */
void save_image_file(char *); /* image output*/


/* Main body of functions */

void load_image_data( )
/* Input of header & body information of pgm file */
/* for image1[ ][ ]，x_size1，y_size1 */
{
  char file_name[MAX_FILENAME];
  char buffer[MAX_BUFFERSIZE];
  FILE *fp; /* File pointer */
  int max_gray; /* Maximum gray level */
  int x, y; /* Loop variable */

  /* Input file open */
  printf("\n-----------------------------------------------------\n");
  printf("Monochromatic image file input routine \n");
  printf("-----------------------------------------------------\n\n");
  printf("     Only pgm binary file is acceptable\n\n");
  printf("Name of input image file? (*.pgm) : ");
  scanf("%s", file_name);
  fp = fopen(file_name, "rb");
  if (NULL == fp) {
    printf("     The file doesn't exist!\n\n");
    exit(1);
  }
  /* Check of file-type ---P5 */
  fgets(buffer, MAX_BUFFERSIZE, fp);
  if (buffer[0] != 'P' || buffer[1] != '5') {
     printf("     Mistaken file format, not P5!\n\n");
     exit(1);
  }
  /* input of x_size1, y_size1 */
  x_size1 = 0;
  y_size1 = 0;
  while (x_size1 == 0 || y_size1 == 0) {
    fgets(buffer, MAX_BUFFERSIZE, fp);
    if (buffer[0] != '#') {
      sscanf(buffer, "%d %d", &x_size1, &y_size1);
    }
  }
  /* input of max_gray */
  max_gray = 0;
  while (max_gray == 0) {
  fgets(buffer, MAX_BUFFERSIZE, fp);
  if (buffer[0] != '#') {
    sscanf(buffer, "%d", &max_gray);
  }
}
  /* Display of parameters */
  printf("\n     Image width = %d, Image height = %d\n", x_size1, y_size1);
  printf("     Maximum gray level = %d\n\n",max_gray);
  if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
     printf("     Image size exceeds %d x %d\n\n", 
      MAX_IMAGEWIDTH, MAX_IMAGEHEIGHT);
     printf("     Please use smaller images!\n\n");
     exit(1);
  }
  if (max_gray != MAX_BRIGHTNESS) {
  printf("     Invalid value of maximum gray level!\n\n");
  exit(1);
}
/* Input of image data*/
#pragma simd
for (y = 0; y < y_size1; y++) {
  #pragma simd
  for (x = 0; x < x_size1; x++) {
     image1[y][x] = (unsigned char)fgetc(fp);
  }
}
printf("-----Image data input OK-----\n\n");
printf("-----------------------------------------------------\n\n");
fclose(fp);
}

void save_image_data( )
/* Output of image2[ ][ ], x_size2, y_size2 in pgm format*/

{
  char file_name[MAX_FILENAME];
  FILE *fp; /* File pointer */
  int x, y; /* Loop variable */

  /* Output file open */
  printf("-----------------------------------------------------\n");
  printf("Monochromatic image file output routine\n");
  printf("-----------------------------------------------------\n\n");
  printf("Name of output image file? (*.pgm) : ");
  scanf("%s",file_name);
  fp = fopen(file_name, "wb");
  /* output of pgm file header information */
  fputs("P5\n", fp);
  fputs("# Created by Image Processing\n", fp);
  fprintf(fp, "%d %d\n", x_size2, y_size2);
  fprintf(fp, "%d\n", MAX_BRIGHTNESS);
  /* Output of image data */
 #pragma simd
  for (y = 0; y < y_size2; y++) {
    #pragma simd
    for (x = 0; x < x_size2; x++) {
       fputc(image2[y][x], fp);
     }
   }
   printf("\n-----Image data output OK-----\n\n");
   printf("-----------------------------------------------------\n\n");
   fclose(fp);
}

void load_image_file(char *filename)
/* Input of header & body information of pgm file */
/* for image1[ ][ ]，x_size1，y_size1 */
{
   char buffer[MAX_BUFFERSIZE];
   FILE *fp; /* File pointer */
   int max_gray; /* Maximum gray level */
   int x, y; /* Loop variable */

   /* Input file open */
   fp = fopen(filename, "rb");
   if (NULL == fp) {
     printf("     The file doesn't exist!\n\n");
     exit(1);
   }
   /* Check of file-type ---P5 */
   fgets(buffer, MAX_BUFFERSIZE, fp);
   if (buffer[0] != 'P' || buffer[1] != '5') {
     printf("     Mistaken file format, not P5!\n\n");
     exit(1);
   }
   /* input of x_size1, y_size1 */
   x_size1 = 0;
   y_size1 = 0;
   while (x_size1 == 0 || y_size1 == 0) {
     fgets(buffer, MAX_BUFFERSIZE, fp);
     if (buffer[0] != '#') {
       sscanf(buffer, "%d %d", &x_size1, &y_size1);
     }
   }
   /* input of max_gray */
   max_gray = 0;
   while (max_gray == 0) {
     fgets(buffer, MAX_BUFFERSIZE, fp);
     if (buffer[0] != '#') {
        sscanf(buffer, "%d", &max_gray);
     }
   }
   if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
     printf("     Image size exceeds %d x %d\n\n", 
        MAX_IMAGEWIDTH, MAX_IMAGEHEIGHT);
     printf("     Please use smaller images!\n\n");
     exit(1);
   }
   if (max_gray != MAX_BRIGHTNESS) {
     printf("     Invalid value of maximum gray level!\n\n");
     exit(1);
   }
   /* Input of image data*/
  #pragma simd
  for (y = 0; y < y_size1; y++) {
    #pragma simd
    for (x = 0; x < x_size1; x++) {
       image1[y][x] = (float)fgetc(fp);
     }
  }
  fclose(fp);
}

void save_image_file(char *filename)
/* Output of image2[ ][ ], x_size2, y_size2 */ 
/* into pgm file with header & body information */
{
  FILE *fp; /* File pointer */
  int x, y; /* Loop variable */

  fp = fopen(filename, "wb");
  /* output of pgm file header information */
  fputs("P5\n", fp);
  fputs("# Created by Image Processing\n", fp);
  fprintf(fp, "%d %d\n", x_size2, y_size2);
  fprintf(fp, "%d\n", MAX_BRIGHTNESS);
  /* Output of image data */
 #pragma simd
  for (y = 0; y < y_size2; y++) {
    #pragma simd
    for (x = 0; x < x_size2; x++) {
      fputc(image2[y][x], fp);
    }
  }
  fclose(fp);
}

**xeon.c**

/* sobel.c */
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>
#include <omp.h>
#include "mypgm.h"



void sobel_filtering( )
     /* Spatial filtering of image data */
     /* Sobel filter (horizontal differentiation */
     /* Input: image1 ---- Outout: image2 */
{
  /* Definition of Sobel filter in horizontal direction */
  float weight[3][3] __attribute__((aligned(64)))= {{ -1,  0,  1 },
              { -2,  0,  2 },
              { -1,  0,  1 }};
  float pixel_value;
  float min, max;
  int x, y, i, j;  /* Loop variable */

  /* Maximum values calculation after filtering*/
  printf("Now, filtering of input image is performed\n\n");


  min = DBL_MAX;
  max = -DBL_MAX;
  #pragma omp parallel shared(image2,weight,image1,min,max) private(y,x,j,i)
  {
    #pragma omp for collapse(2)

    for (y=0;y<y_size1;y++)  {
       for (x=0;x<x_size1;x++)  {
          image2[y][x]=0;
       }
    }
    #pragma omp for collapse(2) reduction(+:pixel_value)

    for (y = 1; y < y_size1 - 1; y++) {
      //#pragma simd
      for (x = 1; x < x_size1 - 1; x++) {
        pixel_value = 0.0;
        #pragma simd
        //#pragma omp for collapse(2)
        for (j = -1; j <= 1; j++) {
          #pragma simd
          for (i = -1; i <= 1; i++) {
             pixel_value += weight[j + 1][i + 1] * image1[y + j][x + i];
          }
        }
        image2[y][x] = (float)pixel_value;
        #pragma omp critical
       { 
          if (pixel_value < min) 
             min = pixel_value;
          if (pixel_value > max)
             max = pixel_value;
       } 
   }
 }
 #pragma omp barrier
 #pragma omp single
 {
   if ((int)(max - min) == 0) {
    printf("Nothing exists!!!\n\n");
    exit(1);
   }
  printf("%f\n",min);
  printf("%f\n",max);
 }

  /* Generation of image2 after linear transformtion */

  #pragma omp for private(x) collapse(2)
     //#pragma simd
     for (y=1;y<y_size1-1;y++)  {
       //#pragma simd
       for (x=1;x<x_size1-1;x++)  {
         image2[y][x] = MAX_BRIGHTNESS * (image2[y][x] - min) / (max - min);
       }
     }
 } // ends the parallel section
} //end of sobel filtering function

int main( )
{
  load_image_data( );   /* Input of image1 */ 
  clock_t begin=clock();
  sobel_filtering( );   /* Sobel filter is applied to image1 */
  clock_t end=clock();
  double time_spent = (double)(end-begin)/CLOCKS_PER_SEC;
  printf("\n\nTiming result of multiplication of matrix-vector: %f\n",time_spent);
  save_image_data( );   /* Output of image2 */
  return 0;
}

TimP · ‎06-08-2017

#pragma simd is irrelevant on file operations. The best you can hope for is for that to be ignored.

Most of the comments you got on stackoverflow are relevant, including any which may criticize you for drawing conclusions from clock_t, which totals up the times spent by all threads, so might multiply the times spent in critical and single regions by the number of threads waiting.

It seems your max and min variables would better be included as reduction(max: and reduction(min: specification clauses on your omp reduction, instead of critical, as long as you use an up to date omp which supports this. However, your specification of reduction for pixel_value appears wrong. In principle, you could specify it as simd reduction in the inner loops, but it's tricky, and the compiler may do the right thing without the reduction spec (with icc defaults or gcc -ffast-math). Better to get things working before adding optional pragmas. One might doubt how much testing icc has received with such combinations of recent omp and obsolete pre-omp pragmas, particularly where you seem to hope that #pragma simd means the same as #pragma omp simd.

jimdempseyatthecove · ‎06-08-2017

>>a) when do I parallelize and when do I vectorize

When vectorization is possible, always vectorize. In very few cases will vectoized code run slower than non-vectoized code (there are a few cases, so it behooves you to test both ways).

After vectorization, you then determine if there is sufficient work to be done with parallelization.

>>Sobel Filter in C... My input is a .pgm image of 1024 by 1024

Is your input a single frame .OR. are you processing multiple/many frames?
If multiple frames, are the images dependent upon adjacent images .OR. independent of each other?
While the filter above indicates indepent images, you have not shown/described the remainder of your program.

Often it is better to move the parallelization to outer levels of the code. In the above code, and using 1024 x 1024 images, on KNL using 64 threads each thread would process 16 rows (16K cells), using 128 threads 8 rows,... If the values are floats, and if 16-wide vectorization is attained, then, with 64 threads each thread would iterate 1024 times, 128 threads 512 iteratons. This is hardly enough iterations to warrant parallelization (though you should test). Therefor, if multiple images are to be processed, move the parallelization out to the per-image level.

Note, when constructing simple timing programs

a) instantiate the OpenMP thread pool outside of your timed section. This can be relatively long (especially with 256 threads)
int main()
{
#pragma omp parallel // instantiate OpenMP thread pool _prior_ to timing function
if(omp_get_thread_num) < 0) printf("Not going to happen");

The above "goofy" if statement should never print .BUT. assures that the compiler optimization does not remove the statement.

b) time the test function multiple times. First time may incur additional overhead.

Jim Dempsey

SergeyKostrov · ‎06-09-2017

>>...a) when do I parallelize and when do I vectorize. - Outer Loops need to be parallized ( Middle Loops are usually left as is but it depends on a type of processing ) - Inner Loops need to be vectorized >>...c) the #pragma omp critical is making my program very very slow... It is absolutely expected performance impact. Try to re-structure the algorithm to use: #pragma omp parallel { ... [ processing ] ... } instead of: #pragma omp parallel for ... for( .[ outer ]. ) for( .[ inner ]. )

jimdempseyatthecove · ‎06-10-2017

Try something along the line of this:

  min_pixel_value = DBL_MAX;  // please change name from intrinsic function name
  max_pixel_value = -DBL_MAX;
  #pragma omp parallel shared(image2,weight,image1) reduction(min:min_pixel_value, max:max_pixel_value)
  {
    // wipe the perimiter
    #pragma omp sections
    {
       #pragma omp section
       {
           for (int x = 0; x < x_size1; x++)
              image2[0] = 0;
       }
       #pragma omp section
       {
           for (int y = 0; y < y_size1; y++)
              image2[0] = 0;
       }
       #pragma omp section
       {
           for (int y = 0; y < y_size1; y++)
              image2[x_size1 - 1] = 0;
       }
       #pragma omp section
       {
           for (int x = 1; x < x_size1 - 1; x++)
              image2[y_size1 - 1] = 0;
       }
    }

    float inner_min_pixel_value = DBL_MAX;
    float inner_max_pixel_value = -DBL_MAX;

    #pragma simd collapse(2) reduction(min:inner_min_pixel_value, max:inner_max_pixel_value)
    for (int y = 1; y < y_size1 - 1; y++) {
      for (int x = 1; x < x_size1 - 1; x++) {
        pixel_value = image1[y - 1][x + 1] - image1[y - 1][x - 1]
                        + image1[x + 1]+ image1[x + 1] - image1[x - 1] - image1[x - 1]
                        + image1[y + 1][x + 1] - image1[y + 1][x - 1];

        image2 = pixel_value;
        if (pixel_value < inner_min_pixel_value) 
             inner_min_pixel_value = pixel_value;
        if (pixel_value > inner_max_pixel_value)
             inner_max_pixel_value = pixel_value;
      }
    }
    if (inner_min_pixel_value < min_pixel_value) 
         min_pixel_value = inner_min_pixel_value;
    if (inner_max_pixel_value > max_pixel_value)
         max_pixel_value = inner_pixel_value;
  }

Note, use "for(int x=0;..." C++ format as opposed to declaring the loop control variable outside of the loop.
Doing so improves optimization.

Jim Dempsey