You ask : why is this

Matara_Ma_Sukoy1 · ‎09-16-2013

Hi all,

I was getting Segmentation fault on my offloaded code, so I made a quick test. Below is a tiny code I used for testing and the results I got;

[cpp]
#include <stdlib.h>
#include <stdio.h>

#define ALLOC alloc_if(1)
#define REUSE alloc_if(0)
#define FREE free_if(1)
#define RETAIN free_if(1)

struct test
{
   int length;
   int* data;
};

typedef struct test test_t;

void transferVector(test_t* test)
{
   __declspec(target(mic)) int length = test->length;
   __declspec(target(mic)) int* data = test->data;

   #pragma offload target(mic) \
       nocopy(test) \
       in(length) \
       in(data:length(length) ALLOC RETAIN)
   {
       printf("(METHOD) addresses on coprocessor test: %p\t &test %p\n", test, &test);
   }
}

int main(int argsc, char* argsv[])
{
   __declspec(target(mic)) test_t* test;

   // Allocate memory on the host
   test = (test_t*) malloc(sizeof(test_t));
   test->length = 10;
   test->data = (int*) malloc(sizeof(int) * 10);

   // fill in some values for array
   int i;
   for(i = 0; i < test->length; ++i)
       test->data = i;

   #pragma offload target(mic) nocopy(test)
   {
       printf("(MAIN) addresses on coprocessor test: %p\t &test %p\n", test, &test);
   }

   transferVector(test);

   #pragma offload target(mic) nocopy(test)
   {
       printf("(MAIN) addresses on coprocessor test: %p\t &test %p\n", test, &test);
   }

   return EXIT_SUCCESS;
}
[/cpp]

Results;

(MAIN) addresses on coprocessor test: 0x5   &test 0x7fca9703cb28
(METHOD) addresses on coprocessor test: (nil)   &test 0x7fca9703cb40
(MAIN) addresses on coprocessor test: 0x5   &test 0x7fca9703cb28

When I check the printed results for the address of &test, I see that it is different in the offloaded region in trasferVector method. What is the cause of this? Shouldn't the &test be the same? Whatever data I fill in trasferVector method is lost when I get back to main... And if I don't use a method to allocate data on coprocessor, code works as desired...

Answers, directions, questions.. are all welcome and greatly appreciated.

Thank you.

Sumedh_N_Intel · ‎09-17-2013

Hi,

The method you have used to transfer the non-bitwise copyable structure is not correct. Please refer to the following compiler BKM to find more about transfering non-bitwise copyable data to the coprocessor: http://software.intel.com/en-us/articles/effective-use-of-the-intel-compilers-offload-features

You may also find the following forum thread to be of relevance to your issue:

http://software.intel.com/en-us/forums/topic/472328

I hope this helps.

Matara_Ma_Sukoy1 · ‎09-17-2013

Hi Sumedh,

Thanks for your time, I really appreciate it. This might seem like a really long post but its actually a really small one so please bear with me.

I have read the article about using offload features. And made couple changes as shown below. Now struct1 is a global pointer. And the code below works without any problem.

[cpp]

#include <stdio.h>
#include <stdlib.h>

#define SIZE 10

#define ALLOC alloc_if(1) free_if(0)
#define REUSE alloc_if(0) free_if(0)
#define FREE alloc_if(0) free_if(1)

// Example of Non-Bitwise Object Transfer, All Data Elements Needed
typedef struct
{
    int m1;
    int *m2;
} nbwcs;

__declspec(target(mic)) nbwcs* struct1;

void send_inputs()
{
   int m1;
   int *m2;

   // Initialize the struct
   struct1 = (nbwcs*) malloc(sizeof(nbwcs));
   struct1->m1 = 10;
   struct1->m2 = (int*) malloc(SIZE * sizeof(int));
   for (int i=0; i<SIZE; i++)
   {
       struct1->m2 = i;
   }

   // In this offload data is transferred
   m1 = struct1->m1;
   m2 = struct1->m2;
   #pragma offload target(mic:0) in(m1) in(m2[0:SIZE] : ALLOC) nocopy(struct1:length(1) ALLOC)
   {
       struct1->m1 = m1;
       struct1->m2 = m2;
       printf("MIC offload1: struct1.m2[0] = %d, struct1.m2[SIZE-1] = %d\n", struct1->m2[0], struct1->m2[SIZE-1]);
       fflush(0);
   }
}

void use_the_data()
{
   // In this offload data is used and updated
   #pragma offload target(mic:0) nocopy(struct1)
   {
       for (int i=0; i<SIZE; i++)
       {
           struct1->m2 += i;
       }

       printf("MIC offload2: struct1.m2[0] = %d, struct1.m2[SIZE-1] = %d\n", struct1->m2[0], struct1->m2[SIZE-1]);
       fflush(0);
   }
}

void receive_results()
{
   int *m2;
   // In this offload data is used,, updated, freed on MIC and brought back to the CPU
   m2 = struct1->m2;
   #pragma offload target(mic:0) out(m2[0:SIZE] : FREE) nocopy(struct1:length(1) FREE)
   {
       for (int i=0; i<SIZE; i++)
       {
           struct1->m2 += i;
       }
       printf("MIC offload3: struct1.m2[0] = %d, struct1.m2[SIZE-1] = %d\n", struct1->m2[0], struct1->m2[SIZE-1]);
       fflush(0);
   }

   printf("CPU: struct1.m2[0] = %d, struct1.m2[SIZE-1] = %d\n", struct1->m2[0], struct1->m2[SIZE-1]);
}

int main()
{
   send_inputs();
   use_the_data();
   receive_results();
   return 0;
}
[/cpp]

However, when I change it to something like this (instead of using global variables I used local pointer this time), I get a segmentation fault on coprocessor.

[cpp]

#include <stdio.h>
#include <stdlib.h>

#define SIZE 10

#define ALLOC alloc_if(1) free_if(0)
#define REUSE alloc_if(0) free_if(0)
#define FREE alloc_if(0) free_if(1)

// Example of Non-Bitwise Object Transfer, All Data Elements Needed
typedef struct
{
    int m1;
    int *m2;
} nbwcs;

void send_inputs(nbwcs* struct1)
{
   int m1;
   int *m2;

   // Initialize the struct
   struct1 = (nbwcs*) malloc(sizeof(nbwcs));
   struct1->m1 = 10;
   struct1->m2 = (int*) malloc(SIZE * sizeof(int));
   for (int i=0; i<SIZE; i++)
   {
       struct1->m2 = i;
   }

   // In this offload data is transferred
   m1 = struct1->m1;
   m2 = struct1->m2;
   #pragma offload target(mic:0) in(m1) in(m2[0:SIZE] : ALLOC) nocopy(struct1:length(1) ALLOC)
   {
       struct1->m1 = m1;
       struct1->m2 = m2;
       printf("MIC offload1: struct1.m2[0] = %d, struct1.m2[SIZE-1] = %d\n", struct1->m2[0], struct1->m2[SIZE-1]);
       fflush(0);
   }
}

void use_the_data(nbwcs* struct1)
{
   // In this offload data is used and updated
   #pragma offload target(mic:0) nocopy(struct1)
   {
       for (int i=0; i<SIZE; i++)
       {
           struct1->m2 += i;
       }

       printf("MIC offload2: struct1.m2[0] = %d, struct1.m2[SIZE-1] = %d\n", struct1->m2[0], struct1->m2[SIZE-1]);
       fflush(0);
   }
}

void receive_results(nbwcs* struct1)
{
   int *m2;
   // In this offload data is used,, updated, freed on MIC and brought back to the CPU
   m2 = struct1->m2;
   #pragma offload target(mic:0) out(m2[0:SIZE] : FREE) nocopy(struct1:length(1) FREE)
   {
       for (int i=0; i<SIZE; i++)
       {
           struct1->m2 += i;
       }
       printf("MIC offload3: struct1.m2[0] = %d, struct1.m2[SIZE-1] = %d\n", struct1->m2[0], struct1->m2[SIZE-1]);
       fflush(0);
   }

   printf("CPU: struct1.m2[0] = %d, struct1.m2[SIZE-1] = %d\n", struct1->m2[0], struct1->m2[SIZE-1]);
}

int main()
{
   __declspec(target(mic)) nbwcs* struct1;

   send_inputs(struct1);
   use_the_data(struct1);
   receive_results(struct1);
   return 0;
}
[/cpp]

So my question is why local variables in offloaded code does not survive / is not persistent throughout all offloads? How __declspec keyword function in this case? or more clearly how coprocessors knows if a variable is defined with __declspec keyword? by looking at its address?

And for the code in my first post, I have made a stupid error, so if I correct the function like this I still have the same outcome. In the same scope, the data allocated is same, but when I go back to main method it will be what it was back in main method (kind of similar to function call stack...)

[cpp]

void transferVector(test_t** test)
{
   __declspec(target(mic)) int length = (*test)->length;
   __declspec(target(mic)) int* data = (*test)->data;
   #pragma offload target(mic) \
       nocopy(test) \
       in(length) \
       in(data:length(length) ALLOC RETAIN)
   {
       printf("(METHOD) addresses on coprocessor test: %pt &test %pn", (*test), test);
   }
}

[/cpp]

Regards

Matara Ma Sukoy

Sumedh_N_Intel · ‎09-18-2013

I am investigating why the code does not work for a local variable. Let me get back to with what I find.

Rajiv_D_Intel · ‎09-19-2013

__declspec(target(mic)) applies only to statically allocated variables, i.e.:

variables declared/defined at file scope, outside any function
variables declared inside functions with storage-class “static”

The effect of that declspec is to include the variable in the MIC executable. Without that declspec, the variable is not allocated in the MIC executable so you cannot reference it.

Using __declspec(target(mic)) on a variable declared inside a function that does not have storage-class “static” is meaningless.

Variable persistence across function calls can only be achieved for statically allocated variables. Those at file-scope can be accessed in multiple functions. Those in function-scope can only be accessed by the function in which they are declared.

Matara_Ma_Sukoy1 · ‎09-19-2013

Thanks a lot. That is the conclusion I and some other people on the forums had. I just wanted to confirm it. If I may ask, why is this feature implemented this way? Is it easier from a compiler perspective? or are there any specific reasons behind it?

On the side note, I have used (I believe) this feature in a prettier way using "extern" keyword (to mimic function parameter passing). You simply declare the variables you want to use in a seperate file (offload functions are defined here as well), and include this file to original main file, in which real definition occurs. To illustrate;

[cpp]

// offload_api.h

#ifndef OFFLOAD_API_H_
#define OFFLOAD_API_H_

#include "include/data_structure/vector.h"

// global variables to be used in spmxv functions
// not all of them will be used for every function though
// ---------------------------------------------------------
__attribute__((target(mic))) extern vector_real_t* x_offload;
__attribute__((target(mic))) extern vector_real_t* y_offload;
// ---------------------------------------------------------

extern void offload_xToCoproc(void);
extern void offload_yToCoproc(void);
extern void offload_xToProc(void);
extern void offload_yToProc(void);

#endif /* OFFLOAD_API_H_ */
[/cpp]

[cpp]

// offload_api.c

#include "offload_api.h"

/* Implement the function as you normally would using variables declared in "offload_api.h" */

[/cpp]

[cpp]

// some other file

#include "arch/mic/inc/offload_api.h"

__attribute__((target(mic))) vector_real_t* x_offload;
__attribute__((target(mic))) vector_real_t* y_offload;

int main(int argsc, char* argsv[])
{ /* use functions */ }

[/cpp]

Rajiv_D_Intel · ‎09-20-2013

You ask : why is this feature implemented this way?

Which feature or aspect of a feature are you referring to?

The visibility of variables inside/outside functions is no different than in standard C/C++. Similarly, the effect of storage-class static is also the same with/without offload. If you are asking about __declspec(target(mic)), that has been added to the offload specification in order to minimize the data declared within the MIC binary. Only variables with the declspec are included. This keeps unneeded data out of the MIC binary, reducing its size.

If you are asking about some other aspect of the implememtation, please ask a more specific question.

Matara_Ma_Sukoy1 · ‎10-04-2013

Alright, sorry for ambiguous statements.

Here is a small code block;

[cpp]

#include <stdlib.h>
#include <stdio.h>
#define ALLOC alloc_if(1)
#define REUSE alloc_if(0)
#define FREE free_if(1)
#define RETAIN free_if(1)
struct test
{
    int length;
    int* data;
};
typedef struct test test_t;

void transferVector(test_t** test)
{
    __declspec(target(mic)) int length = (*test)->length;
    __declspec(target(mic)) int* data = (*test)->data;
    #pragma offload target(mic)
        nocopy(test)
        in(length)
        in(data:length(length) ALLOC RETAIN)
    {
        printf("(METHOD) addresses on coprocessor test: %pt &test %pn", (*test), test);
    }
}

int main(int argsc, char* argsv[])
{
    __declspec(target(mic)) test_t* test;
    // Allocate memory on the host
    test = (test_t*) malloc(sizeof(test_t));
    test->length = 10;
    test->data = (int*) malloc(sizeof(int) * 10);
    // fill in some values for array
    int i;
    for(i = 0; i < test->length; ++i)
        test->data = i;
    #pragma offload target(mic) nocopy(test)
    {
        printf("(MAIN) addresses on coprocessor test: %pt &test %pn", test, &test);
    }
    transferVector(&test);
    #pragma offload target(mic) nocopy(test)
    {
        printf("(MAIN) addresses on coprocessor test: %pt &test %pn", test, &test);
    }
    return EXIT_SUCCESS;
}
[/cpp]

And here is the output;

(MAIN) addresses on coprocessor test: 0x5   &test 0x7fca9703cb28
(METHOD) addresses on coprocessor test: (nil)   &test 0x7fca9703cb40
(MAIN) addresses on coprocessor test: 0x5   &test 0x7fca9703cb28

By normal C/C++ standarts I am able to get multiple return values from functions using pointer parameter passing. However, this is not the case for offloaded execution.

I am guessing (I am quite inexperienced about this topic) icc doesn't take into account that I passed variables address rather than itself. Why is this implemented this way? And what is the difference from compiler perspective?

Regards

Matara Ma

Ravi_N_Intel · ‎10-04-2013

Every offload would create a new stack on MIC and the values on the host are copied by default or as indicated by the offload clause to MIC.

In your test case the the "test" pointer is memory is created for each offload, In the case of offload from main the stack location happens to be same for "test". The offload call in transferVector would create a new stack and its location is different so the "test" on that stack is at a new location.

Persistency Problem when using dynamic memory allocation with pointers