Software Archive
Read-only legacy content
17061 Discussions

offload error: cannot release buffer memory on device 0 (error code 14)

TaeHyeok_J_
Beginner
1,050 Views

I need your help. 

I tried to run K-means algorithm on Xeon Phi by using offload mode.

But when i tried to get into offload region with the clause '#pragma offload ~~ (as attached pic 1) ' ,

i got an erorr 'offload error: cannot release buffer memory on device 0 (error code 14)' .

I have no idea to solve this problem, and i even cannot find any previous example similar to my problem on google.

I saw offload report by using 'export OFFLOAD_REPORT=3', but i couldn't get any hints. 

plz help me !

regards

TaeHyeok, Jang

0 Kudos
1 Solution
Ravi_N_Intel
Employee
1,050 Views

Move the allocation of clusters before  the #pragma offload target.  clusters is not allocated on the host to bring back the values computed on the card.

 

clusters = (float*)malloc(numclusters*numdim*sizeof(float));
        assert(clusters != NULL);

#pragma offload target .......

 

View solution in original post

0 Kudos
6 Replies
Kevin_D_Intel
Employee
1,050 Views

I cannot see the entire offload construct in your code and am unable to re-create the error in my mockup of what portion I can see. Can you attach the complete program that is failing so we can investigate using that?

Also, please let us know what version of the C++ compiler and MPSS you are using. Thank you.
 

0 Kudos
Kevin_D_Intel
Employee
1,050 Views

My suspicion is the error relates to clusters and its use in out() but without any host-side allocation.
 

0 Kudos
TaeHyeok_J_
Beginner
1,050 Views

/* jth_off_kmeans.c */ 


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "kmeans_1d.h"

#include "offload.h"

#define ALLOC alloc_if(1) free_if(0)
#define FREE alloc_if(0) free_if(1)
#define REUSE alloc_if(0) free_if(0)

#ifndef MIC_DEV
#define MIC_DEV 1
#endif

__declspec(target(mic))
__inline static // O(n)
float* compute_dist(int numdim, int numobjs,
                        float *objects, // objs [numdim][numobjs]
                        float *clusters, // centroid [numclusters][numdim]
                        float *dist_new,
                        int cluster_num
                        )
{
        int i, j;
        int jth_temp = 0;

        #pragma omp parallel for\
                private(j)\
                shared(dist_new)
        for(j=0;j<numobjs;j++)
                dist_new = 0;


        // dist calculation
        for(i=0;i<numdim;i++)
        {
                #pragma omp parallel for\
                        private(j)\
                        shared(dist_new, objects, clusters)
                for(j=0;j<numobjs;j++)
                {
                        dist_new+=(objects[i*numobjs+j]-clusters[cluster_num*numdim+i])*(objects[i*numobjs+j]-clusters[cluster_num*numdim+i]);
                }
        }

       // sqrt operation
        #pragma omp parallel for\
                private(i)\
                shared(dist_new)
        for(i=0;i<numobjs;i++)
        {
                dist_new = sqrt(dist_new);
        }


        return dist_new;
}

float* jth_off_kmeans(float *objects,// [numobjs][numdim],
                  int numdim,
                  int numobjs,
                  int numclusters,
                  float threshold,
                  int *membership, // [numobjs]
                  int *loop_iterations,
                  int xeon_threads)
{
        /* var declartion */
        int i, j;
        int num_of_iterations = 0;

        int xeon_threads_num = xeon_threads;

        char signal_1;
        char signal_2;
        char signal_3;
        char signal_4;

        float *clusters;

        float *dist_new;
        float *dist_old;
        float *dist_tmp;

        float *new_clusters;
        int *clusters_size;

        int *membership_xeon_phi;

        printf("test 01\n");

        printf("test 02\n");

#pragma offload target(mic:MIC_DEV) \
        signal(&signal_3) \
        in(objects : length(numobjs*numdim) alloc_if(1) free_if(0))\
        nocopy(dist_new, dist_old, dist_tmp, new_clusters, clusters_size, membership_xeon_phi : alloc_if(0) free_if(0))\
        out(clusters : length(numclusters*numdim))
{ /* start of Xeon Phi section */

        printf("test 02.4\n");

        /* set the number of xeon phi threads */
        omp_set_num_threads(xeon_threads_num);

        membership_xeon_phi = (int*)malloc(numobjs*sizeof(int));

        for(i=0;i<numobjs;i++)
                membership_xeon_phi = 0;


        /* var declaration */

        float delta;

        printf("test 02.5\n");

        /* memory allocation */
        clusters = (float*)malloc(numclusters*numdim*sizeof(float));
        assert(clusters != NULL);

        new_clusters = (float*)malloc(numclusters*numdim*sizeof(float));
        assert(new_clusters != NULL);

        clusters_size = (int*)malloc(numclusters*sizeof(int));
        assert(clusters_size != NULL);

        dist_new = (float*)malloc(numobjs*sizeof(float));
        assert(dist_new != NULL);

        dist_old = (float*)malloc(numobjs*sizeof(float));
        assert(dist_old != NULL);

        dist_tmp = (float*)malloc(numobjs*sizeof(float));
        assert(dist_tmp != NULL);

      printf("test 03\n");

        /* initialization of clusters_size[], new_clusters[] */
        for(i=0;i<numclusters;i++)
        {
                clusters_size = 0;

                for(j=0;j<numdim;j++)
                        new_clusters[i*numdim+j] = 0;
        }

        /* initialization of centroids as random objects */
        for(i=0;i<numdim;i++)
                for(j=0;j<numclusters;j++)
                {
                        clusters[numdim*j+i] = objects[i*numobjs+j];
                }

 //     for(j=0;j<numdim;j++)
 //             clusters_phi = clusters[0*numclusters+j];


        /* initialization of dist_new with centroid[0] */
        dist_new = compute_dist(numdim, numobjs, objects, clusters, dist_new, 0);


        /* initialization of dist_old as a dist_max */
        float dist_max = dist_new[0];

        for(i=0;i<numobjs;i++)
        {
                if(dist_max<dist_new)
                        dist_max = dist_new;
        }
        for(i=0;i<numobjs;i++)
        {
                dist_old = dist_max;
        }


        delta = threshold+1; // temp initialization for enterting while loop

        int temp = 0 ;

        printf("test 04\n");

       while(num_of_iterations++<5)
        {


                delta = 0.0; // var checking for proportion of changed objs

                for(i=1;i<numclusters;i++)
                {
                        int cluster_num;

                        cluster_num = i;

                        /* swap dist_tmp and dist_new */
                        for(j=0;j<numobjs;j++)
                                dist_tmp = dist_new;

                        dist_new = compute_dist(numdim, numobjs, objects, clusters, dist_new, cluster_num);


                #pragma omp parallel for\
                                private(j)\
                                shared(membership_xeon_phi, dist_old)\
                                reduction(+:delta)
                        for(j=0;j<numobjs;j++)
                        {
                                if(dist_tmp<dist_old)
                                {
                                        membership_xeon_phi = i; // i means cluster index
                                        dist_old = dist_tmp;
                                        delta++;
                                }
                        }
                }

                /* determine clusters_size */
                for(i=0;i<numobjs;i++)
                        clusters_size[membership_xeon_phi]++;

               /* centroids recalculation */
                for(i=0;i<numdim;i++)
                {
                        for(j=0;j<numobjs;j++)
                        {
                                new_clusters[membership_xeon_phi*numdim+i] += objects[i*numobjs+j];
                        }
                }

                for(i=0;i<numclusters;i++)
                {
                        for(j=0;j<numdim;j++)
                        {
                                clusters[i*numdim+j] = new_clusters[i*numdim+j]/clusters_size;
                                new_clusters[i*numdim+j] = 0; // set back to 0
                        }
                }

                delta = (delta)/(numobjs*numclusters);

                for(i=0;i<numclusters;i++)
                        clusters_size = 0;
        }


        free(clusters_size);
        free(new_clusters);

        free(dist_new);
        free(dist_old);
        free(dist_tmp);


} /* end of Xeon Phi section */

        #pragma offload_wait target(mic:MIC_DEV) wait(&signal_3)

        printf("\n");

        *loop_iterations = num_of_iterations;

 

        return clusters;
}

 

0 Kudos
TaeHyeok_J_
Beginner
1,050 Views

/* jth_off_main.c */ 


#include <stdio.h>
#include <stdlib.h>
#include <string.h>     /* strtok() */
#include <sys/types.h>  /* open() */
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>     /* getopt() */

#include <omp.h>
int      _debug;
#include "kmeans_1d.h"

/*---< usage() >------------------------------------------------------------*/
static void usage(char *argv0, float threshold) {
    char *help =
        "Usage: %s [switches] -i filename -n num_clusters\n"
        "       -i filename    : file containing data to be clustered\n"
        "       -b             : input file is in binary format (default no)\n"
        "       -n num_clusters: number of clusters (K must > 1)\n"
        "       -t threshold   : threshold value (default %.4f)\n"
        "               -p nproc           : number of threads (default system allocated)\n"
        "               -a                         : perform atomic OpenMP pragma (default no)\n"
        "       -o             : output timing results (default no)\n"
        "       -d             : enable debug mode\n";
    fprintf(stderr, help, argv0, threshold);
    exit(-1);
}

/*---< main() >-------------------------------------------------------------*/
int main(int argc, char **argv) {
           int     opt;
    extern char   *optarg;
    extern int     optind;
           int     i, j, nthreads;
           int     isBinaryFile, is_perform_atomic, is_output_timing;
           int     loop_iterations;

           int     xeon_threads;

           int     numClusters, numCoords, numObjs;
           int    *membership;    /* [numObjs] */
           char   *filename;
           float  *objects;       /* [numObjs][numCoords] data objects */
           float  *clusters;      /* [numClusters][numCoords] cluster center */
           float   threshold;

           double  timing, io_timing, clustering_timing;

    /* some default values */
    _debug           = 0;
        nthreads                 = 0;
    threshold        = 0.001;
    numClusters      = 0;
    isBinaryFile     = 0;
    is_output_timing = 0;
        is_perform_atomic = 0;
    filename         = NULL;
        xeon_threads = 0;
    while ( (opt=getopt(argc,argv,"p:x:i:n:t:abdo"))!= EOF) {
        switch (opt) {
            case 'i': filename=optarg;
                      break;
            case 'b': isBinaryFile = 1;
                      break;
            case 't': threshold=atof(optarg);
                      break;
            case 'n': numClusters = atoi(optarg);
                      break;
            case 'p': nthreads = atoi(optarg);
                  break;
            case 'a': is_perform_atomic = 1;
                  break;
            case 'o': is_output_timing = 1;
                      break;
            case 'd': _debug = 1;
                      break;
            case '?': usage(argv[0], threshold);
                      break;
            case 'x': xeon_threads = atoi(optarg);
                      break;
            default: usage(argv[0], threshold);
                      break;
        }
    }

    if (filename == 0 || numClusters <= 1) usage(argv[0], threshold);

    if (nthreads > 0)
        omp_set_num_threads(nthreads);

    if (is_output_timing) io_timing = omp_get_wtime();

   /* read data points from file ------------------------------------------*/
    objects = file_read(isBinaryFile, filename, &numObjs, &numCoords);
    if (objects == NULL) exit(1);

    if (is_output_timing) {
        timing            = omp_get_wtime();
        io_timing         = timing - io_timing;
    }


/* objects adjusted */
    float *temp_objects;
    temp_objects = (float*)malloc(numObjs * numCoords * sizeof(float));

    for(i=0;i<numObjs;i++)
        for(j=0;j<numCoords;j++)
        {
                temp_objects[j*numObjs+i] = objects[i*numCoords+j];
        }

 

    objects = temp_objects;


    if (is_output_timing) {
        timing = omp_get_wtime();
        clustering_timing = timing;
    }

//      printf("omp get max threads num in main: %d\n", omp_get_max_threads());
//      printf("omp get num threads num in main: %d\n", omp_get_num_threads());

    /* start the timer for the core computation -----------------------------*/
    /* membership: the cluster id for each data object */
    membership = (int*) malloc(numObjs * sizeof(int));
    assert(membership != NULL);

        for(i=0;i<numObjs;i++)
                membership=0;

    clusters = jth_off_kmeans(objects, numCoords, numObjs, numClusters, threshold, membership, &loop_iterations, xeon_threads);

    free(objects);

    if (is_output_timing) {
        timing            = omp_get_wtime();
        clustering_timing = timing - clustering_timing;
    }

    /* output: the coordinates of the cluster centres ----------------------*/
    file_write(filename, numClusters, numObjs, numCoords, clusters,
               membership);

    free(membership);
    free(clusters);
    //wefwefwef
    /*---- output performance numbers ---------------------------------------*/
    if (is_output_timing) {
        io_timing += omp_get_wtime() - timing;
//        printf("\n **** jth off kmeans  ****\n");
/*              if (is_perform_atomic)
                        printf(" using atomic pragma ******\n");
                else
                        printf(" using array reduction ******\n");  */
        printf("Number of threads = %d\n", omp_get_max_threads());
        printf("Number of xeon threads = %d\n", xeon_threads);
        printf("Input file    : %s\n", filename);
//        printf("numObjs       : %d\n", numObjs);
//        printf("numCoords     : %d\n", numCoords);
        printf("numClusters   : %d\n", numClusters);
//        printf("threshold     : %.4f\n", threshold);
//      printf("loop iterations : %d\n", loop_iterations);

//        printf("I/O time           = %10.4f sec\n", io_timing);
//        printf("Computation timing = %10.4f sec\n", clustering_timing);
        printf("Computation timing per iteration = %10.4f sec\n", clustering_timing/loop_iterations);
        }

    return(0);
}

0 Kudos
TaeHyeok_J_
Beginner
1,050 Views

 

yeah, i found that there is an error on out() statement, but still don't know why. :( 

compiler version gcc 4.4.7 .

mpss version 3.1.4. 

thank you for your help in advance ! 

regards 

TaeHyeok, Jang 

0 Kudos
Ravi_N_Intel
Employee
1,051 Views

Move the allocation of clusters before  the #pragma offload target.  clusters is not allocated on the host to bring back the values computed on the card.

 

clusters = (float*)malloc(numclusters*numdim*sizeof(float));
        assert(clusters != NULL);

#pragma offload target .......

 

0 Kudos
Reply