- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I need your help.
I tried to run K-means algorithm on Xeon Phi by using offload mode.
But when i tried to get into offload region with the clause '#pragma offload ~~ (as attached pic 1) ' ,
i got an erorr 'offload error: cannot release buffer memory on device 0 (error code 14)' .
I have no idea to solve this problem, and i even cannot find any previous example similar to my problem on google.
I saw offload report by using 'export OFFLOAD_REPORT=3', but i couldn't get any hints.
plz help me !
regards
TaeHyeok, Jang
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Move the allocation of clusters before the #pragma offload target. clusters is not allocated on the host to bring back the values computed on the card.
clusters = (float*)malloc(numclusters*numdim*sizeof(float));
assert(clusters != NULL);
#pragma offload target .......
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I cannot see the entire offload construct in your code and am unable to re-create the error in my mockup of what portion I can see. Can you attach the complete program that is failing so we can investigate using that?
Also, please let us know what version of the C++ compiler and MPSS you are using. Thank you.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
My suspicion is the error relates to clusters and its use in out() but without any host-side allocation.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
/* jth_off_kmeans.c */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "kmeans_1d.h"
#include "offload.h"
#define ALLOC alloc_if(1) free_if(0)
#define FREE alloc_if(0) free_if(1)
#define REUSE alloc_if(0) free_if(0)
#ifndef MIC_DEV
#define MIC_DEV 1
#endif
__declspec(target(mic))
__inline static // O(n)
float* compute_dist(int numdim, int numobjs,
float *objects, // objs [numdim][numobjs]
float *clusters, // centroid [numclusters][numdim]
float *dist_new,
int cluster_num
)
{
int i, j;
int jth_temp = 0;
#pragma omp parallel for\
private(j)\
shared(dist_new)
for(j=0;j<numobjs;j++)
dist_new
// dist calculation
for(i=0;i<numdim;i++)
{
#pragma omp parallel for\
private(j)\
shared(dist_new, objects, clusters)
for(j=0;j<numobjs;j++)
{
dist_new
}
}
// sqrt operation
#pragma omp parallel for\
private(i)\
shared(dist_new)
for(i=0;i<numobjs;i++)
{
dist_new = sqrt(dist_new);
}
return dist_new;
}
float* jth_off_kmeans(float *objects,// [numobjs][numdim],
int numdim,
int numobjs,
int numclusters,
float threshold,
int *membership, // [numobjs]
int *loop_iterations,
int xeon_threads)
{
/* var declartion */
int i, j;
int num_of_iterations = 0;
int xeon_threads_num = xeon_threads;
char signal_1;
char signal_2;
char signal_3;
char signal_4;
float *clusters;
float *dist_new;
float *dist_old;
float *dist_tmp;
float *new_clusters;
int *clusters_size;
int *membership_xeon_phi;
printf("test 01\n");
printf("test 02\n");
#pragma offload target(mic:MIC_DEV) \
signal(&signal_3) \
in(objects : length(numobjs*numdim) alloc_if(1) free_if(0))\
nocopy(dist_new, dist_old, dist_tmp, new_clusters, clusters_size, membership_xeon_phi : alloc_if(0) free_if(0))\
out(clusters : length(numclusters*numdim))
{ /* start of Xeon Phi section */
printf("test 02.4\n");
/* set the number of xeon phi threads */
omp_set_num_threads(xeon_threads_num);
membership_xeon_phi = (int*)malloc(numobjs*sizeof(int));
for(i=0;i<numobjs;i++)
membership_xeon_phi = 0;
/* var declaration */
float delta;
printf("test 02.5\n");
/* memory allocation */
clusters = (float*)malloc(numclusters*numdim*sizeof(float));
assert(clusters != NULL);
new_clusters = (float*)malloc(numclusters*numdim*sizeof(float));
assert(new_clusters != NULL);
clusters_size = (int*)malloc(numclusters*sizeof(int));
assert(clusters_size != NULL);
dist_new = (float*)malloc(numobjs*sizeof(float));
assert(dist_new != NULL);
dist_old = (float*)malloc(numobjs*sizeof(float));
assert(dist_old != NULL);
dist_tmp = (float*)malloc(numobjs*sizeof(float));
assert(dist_tmp != NULL);
printf("test 03\n");
/* initialization of clusters_size[], new_clusters[] */
for(i=0;i<numclusters;i++)
{
clusters_size = 0;
for(j=0;j<numdim;j++)
new_clusters[i*numdim+j] = 0;
}
/* initialization of centroids as random objects */
for(i=0;i<numdim;i++)
for(j=0;j<numclusters;j++)
{
clusters[numdim*j+i] = objects[i*numobjs+j];
}
// for(j=0;j<numdim;j++)
// clusters_phi
/* initialization of dist_new with centroid[0] */
dist_new = compute_dist(numdim, numobjs, objects, clusters, dist_new, 0);
/* initialization of dist_old as a dist_max */
float dist_max = dist_new[0];
for(i=0;i<numobjs;i++)
{
if(dist_max<dist_new)
dist_max = dist_new;
}
for(i=0;i<numobjs;i++)
{
dist_old = dist_max;
}
delta = threshold+1; // temp initialization for enterting while loop
int temp = 0 ;
printf("test 04\n");
while(num_of_iterations++<5)
{
delta = 0.0; // var checking for proportion of changed objs
for(i=1;i<numclusters;i++)
{
int cluster_num;
cluster_num = i;
/* swap dist_tmp and dist_new */
for(j=0;j<numobjs;j++)
dist_tmp
dist_new = compute_dist(numdim, numobjs, objects, clusters, dist_new, cluster_num);
#pragma omp parallel for\
private(j)\
shared(membership_xeon_phi, dist_old)\
reduction(+:delta)
for(j=0;j<numobjs;j++)
{
if(dist_tmp
{
membership_xeon_phi
dist_old
delta++;
}
}
}
/* determine clusters_size */
for(i=0;i<numobjs;i++)
clusters_size[membership_xeon_phi]++;
/* centroids recalculation */
for(i=0;i<numdim;i++)
{
for(j=0;j<numobjs;j++)
{
new_clusters[membership_xeon_phi
}
}
for(i=0;i<numclusters;i++)
{
for(j=0;j<numdim;j++)
{
clusters[i*numdim+j] = new_clusters[i*numdim+j]/clusters_size;
new_clusters[i*numdim+j] = 0; // set back to 0
}
}
delta = (delta)/(numobjs*numclusters);
for(i=0;i<numclusters;i++)
clusters_size = 0;
}
free(clusters_size);
free(new_clusters);
free(dist_new);
free(dist_old);
free(dist_tmp);
} /* end of Xeon Phi section */
#pragma offload_wait target(mic:MIC_DEV) wait(&signal_3)
printf("\n");
*loop_iterations = num_of_iterations;
return clusters;
}
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
/* jth_off_main.c */
#include <stdio.h>
#include <stdlib.h>
#include <string.h> /* strtok() */
#include <sys/types.h> /* open() */
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h> /* getopt() */
#include <omp.h>
int _debug;
#include "kmeans_1d.h"
/*---< usage() >------------------------------------------------------------*/
static void usage(char *argv0, float threshold) {
char *help =
"Usage: %s [switches] -i filename -n num_clusters\n"
" -i filename : file containing data to be clustered\n"
" -b : input file is in binary format (default no)\n"
" -n num_clusters: number of clusters (K must > 1)\n"
" -t threshold : threshold value (default %.4f)\n"
" -p nproc : number of threads (default system allocated)\n"
" -a : perform atomic OpenMP pragma (default no)\n"
" -o : output timing results (default no)\n"
" -d : enable debug mode\n";
fprintf(stderr, help, argv0, threshold);
exit(-1);
}
/*---< main() >-------------------------------------------------------------*/
int main(int argc, char **argv) {
int opt;
extern char *optarg;
extern int optind;
int i, j, nthreads;
int isBinaryFile, is_perform_atomic, is_output_timing;
int loop_iterations;
int xeon_threads;
int numClusters, numCoords, numObjs;
int *membership; /* [numObjs] */
char *filename;
float *objects; /* [numObjs][numCoords] data objects */
float *clusters; /* [numClusters][numCoords] cluster center */
float threshold;
double timing, io_timing, clustering_timing;
/* some default values */
_debug = 0;
nthreads = 0;
threshold = 0.001;
numClusters = 0;
isBinaryFile = 0;
is_output_timing = 0;
is_perform_atomic = 0;
filename = NULL;
xeon_threads = 0;
while ( (opt=getopt(argc,argv,"p:x:i:n:t:abdo"))!= EOF) {
switch (opt) {
case 'i': filename=optarg;
break;
case 'b': isBinaryFile = 1;
break;
case 't': threshold=atof(optarg);
break;
case 'n': numClusters = atoi(optarg);
break;
case 'p': nthreads = atoi(optarg);
break;
case 'a': is_perform_atomic = 1;
break;
case 'o': is_output_timing = 1;
break;
case 'd': _debug = 1;
break;
case '?': usage(argv[0], threshold);
break;
case 'x': xeon_threads = atoi(optarg);
break;
default: usage(argv[0], threshold);
break;
}
}
if (filename == 0 || numClusters <= 1) usage(argv[0], threshold);
if (nthreads > 0)
omp_set_num_threads(nthreads);
if (is_output_timing) io_timing = omp_get_wtime();
/* read data points from file ------------------------------------------*/
objects = file_read(isBinaryFile, filename, &numObjs, &numCoords);
if (objects == NULL) exit(1);
if (is_output_timing) {
timing = omp_get_wtime();
io_timing = timing - io_timing;
}
/* objects adjusted */
float *temp_objects;
temp_objects = (float*)malloc(numObjs * numCoords * sizeof(float));
for(i=0;i<numObjs;i++)
for(j=0;j<numCoords;j++)
{
temp_objects[j*numObjs+i] = objects[i*numCoords+j];
}
objects = temp_objects;
if (is_output_timing) {
timing = omp_get_wtime();
clustering_timing = timing;
}
// printf("omp get max threads num in main: %d\n", omp_get_max_threads());
// printf("omp get num threads num in main: %d\n", omp_get_num_threads());
/* start the timer for the core computation -----------------------------*/
/* membership: the cluster id for each data object */
membership = (int*) malloc(numObjs * sizeof(int));
assert(membership != NULL);
for(i=0;i<numObjs;i++)
membership=0;
clusters = jth_off_kmeans(objects, numCoords, numObjs, numClusters, threshold, membership, &loop_iterations, xeon_threads);
free(objects);
if (is_output_timing) {
timing = omp_get_wtime();
clustering_timing = timing - clustering_timing;
}
/* output: the coordinates of the cluster centres ----------------------*/
file_write(filename, numClusters, numObjs, numCoords, clusters,
membership);
free(membership);
free(clusters);
//wefwefwef
/*---- output performance numbers ---------------------------------------*/
if (is_output_timing) {
io_timing += omp_get_wtime() - timing;
// printf("\n **** jth off kmeans ****\n");
/* if (is_perform_atomic)
printf(" using atomic pragma ******\n");
else
printf(" using array reduction ******\n"); */
printf("Number of threads = %d\n", omp_get_max_threads());
printf("Number of xeon threads = %d\n", xeon_threads);
printf("Input file : %s\n", filename);
// printf("numObjs : %d\n", numObjs);
// printf("numCoords : %d\n", numCoords);
printf("numClusters : %d\n", numClusters);
// printf("threshold : %.4f\n", threshold);
// printf("loop iterations : %d\n", loop_iterations);
// printf("I/O time = %10.4f sec\n", io_timing);
// printf("Computation timing = %10.4f sec\n", clustering_timing);
printf("Computation timing per iteration = %10.4f sec\n", clustering_timing/loop_iterations);
}
return(0);
}
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
yeah, i found that there is an error on out() statement, but still don't know why. :(
compiler version gcc 4.4.7 .
mpss version 3.1.4.
thank you for your help in advance !
regards
TaeHyeok, Jang
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Move the allocation of clusters before the #pragma offload target. clusters is not allocated on the host to bring back the values computed on the card.
clusters = (float*)malloc(numclusters*numdim*sizeof(float));
assert(clusters != NULL);
#pragma offload target .......
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page