[cpp]void* threadTask(void* params) { struct timeval start, stop; gettimeofday(&start, 0); int tid = (int)params; int i, j, k, l, add; add = 0; int* elements = (int*) malloc((global_offsets_len / 2 + 1) * sizeof(int)); for(i = tid; i < global_height; i += numProcs) { while(!events) // thread is released only when previous thread has finished its first chunk. { //asm volatile("":::"memory"); //__sync_synchronize(); sleep(0); } for(l = 0; l < global_width / grain; l++) { if(i > 0 && progress[i - 1] < global_width / grain) { while(progress + 1 == progress[i - 1]) // wait for previous thread to finish its current chunk. { //asm volatile("":::"memory"); //__sync_synchronize(); sleep(0); } } pthread_mutex_lock(&locks); // each row is striped add = 0; if (l + 1 == global_width / grain) { // accommodate division reminders add = global_width % grain; } for(j = l * grain; j < (l + 1) * grain + add; j++) { elements[0] = global_input ; for(k = 0; k < global_offsets_len; k += 2) { int d1 = indexWrap(i + global_offsets , global_height); int d2 = indexWrap(j + global_offsets[k + 1], global_width); elements[k / 2 + 1] = global_input[d1][d2]; } global_input = global_function(elements, global_offsets_len / 2 + 1); } progress++; if (progress == 1 && i + cPoint[0] < global_height) { events[i + cPoint[0]] = 1; // release the next thread to begin computation } pthread_mutex_unlock(&locks ); } } gettimeofday(&stop, 0); int total = (stop.tv_sec - start.tv_sec); printf("Thread %d time %d sec\r\n", tid, total); return 0; } [/cpp]
[cpp] int cpus[] = { 0, 2, 4, 6, 1, 3, 5, 7 }; for(i = 0; i < numProcs; i++) { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cpus, &cpuset); pthread_attr_t attr; pthread_attr_init(&attr); pthread_attr_setaffinity_np(&attr, sizeof(cpuset), &cpuset); printf("Thread %d started\r\n", i); pthread_create(&threads, &attr, threadTask, (void*)i); }[/cpp]
Link Copied
For more complete information about compiler optimizations, see our Optimization Notice.