- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I am now optimizing a simple a program. Element of a and b are made sure to be from 0 to 8, so in my opitimization I turned them from int to short.
void ans(int n, const int *a, const int *b, int *answer)
{
for (int k = 0; k < n; k++)
for (int i = 0; i < n - k; ++i)
answer[k] += a[i + k] * b[i];
}
But I found the output is correct when input size is less than 25k, but being wrong when size exceeds this number and the wrong number of value is exactly (total input size -25k).
And here is what I have done with AVX2 and openmp.
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include <omp.h>
#include <string.h>
#include <malloc.h>
static short n_zero = 0;
static int MAX_CORE = omp_get_max_threads();
void calc(int n, const int *a, const int *b, int *c)
{
#pragma omp parallel for schedule(dynamic)
for (int k = 0; k < n; k++)
{
short *new_a =(short*)alloca(sizeof(short)*16);
short *new_b = (short*)alloca(sizeof(short)*16);
{
__m256i partial_sums = _mm256_set1_epi16(n_zero);
for (int i = 0; i < (n - k) / 16 * 16; i += 16)
{
for (int j = 0; j < 16; j++)
{
new_a[j] = a[i+j+k];
new_b[j] = b[i+j];
}
__m256i vec_a_1 = _mm256_loadu_si256((__m256i *)(new_a ));
__m256i vec_b_1 = _mm256_loadu_si256((__m256i *)(new_b ));
__m256i partial_pd = _mm256_mullo_epi16(vec_a_1, vec_b_1);
partial_sums = _mm256_add_epi16(partial_pd, partial_sums);
}
short arr[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
_mm256_storeu_si256(((__m256i *)arr), partial_sums);
for (int i = 0; i < 16; i++)
{
c[k] += arr[i];
}
for (int i = (n - k) / 16 * 16 + k; i < (n); i++)
{
c[k] += a[i] * b[i - k];
}
}
}
}
And here is the main program to test them:
#include <chrono>
#include <cstring>
#include <iostream>
#include <ctime>
extern void calc(int, const int *, const int *, int *);
extern void ans(int, const int *, const int *, int *);
int main()
{
std::ios_base::sync_with_stdio(false);
int n;
int flag = 0;
int ff=0;
std::cout << "Enter n:";
std::cin >> n;
int *a = new int[n];
int *b = new int[n];
int *c = new int[n];
int *answer = new int[n];
// Initialize the arrays
std::memset(a, 0, sizeof(int) * n);
std::memset(b, 0, sizeof(int) * n);
std::memset(c, 0, sizeof(int) * n);
//create random arrry
unsigned seed;
seed = time(0);
srand(seed);
for (int i = 0; i < n; ++i) {
a[i] = rand() % 8+1;
b[i] = rand() % 8+1;
//std::cout << a[i] << "/" << b[i] << "\n";
}
// Start the calculation
auto start = std::chrono::high_resolution_clock::now();
calc(n, a, b, c);
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = stop - start;
std::clog << "Time elapsed: " << diff.count() << '\n';
//correctness feedback
std::memset(answer, 0, sizeof(int) * n);
ans(n, a, b, answer);
for (int i = 0; i < n; i++) {
if (c[i] != answer[i]) {
if(answer[i]<0){
ff+=1;
}
flag +=1;
}
}
std::cout << "--------correctness feedback--------"<< '\n';
if (flag == 0) {
std::cout << "ALL PASS"<< '\n';
}
else {
std::cout <<ff<<"****"<<flag<<"error detected"<< '\n';
}
delete[] a;
delete[] b;
delete[] c;
delete[] answer;
return 0;
}
void ans(int n, const int *a, const int *b, int *answer)
{
for (int k = 0; k < n; k++)
for (int i = 0; i < n - k; ++i)
answer[k] += a[i + k] * b[i];
}
And here is the makefile:
ARCH = native
ifndef DEBUG
CXXFLAGS = -march=$(ARCH) -O2 -fopenmp -pthread
else
CXXFLAGS = -march=$(ARCH) -g -fopenmp -pthread
endif
%.o: %.cpp
$(CXX) $(CXXFLAGS) $^ -c -o $@
calc: calc.o main.o
$(CXX) $(CXXFLAGS) $^ -o $@
clean:
$(RM) calc *.o
.PHONY: clean
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello aasda38,
Thank you for posting on the Intel* Community Forums.
To better assist you, we will send you a private message to request personal information.
Best regards,
Maria R.
Intel Customer Support Technician
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello aasda38,
Were you able to check the previous post and the private message?
Let me know if you need more assistance.
Best regards,
Maria R.
Intel Customer Support Technician
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello aasda38,
We have not heard back from you, so we will close this inquiry. If you need further assistance, please post a new question.
Best regards,
Maria R.
Intel Customer Support Technician

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page