Intel® oneAPI DPC++/C++ Compiler
Talk to fellow users of Intel® oneAPI DPC++/C++ Compiler and companion tools like Intel® oneAPI DPC++ Library, Intel® DPC++ Compatibility Tool, and Intel® Distribution for GDB*
598 Discussions

CMake finds CPU supports AVX2 but the binary execution causes illegal instruction

cqwei
Beginner
694 Views

I find that the cmake file uses a single c++ segment and obtains that avx2 supported, which further generates the binary.

However, the binary will core dump due to illegal instruction, which caused by a multiplication optimized with avx2 (listed in part 3). 

What are the reasons? For example,  vextracti128 is not supported in some CPU with avx2? or else?

 

 

1. CPU information

 

We use CPU Intel Xeon E5-2620 v4 which supports avx2,  according to https://www.intel.com/content/www/us/en/products/sku/92986/intel-xeon-processor-e52620-v4-20m-cache-2-10-ghz/specifications.html.

 

2. CMake returns we support AVX2

 

Use the following method, we find CPU supports avx2

https://github.com/microsoft/APSI/blob/main/cmake/FindAVX.cmake

 

3. The binary execution returns illegal instruction

 

After generating the binary, we find lines 61-62 in the file of the following url causes illegal instruction. 

https://github.com/microsoft/APSI/blob/main/common/apsi/oprf/oprf_sender.h

 

The above source codes could be extracted into one c++ file as follows, and compile using 

gcc -std=c++17 -O3 -mavx2 testavx.cpp

 

#include <stdint.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdbool.h>
#include <stdlib.h>
static int lock = -1;

typedef unsigned uint128_t __attribute__((mode(TI)));
#define RADIX 64
typedef uint64_t digit_t; // Unsigned 64-bit digit
#define NWORDS_FIELD 2 // Number of words of a field element
#define NWORDS_ORDER 4
// Digit multiplication
#define MUL(multiplier, multiplicand, hi, lo) \
{ \
uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \
*(hi) = (digit_t)(tempReg >> RADIX); \
(lo) = (digit_t)tempReg; \
}

// Digit addition with carry
#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \
{ \
uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \
(carryOut) = (digit_t)(tempReg >> RADIX); \
(sumOut) = (digit_t)tempReg; \
}

// Digit subtraction with borrow
#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \
{ \
uint128_t tempReg = \
(uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \
(borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t) * 8 - 1)); \
(differenceOut) = (digit_t)tempReg; \
}

static void multiply(const digit_t *a, const digit_t *b, digit_t *c)
{ // Schoolbook multiprecision multiply, c = a*b
unsigned int i, j;
digit_t u, v, UV[2];
unsigned char carry = 0;

for (i = 0; i < (2 * NWORDS_ORDER); i++)
c[i] = 0;

for (i = 0; i < NWORDS_ORDER; i++) {
u = 0;
for (j = 0; j < NWORDS_ORDER; j++) {
MUL(a[i], b[j], UV + 1, UV[0]);
ADDC(0, UV[0], u, carry, v);
u = UV[1] + carry;
ADDC(0, c[i + j], v, carry, v);
u = u + carry;
c[i + j] = v;
}
c[NWORDS_ORDER + i] = u;
}
}

static unsigned char add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords)
{ // Multiprecision addition, c = a+b. Returns the carry bit
unsigned int i;
unsigned char carry = 0;

for (i = 0; i < nwords; i++) {
ADDC(carry, a[i], b[i], carry, c[i]);
}

return carry;
}

unsigned char subtract(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords)
{ // Multiprecision subtraction, c = a-b. Returns the borrow bit
unsigned int i;
unsigned char borrow = 0;

for (i = 0; i < nwords; i++) {
SUBC(borrow, a[i], b[i], borrow, c[i]);
}

return borrow;
}

static const uint64_t curve_order[4] = {
0x2FB2540EC7768CE7, 0xDFBD004DFE0F7999, 0xF05397829CBC14E5, 0x0029CBC14E5E0A72
};
static const uint64_t Montgomery_Rprime[4] = {
0xC81DB8795FF3D621, 0x173EA5AAEA6B387D, 0x3D01B7C72136F61C, 0x0006A5F16AC8F9D3
};
static const uint64_t Montgomery_rprime[4] = {
0xE12FE5F079BC3929, 0xD75E78B8D1FCDCF3, 0xBCE409ED76B5DB21, 0xF32702FDAFC1C074
};

void Montgomery_multiply_mod_order(const digit_t *ma, const digit_t *mb, digit_t *mc)
{ // 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where
// ma,mb,mc in [0, order-1] ma, mb and mc are assumed to be in Montgomery representation The
// Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime",
// where r is the order
unsigned int i;
digit_t mask, P[2 * NWORDS_ORDER], Q[2 * NWORDS_ORDER], temp[2 * NWORDS_ORDER];
digit_t *order = (digit_t *)curve_order;
unsigned char cout = 0, bout = 0;

multiply(ma, mb, P); // P = ma * mb
multiply(P, (digit_t *)&Montgomery_rprime, Q); // Q = P * r' mod 2^(log_2(r))
multiply(Q, (digit_t *)&curve_order, temp); // temp = Q * r
cout = add(P, temp, temp, 2 * NWORDS_ORDER); // (cout, temp) = P + Q * r

for (i = 0; i < NWORDS_ORDER; i++) { // (cout, mc) = (P + Q * r)/2^(log_2(r))
mc[i] = temp[NWORDS_ORDER + i];
}

// Final, constant-time subtraction
bout = subtract(mc, (digit_t *)&curve_order, mc, NWORDS_ORDER); // (cout, mc) = (cout, mc) - r
mask = (digit_t)(
cout -
bout); // if (cout, mc) >= 0 then mask = 0x00..0, else if (cout, mc) < 0 then mask = 0xFF..F

for (i = 0; i < NWORDS_ORDER; i++) { // temp = mask & r
temp[i] = (order[i] & mask);
}
add(mc, temp, mc, NWORDS_ORDER); // mc = mc + (mask & r)

return;
}

void modulo_order(digit_t *a, digit_t *c)
{ // Reduction modulo the order using Montgomery arithmetic
// ma = a*Montgomery_Rprime mod r, where a,ma in [0, r-1], a,ma,r < 2^256
// c = ma*1*Montgomery_Rprime^(-1) mod r, where ma,c in [0, r-1], ma,c,r < 2^256
digit_t ma[NWORDS_ORDER], one[NWORDS_ORDER] = { 0 };

one[0] = 1;
Montgomery_multiply_mod_order(a, (digit_t *)&Montgomery_Rprime, ma);
Montgomery_multiply_mod_order(ma, one, c);
}

static __inline void delay(unsigned int count)
{
while (count--) {
}
}

int random_bytes(unsigned char *random_array, unsigned int nbytes)
{
int r, n = nbytes, count = 0;

if (lock == -1) {
do {
lock = open("/dev/urandom", O_RDONLY);
if (lock == -1) {
delay(0xFFFFF);
}
} while (lock == -1);
}

while (n > 0) {
do {
r = read(lock, random_array + count, n);
if (r == -1) {
delay(0xFFFF);
}
} while (r == -1);
count += r;
n -= r;
}
return true;
}

int main()
{
digit_t temp[NWORDS_ORDER], temp2[NWORDS_ORDER];
random_bytes((unsigned char *)temp, sizeof(digit_t) * NWORDS_ORDER);
random_bytes((unsigned char *)temp2, sizeof(digit_t) * NWORDS_ORDER);
modulo_order(temp, temp2);
}
Labels (1)
0 Kudos
1 Reply
cqwei
Beginner
681 Views

As from this URL (My issue is the same problem), AVX2 seems will causes problem during dynamic linking,

I wonder is there any official interpretation, for example the address may not be aligned during dynamic linking? (or else such as this)
 

 

https://github.com/LGro/PyAPSI/blob/main/docker/base.Dockerfile#L13-L17

 

0 Kudos
Reply