CMake finds CPU supports AVX2 but the binary execution causes illegal instruction

cqwei · ‎03-25-2024

I find that the cmake file uses a single c++ segment and obtains that avx2 supported, which further generates the binary.

However, the binary will core dump due to illegal instruction, which caused by a multiplication optimized with avx2 (listed in part 3).

What are the reasons? For example, vextracti128 is not supported in some CPU with avx2? or else?

1. CPU information

We use CPU Intel Xeon E5-2620 v4 which supports avx2, according to https://www.intel.com/content/www/us/en/products/sku/92986/intel-xeon-processor-e52620-v4-20m-cache-2-10-ghz/specifications.html.

2. CMake returns we support AVX2

Use the following method, we find CPU supports avx2

https://github.com/microsoft/APSI/blob/main/cmake/FindAVX.cmake

3. The binary execution returns illegal instruction

After generating the binary, we find lines 61-62 in the file of the following url causes illegal instruction.

https://github.com/microsoft/APSI/blob/main/common/apsi/oprf/oprf_sender.h

The above source codes could be extracted into one c++ file as follows, and compile using

gcc -std=c++17 -O3 -mavx2 testavx.cpp

#include <stdint.h>

#include <fcntl.h>

#include <unistd.h>

#include <stdbool.h>

#include <stdlib.h>

static int lock = -1;

typedef unsigned uint128_t __attribute__((mode(TI)));

#define RADIX 64

typedef uint64_t digit_t; // Unsigned 64-bit digit

#define NWORDS_FIELD 2 // Number of words of a field element

#define NWORDS_ORDER 4

// Digit multiplication

#define MUL(multiplier, multiplicand, hi, lo) \

{ \

uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \

*(hi) = (digit_t)(tempReg >> RADIX); \

(lo) = (digit_t)tempReg; \

}

// Digit addition with carry

#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \

{ \

uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \

(carryOut) = (digit_t)(tempReg >> RADIX); \

(sumOut) = (digit_t)tempReg; \

}

// Digit subtraction with borrow

#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \

{ \

uint128_t tempReg = \

(uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \

(borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t) * 8 - 1)); \

(differenceOut) = (digit_t)tempReg; \

}

static void multiply(const digit_t *a, const digit_t *b, digit_t *c)

{ // Schoolbook multiprecision multiply, c = a*b

unsigned int i, j;

digit_t u, v, UV[2];

unsigned char carry = 0;

for (i = 0; i < (2 * NWORDS_ORDER); i++)

c[i] = 0;

for (i = 0; i < NWORDS_ORDER; i++) {

u = 0;

for (j = 0; j < NWORDS_ORDER; j++) {

MUL(a[i], b[j], UV + 1, UV[0]);

ADDC(0, UV[0], u, carry, v);

u = UV[1] + carry;

ADDC(0, c[i + j], v, carry, v);

u = u + carry;

c[i + j] = v;

}

c[NWORDS_ORDER + i] = u;

}

static unsigned char add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords)

{ // Multiprecision addition, c = a+b. Returns the carry bit

unsigned int i;

unsigned char carry = 0;

for (i = 0; i < nwords; i++) {

ADDC(carry, a[i], b[i], carry, c[i]);

}

return carry;

}

unsigned char subtract(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords)

{ // Multiprecision subtraction, c = a-b. Returns the borrow bit

unsigned int i;

unsigned char borrow = 0;

for (i = 0; i < nwords; i++) {

SUBC(borrow, a[i], b[i], borrow, c[i]);

}

return borrow;

}

static const uint64_t curve_order[4] = {

0x2FB2540EC7768CE7, 0xDFBD004DFE0F7999, 0xF05397829CBC14E5, 0x0029CBC14E5E0A72

};

static const uint64_t Montgomery_Rprime[4] = {

0xC81DB8795FF3D621, 0x173EA5AAEA6B387D, 0x3D01B7C72136F61C, 0x0006A5F16AC8F9D3

};

static const uint64_t Montgomery_rprime[4] = {

0xE12FE5F079BC3929, 0xD75E78B8D1FCDCF3, 0xBCE409ED76B5DB21, 0xF32702FDAFC1C074

};

void Montgomery_multiply_mod_order(const digit_t *ma, const digit_t *mb, digit_t *mc)

{ // 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where

// ma,mb,mc in [0, order-1] ma, mb and mc are assumed to be in Montgomery representation The

// Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime",

// where r is the order

unsigned int i;

digit_t mask, P[2 * NWORDS_ORDER], Q[2 * NWORDS_ORDER], temp[2 * NWORDS_ORDER];

digit_t *order = (digit_t *)curve_order;

unsigned char cout = 0, bout = 0;

multiply(ma, mb, P); // P = ma * mb

multiply(P, (digit_t *)&Montgomery_rprime, Q); // Q = P * r' mod 2^(log_2(r))

multiply(Q, (digit_t *)&curve_order, temp); // temp = Q * r

cout = add(P, temp, temp, 2 * NWORDS_ORDER); // (cout, temp) = P + Q * r

for (i = 0; i < NWORDS_ORDER; i++) { // (cout, mc) = (P + Q * r)/2^(log_2(r))

mc[i] = temp[NWORDS_ORDER + i];

}

// Final, constant-time subtraction

bout = subtract(mc, (digit_t *)&curve_order, mc, NWORDS_ORDER); // (cout, mc) = (cout, mc) - r

mask = (digit_t)(

cout -

bout); // if (cout, mc) >= 0 then mask = 0x00..0, else if (cout, mc) < 0 then mask = 0xFF..F

for (i = 0; i < NWORDS_ORDER; i++) { // temp = mask & r

temp[i] = (order[i] & mask);

}

add(mc, temp, mc, NWORDS_ORDER); // mc = mc + (mask & r)

return;

}

void modulo_order(digit_t *a, digit_t *c)

{ // Reduction modulo the order using Montgomery arithmetic

// ma = a*Montgomery_Rprime mod r, where a,ma in [0, r-1], a,ma,r < 2^256

// c = ma*1*Montgomery_Rprime^(-1) mod r, where ma,c in [0, r-1], ma,c,r < 2^256

digit_t ma[NWORDS_ORDER], one[NWORDS_ORDER] = { 0 };

one[0] = 1;

Montgomery_multiply_mod_order(a, (digit_t *)&Montgomery_Rprime, ma);

Montgomery_multiply_mod_order(ma, one, c);

}

static __inline void delay(unsigned int count)

{

while (count--) {

}

int random_bytes(unsigned char *random_array, unsigned int nbytes)

{

int r, n = nbytes, count = 0;

if (lock == -1) {

do {

lock = open("/dev/urandom", O_RDONLY);

if (lock == -1) {

delay(0xFFFFF);

}

} while (lock == -1);

}

while (n > 0) {

do {

r = read(lock, random_array + count, n);

if (r == -1) {

delay(0xFFFF);

}

} while (r == -1);

count += r;

n -= r;

}

return true;

}

int main()

{

digit_t temp[NWORDS_ORDER], temp2[NWORDS_ORDER];

random_bytes((unsigned char *)temp, sizeof(digit_t) * NWORDS_ORDER);

random_bytes((unsigned char *)temp2, sizeof(digit_t) * NWORDS_ORDER);

modulo_order(temp, temp2);

}

cqwei · ‎03-26-2024

As from this URL (My issue is the same problem), AVX2 seems will causes problem during dynamic linking,

I wonder is there any official interpretation, for example the address may not be aligned during dynamic linking? (or else such as this)

https://github.com/LGro/PyAPSI/blob/main/docker/base.Dockerfile#L13-L17

CMake finds CPU supports AVX2 but the binary execution causes illegal instruction

Intel® oneAPI Data Parallel C++