MKL and arithmetic with 2^136279841 - 1

richter__dan · ‎12-02-2024

I need to calculate square i.e. x^2 of the very big uint64_t array number, then subtract 2 and calculate the MOD(result, x).

Can you advise me if the oneAPI MKL is suitable for that purpose ? What speed I can expect if the x is 2^136279841 - 1. It's one step of Lucas-Lehmer Primality Testing.

What MKL commands do I need? Does the MKL library automatically call the best algorithm for the calculation (FFT/iFFT, Karatsuba etc.).

#include <iostream>
#include <immintrin.h>
#include <cstdint>
#include <windows.h>   // For VirtualAlloc and VirtualFree

// Constants and definitions
constexpr size_t GB = 3;
constexpr size_t giga = 1024 * 1024 * 1024;
size_t num_bits = 136279841;
size_t num_uint64 = ((num_bits + 255) / 256) * 4; // Number of uint64_t elements

// Allocate 3GB memory
static const size_t giga = 1024 * 1024 * 1024;
static const size_t size = GB * giga;
uint64_t* ARRAY = static_cast<uint64_t*>(VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE));

// Initialize the large number in POLE (example initialization)
uint64_t* x = ARRAY;
uint64_t* result = ARRAY + num_uint64;
uint64_t* tmp = ARRAY + 3 * num_uint64;

// Store the number 2^136279841 - 1 using _mm256_maskstore_epi64 in a loop
__m256i ones = _mm256_set1_epi64x(-1);
size_t i = 0;
for (; i < (num_uint64)-4; i += 4) {
    _mm256_store_si256((__m256i*) & x[i], ones);
}

// Handle remaining bits
_mm256_maskstore_epi64((long long int*) & x[i], _mm256_setr_epi64x(-1, -1, -1, -1), _mm256_setr_epi64x(0x01FFFFFFFF, 0, 0, 0));

richter__dan · ‎12-03-2024

What about this code generated by ChatGPT 4o ? Can someone test the speed of squaring and MOD before I install MKL library in my Visual Studio 2022?

#include <iostream>
#include <immintrin.h>
#include <cstdint>
#include <windows.h>   // For VirtualAlloc and VirtualFree
#include <oneapi/mkl.hpp>

// Constants and definitions
constexpr size_t GB = 3;
constexpr size_t giga = 1024 * 1024 * 1024;
size_t num_bits = 136279841;
size_t num_uint64 = ((num_bits + 255) / 256) * 4; // Number of uint64_t elements

void print_bigint(const uint64_t* num, size_t len) {
    for (size_t i = len; i > 0; --i) {
        printf("%016llx", num[i-1]);
    }
    printf("\n");
}

int main() {
	// Allocate 3GB memory
	static const size_t size = GB * giga;
	uint64_t* ARRAY = static_cast<uint64_t*>(VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE));

	// Initialize the large number in POLE (example initialization)
	uint64_t* x = ARRAY;
	uint64_t* result = ARRAY + num_uint64;
	uint64_t* tmp = ARRAY + 3 * num_uint64;

	// Store the number 2^136279841 - 1 using _mm256_maskstore_epi64 in a loop
	__m256i ones = _mm256_set1_epi64x(-1);
	size_t i = 0;
	for (; i < (num_uint64)-4; i += 4) {
	    _mm256_store_si256((__m256i*) & x[i], ones);
	}

	// Handle remaining bits
	_mm256_maskstore_epi64((long long int*) & x[i], _mm256_setr_epi64x(-1, -1, -1, -1), _mm256_setr_epi64x(0x01FFFFFFFF, 0, 0, 0));

    try {
        // Create a oneAPI queue
        sycl::queue queue{sycl::default_selector{}};
        
        // Using MKL to square the number x
        oneapi::mkl::vm::mul(queue, num_uint64, x, x, result).wait();

        // Subtract 2 from result
        result[0] -= 2;

        // Compute result mod x
        uint64_t* mod_result = tmp;
        oneapi::mkl::vm::mod(queue, num_uint64, result, x, mod_result).wait();

        // Print the result
        std::cout << "Result mod x: ";
        print_bigint(mod_result, num_uint64);

    } catch (const sycl::exception& e) {
        std::cerr << "SYCL exception: " << e.what() << std::endl;
        return 1;
    } catch (const std::exception& e) {
        std::cerr << "Exception: " << e.what() << std::endl;
        return 1;
    }

    VirtualFree(ARRAY, 0, MEM_RELEASE);
    return 0;
}

MKL and arithmetic with 2^136279841 - 1

Performance