Having no inline assembly kind of sucks.
Anyway, the function call overhead is actually extremely small. Parameters are passed in volatile registers and no cleanup is needed.
I don't have an assembler, and x64 targets don't support __asm, so I had no choice but to "assemble" my function from opcodes myself.
Obviously it depends on . I'm using mpir (gmp) as a reference to show the function produces correct results.
#include "stdafx.h"
// mulmod64(a, b, m) == (a * b) % m
typedef uint64_t(__cdecl *mulmod64_fnptr_t)(uint64_t a, uint64_t b, uint64_t m);
uint8_t mulmod64_opcodes[] = {
0x48, 0x89, 0xC8, // mov rax, rcx
0x48, 0xF7, 0xE2, // mul rdx
0x4C, 0x89, 0xC1, // mov rcx, r8
0x48, 0xF7, 0xF1, // div rcx
0x48, 0x89, 0xD0, // mov rax,rdx
0xC3 // ret
};
mulmod64_fnptr_t mulmod64_fnptr;
void init() {
DWORD dwOldProtect;
VirtualProtect(
&mulmod64_opcodes,
sizeof(mulmod64_opcodes),
PAGE_EXECUTE_READWRITE,
&dwOldProtect);
// NOTE: reinterpret byte array as a function pointer
mulmod64_fnptr = (mulmod64_fnptr_t)(void*)mulmod64_opcodes;
}
int main() {
init();
uint64_t a64 = 2139018971924123ull;
uint64_t b64 = 1239485798578921ull;
uint64_t m64 = 8975489368910167ull;
// reference code
mpz_t a, b, c, m, r;
mpz_inits(a, b, c, m, r, NULL);
mpz_set_ui(a, a64);
mpz_set_ui(b, b64);
mpz_set_ui(m, m64);
mpz_mul(c, a, b);
mpz_mod(r, c, m);
gmp_printf("(%Zd * %Zd) mod %Zd = %Zd\n", a, b, m, r);
// using mulmod64
uint64_t r64 = mulmod64_fnptr(a64, b64, m64);
printf("(%llu * %llu) mod %llu = %llu\n", a64, b64, m64, r64);
return 0;
}