ASM block into function and ABI x86-64

https://stackoverflow.com/questions/14388523

16-01-2022
|

Question

I coded a very nice integer lib for large integer but limited to 512 bits (faster than GMP for various reason). I am trying to generalize the lib for large size. So I have to loop over an adcq instruction.

// long addition little indian order due the technique incq-jnz
// I can not use compare because it destroy the Carry Bit
template<int n>
void test_add(boost::uint64_t*, boost::uint64_t* ){    
    asm volatile (
        "clc                                     \n"
        "movq %0, %%rcx                          \n"
    "loop:                                       \n"
        "movq 8(%%rsi,%%rcx,8), %%rax            \n"  /* original -8(%%rsi,%%rbx,8) */
        "adcq %%rax           , 8(%%rdi,%%rcx,8) \n"  /* original -8(%%rsi,%%rbx,8) */
        "incq %%rcx                              \n"  /* original decq */
    "jnz loop                                    \n"
        :   
        :"g"(n)
        :"rax","rcx","cc","memory"
    );  
}


int main(int argc, char* argv[]) {
boost::uint64_t c[4],d[4];

c[0] = -1; 
c[1] = -1; 
c[2] = -1; 
c[3] =  0;  

d[0] = 1;
d[1] = 0;
d[2] = 0;
d[3] = 0;

test_add<-4>(&d[3],&c[3]); // <-- BigEndian to LittleEndian

This thing works well in debug mode -O0 but as soon as I use optimization, segfault/

I do not really understand because I respect the ABI for rsi and rdi, clobber register, use the good register, so I compiled with GCC -O0 -S and -O2 -S

For -O0 -S I get

 3 .globl main
 4         .type   main, @function
 5 main:
 6 .LFB1:
 7         .cfi_startproc
 8         .cfi_personality 0x3,__gxx_personality_v0
 9         pushq   %rbp
 10         .cfi_def_cfa_offset 16
 11         .cfi_offset 6, -16
 12         movq    %rsp, %rbp
 13         .cfi_def_cfa_register 6
 14         subq    $80, %rsp
 15         movl    %edi, -68(%rbp)
 16         movq    %rsi, -80(%rbp)
 17         movq    $-1, -32(%rbp)
 18         movq    $-1, -24(%rbp)
 19         movq    $-1, -16(%rbp)
 20         movq    $0, -8(%rbp)
 21         movq    $1, -64(%rbp)
 22         movq    $0, -56(%rbp)
 23         movq    $0, -48(%rbp)
 24         movq    $0, -40(%rbp)
 25         leaq    -32(%rbp), %rax
 26         leaq    24(%rax), %rdx
 27         leaq    -64(%rbp), %rax
 28         addq    $24, %rax
 29         movq    %rdx, %rsi
 30         movq    %rax, %rdi
 31         call    _Z8test_addILin4EEvPyS0_
 32         movl    $0, %eax
 33         leave
 34         .cfi_def_cfa 7, 8
 35         ret
 36         .cfi_endproc
 37 .LFE1:
 38         .size   main, .-main
 39         .section              .   enter code here  `enter code here`text._Z8test_addILin4EEvPyS0_,"axG",@progbits,_Z8test_addILin4EEvPyS0_,comdat
 40         .weak   _Z8test_addILin4EEvPyS0_
 41         .type   _Z8test_addILin4EEvPyS0_, @function
 42 _Z8test_addILin4EEvPyS0_:
 43 .LFB2:
 44         .cfi_startproc
 45         .cfi_personality 0x3,__gxx_personality_v0
 46         pushq   %rbp
 47         .cfi_def_cfa_offset 16
 48         .cfi_offset 6, -16
 49         movq    %rsp, %rbp
 50         .cfi_def_cfa_register 6
 51         movq    %rdi, -8(%rbp)
 52         movq    %rsi, -16(%rbp)
 53 #APP
 54 # 14 "test.cpp" 1
 55         clc
 56 movq $-4, %rcx
 57 loop:
 58 movq 8(%rsi,%rcx,8), %rax
 59 adcq %rax           , 8(%rdi,%rcx,8)
 60 incq %rcx
 61 jnz loop
 62 
 63 # 0 "" 2
 64 #NO_APP
 65         leave
 66         .cfi_def_cfa 7, 8
 67         ret
 68         .cfi_endproc
 69 .LFE2:
 70         .size   _Z8test_addILin4EEvPyS0_, .-_Z8test_addILin4EEvPyS0_
 71         .ident  "GCC: (GNU) 4.4.6 20120305 (Red Hat 4.4.6-4)"
 72         .section        .note.GNU-stack,"",@progbits

Line 20-30 we see the compiler reorganizes the stack to pass the arg into rsi and rdi (line 29 - 30) and the call. Perfect as in the ABI

If now I look the optimize version I get

  1         .file   "test.cpp"
  2         .text
  3         .p2align 4,,15
  4 .globl main
  5         .type   main, @function
  6 main:
  7 .LFB1:
  8         .cfi_startproc
  9         .cfi_personality 0x3,__gxx_personality_v0
  10 #APP
  11 # 14 "test.cpp" 1
  12         clc
  13 movq $-4, %rcx
  14 loop:
  15 movq 8(%rsi,%rcx,8), %rax
  16 adcq %rax           , 8(%rdi,%rcx,8)
  17 incq %rcx
  18 jnz loop
  19 
  20 # 0 "" 2
  21 #NO_APP
  22         xorl    %eax, %eax
  23         ret
  24         .cfi_endproc
  25 .LFE1:
  26         .size   main, .-main
  27         .ident  "GCC: (GNU) 4.4.6 20120305 (Red Hat 4.4.6-4)"
  28         .section        .note.GNU-stack,"",@progbits

Well bye bye the ABI, I do not understand. The stack is managed by what ????

An ASM guru has an idea ? I refuse to put the function into an independent file, well metaprogramming spirit.

Cheers.

------- Edit :

I found a bug with your solution, if I put it into a single loop :

#include <boost/cstdint.hpp> //boost type

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t const* y) {
    boost::uint64_t dummy;
    boost::uint64_t loop_index(n);
    __asm__ __volatile__ (
        "clc\n\t"
        "1:\n\t"
        "movq (%[y],%[counter],8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy)
        : [x] "r" (x), [y] "r" (y), [counter] "r" (loop_index)
        : "memory", "cc");
 }


int main(int argc, char* argv[]) {
    boost::uint64_t c[3],d[3];

    c[0] = -1; 
    c[1] = -1; 
    c[2] = -1; 
    c[3] =  0;  

    d[0] = 1;
    d[1] = 0;
    d[2] = 0;
    d[3] = 0;

for(int i=0; i < 0xfff; ++i)
    test_add<-4>(&c[4],&d[4]);

 return 0;

}

Will give the following ASM :

      movq    $-4, %rdx <---------------------template parameter
      leaq    -32(%rsp), %rcx
      movq    $-1, -32(%rsp)
      movq    $-1, -24(%rsp)
      movq    $-1, -16(%rsp)
      movq    $0, -8(%rsp)
      movq    $1, -64(%rsp)
      movq    $0, -56(%rsp)
      movq    $0, -48(%rsp)
      movq    $0, -40(%rsp)
      .p2align 4,,10
      .p2align 3
  .L2: <-------- OUPUT loop
#APP
# 16 "main.cpp" 1
       clc
       1: <-------- INPUT loop
       movq (%rcx,%rdx,8), %rsi
       adcq %rsi, (%rsp, %rdx, 8)
       incq %rdx <------------ rdx++ -> (-4)++ (for the @nd iteration of L2 it is not reset to -4)
       jnz 1b

 # 0 "" 2
 #NO_APP
       addl    $1, %eax
       cmpl    $4095, %eax <----- test second loop
       jne     .L2

Well for the second iteration for the output loop, rdx is not reflush to -4, thus the movq instruction gives a bad reading, segfault. I patch it very badly (I reset by hand with -4),I just add "movq $-4, %[counter]\n\t " after the jnz, but I need something more general. Does it exist a constraint to reset the counter to the template parameter value ?

Presently the correction is :

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t const* y) {
    boost::uint64_t dummy;
    __asm__ __volatile__ (
        "clc\n\t"
        "movq %[counter_value], %[counter]\n\t" // set the counter to the template value, it's not sure if the function is reused
        "1:\n\t"
        "movq (%[y],%[counter],8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy)
        : [x] "r" (x), [y] "r" (y), [counter] "r" (n), [counter_value] "i" (n)
        : "memory", "cc");
}

Solution

You should use constraints to access the arguments. gcc is not required to follow the ABI for internal functions, and even if it does, it's not required to have the initial state intact when your asm block executes. Of course the point of inline asm is to have the compiler inline it, and then no function call even takes place. (Many peole wrongly think inline means "embedded in the C source file" and use it as convenience feature even when no actual code inlining is required.)

gcc is also quite capable of putting things in the register you want them in (not that you particularly care about the counter being rcx, here). It's also generally a good idea to leave as much to the compiler as possible, so that it can do register allocation, loop unrolling and other optimizations. Unfortunately I couldn't get gcc to generate ADC, so the asm block stays, this time. Using inc is not recommended either due to partial flags update, but I don't see an obvious way around that right now.

Finally, if you pass address of d[3] you will access items d[-1] through d[2] which is not what you want. You should pass d[4].

A fixed version could look like this (with named arguments):

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t* y) {
    boost::uint64_t dummy, dummy2;
    __asm__ __volatile__ (
        "clc\n\t"
        "1:\n\t"
        "movq (%[y], %[counter], 8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy), "=r" (dummy2)
        : [x] "r" (x), [y] "r" (y), [counter] "1" (n)
        : "memory", "cc");
}

Note that the dummy variable will be optimized away while allowing gcc to pick a suitable register instead of forcing it to use a particular one.

Update: Here is a pure C++ version that the compiler can fully unroll and otherwise optimize (including calculating things at compilation time!). While in the generic case the compiler's code isn't as efficient as the hand written one, the mentioned optimizations may make it better under circumstances. Note: since you are using gcc inline asm, that means your code is already gcc and x86-64 specific, so using __uint128_t is not a further restriction (in fact this will work on any architecture where gcc supports 128 bit integers).

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t* y) {
    __uint128_t work = 0;
    for(long i = n; i < 0; i += 1) {
        work = work + x[i] + y[i];
        x[i] = work; // automatic truncation
        work >>= 64;
    }
}

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow