Question

I'm trying to use the inline asm directive of gcc/g++ (I have to say I've been using the Intel syntax on MSVC previously and that was a breeze). I'm playing around with double values and the following my_func2 seems to crash after execution:

  #include <iostream>

  void my_func(const double *in, double *out) {
    asm("mov %0, %%r8" : : "r"(in));
    asm("movupd (%%r8), %%xmm0" :);
    asm("movupd (%%r8), %%xmm1" :);
    asm("addpd %%xmm1, %%xmm0" :);
    asm("movupd %%xmm0, (%0)" : : "r"(out) : "%r8", "%xmm0", "%xmm1");
  }

  double my_func2(const double *in) {
    double  ret = 0.0;

    asm("mov %0, %%r8" : : "r"(in));
    asm("movupd (%%r8), %%xmm0" :);
    asm("movupd (%%r8), %%xmm1" :);
    asm("addpd %%xmm1, %%xmm0" :);
    asm("movupd %%xmm0, %0" : "=m"(ret) : : "memory", "%r8", "%xmm0", "%xmm1");

    return ret;
 }

  int main(int argc, char *argv[]) {
    const double    a = 1.0;
    double      b = 0.0;
    my_func(&a, &b);
    std::cout << "b:" << b << std::endl;
    b = my_func2(&a);
    std::cout << "b:" << b << std::endl;
  }

The error I get is specifically (when I'm running with gdb):

Program received signal SIGBUS, Bus error.
0x00000000004008e1 in main (argc=<error reading variable: Cannot access memory at address 0x400fffffffffffec>, 
    argv=<error reading variable: Cannot access memory at address 0x400fffffffffffe0>) at asm_test.cpp:28
28      b = my_func2(&a);

What am I doing wrong? In the last line of my_func2 I've specified that memory is clobbered too, I don't understand... Where can I find a good guide how to use the infamous AT&T syntax?
I compile with: g++ -g -o asm_test asm_test.cpp, g++ version g++ (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3 on Ubuntu Linux scv 3.2.0-48-generic #74-Ubuntu SMP Thu Jun 6 19:43:26 UTC 2013 x86_64 x86_64 x86_64 GNU/Linux.

I've found http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html and http://www.delorie.com/djgpp/doc/brennan/brennan_att_inline_djgpp.html , is there something more you would recommend?

Thanks,
Ema

Was it helpful?

Solution

The mistake here is that one has to be careful when using movupd. With this instruction, you actually copy 128 bit of memory, in and out.

By chance the first function can copy these values out too, but the second one, has got only 64 bit space in ret variable. As expected this corrupts stack, yields to undefined behaviour?
Substituting movupd with movlpd (or movhpd), things work a charm.

Am I still clobbering the right registers?

Following code works just fine when compiled with g++ -O3 -o asm_test asm_test.cpp

  void my_func(const double *in, double *out) {
    asm ("mov %0, %%r8" : : "r"(in));
    asm ("movhpd (%%r8), %%xmm0" :);
    asm ("movhpd (%%r8), %%xmm1" :);
    asm ("addpd %%xmm1, %%xmm0" :);
    asm ("movhpd %%xmm0, (%0)" : : "r"(out) : "memory", "%r8", "%xmm0", "%xmm1");
  }

  double my_func2(const double *in) {
    double  ret;

    asm("mov %0, %%r8" : : "r"(in));
    asm("movlpd (%%r8), %%xmm0" :);
    asm("movlpd (%%r8), %%xmm1" :);
    asm("addpd %%xmm1, %%xmm0" :);
    asm("movlpd %%xmm0, %0" : "=m"(ret) : : "memory", "%r8", "%xmm0", "%xmm1");

    return ret;
  }

OTHER TIPS

gcc inline assembly doesn't particularly like it if you have separate lines of asm() statements that are not actually independent. You'd better code the above like:

#include <xmmintrin.h> // for __m128d

static  void my_func(const double *in, double *out) {
    asm("movupd %1, %%xmm0\n"
        "movupd %1, %%xmm1\n"
        "addpd %%xmm1, %%xmm0\n"
        "movupd %%xmm0, %0"
        : "=rm"(*(__m128d*)out)
        : "rm"(*(__m128d*)in)
        : "%xmm0", "%xmm1");
}

static double my_func2(const double *in) {
    double ret;
    asm("movupd %1, %%xmm0\n"
        "movupd %1, %%xmm1\n"
        "addpd %%xmm1, %%xmm0\n"
        "movlpd %%xmm0, %0"
        : "=xm"(ret)
        : "rm"(*(__m128d*)in)
        : "%xmm0", "%xmm1");
    return ret;
}

because this lets the compiler choose where to put things (mem or reg). For your source, this inlines the following two blocks into main():

  1c:   66 0f 10 44 24 10       movupd 0x10(%rsp),%xmm0
  22:   66 0f 10 4c 24 10       movupd 0x10(%rsp),%xmm1
  28:   66 0f 58 c1             addpd  %xmm1,%xmm0
  2c:   66 0f 11 44 24 20       movupd %xmm0,0x20(%rsp)
[ ... ]
  63:   66 0f 10 44 24 10       movupd 0x10(%rsp),%xmm0
  69:   66 0f 10 4c 24 10       movupd 0x10(%rsp),%xmm1
  6f:   66 0f 58 c1             addpd  %xmm1,%xmm0
  73:   66 0f 13 44 24 08       movlpd %xmm0,0x8(%rsp)

This is _not optimal, though ... if you change it to:

static  void my_func(const double *in, double *out) {
    asm volatile("movapd %1, %0\n"
                 "addpd %1, %0"
                 : "=xm"((__m128d*)out)
                 : "x"(*(__m128d*)in));
}

you leave it to the compiler where to put the variables. The compiler detects that it can get away with not doing loads/stores at all ... as this gets inlined simply as:

  18:   66 0f 28 c1             movapd %xmm1,%xmm0
  1c:   66 0f 58 c1             addpd  %xmm1,%xmm0
since the compiler recognizes it's got all variables in registers / wants all returns in registers.

Although it's not at all necessary to do this using assembly; with a decent compiler (your gcc will do) the plain C/C++ version,

static void my_func(const double *in, double *out) {
    out[0] = in[0] + in[0];
    out[1] = in[1] + in[1];
}

is most likely going to be turned into no less efficient code.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top