未知GCC错误，在编制ARM NEON（严重）

https://stackoverflow.com/questions/3811148

26-09-2019
|

题

我有一个基于ARM NEON Cortex-A8处理器目标。我被利用NEON的优化我的代码。但是，当我编译我的代码，我得到这个奇怪的错误。不知道如何解决这个问题。

我试图编译我的主机上使用代码的Sourcery（PART2）以下代码（第1部分）。而我得到这个奇怪的错误（PART3）。难道我做错了什么吗？任何人都可以编译，看看是否他们也得到相同的编译错误？

奇怪部分，在代码，如果我注释掉代码的else if(step_size == 4)部分，则误差消失。但遗憾的是我的优化是不完整的有出，所以我必须拥有它。

起初我以为它的问题CodeSourcey编译器（我的主机上），所以我编我的目标直接（在Ubuntu上我的目标运行）的程序。我用gcc那里，我再次得到了同样的错误，当我注释掉else if(step_size == 4)一部分，那么错误就消失了。

帮助！

<强>第1部分

#include<stdio.h>
#include"arm_neon.h"

#define IMAGE_HEIGHT 480
#define IMAGE_WIDTH  640

float32_t integral_image[IMAGE_HEIGHT][IMAGE_WIDTH];

float32x4_t box_area_compute3(int, int , int , int , unsigned int , float);

inline int min(int, int);

int main()
{

 box_area_compute3(1, 1, 4, 4, 2, 0);

 return 0;
}

float32x4_t box_area_compute3(int row, int col, int num_rows, int num_cols, unsigned int step_size, float three)
{
 unsigned int height = IMAGE_HEIGHT;
 unsigned int width = IMAGE_WIDTH;

 int temp_row = row + num_rows;
 int temp_col = col + num_cols;

 int r1 = (min(row, height))- 1 ;
 int r2 = (min(temp_row, height)) - 1;

 int c1 = (min(col, width)) - 1;
 int c2 = (min(temp_col, width)) - 1;

 float32x4_t v128_areas;

 if(step_size == 2)
 {
  float32x4x2_t top_left, top_right, bottom_left, bottom_right;
  top_left    = vld2q_f32((float32_t *)integral_image[r1] + c1);
  top_right   = vld2q_f32((float32_t *)integral_image[r1] + c2);
  bottom_left  = vld2q_f32((float32_t *)integral_image[r2] + c1);
  bottom_right  = vld2q_f32((float32_t *)integral_image[r2] + c2);

  v128_areas = vsubq_f32(vsubq_f32(vaddq_f32(top_left.val[0], bottom_right.val[0]), top_right.val[0]), bottom_left.val[0]);


 }
 else if(step_size == 4)
 {
  float32x4x4_t top_left, top_right, bottom_left, bottom_right;
  top_left   = vld4q_f32((float32_t *)integral_image[r1] + c1);
  top_right   = vld4q_f32((float32_t *)integral_image[r1] + c2);
  bottom_left  = vld4q_f32((float32_t *)integral_image[r2] + c1);
  bottom_right  = vld4q_f32((float32_t *)integral_image[r2] + c2);

  v128_areas = vsubq_f32(vsubq_f32(vaddq_f32(top_left.val[0], bottom_right.val[0]), top_right.val[0]), bottom_left.val[0]);

 }

 if(three == 3.0)
  v128_areas = vmulq_n_f32(v128_areas, three);

 return v128_areas;

}

inline int min(int X, int Y)
{
 return (X < Y ? X : Y);
}

<强>第2部分

arm-none-linux-gnueabi-gcc -O0 -g3 -Wall -c -fmessage-length=0 -fcommon -MMD -MP -MF"main.d" -MT"main.d" -mcpu=cortex-a8 -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -o"main.o" "../main.c"

<强>第3部分

../main.c: In function 'box_area_compute3':
../main.c:65: error: unable to find a register to spill in class 'GENERAL_REGS'
../main.c:65: error: this is the insn:
(insn 226 225 227 5 c:\program files\codesourcery\sourcery g++\bin\../lib/gcc/arm-none-linux-gnueabi/4.4.1/include/arm_neon.h:9863 (parallel [
           (set (reg:XI 148 [ D.17028 ])
               (unspec:XI [
                       (mem:XI (reg:SI 3 r3 [301]) [0 S64 A64])
                       (reg:XI 148 [ D.17028 ])
                       (unspec:V4SF [
                               (const_int 0 [0x0])
                           ] 191)
                   ] 111))
           (set (reg:SI 3 r3 [301])
               (plus:SI (reg:SI 3 r3 [301])
                   (const_int 32 [0x20])))
       ]) 1605 {neon_vld4qav4sf} (nil))
../main.c:65: confused by earlier errors, bailing out
cs-make: *** [main.o] Error 1

解决方案 2

嗯，我已经联系了代码的Sourcery关于这个问题，他们已经考虑到这种在GCC编译器的错误。所以我写了do_it4（）{} .....功能的组件，而不是采用德内部函数。现在，它的工作好！

其他提示

我不能对此进行测试，因为我没有工具链，但这种类型的错误通常可以通过重新措辞的代码一点点被合作周围。一般来说，这是不应该的，应该被报告为错误，但您使用的处理器特定的功能，这可能是较差的测试，比编译器的其余部分抛光。

由于它是一个寄存器溢出错误，你有几个指针参与我高度怀疑，编译器可能是想比它需要的是担心可能会有一些混淆回事更多的数据加载到寄存器（其中可能没有实际发生）。下面我将讨论的是，可能性以及做可以从编译器的角度来看减轻代码的复杂性（尽管它可能看起来不像是这种情况），一些其他的东西。

#include<stdio.h>
#include"arm_neon.h"

#define IMAGE_HEIGHT 480
#define IMAGE_WIDTH  640

float32_t integral_image[IMAGE_HEIGHT][IMAGE_WIDTH];

float32x4_t box_area_compute3(int, int , int , int , unsigned int , float);

inline int min(int, int);

int main()
{

 box_area_compute3(1, 1, 4, 4, 2, 0);

 return 0;
}

/* By putting these in separate functions the compiler will initially
 * think about them by themselves, without the complications of the
 * surrounding code.  This may give it the abiltiy to optimise the
 * code somewhat before trying to inline it.
 * This may also serve to make it more obvious to the compiler that
 * the local variables are dead after their use (since they are
 * dead after the call returns, and that the lifetimes of some variable
 * cannot actually overlap (hopefully reducing the register needs).
 */
static inline float32x4_t do_it2(float32_t *tl, float32_t *tr, float32_t *bl, float32_t * br) {
    float32x4x2_t top_left, top_right, bottom_left, bottom_right;
    float32x4_t A, B;

    top_left = vld2q_f32(tl);
    top_right = vld2q_f32(tr);
    bottom_left = vld2q_f32(bl);
    bottom_right = vld2q_f32(br);

    /* By spreading this across several statements I have created several
     * additional sequence points.  The compiler does not think that it
     * has to dereference all of the pointers before doing any of the
     * computations.... maybe. */
    A = vaddq_f32(*top_left.val, *bottom_right.val);
    B = vsubq_f32(A, *top_right.val);
    return vsubq_f32(B, *bottom_left);
}

static inline float32x4_t do_it4(float32_t *tl, float32_t *tr, float32_t *bl, float32_t * br) {
    float32x4x4_t top_left, top_right, bottom_left, bottom_right;
    float32x4_t A, B;

    top_left = vld4q_f32(tl);
    top_right = vld4q_f32(tr);
    bottom_left = vld4q_f32(bl);
    bottom_right = vld4q_f32(br);

    A = vaddq_f32(*top_left.val, *bottom_right.val);
    B = vsubq_f32(A, *top_right.val);
    return vsubq_f32(B, *bottom_left);
}

float32x4_t box_area_compute3(int row, int col, int num_rows, int num_cols, unsigned int step_size, float three)
{
 unsigned int height = IMAGE_HEIGHT;
 unsigned int width = IMAGE_WIDTH;

 int temp_row = row + num_rows;
 int temp_col = col + num_cols;

 int r1 = (min(row, height))- 1 ;
 int r2 = (min(temp_row, height)) - 1;

 int c1 = (min(col, width)) - 1;
 int c2 = (min(temp_col, width)) - 1;

 float32x4_t v128_areas;

     float32_t *tl = (float32_t *)integral_image[r1] + c1;
 float32_t *tr = (float32_t *)integral_image[r1] + c2;
 float32_t *bl = (float32_t *)integral_image[r2] + c1;
 float32_t *br = (float32_t *)integral_image[r2] + c2;


 switch (step_size) {
    case 2:
      v128_areas = do_it2(tl, tr, bl, br);
      break;

 case 4:
      v128_areas = do_it4(tl, tr, bl, br);
      break;
 }

 if(three == 3.0)
  v128_areas = vmulq_n_f32(v128_areas, three);

 return v128_areas;

}

inline int min(int X, int Y)
{
 return (X < Y ? X : Y);
}

我希望这可以帮助和我没有引入任何错误。

行：

float32x4x4_t top_left, top_right, bottom_left, bottom_right;

使用全部16 Q寄！这不是太奇怪，编译器无法处理这个问题。你也许可以通过重新编写有固定的这个使用较少的寄存器。

ARM NEON Cortex-A8的有vfpv3可使支持，皮质-A5具有vfpv4和neon2载体，（如用于：如果使用-mfloat-ABI =硬跳过在软件能力来模拟丢失的指示，所以可以不产生代码对于vfpv4会被优化，但会与软件仿真vfpv3可使运行）

许可以下： CC-BY-SA 和归因

不隶属于 StackOverflow