Why will GCC copy word into the return register but not byte?

Question 1

Since you multiply an uchar x and a uint y in the function foo, the compiler must promote uchar x to int first, which the instruction movzbl just does.

See the explanation of movz instructions here.

Afterward I recompiled your code with gcc 4.6.1 and -O3 options, I got assembles as follows:

foo:
.LFB34:
    .cfi_startproc
    movzbl  (%rdi), %eax
    imull   %esi, %eax
    ret 
    .cfi_endproc

bar:
.LFB35:
    .cfi_startproc
    movl    (%rdi), %eax
    imull   %esi, %eax
    ret 
    .cfi_endproc

It doesn't use %edx any more.

Question 2

The short answer

Why will GCC copy word into the return register but not byte?

Because you asked it to return a word not a byte. The compilers did what they were asked based on your code. You asked for a size promotion in one case and unsigned to signed in both cases. There was more than one way to do that and clang/llvm and gcc happened to vary.

Is there a logical reason GCC (4.4.7) is not moving the byte from a structure into %eax directly, or is it just an optimization oversight?

I think based on what we see in the current compilers it was an oversight. See generated code below. (-O2 used in each case).

Interesting experiments related to this question.

clang

0000000000000000 <foo>:
   0:   0f b6 07                movzbl (%rdi),%eax
   3:   0f af c6                imul   %esi,%eax
   6:   c3                      retq   

0000000000000010 <bar>:
  10:   0f af 37                imul   (%rdi),%esi
  13:   89 f0                   mov    %esi,%eax
  15:   c3                      retq

gcc

0000000000000000 <foo>:
   0:   0f b6 07                movzbl (%rdi),%eax
   3:   0f af c6                imul   %esi,%eax
   6:   c3                      retq   

0000000000000010 <bar>:
  10:   8b 07                   mov    (%rdi),%eax
  12:   0f af c6                imul   %esi,%eax
  15:   c3                      retq

They both generated proper code. The tiny difference in the number of bytes of instruction could have really gone either way with these small functions on this instruction set.

Your compiler at the time must not have seen that optimization for some reason.

mips:

00000000 <foo>:
   0:   90820000    lbu v0,0(a0)
   4:   00000000    nop
   8:   00450018    mult    v0,a1
   c:   00001012    mflo    v0
  10:   03e00008    jr  ra
  14:   00000000    nop

00000018 <bar>:
  18:   8c820000    lw  v0,0(a0)
  1c:   00000000    nop
  20:   00a20018    mult    a1,v0
  24:   00001012    mflo    v0
  28:   03e00008    jr  ra
  2c:   00000000    nop

arm

00000000 <foo>:
   0:   e5d00000    ldrb    r0, [r0]
   4:   e0000091    mul r0, r1, r0
   8:   e12fff1e    bx  lr

0000000c <bar>:
   c:   e5900000    ldr r0, [r0]
  10:   e0000091    mul r0, r1, r0
  14:   e12fff1e    bx  lr

No big surprise there like x86 the difference is in the load and how it deals with the other 24 bits then as the code said it promotes the unsigned char or int to signed integer and then multiply and return a signed integer.

Another equally interesting example to complement your question.

struct foo { unsigned char x; };
struct bar { unsigned int x; };

char foo (const struct foo *x, char y) { return x->x * y; }
char bar (const struct bar *x, char y) { return x->x * y; }

clang

0000000000000000 <foo>:
   0:   8a 07                   mov    (%rdi),%al
   2:   40 f6 e6                mul    %sil
   5:   0f be c0                movsbl %al,%eax
   8:   c3                      retq   

0000000000000010 <bar>:
  10:   0f af 37                imul   (%rdi),%esi
  13:   40 0f be c6             movsbl %sil,%eax
  17:   c3                      retq

gcc

0000000000000000 <foo>:
   0:   89 f0                   mov    %esi,%eax
   2:   f6 27                   mulb   (%rdi)
   4:   c3                      retq   

0000000000000010 <bar>:
  10:   89 f0                   mov    %esi,%eax
  12:   f6 27                   mulb   (%rdi)
  14:   c3                      retq

gcc arm

00000000 <foo>:
   0:   e5d00000    ldrb    r0, [r0]
   4:   e0010190    mul r1, r0, r1
   8:   e20100ff    and r0, r1, #255    ; 0xff
   c:   e12fff1e    bx  lr

00000010 <bar>:
  10:   e5900000    ldr r0, [r0]
  14:   e0010190    mul r1, r0, r1
  18:   e20100ff    and r0, r1, #255    ; 0xff
  1c:   e12fff1e    bx  lr

mips

00000000 <foo>:
   0:   90820000    lbu v0,0(a0)
   4:   00052e00    sll a1,a1,0x18
   8:   00052e03    sra a1,a1,0x18
   c:   00a20018    mult    a1,v0
  10:   00001012    mflo    v0
  14:   00021600    sll v0,v0,0x18
  18:   03e00008    jr  ra
  1c:   00021603    sra v0,v0,0x18

00000020 <bar>:
  20:   8c820000    lw  v0,0(a0)
  24:   00052e00    sll a1,a1,0x18
  28:   00052e03    sra a1,a1,0x18
  2c:   00a20018    mult    a1,v0
  30:   00001012    mflo    v0
  34:   00021600    sll v0,v0,0x18
  38:   03e00008    jr  ra
  3c:   00021603    sra v0,v0,0x18

That code in particular punished mips.

and lastly

struct foo { unsigned char x; };
struct bar { unsigned int x; };

unsigned char foo (const struct foo *x, unsigned char y) { return x->x * y; }
unsigned char bar (const struct bar *x, unsigned char y) { return x->x * y; }

gcc and clang for x86 produce the same code as above with the non-specified chars, but

arm

00000000 <foo>:
   0:   e5d00000    ldrb    r0, [r0]
   4:   e0010190    mul r1, r0, r1
   8:   e20100ff    and r0, r1, #255    ; 0xff
   c:   e12fff1e    bx  lr

00000010 <bar>:
  10:   e5900000    ldr r0, [r0]
  14:   e0010190    mul r1, r0, r1
  18:   e20100ff    and r0, r1, #255    ; 0xff
  1c:   e12fff1e    bx  lr

mips

00000000 <foo>:
   0:   90820000    lbu v0,0(a0)
   4:   30a500ff    andi    a1,a1,0xff
   8:   00a20018    mult    a1,v0
   c:   00001012    mflo    v0
  10:   03e00008    jr  ra
  14:   304200ff    andi    v0,v0,0xff

00000018 <bar>:
  18:   8c820000    lw  v0,0(a0)
  1c:   30a500ff    andi    a1,a1,0xff
  20:   00a20018    mult    a1,v0
  24:   00001012    mflo    v0
  28:   03e00008    jr  ra
  2c:   304200ff    andi    v0,v0,0xff

Masking needed because of a combination of calling convention and instruction set. A punishment on both of these instruction sets...You will see this often when using variables whose size do not match the register size for instruction sets like these. x86 has a much wider array of instruction choices, the costs for x86 is the power (watts) that that additional logic costs.

For grins, even if you go way way back, the register sized choice is slightly cheaper.

00000000 <_foo>:
   0:   1166            mov r5, -(sp)
   2:   1185            mov sp, r5
   4:   9f40 0004       movb    *4(r5), r0
   8:   45c0 ff00       bic $-400, r0
   c:   1001            mov r0, r1
   e:   7075 0006       mul 6(r5), r1
  12:   1040            mov r1, r0
  14:   1585            mov (sp)+, r5
  16:   0087            rts pc

00000018 <_bar>:
  18:   1166            mov r5, -(sp)
  1a:   1185            mov sp, r5
  1c:   1d41 0006       mov 6(r5), r1
  20:   707d 0004       mul *4(r5), r1
  24:   1040            mov r1, r0
  26:   1585            mov (sp)+, r5
  28:   0087            rts pc

compiler versions

gcc --version
gcc (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

clang --version
clang version 3.4 (branches/release_34 201060)
Target: x86_64-unknown-linux-gnu
Thread model: posix

arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 4.8.2
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

mips-elf-gcc --version
mips-elf-gcc (GCC) 4.8.2
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

And that last instruction set is an exercise for the reader, there is a bit of a clue in the disassembly...