Here is a starting point. From here you can do cache preloads, loop unrolling, etc. The best performance will happen when more NEON registers are involved to prevent data stalls.
.equ CAM_HEIGHT, 480 @ fill in the correct values
.equ CAM_WIDTH, 640
@
@ Call from C as convert_yuyv_to_y(const void *src, char *dest);
@
convert_yuyv_to_y:
mov r2,#CAM_HEIGHT
cvtyuyv_top_y:
mov r3,#CAM_WIDTH
cvtyuyv_top_x:
vld2.8 {d0,d1},[r0]! @ assumes source width is a multiple of 8
vst1.8 {d0},[r1]! @ work with 8 pixels at a time
subs r3,r3,#8 @ x+=8
bgt cvtyuyv_top_x
subs r2,r2,#1 @ y++
bgt cvtyuyv_top_y
bx lr