I have a code generation question for ARM with VFP and NEON.
I am generating code for the following function as a test:
void FloatingPointTest(float f1, float f2, float f3)
{
float f4 = f1 * f2;
if (f4 > f3)
printf("%f\n",f2);
else
printf("%f\n",f3);
}
I have tried compiling with:
1. -mfloat-abi=softfp and -mfpu=neon
2. -mfloat-abi=hard and -mfpu=neon
3. -mfloat-abi=softfp and -mfpu=vfp3
4. -mfloat-abi=hard and -mfpu=vfp3
When I use --emit-llvm -c flags to generate bitcode, and then use llc to
generate ARM assembler, I have tried supplying these flag variations to
llc:
5. llc -mattr=+neon
6. llc -mattr=+vfp3
I am building for armv7-a.
In all cases, I get code that looks pretty very the same; its like what
is below. However, I am expecting to see instruction level differences
between the vfp3 and neon versions. When I do the same with gcc 4.2 I do
see differences in the generated code.
Am I mistaken in expecting to see a difference in NEON and VFP
instructions, is this my mistake, or is there something else going on
here?
thanks,
-David
.private_extern _FloatingPointTest
.globl _FloatingPointTest
.align 2
_FloatingPointTest: @ @FloatingPointTest
@ BB#0: @ %entry
sub sp, sp, #8
str lr, [sp, #4]
str r7, [sp]
mov r7, sp
sub sp, sp, #36
str r0, [r7, #-4]
vmov s0, r0
str r1, [r7, #-8]
vmov s1, r1
str r2, [r7, #-12]
vmov s2, r2
vldr.32 s3, [r7, #-4]
vldr.32 s4, [r7, #-8]
vmul.f32 s3, s3, s4
vstr.32 s3, [r7, #-16]
vldr.32 s4, [r7, #-12]
vcmpe.f32 s3, s4
vmrs apsr_nzcv, fpscr
vstr.32 s0, [sp, #16]
vstr.32 s2, [sp, #12]
vstr.32 s1, [sp, #8]
ble LBB20_2
@ BB#1: @ %bb
vldr.32 s0, [r7, #-16]
ldr r0, LCPI20_0
LPC20_0:
add r0, pc, r0
vcvt.f64.f32 d1, s0
vmov r1, r2, d1
bl _printf
str r0, [sp, #4]
b LBB20_3
LBB20_2: @ %bb1
vldr.32 s0, [r7, #-12]
ldr r0, LCPI20_1
LPC20_1:
add r0, pc, r0
vcvt.f64.f32 d1, s0
vmov r1, r2, d1
bl _printf
str r0, [sp]
LBB20_3: @ %bb2
@ BB#4: @ %return
mov sp, r7
ldr r7, [sp]
ldr lr, [sp, #4]
add sp, sp, #8
bx lr
@ BB#5:
.align 2
LCPI20_0:
.long L_.str107-(LPC20_0+8)
.align 2
LCPI20_1:
.long L_.str107-(LPC20_1+8)