On Sep 29, 2010, at 8:35 AMPDT, Ralf Karrenberg wrote:
> Hello everybody,
>
> I have stumbled upon a test case (the attached module is a slightly
> reduced version) that shows extremely reduced performance on linux
> compared to windows when executed using LLVM's JIT.
>
> We narrowed the problem down to the actual code being generated, the
> source IR on both systems is the same.
> Try compiling the attached module:
>
> llc -O3 -filetype=asm -o BAD.s BAD.ll
>
> Under linux, the resulting assembly file shows that only registers up to
> xmm5, while the same command under windows generates assembly that uses
> all registers up to xmm15 (on the same 64bit Intel Q9550).
> At the same time, the linux-assembly shows lots and lots of spills and
> reloads.
The Win64 calling convention defines XMM6..XMM15 as callee saved, so their
values can remain live across the calls. On Linux all XMM registers are
call-clobbered so any live values must be spilled across calls. That's the
basic reason for the difference. It may be there's something to do to
improve the code on Linux, such as scheduling differently, I haven't looked
in detail.
> Although I did not check whether the code generated by the JIT is the
> same or comparable, the fact that this occurs with the static llc seems
> to prove that there is a major problem here.
>
> This applies both to the current SVN trunk and SVN revision 112036.
>
> Can somebody reproduce that or give comments on what happens there?
>
>
> Best regards,
> Ralf
> ; ModuleID = 'BAD.bc'
> target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
>
> %0 = type { i8*, i8*, i8*, i8*, i32 }
> %1 = type { float addrspace(1)*, i32, float addrspace(1)*, float
addrspace(1)* }
>
> @sgv = internal constant [1 x i8] zeroinitializer
> @fgv = internal constant [1 x i8] zeroinitializer
> @lvgv = internal constant [0 x i8*] zeroinitializer
>
> declare float @llvm.sqrt.f32(float) nounwind readonly
>
> declare float @llvm.exp.f32(float) nounwind readonly
>
> declare float @llvm.log.f32(float) nounwind readonly
>
> declare float @fabs(float)
>
> define void @BAD(%1* noalias nocapture %arg_struct, i32 %get_work_dim, i32*
%get_global_size, i32* %get_local_size, i32* %get_group_id) {
> entry:
> %0 = getelementptr %1* %arg_struct, i64 0, i32 0
> %1 = load float addrspace(1)** %0, align 8
> %2 = getelementptr %1* %arg_struct, i64 0, i32 2
> %3 = load float addrspace(1)** %2, align 8
> %4 = getelementptr %1* %arg_struct, i64 0, i32 3
> %5 = load float addrspace(1)** %4, align 8
> %local_size_0 = load i32* %get_local_size, align 16
> %group_id_0 = load i32* %get_group_id, align 16
> %6 = mul i32 %group_id_0, %local_size_0
> br label %entry.header.loop
>
> entry.header.loop.end: ; preds =
%cond.then.i201.i, %phi.exit138.i
> %cond.i204.i = phi float [ %tmp43.i200.i, %cond.then.i201.i ], [
%tmp38.i194.i, %phi.exit138.i ]
> %arrayidx82.i = getelementptr float addrspace(1)* %5, i64 %8
> %tmp85.i = fmul float %tmp63.i, %cond.i204.i
> %tmp88.i = fmul float %tmp9.i, %cond.i135.i
> %tmp89.i = fsub float %tmp85.i, %tmp88.i
> store float %tmp89.i, float addrspace(1)* %arrayidx82.i, align 4
> %inc = add i32 %7, 1
> %exitcond = icmp eq i32 %inc, %local_size_0
> br i1 %exitcond, label %exit, label %entry.header.loop
>
> entry.header.loop: ; preds = %entry,
%entry.header.loop.end
> %7 = phi i32 [ %inc, %entry.header.loop.end ], [ 0, %entry ]
> %global_id_0 = add i32 %7, %6
> %8 = sext i32 %global_id_0 to i64
> %arrayidx.i = getelementptr float addrspace(1)* %1, i64 %8
> %tmp3.i = load float addrspace(1)* %arrayidx.i, align 4
> %tmp5.i = fmul float %tmp3.i, 1.000000e+01
> %tmp7.i = fsub float 1.000000e+00, %tmp3.i
> %tmp8.i = fmul float %tmp7.i, 1.000000e+02
> %tmp9.i = fadd float %tmp5.i, %tmp8.i
> %tmp20.i = fmul float %tmp7.i, 1.000000e+01
> %tmp21.i = fadd float %tmp3.i, %tmp20.i
> %tmp23.i = fmul float %tmp3.i, 0x3F847AE140000000
> %tmp26.i = fmul float %tmp7.i, 0x3FA99999A0000000
> %tmp27.i = fadd float %tmp23.i, %tmp26.i
> %tmp32.i = fmul float %tmp7.i, 0x3FB99999A0000000
> %tmp33.i = fadd float %tmp23.i, %tmp32.i
> %call36.i = tail call float @llvm.sqrt.f32(float %tmp21.i) nounwind
> %tmp37.i = fmul float %tmp33.i, %call36.i
> %tmp40.i = fdiv float %tmp9.i, %tmp9.i
> %call41.i = tail call float @llvm.log.f32(float %tmp40.i) nounwind
> %tmp45.i = fmul float %tmp33.i, %tmp33.i
> %tmp47.i = fdiv float %tmp45.i, 2.000000e+00
> %tmp48.i = fadd float %tmp27.i, %tmp47.i
> %tmp50.i = fmul float %tmp48.i, %tmp21.i
> %tmp51.i = fadd float %call41.i, %tmp50.i
> %tmp53.i = fdiv float %tmp51.i, %tmp37.i
> %tmp56.i = fsub float %tmp53.i, %tmp37.i
> %tmp59.i = fsub float -0.000000e+00, %tmp27.i
> %tmp61.i = fmul float %tmp21.i, %tmp59.i
> %call62.i = tail call float @llvm.exp.f32(float %tmp61.i) nounwind
> %tmp63.i = fmul float %tmp9.i, %call62.i
> %call.i.i = tail call float @fabs(float %tmp53.i) nounwind
> %tmp5.i.i = fmul float %call.i.i, 0x3FCDA67120000000
> %tmp6.i.i = fadd float %tmp5.i.i, 1.000000e+00
> %tmp7.i.i = fdiv float 1.000000e+00, %tmp6.i.i
> %tmp11.i.i = fsub float -0.000000e+00, %tmp53.i
> %tmp13.i.i = fmul float %tmp53.i, %tmp11.i.i
> %tmp15.i.i = fdiv float %tmp13.i.i, 2.000000e+00
> %call16.i.i = tail call float @llvm.exp.f32(float %tmp15.i.i) nounwind
> %tmp17.i.i = fmul float %call16.i.i, 0x3FD9884540000000
> %tmp19.i.i = fmul float %tmp17.i.i, %tmp7.i.i
> %tmp29.i.i = fmul float %tmp7.i.i, 0x3FF548CDE0000000
> %tmp30.i.i = fadd float %tmp29.i.i, 0xBFFD23DD40000000
> %tmp31.i.i = fmul float %tmp7.i.i, %tmp30.i.i
> %tmp32.i.i = fadd float %tmp31.i.i, 0x3FFC80EF00000000
> %tmp33.i.i = fmul float %tmp7.i.i, %tmp32.i.i
> %tmp34.i.i = fadd float %tmp33.i.i, 0xBFD6D1F0E0000000
> %tmp35.i.i = fmul float %tmp7.i.i, %tmp34.i.i
> %tmp36.i.i = fadd float %tmp35.i.i, 0x3FD470BF40000000
> %tmp37.i.i = fmul float %tmp19.i.i, %tmp36.i.i
> %tmp38.i.i = fsub float 1.000000e+00, %tmp37.i.i
> %cmp.i.i = fcmp olt float %tmp53.i, 0.000000e+00
> br i1 %cmp.i.i, label %cond.then.i.i, label %phi.exit.i
>
> cond.then.i.i: ; preds =
%entry.header.loop
> %tmp43.i.i = fsub float 1.000000e+00, %tmp38.i.i
> br label %phi.exit.i
>
> phi.exit.i: ; preds = %cond.then.i.i,
%entry.header.loop
> %cond.i.i = phi float [ %tmp43.i.i, %cond.then.i.i ], [ %tmp38.i.i,
%entry.header.loop ]
> %call.i18.i = tail call float @fabs(float %tmp56.i) nounwind
> %tmp5.i23.i = fmul float %call.i18.i, 0x3FCDA67120000000
> %tmp6.i24.i = fadd float %tmp5.i23.i, 1.000000e+00
> %tmp7.i25.i = fdiv float 1.000000e+00, %tmp6.i24.i
> %tmp11.i29.i = fsub float -0.000000e+00, %tmp56.i
> %tmp13.i31.i = fmul float %tmp56.i, %tmp11.i29.i
> %tmp15.i33.i = fdiv float %tmp13.i31.i, 2.000000e+00
> %call16.i34.i = tail call float @llvm.exp.f32(float %tmp15.i33.i) nounwind
> %tmp17.i35.i = fmul float %call16.i34.i, 0x3FD9884540000000
> %tmp19.i37.i = fmul float %tmp17.i35.i, %tmp7.i25.i
> %tmp29.i47.i = fmul float %tmp7.i25.i, 0x3FF548CDE0000000
> %tmp30.i48.i = fadd float %tmp29.i47.i, 0xBFFD23DD40000000
> %tmp31.i49.i = fmul float %tmp7.i25.i, %tmp30.i48.i
> %tmp32.i50.i = fadd float %tmp31.i49.i, 0x3FFC80EF00000000
> %tmp33.i51.i = fmul float %tmp7.i25.i, %tmp32.i50.i
> %tmp34.i52.i = fadd float %tmp33.i51.i, 0xBFD6D1F0E0000000
> %tmp35.i53.i = fmul float %tmp7.i25.i, %tmp34.i52.i
> %tmp36.i54.i = fadd float %tmp35.i53.i, 0x3FD470BF40000000
> %tmp37.i55.i = fmul float %tmp19.i37.i, %tmp36.i54.i
> %tmp38.i56.i = fsub float 1.000000e+00, %tmp37.i55.i
> %cmp.i59.i = fcmp olt float %tmp56.i, 0.000000e+00
> br i1 %cmp.i59.i, label %cond.then.i63.i, label %phi.exit69.i
>
> cond.then.i63.i: ; preds = %phi.exit.i
> %tmp43.i62.i = fsub float 1.000000e+00, %tmp38.i56.i
> br label %phi.exit69.i
>
> phi.exit69.i: ; preds =
%cond.then.i63.i, %phi.exit.i
> %cond.i66.i = phi float [ %tmp43.i62.i, %cond.then.i63.i ], [
%tmp38.i56.i, %phi.exit.i ]
> %arrayidx68.i = getelementptr float addrspace(1)* %3, i64 %8
> %tmp71.i = fmul float %tmp9.i, %cond.i.i
> %tmp74.i = fmul float %tmp63.i, %cond.i66.i
> %tmp75.i = fsub float %tmp71.i, %tmp74.i
> store float %tmp75.i, float addrspace(1)* %arrayidx68.i, align 4
> %call.i87.i = tail call float @fabs(float %tmp11.i.i) nounwind
> %tmp5.i92.i = fmul float %call.i87.i, 0x3FCDA67120000000
> %tmp6.i93.i = fadd float %tmp5.i92.i, 1.000000e+00
> %tmp7.i94.i = fdiv float 1.000000e+00, %tmp6.i93.i
> %call16.i103.i = tail call float @llvm.exp.f32(float %tmp15.i.i) nounwind
> %tmp17.i104.i = fmul float %call16.i103.i, 0x3FD9884540000000
> %tmp19.i106.i = fmul float %tmp17.i104.i, %tmp7.i94.i
> %tmp29.i116.i = fmul float %tmp7.i94.i, 0x3FF548CDE0000000
> %tmp30.i117.i = fadd float %tmp29.i116.i, 0xBFFD23DD40000000
> %tmp31.i118.i = fmul float %tmp7.i94.i, %tmp30.i117.i
> %tmp32.i119.i = fadd float %tmp31.i118.i, 0x3FFC80EF00000000
> %tmp33.i120.i = fmul float %tmp7.i94.i, %tmp32.i119.i
> %tmp34.i121.i = fadd float %tmp33.i120.i, 0xBFD6D1F0E0000000
> %tmp35.i122.i = fmul float %tmp7.i94.i, %tmp34.i121.i
> %tmp36.i123.i = fadd float %tmp35.i122.i, 0x3FD470BF40000000
> %tmp37.i124.i = fmul float %tmp19.i106.i, %tmp36.i123.i
> %tmp38.i125.i = fsub float 1.000000e+00, %tmp37.i124.i
> %cmp.i128.i = fcmp olt float %tmp11.i.i, 0.000000e+00
> br i1 %cmp.i128.i, label %cond.then.i132.i, label %phi.exit138.i
>
> cond.then.i132.i: ; preds = %phi.exit69.i
> %tmp43.i131.i = fsub float 1.000000e+00, %tmp38.i125.i
> br label %phi.exit138.i
>
> phi.exit138.i: ; preds =
%cond.then.i132.i, %phi.exit69.i
> %cond.i135.i = phi float [ %tmp43.i131.i, %cond.then.i132.i ], [
%tmp38.i125.i, %phi.exit69.i ]
> %call.i156.i = tail call float @fabs(float %tmp11.i29.i) nounwind
> %tmp5.i161.i = fmul float %call.i156.i, 0x3FCDA67120000000
> %tmp6.i162.i = fadd float %tmp5.i161.i, 1.000000e+00
> %tmp7.i163.i = fdiv float 1.000000e+00, %tmp6.i162.i
> %call16.i172.i = tail call float @llvm.exp.f32(float %tmp15.i33.i)
nounwind
> %tmp17.i173.i = fmul float %call16.i172.i, 0x3FD9884540000000
> %tmp19.i175.i = fmul float %tmp17.i173.i, %tmp7.i163.i
> %tmp29.i185.i = fmul float %tmp7.i163.i, 0x3FF548CDE0000000
> %tmp30.i186.i = fadd float %tmp29.i185.i, 0xBFFD23DD40000000
> %tmp31.i187.i = fmul float %tmp7.i163.i, %tmp30.i186.i
> %tmp32.i188.i = fadd float %tmp31.i187.i, 0x3FFC80EF00000000
> %tmp33.i189.i = fmul float %tmp7.i163.i, %tmp32.i188.i
> %tmp34.i190.i = fadd float %tmp33.i189.i, 0xBFD6D1F0E0000000
> %tmp35.i191.i = fmul float %tmp7.i163.i, %tmp34.i190.i
> %tmp36.i192.i = fadd float %tmp35.i191.i, 0x3FD470BF40000000
> %tmp37.i193.i = fmul float %tmp19.i175.i, %tmp36.i192.i
> %tmp38.i194.i = fsub float 1.000000e+00, %tmp37.i193.i
> %cmp.i197.i = fcmp olt float %tmp11.i29.i, 0.000000e+00
> br i1 %cmp.i197.i, label %cond.then.i201.i, label %entry.header.loop.end
>
> cond.then.i201.i: ; preds = %phi.exit138.i
> %tmp43.i200.i = fsub float 1.000000e+00, %tmp38.i194.i
> br label %entry.header.loop.end
>
> exit: ; preds =
%entry.header.loop.end
> ret void
> }
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev