For example, I have the following IR code,
for.cond.preheader: ; preds = %if.end18
%mul = mul i32 %12, %3
%cmp21128 = icmp sgt i32 %mul, 0
br i1 %cmp21128, label %for.body.preheader, label %return
for.body.preheader: ; preds %for.cond.preheader
%19 = mul i32 %12, %3
%20 = add i32 %19, -1
%21 = zext i32 %20 to i64
%22 = add i64 %21, 1
%end.idx = add i64 %21, 1
%n.vec = and i64 %22, 8589934584
%cmp.zero = icmp eq i64 %n.vec, 0
br i1 %cmp.zero, label %middle.block, label %vector.ph
The corresponding assembly code is:
# BB#3: # %for.cond.preheader
imull %r9d, %ebx
testl %ebx, %ebx
jle .LBB10_63
# BB#4: # %for.body.preheader
leal -1(%rbx), %eax
incq %rax
xorl %edx, %edx
movabsq $8589934584, %rcx # imm = 0x1FFFFFFF8
andq %rax, %rcx
je .LBB10_8
I changed all the scalar operands to <2 x ValueType> ones. The IR becomes
the following
for.cond.preheader: ; preds = %if.end18
%mulS44_D = mul <2 x i32> %splatLDS24_D.splat, %splatLDS7_D.splat
%cmp21128S45_D = icmp sgt <2 x i32> %mulS44_D, zeroinitializer
%sextS46_D = sext <2 x i1> %cmp21128S45_D to <2 x i64>
%BCS46_D = bitcast <2 x i64> %sextS46_D to i128
%mskS46_D = icmp ne i128 %BCS46_D, 0
br i1 %mskS46_D, label %for.body.preheader, label %return
for.body.preheader: ; preds %for.cond.preheader
%S47_D = mul <2 x i32> %splatLDS24_D.splat, %splatLDS7_D.splat
%S48_D = add <2 x i32> %S47_D, <i32 -1, i32 -1>
%S49_D = zext <2 x i32> %S48_D to <2 x i64>
%S50_D = add <2 x i64> %S49_D, <i64 1, i64 1>
%end.idxS51_D = add <2 x i64> %S49_D, <i64 1, i64 1>
%n.vecS52_D = and <2 x i64> %S50_D, <i64 8589934584, i64
8589934584>
%cmp.zeroS53_D = icmp eq <2 x i64> %n.vecS52_D, zeroinitializer
%sextS54_D = sext <2 x i1> %cmp.zeroS53_D to <2 x i64>
%BCS54_D = bitcast <2 x i64> %sextS54_D to i128
%mskS54_D = icmp ne i128 %BCS54_D, 0
br i1 %mskS54_D, label %middle.block, label %vector.ph
Now the assembly for the above IR code is:
# BB#4: # %for.cond.preheader
vmovdqa 144(%rsp), %xmm0 # 16-byte Reload
vpmuludq %xmm7, %xmm0, %xmm2
vpsrlq $32, %xmm7, %xmm4
vpmuludq %xmm4, %xmm0, %xmm4
vpsllq $32, %xmm4, %xmm4
vpaddq %xmm4, %xmm2, %xmm2
vpsrlq $32, %xmm0, %xmm4
vpmuludq %xmm7, %xmm4, %xmm4
vpsllq $32, %xmm4, %xmm4
vpaddq %xmm4, %xmm2, %xmm2
vpextrq $1, %xmm2, %rax
cltq
vmovq %rax, %xmm4
vmovq %xmm2, %rax
cltq
vmovq %rax, %xmm5
vpunpcklqdq %xmm4, %xmm5, %xmm4 # xmm4 = xmm5[0],xmm4[0]
vpcmpgtq %xmm3, %xmm4, %xmm3
vptest %xmm3, %xmm3
je .LBB10_66
# BB#5: # %for.body.preheader
vpaddq %xmm15, %xmm2, %xmm3
vpand %xmm15, %xmm3, %xmm3
vpaddq .LCPI10_1(%rip), %xmm3, %xmm8
vpand .LCPI10_5(%rip), %xmm8, %xmm5
vpxor %xmm4, %xmm4, %xmm4
vpcmpeqq %xmm4, %xmm5, %xmm6
vptest %xmm6, %xmm6
jne .LBB10_9
It turned out that the vector one is way more complicated than the scalar
one. I was expecting that it would be not so tedious.
On Fri, Jun 26, 2015 at 3:49 AM, suyog sarda <sardask01 at gmail.com>
wrote:
>
> >
> > Is LLVM be able to generate code for the following code?
> >
> > %mul = mul <2 x i32> %1, %2, where %1 > and %2 are <2 x
i32> type.
>
> > I am running it on a Haswell processor with LLVM-3.4.2. It seems that
it
> will generates really complicated code with vpaddq, vpmuludq, vpsllq,
> vpsrlq.
> >
>
> Can you please elaborate more on what is your test case and what do you
> want to see the final output? It will be good if you can give test case you
> are running LLVM on.
>
> Regards,
> Suyog Sarda
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20150626/47d7e933/attachment.html>
> For example, I have the following IR code, > > for.cond.preheader: ; preds = %if.end18 > %mul = mul i32 %12, %3 > %cmp21128 = icmp sgt i32 %mul, 0 > br i1 %cmp21128, label %for.body.preheader, label %return > > for.body.preheader: ; preds %for.cond.preheader > %19 = mul i32 %12, %3 > %20 = add i32 %19, -1 > %21 = zext i32 %20 to i64 > %22 = add i64 %21, 1 > %end.idx = add i64 %21, 1 > %n.vec = and i64 %22, 8589934584 > %cmp.zero = icmp eq i64 %n.vec, 0 > br i1 %cmp.zero, label %middle.block, label %vector.ph > > The corresponding assembly code is: > # BB#3: # %for.cond.preheader> imull %r9d, %ebx> testl %ebx, %ebx> jle .LBB10_63> # BB#4: # %for.body.preheader> leal -1(%rbx), %eax> incq %rax> xorl %edx, %edx> movabsq $8589934584, %rcx # imm = 0x1FFFFFFF8> andq %rax, %rcx> je .LBB10_8 > > I changed all the scalar operands to <2 x ValueType> ones. The IR becomesthe following> for.cond.preheader: ; preds = %if.end18 > %mulS44_D = mul <2 x i32> %splatLDS24_D.splat, %splatLDS7_D.splat > %cmp21128S45_D = icmp sgt <2 x i32> %mulS44_D, zeroinitializer > %sextS46_D = sext <2 x i1> %cmp21128S45_D to <2 x i64> > %BCS46_D = bitcast <2 x i64> %sextS46_D to i128 > %mskS46_D = icmp ne i128 %BCS46_D, 0 > br i1 %mskS46_D, label %for.body.preheader, label %return > > for.body.preheader: ; preds %for.cond.preheader > %S47_D = mul <2 x i32> %splatLDS24_D.splat, %splatLDS7_D.splat > %S48_D = add <2 x i32> %S47_D, <i32 -1, i32 -1> > %S49_D = zext <2 x i32> %S48_D to <2 x i64> > %S50_D = add <2 x i64> %S49_D, <i64 1, i64 1> > %end.idxS51_D = add <2 x i64> %S49_D, <i64 1, i64 1> > %n.vecS52_D = and <2 x i64> %S50_D, <i64 8589934584, i64 8589934584> > %cmp.zeroS53_D = icmp eq <2 x i64> %n.vecS52_D, zeroinitializer > %sextS54_D = sext <2 x i1> %cmp.zeroS53_D to <2 x i64> > %BCS54_D = bitcast <2 x i64> %sextS54_D to i128 > %mskS54_D = icmp ne i128 %BCS54_D, 0 > br i1 %mskS54_D, label %middle.block, label %vector.ph > > Now the assembly for the above IR code is: > # BB#4: # %for.cond.preheader > vmovdqa 144(%rsp), %xmm0 # 16-byte Reload > vpmuludq %xmm7, %xmm0, %xmm2 > vpsrlq $32, %xmm7, %xmm4 > vpmuludq %xmm4, %xmm0, %xmm4 > vpsllq $32, %xmm4, %xmm4 > vpaddq %xmm4, %xmm2, %xmm2 > vpsrlq $32, %xmm0, %xmm4 > vpmuludq %xmm7, %xmm4, %xmm4 > vpsllq $32, %xmm4, %xmm4 > vpaddq %xmm4, %xmm2, %xmm2 > vpextrq $1, %xmm2, %rax > cltq > vmovq %rax, %xmm4 > vmovq %xmm2, %rax > cltq > vmovq %rax, %xmm5 > vpunpcklqdq %xmm4, %xmm5, %xmm4 # xmm4 = xmm5[0],xmm4[0] > vpcmpgtq %xmm3, %xmm4, %xmm3 > vptest %xmm3, %xmm3 > je .LBB10_66 > # BB#5: # %for.body.preheader > vpaddq %xmm15, %xmm2, %xmm3 > vpand %xmm15, %xmm3, %xmm3 > vpaddq .LCPI10_1(%rip), %xmm3, %xmm8 > vpand .LCPI10_5(%rip), %xmm8, %xmm5 > vpxor %xmm4, %xmm4, %xmm4 > vpcmpeqq %xmm4, %xmm5, %xmm6 > vptest %xmm6, %xmm6 > jne .LBB10_9 >As Mats pointed out, this may be the same problem as: https <https://llvm.org/bugs/show_bug.cgi?id=22703>:// <https://llvm.org/bugs/show_bug.cgi?id=22703>llvm.org <https://llvm.org/bugs/show_bug.cgi?id=22703>/bugs/show_ <https://llvm.org/bugs/show_bug.cgi?id=22703>bug.cgi <https://llvm.org/bugs/show_bug.cgi?id=22703>?id=22703 <https://llvm.org/bugs/show_bug.cgi?id=22703> Basically, the code is generated for AVX2 where register XMM are 128 bits. Some of the above ops are <2 x i32> and involve sext to <2 x i64>, bitcast, etc. Hence the code has extra vector instructions. Regards, Suyog Sarda -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150627/e3c54be6/attachment.html>
Thanks for pointing out. In this case, it seems that vectorization is not actually profitable. Doesn't that mean we always need to sext i32 to i64 so that we can use the whole lanes in a XMM register? Thanks. On Fri, Jun 26, 2015 at 11:56 AM, suyog sarda <sardask01 at gmail.com> wrote:> > > For example, I have the following IR code, > > > > for.cond.preheader: ; preds = %if.end18 > > %mul = mul i32 %12, %3 > > %cmp21128 = icmp sgt i32 %mul, 0 > > br i1 %cmp21128, label %for.body.preheader, label %return > > > > for.body.preheader: ; preds > %for.cond.preheader > > %19 = mul i32 %12, %3 > > %20 = add i32 %19, -1 > > %21 = zext i32 %20 to i64 > > %22 = add i64 %21, 1 > > %end.idx = add i64 %21, 1 > > %n.vec = and i64 %22, 8589934584 > > %cmp.zero = icmp eq i64 %n.vec, 0 > > br i1 %cmp.zero, label %middle.block, label %vector.ph > > > > The corresponding assembly code is: > > # BB#3: # %for.cond.preheader > > > imull %r9d, %ebx > > > testl %ebx, %ebx > > > jle .LBB10_63 > > > # BB#4: # %for.body.preheader > > > leal -1(%rbx), %eax > > > incq %rax > > > xorl %edx, %edx > > > movabsq $8589934584, %rcx # imm = 0x1FFFFFFF8 > > > andq %rax, %rcx > > > je .LBB10_8 > > > > I changed all the scalar operands to <2 x ValueType> ones. The IR > becomes the following > > for.cond.preheader: ; preds = %if.end18 > > %mulS44_D = mul <2 x i32> %splatLDS24_D.splat, %splatLDS7_D.splat > > %cmp21128S45_D = icmp sgt <2 x i32> %mulS44_D, zeroinitializer > > %sextS46_D = sext <2 x i1> %cmp21128S45_D to <2 x i64> > > %BCS46_D = bitcast <2 x i64> %sextS46_D to i128 > > %mskS46_D = icmp ne i128 %BCS46_D, 0 > > br i1 %mskS46_D, label %for.body.preheader, label %return > > > > for.body.preheader: ; preds > %for.cond.preheader > > %S47_D = mul <2 x i32> %splatLDS24_D.splat, %splatLDS7_D.splat > > %S48_D = add <2 x i32> %S47_D, <i32 -1, i32 -1> > > %S49_D = zext <2 x i32> %S48_D to <2 x i64> > > %S50_D = add <2 x i64> %S49_D, <i64 1, i64 1> > > %end.idxS51_D = add <2 x i64> %S49_D, <i64 1, i64 1> > > %n.vecS52_D = and <2 x i64> %S50_D, <i64 8589934584, i64 8589934584> > > %cmp.zeroS53_D = icmp eq <2 x i64> %n.vecS52_D, zeroinitializer > > %sextS54_D = sext <2 x i1> %cmp.zeroS53_D to <2 x i64> > > %BCS54_D = bitcast <2 x i64> %sextS54_D to i128 > > %mskS54_D = icmp ne i128 %BCS54_D, 0 > > br i1 %mskS54_D, label %middle.block, label %vector.ph > > > > Now the assembly for the above IR code is: > > # BB#4: # %for.cond.preheader > > vmovdqa 144(%rsp), %xmm0 # 16-byte Reload > > vpmuludq %xmm7, %xmm0, %xmm2 > > vpsrlq $32, %xmm7, %xmm4 > > vpmuludq %xmm4, %xmm0, %xmm4 > > vpsllq $32, %xmm4, %xmm4 > > vpaddq %xmm4, %xmm2, %xmm2 > > vpsrlq $32, %xmm0, %xmm4 > > vpmuludq %xmm7, %xmm4, %xmm4 > > vpsllq $32, %xmm4, %xmm4 > > vpaddq %xmm4, %xmm2, %xmm2 > > vpextrq $1, %xmm2, %rax > > cltq > > vmovq %rax, %xmm4 > > vmovq %xmm2, %rax > > cltq > > vmovq %rax, %xmm5 > > vpunpcklqdq %xmm4, %xmm5, %xmm4 # xmm4 = xmm5[0],xmm4[0] > > vpcmpgtq %xmm3, %xmm4, %xmm3 > > vptest %xmm3, %xmm3 > > je .LBB10_66 > > # BB#5: # %for.body.preheader > > vpaddq %xmm15, %xmm2, %xmm3 > > vpand %xmm15, %xmm3, %xmm3 > > vpaddq .LCPI10_1(%rip), %xmm3, %xmm8 > > vpand .LCPI10_5(%rip), %xmm8, %xmm5 > > vpxor %xmm4, %xmm4, %xmm4 > > vpcmpeqq %xmm4, %xmm5, %xmm6 > > vptest %xmm6, %xmm6 > > jne .LBB10_9 > > > > As Mats pointed out, this may be the same problem as: > https <https://llvm.org/bugs/show_bug.cgi?id=22703>:// > <https://llvm.org/bugs/show_bug.cgi?id=22703>llvm.org > <https://llvm.org/bugs/show_bug.cgi?id=22703>/bugs/show_ > <https://llvm.org/bugs/show_bug.cgi?id=22703>bug.cgi > <https://llvm.org/bugs/show_bug.cgi?id=22703>?id=22703 > <https://llvm.org/bugs/show_bug.cgi?id=22703> > > Basically, the code is generated for AVX2 where register XMM are 128 bits. > Some of the above ops are <2 x i32> and involve sext to <2 x i64>, bitcast, > etc. Hence the code has extra vector instructions. > > Regards, > Suyog Sarda >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150626/ab3da0bf/attachment.html>