On 07/24/2015 03:42 AM, Benjamin Kramer wrote:>> On 24.07.2015, at 08:06, zhi chen <zchenhn at gmail.com> wrote: >> >> It seems that that it's hard to vectorize int64 in LLVM. For example, LLVM 3.4 generates very complicated code for the following IR. I am running on a Haswell processor. Is it because there is no alternative AVX/2 instructions for int64? The same thing also happens to zext <2 x i32> -> <2 x i64> and trunc <2 x i64> -> <2 x i32>. Any ideas to optimize these instructions? Thanks. >> >> %sub.ptr.sub.i6.i.i.i.i = sub <2 x i64> %sub.ptr.lhs.cast.i4.i.i.i.i, %sub.ptr.rhs.cast.i5.i.i.i.i >> %sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64> %sub.ptr.sub.i6.i.i.i.i, <i64 24, i64 24> >> >> Assembly: >> vpsubq %xmm6, %xmm5, %xmm5 >> vmovq %xmm5, %rax >> movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB >> imulq %rbx >> movq %rdx, %rcx >> movq %rcx, %rax >> shrq $63, %rax >> shrq $2, %rcx >> addl %eax, %ecx >> vpextrq $1, %xmm5, %rax >> imulq %rbx >> movq %rdx, %rax >> shrq $63, %rax >> shrq $2, %rdx >> addl %eax, %edx >> movslq %edx, %rax >> vmovq %rax, %xmm5 >> movslq %ecx, %rax >> vmovq %rax, %xmm6 >> vpunpcklqdq %xmm5, %xmm6, %xmm5 # xmm5 = xmm6[0],xmm5[0] > AVX2 doesn't have integer vector division instructions and LLVM lowers divides by constants into (128 bit) multiplies. However, AVX2 doesn't have a way to get to the upper 64 bits of a 64x64->128 bit multiply either, so LLVM uses the scalar imulq instruction to do that. There's not much room to optimize here given the limitations of AVX2. > > You seem to be subtracting pointers though, so if you can guarantee that the pointers are aligned you could set the exact bit on your 'sdiv' instruction. That should give better code.Depending on what you're using the result of the divide for, there might be optimizations which could be applied as well. Can you give a slightly larger context for your source IR? (1-2 level of uses/defs out from the instructions would help)> > - Ben > > > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
------------------------------------ IR
------------------------------------------------------------------
if.then.i.i.i.i.i.i: ; preds = %if.then4
%S25_D = zext <2 x i32> %splatLDS17_D.splat to <2 x i64>
%umul_with_overflow.i.iS26_D = shl <2 x i64> %S25_D, <i64 3, i64
3>
%extumul_with_overflow.i.iS26_D = extractelement <2 x i64>
%umul_with_overflow.i.iS26_D, i32 1
%call5.i.i = tail call noalias i8* @_Znam(i64
%extumul_with_overflow.i.iS26_D) #22
%splatCallS27_D.splatinsert = insertelement <2 x i8*> undef, i8*
%call5.i.i, i32 0
%splatCallS27_D.splat = shufflevector <2 x i8*>
%splatCallS27_D.splatinsert, <2 x i8*> undef, <2 x i32>
zeroinitializer
%bitcastS28_D = bitcast <2 x i8*> %splatCallS27_D.splat to <2 x
double*>
%extractS29_D = extractelement <2 x double*> %bitcastS28_D, i32 1
store double* %extractS29_D, double** %val.i.i, align 8
%val.i3.i.i = getelementptr inbounds %class.Vector* %__x, i64 0, i32 3
%4 = load double** %val.i3.i.i, align 8, !tbaa !22
%splatLDS31_D.splatinsert = insertelement <2 x double*> undef, double*
%4, i32 0
%splatLDS31_D.splat = shufflevector <2 x double*>
%splatLDS31_D.splatinsert, <2 x double*> undef, <2 x i32>
zeroinitializer
%bitcastS32_D = bitcast <2 x double*> %splatLDS31_D.splat to <2 x
i8*>
%extbitcastS32_D = extractelement <2 x i8*> %bitcastS32_D, i32 1
tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %call5.i.i, i8*
%extbitcastS32_D, i64 %extumul_with_overflow.i.iS26_D, i32 8, i1 false) #9
br label %invoke.cont
invoke.cont: ; preds %if.then.i.i.i.i.i.i,
%if.then4
%sub.ptr.rhs.cast.i = ptrtoint %class.Vector* %__position.coerce to i64
%sub.ptr.rhs.cast.iS35_D = ptrtoint <2 x %class.Vector*>
%splatInsMapS35_D.splat to <2 x i64>
%sub.ptr.sub.iS36_D = sub <2 x i64> %sub.ptr.rhs.castS8_D,
%sub.ptr.rhs.cast.iS35_D
%sub.ptr.div.iS37_D = sdiv <2 x i64> %sub.ptr.sub.iS36_D, <i64 24,
i64 24>
%extractS196_D = extractelement <2 x i64> %sub.ptr.div.iS37_D, i32 1
%cmp10S38_D = icmp ugt <2 x i64> %sub.ptr.div.iS37_D,
%splatInsMapS1_D.splat
%zextS39_D = sext <2 x i1> %cmp10S38_D to <2 x i64>
%BCS39_D = bitcast <2 x i64> %zextS39_D to i128
%mskS39_D = icmp ne i128 %BCS39_D, 0
br i1 %mskS39_D, label %if.then11, label %if.else
-------------------------------------------- Assembly
-----------------------------------------------------------------
# BB#3: # %if.then.i.i.i.i.i.i
vpsllq $3, %xmm0, %xmm0
vpextrq $1, %xmm0, %rbx
movq %rbx, %rdi
vmovaps %xmm2, 96(%rsp) # 16-byte Spill
vmovaps %xmm5, 64(%rsp) # 16-byte Spill
vmovdqa %xmm6, 16(%rsp) # 16-byte Spill
callq _Znam
movq %rax, 128(%rsp)
movq 16(%r12), %rsi
movq %rax, %rdi
movq %rbx, %rdx
callq memmove
vmovdqa 16(%rsp), %xmm6 # 16-byte Reload
vmovaps 64(%rsp), %xmm5 # 16-byte Reload
vmovaps 96(%rsp), %xmm2 # 16-byte Reload
vmovdqa .LCPI582_0(%rip), %xmm4
.LBB582_4: # %invoke.cont
vmovaps %xmm2, 96(%rsp) # 16-byte Spill
vmovdqa 48(%rsp), %xmm0 # 16-byte Reload
vpsubq %xmm0, %xmm2, %xmm0
vpextrq $1, %xmm0, %rax
movabsq $3074457345618258603, %rcx # imm = 0x2AAAAAAAAAAAAAAB
imulq %rcx
movq %rdx, %rax
shrq $63, %rax
sarq $2, %rdx
addq %rax, %rdx
vmovq %rdx, %xmm1
vmovq %xmm0, %rax
imulq %rcx
movq %rdx, %rax
shrq $63, %rax
sarq $2, %rdx
addq %rax, %rdx
vmovq %rdx, %xmm0
vpunpcklqdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[0],xmm1[0]
vpxor %xmm4, %xmm1, %xmm0
vpcmpgtq %xmm6, %xmm0, %xmm0
vptest %xmm0, %xmm0
je .LBB582_49
Thanks,
Zhi
On Fri, Jul 24, 2015 at 10:16 AM, Philip Reames <listmail at
philipreames.com>
wrote:
>
>
> On 07/24/2015 03:42 AM, Benjamin Kramer wrote:
>
> On 24.07.2015, at 08:06, zhi chen <zchenhn at gmail.com> wrote:
>>>
>>> It seems that that it's hard to vectorize int64 in LLVM. For
example,
>>> LLVM 3.4 generates very complicated code for the following IR. I am
running
>>> on a Haswell processor. Is it because there is no alternative AVX/2
>>> instructions for int64? The same thing also happens to zext <2 x
i32> -> <2
>>> x i64> and trunc <2 x i64> -> <2 x i32>. Any
ideas to optimize these
>>> instructions? Thanks.
>>>
>>> %sub.ptr.sub.i6.i.i.i.i = sub <2 x i64>
%sub.ptr.lhs.cast.i4.i.i.i.i,
>>> %sub.ptr.rhs.cast.i5.i.i.i.i
>>> %sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64>
%sub.ptr.sub.i6.i.i.i.i, <i64
>>> 24, i64 24>
>>>
>>> Assembly:
>>> vpsubq %xmm6, %xmm5, %xmm5
>>> vmovq %xmm5, %rax
>>> movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB
>>> imulq %rbx
>>> movq %rdx, %rcx
>>> movq %rcx, %rax
>>> shrq $63, %rax
>>> shrq $2, %rcx
>>> addl %eax, %ecx
>>> vpextrq $1, %xmm5, %rax
>>> imulq %rbx
>>> movq %rdx, %rax
>>> shrq $63, %rax
>>> shrq $2, %rdx
>>> addl %eax, %edx
>>> movslq %edx, %rax
>>> vmovq %rax, %xmm5
>>> movslq %ecx, %rax
>>> vmovq %rax, %xmm6
>>> vpunpcklqdq %xmm5, %xmm6, %xmm5 # xmm5 = xmm6[0],xmm5[0]
>>>
>> AVX2 doesn't have integer vector division instructions and LLVM
lowers
>> divides by constants into (128 bit) multiplies. However, AVX2
doesn't have
>> a way to get to the upper 64 bits of a 64x64->128 bit multiply
either, so
>> LLVM uses the scalar imulq instruction to do that. There's not much
room to
>> optimize here given the limitations of AVX2.
>>
>> You seem to be subtracting pointers though, so if you can guarantee
that
>> the pointers are aligned you could set the exact bit on your
'sdiv'
>> instruction. That should give better code.
>>
> Depending on what you're using the result of the divide for, there
might
> be optimizations which could be applied as well. Can you give a slightly
> larger context for your source IR? (1-2 level of uses/defs out from the
> instructions would help)
>
>>
>> - Ben
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20150724/eab0f46f/attachment.html>
This snippet of IR is interesting: %sub.ptr.div.iS37_D = sdiv <2 x i64> %sub.ptr.sub.iS36_D, <i64 24, i64 24> %cmp10S38_D = icmp ugt <2 x i64> %sub.ptr.div.iS37_D, %splatInsMapS1_D.splat %zextS39_D = sext <2 x i1> %cmp10S38_D to <2 x i64> %BCS39_D = bitcast <2 x i64> %zextS39_D to i128 %mskS39_D = icmp ne i128 %BCS39_D, 0 br i1 %mskS39_D, label %if.then11, label %if.else It looks like %msk539_D is basically a test of whether either of the vector elements produced by the divide are ugt the spatInstMap. I can't say for sure that we can do better here - I haven't studied our vector canonicalization rules enough - but this seems like something which could possibly be improved. This is interesting: %splatCallS27_D.splatinsert = insertelement <2 x i8*> undef, i8* %call5.i.i, i32 0 %splatCallS27_D.splat = shufflevector <2 x i8*> %splatCallS27_D.splatinsert, <2 x i8*> undef, <2 x i32> zeroinitializer Can't that shuifflevector be replaced with: %splatCallS27_D.splat = insertelement <2 x i8*> %splatCallS27_D.splatinsert , i8* %call5.i.i, i32 1 Again, without knowledge of how we canonicalize such things, not necessarily a win. Just suspicious. The bitcast/extractelement sequence following that shufflevector is also interesting. It looks like it could be rewritten in terms of the i8* %call5.i.i and a bitcast. On 07/24/2015 10:52 AM, zhi chen wrote:> ------------------------------------ IR > ------------------------------------------------------------------ > if.then.i.i.i.i.i.i: ; preds = %if.then4 > %S25_D = zext <2 x i32> %splatLDS17_D.splat to <2 x i64> > %umul_with_overflow.i.iS26_D = shl <2 x i64> %S25_D, <i64 3, i64 3> > %extumul_with_overflow.i.iS26_D = extractelement <2 x i64> > %umul_with_overflow.i.iS26_D, i32 1 > %call5.i.i = tail call noalias i8* @_Znam(i64 > %extumul_with_overflow.i.iS26_D) #22 > %splatCallS27_D.splatinsert = insertelement <2 x i8*> undef, i8* > %call5.i.i, i32 0 > %splatCallS27_D.splat = shufflevector <2 x i8*> > %splatCallS27_D.splatinsert, <2 x i8*> undef, <2 x i32> zeroinitializer > %bitcastS28_D = bitcast <2 x i8*> %splatCallS27_D.splat to <2 x double*> > %extractS29_D = extractelement <2 x double*> %bitcastS28_D, i32 1 > store double* %extractS29_D, double** %val.i.i, align 8 > %val.i3.i.i = getelementptr inbounds %class.Vector* %__x, i64 0, i32 3 > %4 = load double** %val.i3.i.i, align 8, !tbaa !22 > %splatLDS31_D.splatinsert = insertelement <2 x double*> undef, > double* %4, i32 0 > %splatLDS31_D.splat = shufflevector <2 x double*> > %splatLDS31_D.splatinsert, <2 x double*> undef, <2 x i32> zeroinitializer > %bitcastS32_D = bitcast <2 x double*> %splatLDS31_D.splat to <2 x i8*> > %extbitcastS32_D = extractelement <2 x i8*> %bitcastS32_D, i32 1 > tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %call5.i.i, i8* > %extbitcastS32_D, i64 %extumul_with_overflow.i.iS26_D, i32 8, i1 false) #9 > br label %invoke.cont > > invoke.cont: ; preds = > %if.then.i.i.i.i.i.i, %if.then4 > %sub.ptr.rhs.cast.i = ptrtoint %class.Vector* %__position.coerce to i64 > %sub.ptr.rhs.cast.iS35_D = ptrtoint <2 x %class.Vector*> > %splatInsMapS35_D.splat to <2 x i64> > %sub.ptr.sub.iS36_D = sub <2 x i64> %sub.ptr.rhs.castS8_D, > %sub.ptr.rhs.cast.iS35_D > %sub.ptr.div.iS37_D = sdiv <2 x i64> %sub.ptr.sub.iS36_D, <i64 24, > i64 24> > %extractS196_D = extractelement <2 x i64> %sub.ptr.div.iS37_D, i32 1 > %cmp10S38_D = icmp ugt <2 x i64> %sub.ptr.div.iS37_D, > %splatInsMapS1_D.splat > %zextS39_D = sext <2 x i1> %cmp10S38_D to <2 x i64> > %BCS39_D = bitcast <2 x i64> %zextS39_D to i128 > %mskS39_D = icmp ne i128 %BCS39_D, 0 > br i1 %mskS39_D, label %if.then11, label %if.else > > -------------------------------------------- Assembly > ----------------------------------------------------------------- > > # BB#3: # %if.then.i.i.i.i.i.i > vpsllq $3, %xmm0, %xmm0 > vpextrq $1, %xmm0, %rbx > movq %rbx, %rdi > vmovaps %xmm2, 96(%rsp) # 16-byte Spill > vmovaps %xmm5, 64(%rsp) # 16-byte Spill > vmovdqa %xmm6, 16(%rsp) # 16-byte Spill > callq _Znam > movq %rax, 128(%rsp) > movq 16(%r12), %rsi > movq %rax, %rdi > movq %rbx, %rdx > callq memmove > vmovdqa 16(%rsp), %xmm6 # 16-byte Reload > vmovaps 64(%rsp), %xmm5 # 16-byte Reload > vmovaps 96(%rsp), %xmm2 # 16-byte Reload > vmovdqa .LCPI582_0(%rip), %xmm4 > .LBB582_4: # %invoke.cont > vmovaps %xmm2, 96(%rsp) # 16-byte Spill > vmovdqa 48(%rsp), %xmm0 # 16-byte Reload > vpsubq %xmm0, %xmm2, %xmm0 > vpextrq $1, %xmm0, %rax > movabsq $3074457345618258603, %rcx # imm = 0x2AAAAAAAAAAAAAAB > imulq %rcx > movq %rdx, %rax > shrq $63, %rax > sarq $2, %rdx > addq %rax, %rdx > vmovq %rdx, %xmm1 > vmovq %xmm0, %rax > imulq %rcx > movq %rdx, %rax > shrq $63, %rax > sarq $2, %rdx > addq %rax, %rdx > vmovq %rdx, %xmm0 > vpunpcklqdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[0],xmm1[0] > vpxor %xmm4, %xmm1, %xmm0 > vpcmpgtq %xmm6, %xmm0, %xmm0 > vptest %xmm0, %xmm0 > je .LBB582_49 > > Thanks, > Zhi > > On Fri, Jul 24, 2015 at 10:16 AM, Philip Reames > <listmail at philipreames.com <mailto:listmail at philipreames.com>> wrote: > > > > On 07/24/2015 03:42 AM, Benjamin Kramer wrote: > > On 24.07.2015, at 08:06, zhi chen <zchenhn at gmail.com > <mailto:zchenhn at gmail.com>> wrote: > > It seems that that it's hard to vectorize int64 in LLVM. > For example, LLVM 3.4 generates very complicated code for > the following IR. I am running on a Haswell processor. Is > it because there is no alternative AVX/2 instructions for > int64? The same thing also happens to zext <2 x i32> -> <2 > x i64> and trunc <2 x i64> -> <2 x i32>. Any ideas to > optimize these instructions? Thanks. > > %sub.ptr.sub.i6.i.i.i.i = sub <2 x i64> > %sub.ptr.lhs.cast.i4.i.i.i.i, %sub.ptr.rhs.cast.i5.i.i.i.i > %sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64> > %sub.ptr.sub.i6.i.i.i.i, <i64 24, i64 24> > > Assembly: > vpsubq %xmm6, %xmm5, %xmm5 > vmovq %xmm5, %rax > movabsq $3074457345618258603, %rbx # imm > 0x2AAAAAAAAAAAAAAB > imulq %rbx > movq %rdx, %rcx > movq %rcx, %rax > shrq $63, %rax > shrq $2, %rcx > addl %eax, %ecx > vpextrq $1, %xmm5, %rax > imulq %rbx > movq %rdx, %rax > shrq $63, %rax > shrq $2, %rdx > addl %eax, %edx > movslq %edx, %rax > vmovq %rax, %xmm5 > movslq %ecx, %rax > vmovq %rax, %xmm6 > vpunpcklqdq %xmm5, %xmm6, %xmm5 # xmm5 = xmm6[0],xmm5[0] > > AVX2 doesn't have integer vector division instructions and > LLVM lowers divides by constants into (128 bit) multiplies. > However, AVX2 doesn't have a way to get to the upper 64 bits > of a 64x64->128 bit multiply either, so LLVM uses the scalar > imulq instruction to do that. There's not much room to > optimize here given the limitations of AVX2. > > You seem to be subtracting pointers though, so if you can > guarantee that the pointers are aligned you could set the > exact bit on your 'sdiv' instruction. That should give better > code. > > Depending on what you're using the result of the divide for, there > might be optimizations which could be applied as well. Can you > give a slightly larger context for your source IR? (1-2 level of > uses/defs out from the instructions would help) > > > - Ben > > > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu <mailto:LLVMdev at cs.uiuc.edu> > http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev > > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150724/e04ee881/attachment.html>