Displaying 20 results from an estimated 70 matches for "xmm5".
Did you mean:
xmm0
2013 Jul 19
0
[LLVMdev] llvm.x86.sse2.sqrt.pd not using sqrtpd, calling a function that modifies ECX
...0h],xmm1
002E0126 movapd xmmword ptr [esp+0B0h],xmm7
002E012F movapd xmm3,xmm1
002E0133 movlpd qword ptr [esp+0F0h],xmm3
002E013C movhpd qword ptr [esp+0E0h],xmm3
002E0145 movlpd qword ptr [esp+100h],xmm7
002E014E pshufd xmm0,xmm7,44h
002E0153 movdqa xmm5,xmm0
002E0157 xorpd xmm4,xmm4
002E015B mulpd xmm5,xmm4
002E015F pshufd xmm2,xmm3,44h
002E0164 movdqa xmm1,xmm2
002E0168 mulpd xmm1,xmm4
002E016C xorpd xmm7,xmm7
002E0170 movapd xmm4,xmmword ptr [esp+70h]
002E0176 subpd xmm4,xmm1
002E017A...
2015 Jul 24
2
[LLVMdev] SIMD for sdiv <2 x i64>
...<2 x i32>. Any ideas to optimize these instructions?
Thanks.
%sub.ptr.sub.i6.i.i.i.i = sub <2 x i64> %sub.ptr.lhs.cast.i4.i.i.i.i,
%sub.ptr.rhs.cast.i5.i.i.i.i
%sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64> %sub.ptr.sub.i6.i.i.i.i, <i64 24,
i64 24>
Assembly:
vpsubq %xmm6, %xmm5, %xmm5
vmovq %xmm5, %rax
movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB
imulq %rbx
movq %rdx, %rcx
movq %rcx, %rax
shrq $63, %rax
shrq $2, %rcx
addl %eax, %ecx
vpextrq $1, %xmm5, %rax
imulq %rbx
movq %rdx, %rax...
2015 Jul 24
0
[LLVMdev] SIMD for sdiv <2 x i64>
...optimize these instructions? Thanks.
>
> %sub.ptr.sub.i6.i.i.i.i = sub <2 x i64> %sub.ptr.lhs.cast.i4.i.i.i.i, %sub.ptr.rhs.cast.i5.i.i.i.i
> %sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64> %sub.ptr.sub.i6.i.i.i.i, <i64 24, i64 24>
>
> Assembly:
> vpsubq %xmm6, %xmm5, %xmm5
> vmovq %xmm5, %rax
> movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB
> imulq %rbx
> movq %rdx, %rcx
> movq %rcx, %r...
2015 Jan 29
2
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
...m0[0,0]
> vshufps $-0x68, %xmm0, %xmm3, %xmm0 ## xmm0 = xmm3[0,2],xmm0[1,2]
>
>
> Also, I see differences when some loads are shuffled, that I'm a bit
> conflicted about:
> vmovaps -0xXX(%rbp), %xmm3
> ...
> vinsertps $0xc0, %xmm4, %xmm3, %xmm5 ## xmm5 = xmm4[3],xmm3[1,2,3]
> becomes:
> vpermilps $-0x6d, -0xXX(%rbp), %xmm2 ## xmm2 = mem[3,0,1,2]
> ...
> vinsertps $0xc0, %xmm4, %xmm2, %xmm2 ## xmm2 = xmm4[3],xmm2[1,2,3]
>
> Note that the second version does the shuffle in-place, in xmm2.
>
>
>...
2004 Aug 06
2
[PATCH] Make SSE Run Time option. Add Win32 SSE code
...eax+4]
+ movaps xmm3, [ebx+4]
+ mulps xmm2, xmm0
+ mulps xmm3, xmm1
+ movaps xmm4, [eax+20]
+ mulps xmm4, xmm0
+ addps xmm2, [ecx+4]
+ movaps xmm5, [ebx+20]
+ mulps xmm5, xmm1
+ addps xmm4, [ecx+20]
+ subps xmm2, xmm3
+ movups [ecx], xmm2
+ subps xmm4, xmm5
+ movups [ecx+16], xmm4
+
+ mo...
2013 Jul 19
4
[LLVMdev] SIMD instructions and memory alignment on X86
Hmm, I'm not able to get those .ll files to compile if I disable SSE and I
end up with SSE instructions(including sqrtpd) if I don't disable it.
On Thu, Jul 18, 2013 at 10:53 PM, Peter Newman <peter at uformia.com> wrote:
> Is there something specifically required to enable SSE? If it's not
> detected as available (based from the target triple?) then I don't think
2015 Jul 24
2
[LLVMdev] SIMD for sdiv <2 x i64>
...tions? Thanks.
>>
>> %sub.ptr.sub.i6.i.i.i.i = sub <2 x i64> %sub.ptr.lhs.cast.i4.i.i.i.i, %sub.ptr.rhs.cast.i5.i.i.i.i
>> %sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64> %sub.ptr.sub.i6.i.i.i.i, <i64 24, i64 24>
>>
>> Assembly:
>> vpsubq %xmm6, %xmm5, %xmm5
>> vmovq %xmm5, %rax
>> movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB
>> imulq %rbx
>> movq %rdx, %rcx
>> movq %rcx, %rax
>> shrq $63, %rax
>> shrq $2, %rcx
>> addl %eax...
2015 Jan 30
4
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
...>>> xmm3[0,2],xmm0[1,2]
>>>
>>>
>>> Also, I see differences when some loads are shuffled, that I'm a bit
>>> conflicted about:
>>> vmovaps -0xXX(%rbp), %xmm3
>>> ...
>>> vinsertps $0xc0, %xmm4, %xmm3, %xmm5 ## xmm5 =
>>> xmm4[3],xmm3[1,2,3]
>>> becomes:
>>> vpermilps $-0x6d, -0xXX(%rbp), %xmm2 ## xmm2 = mem[3,0,1,2]
>>> ...
>>> vinsertps $0xc0, %xmm4, %xmm2, %xmm2 ## xmm2 =
>>> xmm4[3],xmm2[1,2,3]
>>>
>>> Note...
2015 Jan 29
0
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
..., %xmm0, %xmm3, %xmm0 ## xmm0 =
>> xmm3[0,2],xmm0[1,2]
>>
>>
>> Also, I see differences when some loads are shuffled, that I'm a bit
>> conflicted about:
>> vmovaps -0xXX(%rbp), %xmm3
>> ...
>> vinsertps $0xc0, %xmm4, %xmm3, %xmm5 ## xmm5 = xmm4[3],xmm3[1,2,3]
>> becomes:
>> vpermilps $-0x6d, -0xXX(%rbp), %xmm2 ## xmm2 = mem[3,0,1,2]
>> ...
>> vinsertps $0xc0, %xmm4, %xmm2, %xmm2 ## xmm2 = xmm4[3],xmm2[1,2,3]
>>
>> Note that the second version does the shuffle in-place,...
2015 Jan 30
0
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
...1,2]
>>>>
>>>>
>>>> Also, I see differences when some loads are shuffled, that I'm a bit
>>>> conflicted about:
>>>> vmovaps -0xXX(%rbp), %xmm3
>>>> ...
>>>> vinsertps $0xc0, %xmm4, %xmm3, %xmm5 ## xmm5 =
>>>> xmm4[3],xmm3[1,2,3]
>>>> becomes:
>>>> vpermilps $-0x6d, -0xXX(%rbp), %xmm2 ## xmm2 = mem[3,0,1,2]
>>>> ...
>>>> vinsertps $0xc0, %xmm4, %xmm2, %xmm2 ## xmm2 =
>>>> xmm4[3],xmm2[1,2,3]
>>...
2015 Jul 24
0
[LLVMdev] SIMD for sdiv <2 x i64>
...--------------- Assembly
-----------------------------------------------------------------
# BB#3: # %if.then.i.i.i.i.i.i
vpsllq $3, %xmm0, %xmm0
vpextrq $1, %xmm0, %rbx
movq %rbx, %rdi
vmovaps %xmm2, 96(%rsp) # 16-byte Spill
vmovaps %xmm5, 64(%rsp) # 16-byte Spill
vmovdqa %xmm6, 16(%rsp) # 16-byte Spill
callq _Znam
movq %rax, 128(%rsp)
movq 16(%r12), %rsi
movq %rax, %rdi
movq %rbx, %rdx
callq memmove
vmovdqa 16(%rsp), %xmm6 # 16-byte Reload
vmovaps 64(%rsp),...
2015 Jul 24
1
[LLVMdev] SIMD for sdiv <2 x i64>
...--------------------------------------------------------
>
> # BB#3: # %if.then.i.i.i.i.i.i
> vpsllq $3, %xmm0, %xmm0
> vpextrq $1, %xmm0, %rbx
> movq %rbx, %rdi
> vmovaps %xmm2, 96(%rsp) # 16-byte Spill
> vmovaps %xmm5, 64(%rsp) # 16-byte Spill
> vmovdqa %xmm6, 16(%rsp) # 16-byte Spill
> callq _Znam
> movq %rax, 128(%rsp)
> movq 16(%r12), %rsi
> movq %rax, %rdi
> movq %rbx, %rdx
> callq memmove
> vmovdqa 16(%rsp), %xmm6...
2013 Aug 22
2
New routine: FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
...ation_asm_ia32_sse_lag_16
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
@@ -596,7 +597,7 @@
movss xmm3, xmm2
movss xmm2, xmm0
- ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
+ ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
movaps xmm1, xmm0
mulps xmm1, xmm2
addps xmm5, xmm1
@@ -619,6 +620,95 @@
ret
ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
+ ;[ebp + 20] == autoc[]
+ ;[ebp + 16] == lag...
2015 Jun 26
2
[LLVMdev] Can LLVM vectorize <2 x i32> type
...vpsllq $32, %xmm4, %xmm4
vpaddq %xmm4, %xmm2, %xmm2
vpsrlq $32, %xmm0, %xmm4
vpmuludq %xmm7, %xmm4, %xmm4
vpsllq $32, %xmm4, %xmm4
vpaddq %xmm4, %xmm2, %xmm2
vpextrq $1, %xmm2, %rax
cltq
vmovq %rax, %xmm4
vmovq %xmm2, %rax
cltq
vmovq %rax, %xmm5
vpunpcklqdq %xmm4, %xmm5, %xmm4 # xmm4 = xmm5[0],xmm4[0]
vpcmpgtq %xmm3, %xmm4, %xmm3
vptest %xmm3, %xmm3
je .LBB10_66
# BB#5: # %for.body.preheader
vpaddq %xmm15, %xmm2, %xmm3
vpand %xmm15, %xmm3, %xmm3
vpaddq .LCPI10_1(%rip), %x...
2015 Jan 23
5
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
Greetings LLVM hackers and x86 vector shufflers!
I would like to flip on another chunk of the new vector shuffling,
specifically the logic to mark ~all shuffles as "legal".
This can be tested today with the flag
"-x86-experimental-vector-shuffle-legality". I would essentially like to
make this the default (by removing the "false" path). Doing this will allow
me to
2016 Aug 12
4
Invoke loop vectorizer
...%xmm1 ## xmm1 =
> zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
> pshufd $68, %xmm3, %xmm3 ## xmm3 = xmm3[0,1,0,1]
> paddq %xmm1, %xmm3
> pshufd $78, %xmm3, %xmm4 ## xmm4 = xmm3[2,3,0,1]
> punpckldq %xmm5, %xmm4 ## xmm4 =
> xmm4[0],xmm5[0],xmm4[1],xmm5[1]
> pshufd $212, %xmm4, %xmm4 ## xmm4 = xmm4[0,1,1,3]
>
>
>
> Note:
> It also vectorizes at SIZE=8.
>
> Not sure what the exact translation of options from clang-cl to clang is.
> Maybe try adding /O3?
&...
2016 Aug 05
3
enabling interleaved access loop vectorization
...# %vector.body
# =>This Inner Loop Header: Depth=1
movdqu (%rdi,%rax,4), %xmm3
movd %xmm0, %rcx
movdqu 4(%rdi,%rcx,4), %xmm4
paddd %xmm3, %xmm4
movdqu 8(%rdi,%rcx,4), %xmm3
paddd %xmm4, %xmm3
movdqa %xmm1, %xmm4
paddq %xmm4, %xmm4
movdqa %xmm0, %xmm5
paddq %xmm5, %xmm5
movd %xmm5, %rcx
pextrq $1, %xmm5, %rdx
movd %xmm4, %r8
pextrq $1, %xmm4, %r9
movd (%rdi,%rcx,4), %xmm4 # xmm4 = mem[0],zero,zero,zero
pinsrd $1, (%rdi,%rdx,4), %xmm4
pinsrd $2, (%rdi,%r8,4), %xmm4
pinsrd $3, (%rdi,%r9,4), %xmm4
paddd %xmm3, %xmm4
movdqu %xmm4, (%rsi,%rax,4)
a...
2016 May 26
2
enabling interleaved access loop vectorization
Interleaved access is not enabled on X86 yet.
We looked at this feature and got into conclusion that interleaving (as loads + shuffles) is not always profitable on X86. We should provide the right cost which depends on number of shuffles. Number of shuffles depends on permutations (shuffle mask). And even if we estimate the number of shuffles, the shuffles are not generated in-place. Vectorizer
2015 Jun 24
2
[LLVMdev] Can LLVM vectorize <2 x i32> type
Hi,
Is LLVM be able to generate code for the following code?
%mul = mul <2 x i32> %1, %2, where %1 and %2 are <2 x i32> type.
I am running it on a Haswell processor with LLVM-3.4.2. It seems that it
will generates really complicated code with vpaddq, vpmuludq, vpsllq,
vpsrlq.
Thanks,
Zhi
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
2016 Aug 12
2
Invoke loop vectorizer
Hi Daniel,
I increased the size of your test to be 128 but -stats still shows no loop
optimized...
Xiaochu
On Aug 12, 2016 11:11 AM, "Daniel Berlin" <dberlin at dberlin.org> wrote:
> It's not possible to know that A and B don't alias in this example. It's
> almost certainly not profitable to add a runtime check given the size of
> the loop.
>
>
>