Displaying 20 results from an estimated 168 matches for "xmm2".
Did you mean:
xmm0
2012 Mar 28
2
[LLVMdev] Suboptimal code due to excessive spilling
...have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects. I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.
It's all very strange and it points to some questionable decision
making on the part of llvm. I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p =...
2012 Apr 05
0
[LLVMdev] Suboptimal code due to excessive spilling
...have been more explicit.
The really strange thing, is that in the assingment to p[i] is removed
(line marked with "xxx..."), then the code produced is optimal and
exactly what one expects. I show this result in "Output B" where you
get a beatiful sequence of addsd into register xmm2.
It's all very strange and it points to some questionable decision
making on the part of llvm. I tried different versions of the sum()
function (elliminating the loop for example) but it does not help.
Another observation is that the loop variable i (in foo) must be
involved: if one does *p =...
2015 Jan 29
2
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
...on reducing them, but for now, here are
> some raw observations, in case any of it rings a bell:
>
Very cool, and thanks for the analysis!
>
>
> Another problem I'm seeing is that in some cases we can't fold memory
> anymore:
> vpermilps $-0x6d, -0xXX(%rdx), %xmm2 ## xmm2 = mem[3,0,1,2]
> vblendps $0x1, %xmm2, %xmm0, %xmm0
> becomes:
> vmovaps -0xXX(%rdx), %xmm2
> vshufps $0x3, %xmm0, %xmm2, %xmm3 ## xmm3 = xmm2[3,0],xmm0[0,0]
> vshufps $-0x68, %xmm0, %xmm3, %xmm0 ## xmm0 = xmm3[0,2],xmm0[1,2]
>
>
&...
2004 Aug 06
2
[PATCH] Make SSE Run Time option. Add Win32 SSE code
...x]
+
+ movss xmm1, [ecx]
+ addss xmm1, xmm0
+
+ mov edx, in2
+ movss [edx], xmm1
+
+ shufps xmm0, xmm0, 0x00
+ shufps xmm1, xmm1, 0x00
+
+ movaps xmm2, [eax+4]
+ movaps xmm3, [ebx+4]
+ mulps xmm2, xmm0
+ mulps xmm3, xmm1
+ movaps xmm4, [eax+20]
+ mulps xmm4, xmm0
+ addps xmm2, [ecx+4]
+ mova...
2015 Jul 29
2
[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address
...# %loop2
# =>This Inner Loop Header: Depth=1
movq offset_array3(,%rsi,8), %rdi
movq offset_array2(,%rsi,8), %r10
movss -28(%rax), %xmm0
movss -8(%rax), %xmm1
movss -4(%rax), %xmm2
unpcklps %xmm0, %xmm2 # xmm2 =
xmm2[0],xmm0[0],xmm2[1],xmm0[1]
movss (%rax), %xmm0
unpcklps %xmm0, %xmm1 # xmm1 =
xmm1[0],xmm0[0],xmm1[1],xmm0[1]
unpcklps %xmm2, %xmm1 # xmm1 =
xmm1[0],xmm2[0],xmm1[1],xmm2[1]
addps (%...
2014 Sep 05
3
[LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!
On Fri, Sep 5, 2014 at 9:32 AM, Robert Lougher <rob.lougher at gmail.com>
wrote:
> Unfortunately, another team, while doing internal testing has seen the
> new path generating illegal insertps masks. A sample here:
>
> vinsertps $256, %xmm0, %xmm13, %xmm4 # xmm4 = xmm0[0],xmm13[1,2,3]
> vinsertps $256, %xmm1, %xmm0, %xmm6 # xmm6 = xmm1[0],xmm0[1,2,3]
>
2015 Jan 30
4
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
...f it rings a bell:
>>>
>>
>> Very cool, and thanks for the analysis!
>>
>>
>>>
>>>
>>> Another problem I'm seeing is that in some cases we can't fold memory
>>> anymore:
>>> vpermilps $-0x6d, -0xXX(%rdx), %xmm2 ## xmm2 = mem[3,0,1,2]
>>> vblendps $0x1, %xmm2, %xmm0, %xmm0
>>> becomes:
>>> vmovaps -0xXX(%rdx), %xmm2
>>> vshufps $0x3, %xmm0, %xmm2, %xmm3 ## xmm3 = xmm2[3,0],xmm0[0,0]
>>> vshufps $-0x68, %xmm0, %xmm3, %xmm0 #...
2013 Jul 19
0
[LLVMdev] llvm.x86.sse2.sqrt.pd not using sqrtpd, calling a function that modifies ECX
...133 movlpd qword ptr [esp+0F0h],xmm3
002E013C movhpd qword ptr [esp+0E0h],xmm3
002E0145 movlpd qword ptr [esp+100h],xmm7
002E014E pshufd xmm0,xmm7,44h
002E0153 movdqa xmm5,xmm0
002E0157 xorpd xmm4,xmm4
002E015B mulpd xmm5,xmm4
002E015F pshufd xmm2,xmm3,44h
002E0164 movdqa xmm1,xmm2
002E0168 mulpd xmm1,xmm4
002E016C xorpd xmm7,xmm7
002E0170 movapd xmm4,xmmword ptr [esp+70h]
002E0176 subpd xmm4,xmm1
002E017A pshufd xmm3,xmm3,0EEh
002E017F subpd xmm4,xmm3
002E0183 subpd xmm4,xmm5
002...
2015 Jan 29
0
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
...gt; some raw observations, in case any of it rings a bell:
>>
>
> Very cool, and thanks for the analysis!
>
>
>>
>>
>> Another problem I'm seeing is that in some cases we can't fold memory
>> anymore:
>> vpermilps $-0x6d, -0xXX(%rdx), %xmm2 ## xmm2 = mem[3,0,1,2]
>> vblendps $0x1, %xmm2, %xmm0, %xmm0
>> becomes:
>> vmovaps -0xXX(%rdx), %xmm2
>> vshufps $0x3, %xmm0, %xmm2, %xmm3 ## xmm3 = xmm2[3,0],xmm0[0,0]
>> vshufps $-0x68, %xmm0, %xmm3, %xmm0 ## xmm0 =
>> xm...
2015 Jan 30
0
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
...>>
>>> Very cool, and thanks for the analysis!
>>>
>>>
>>>>
>>>>
>>>> Another problem I'm seeing is that in some cases we can't fold memory
>>>> anymore:
>>>> vpermilps $-0x6d, -0xXX(%rdx), %xmm2 ## xmm2 = mem[3,0,1,2]
>>>> vblendps $0x1, %xmm2, %xmm0, %xmm0
>>>> becomes:
>>>> vmovaps -0xXX(%rdx), %xmm2
>>>> vshufps $0x3, %xmm0, %xmm2, %xmm3 ## xmm3 =
>>>> xmm2[3,0],xmm0[0,0]
>>>> vshufp...
2014 Sep 05
2
[LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!
...00000e+00,
> float undef, float undef>, <4 x float> %1, <4 x i32> <i32 4, i32 1,
> i32 6, i32 7>
> ret <4 x float> %2
> }
>
>
> llc -march=x86-64 -mattr=+avx test.ll -o -
>
> test: # @test
> vxorps %xmm2, %xmm2, %xmm2
> vmovss %xmm0, %xmm2, %xmm2
> vblendps $4, %xmm0, %xmm2, %xmm0 # xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
> vinsertps $48, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm1[0]
> retl
>
> test2: # @test2
> vinsertps...
2015 Jul 29
0
[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address
...# %loop2
> # =>This Inner Loop Header: Depth=1
> movq offset_array3(,%rsi,8), %rdi
> movq offset_array2(,%rsi,8), %r10
> movss -28(%rax), %xmm0
> movss -8(%rax), %xmm1
> movss -4(%rax), %xmm2
> unpcklps %xmm0, %xmm2 # xmm2 =
> xmm2[0],xmm0[0],xmm2[1],xmm0[1]
> movss (%rax), %xmm0
> unpcklps %xmm0, %xmm1 # xmm1 =
> xmm1[0],xmm0[0],xmm1[1],xmm0[1]
> unpcklps %xmm2, %xmm1 # xmm1 =
> xmm1[0],xmm2[0],xmm1[1]...
2014 Sep 06
2
[LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!
...t undef>, <4 x float> %1, <4 x i32> <i32 4, i32 1,
>> i32 6, i32 7>
>> ret <4 x float> %2
>> }
>>
>>
>> llc -march=x86-64 -mattr=+avx test.ll -o -
>>
>> test: # @test
>> vxorps %xmm2, %xmm2, %xmm2
>> vmovss %xmm0, %xmm2, %xmm2
>> vblendps $4, %xmm0, %xmm2, %xmm0 # xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
>> vinsertps $48, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm1[0]
>> retl
>>
>> test2: # @tes...
2015 Jan 25
4
[LLVMdev] RFB: Would like to flip the vector shuffle legality flag
I ran the benchmarking subset of test-suite on a btver2 machine and
optimizing for btver2 (so enabling AVX codegen).
I don't see anything outside of the noise with
x86-experimental-vector-shuffle-legality=1.
On Fri, Jan 23, 2015 at 5:19 AM, Andrea Di Biagio <andrea.dibiagio at gmail.com
> wrote:
> Hi Chandler,
>
> On Fri, Jan 23, 2015 at 8:15 AM, Chandler Carruth
2015 Jun 26
2
[LLVMdev] Can LLVM vectorize <2 x i32> type
...S54_D to i128
%mskS54_D = icmp ne i128 %BCS54_D, 0
br i1 %mskS54_D, label %middle.block, label %vector.ph
Now the assembly for the above IR code is:
# BB#4: # %for.cond.preheader
vmovdqa 144(%rsp), %xmm0 # 16-byte Reload
vpmuludq %xmm7, %xmm0, %xmm2
vpsrlq $32, %xmm7, %xmm4
vpmuludq %xmm4, %xmm0, %xmm4
vpsllq $32, %xmm4, %xmm4
vpaddq %xmm4, %xmm2, %xmm2
vpsrlq $32, %xmm0, %xmm4
vpmuludq %xmm7, %xmm4, %xmm4
vpsllq $32, %xmm4, %xmm4
vpaddq %xmm4, %xmm2, %xmm2
vpextrq $1, %xmm2, %rax
cltq
vm...
2013 Aug 22
2
New routine: FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
...12
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
@@ -596,7 +597,7 @@
movss xmm3, xmm2
movss xmm2, xmm0
- ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
+ ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
movaps xmm1, xmm0
mulps xmm1, xmm2
addps xmm5, xmm1
@@ -619,6 +620,95 @@
ret
ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
+ ;[eb...
2010 May 11
2
[LLVMdev] How does SSEDomainFix work?
...$ llc -mcpu=nehalem -debug-pass=Structure foo.bc -o foo.s
(snip)
Code Placement Optimizater
SSE execution domain fixup
Machine Natural Loop Construction
X86 AT&T-Style Assembly Printer
Delete Garbage Collector Information
foo.s: (edited)
_foo:
movaps %xmm0, %xmm3
andps %xmm2, %xmm3
andnps %xmm1, %xmm2
movaps %xmm2, %xmm0
xorps %xmm3, %xmm0
ret
_bar:
movaps %xmm0, %xmm3
andps %xmm2, %xmm3
andnps %xmm1, %xmm2
movaps %xmm2, %xmm0
xorps %xmm3, %xmm0
ret
2014 Sep 08
2
[LLVMdev] Please benchmark new x86 vector shuffle lowering, planning to make it the default very soon!
...4 x i32> <i32 4, i32 1,
>>> i32 6, i32 7>
>>> ret <4 x float> %2
>>> }
>>>
>>>
>>> llc -march=x86-64 -mattr=+avx test.ll -o -
>>>
>>> test: # @test
>>> vxorps %xmm2, %xmm2, %xmm2
>>> vmovss %xmm0, %xmm2, %xmm2
>>> vblendps $4, %xmm0, %xmm2, %xmm0 # xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
>>> vinsertps $48, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],xmm1[0]
>>> retl
>>>
>>> test2:...
2010 Nov 20
2
[LLVMdev] Poor floating point optimizations?
I wanted to use LLVM for my math parser but it seems that floating point
optimizations are poor.
For example consider such C code:
float foo(float x) { return x+x+x; }
and here is the code generated with "optimized" live demo:
define float @foo(float %x) nounwind readnone { entry: %0 = fmul float %x,
2.000000e+00 ; <float> [#uses=1] %1 = fadd float %0, %x
2010 Nov 20
0
[LLVMdev] Poor floating point optimizations?
And also the resulting assembly code is very poor:
00460013 movss xmm0,dword ptr [esp+8]
00460019 movaps xmm1,xmm0
0046001C addss xmm1,xmm1
00460020 pxor xmm2,xmm2
00460024 addss xmm2,xmm1
00460028 addss xmm2,xmm0
0046002C movss dword ptr [esp],xmm2
00460031 fld dword ptr [esp]
Especially pxor&and instead of movss (which is unnecessary anyway) is just pure
madness.
Bob D.