Displaying 19 results from an estimated 19 matches for "vmulp".
Did you mean:
vulp
2020 Sep 01
2
Vector evolution?
...get the following codegen:
0000000000000160 <_Z4fct6PDv4_f>:
160: 31 c0 xor %eax,%eax
162: c4 e2 79 18 05 00 00 vbroadcastss 0x0(%rip),%xmm0 # 16b
<_Z4fct6PDv4_f+0xb>
169: 00 00
16b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
170: c5 f8 59 0c 07 vmulps (%rdi,%rax,1),%xmm0,%xmm1
175: c5 f8 29 0c 07 vmovaps %xmm1,(%rdi,%rax,1)
17a: c5 f8 59 4c 07 10 vmulps 0x10(%rdi,%rax,1),%xmm0,%xmm1
180: c5 f8 29 4c 07 10 vmovaps %xmm1,0x10(%rdi,%rax,1)
186: c5 f8 59 4c 07 20 vmulps 0x20(%rdi,%rax,1),%xmm0,%xmm1
18c: c5 f8 29 4c 07 20 vm...
2019 Sep 02
3
AVX2 codegen - question reg. FMA generation
...ator (with cpu set to haswell or later types) turning it into an
AVX2 FMA instructions. Here's the snippet in the output it generates:
$ llc -O3 -mcpu=skylake
---------------------
.LBB0_2: # =>This Inner Loop Header: Depth=1
vbroadcastss (%rsi,%rdx,4), %ymm0
vmulps (%rdi,%rcx), %ymm0, %ymm0
vaddps (%rax,%rcx), %ymm0, %ymm0
vmovups %ymm0, (%rax,%rcx)
incq %rdx
addq $32, %rcx
cmpq $15, %rdx
jle .LBB0_2
-----------------------
$ llc --version
LLVM (http://llvm.org/):
LLVM version 8.0.0
Optimized build.
Default target: x86_64-unknown-linux-gnu
Host CPU:...
2013 Dec 11
2
[LLVMdev] AVX code gen
...nd this post on the llvm blog: http://blog.llvm.org/2012/12/new-loop-vectorizer.html which makes me think that clang / llvm are capable of generating AVX with packed instructions as well as utilizing the full width of the YMM registers… I have an environment where icc generates these instructions (vmulps %ymm1, %ymm3, %ymm2 for example) but I can not get clang/llvm to generate such instructions (using the 3.3 release or either 3.4 rc1 or 3.4 rc2). I am new to clang / llvm so I may not be invoking the tools correctly but given that –fvectorize and –fslp-vectorize are on by default at 3.4 I would h...
2016 Jun 29
2
avx512 JIT backend generates wrong code on <4 x float>
...dx
leaq 16(%r9,%rdx), %rsi
orq $16, %rdx
movq 16(%rsp), %rdi
addq %rdx, %rdi
addq 8(%rsp), %rdx
.align 16, 0x90
.LBB0_1:
vmovaps -16(%rdx), %xmm0
vmovaps (%rdx), %xmm1
vmovaps -16(%rdi), %xmm2
vmovaps (%rdi), %xmm3
vmulps %xmm3, %xmm1, %xmm4
vmulps %xmm2, %xmm1, %xmm1
vfmadd213ss %xmm4, %xmm0, %xmm2
vfmsub213ss %xmm1, %xmm0, %xmm3
vmovaps %xmm2, -16(%rsi)
vmovaps %xmm3, (%rsi)
addq $1, %rax
addq $32, %rsi
addq $32, %rdi
addq $32, %rdx
c...
2016 Jun 29
0
avx512 JIT backend generates wrong code on <4 x float>
..., %rdx
> movq 16(%rsp), %rdi
> addq %rdx, %rdi
> addq 8(%rsp), %rdx
> .align 16, 0x90
> .LBB0_1:
> vmovaps -16(%rdx), %xmm0
> vmovaps (%rdx), %xmm1
> vmovaps -16(%rdi), %xmm2
> vmovaps (%rdi), %xmm3
> vmulps %xmm3, %xmm1, %xmm4
> vmulps %xmm2, %xmm1, %xmm1
> vfmadd213ss %xmm4, %xmm0, %xmm2
> vfmsub213ss %xmm1, %xmm0, %xmm3
> vmovaps %xmm2, -16(%rsi)
> vmovaps %xmm3, (%rsi)
> addq $1, %rax
> addq $32, %rsi
> addq...
2013 Dec 12
0
[LLVMdev] AVX code gen
...movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
xorl %eax, %eax
.align 4, 0x90
LBB0_1: ## %vector.body
## =>This Inner Loop Header: Depth=1
vmovups (%rdx,%rax,4), %ymm0
vmulps (%rsi,%rax,4), %ymm0, %ymm0
vaddps (%rdi,%rax,4), %ymm0, %ymm0
vmovups %ymm0, (%rdi,%rax,4)
addq $8, %rax
cmpq $256, %rax ## imm = 0x100
jne LBB0_1
## BB#2: ## %for.end
popq %rbp
vzer...
2016 Jun 30
1
avx512 JIT backend generates wrong code on <4 x float>
...> addq %rdx, %rdi
>> addq 8(%rsp), %rdx
>> .align 16, 0x90
>> .LBB0_1:
>> vmovaps -16(%rdx), %xmm0
>> vmovaps (%rdx), %xmm1
>> vmovaps -16(%rdi), %xmm2
>> vmovaps (%rdi), %xmm3
>> vmulps %xmm3, %xmm1, %xmm4
>> vmulps %xmm2, %xmm1, %xmm1
>> vfmadd213ss %xmm4, %xmm0, %xmm2
>> vfmsub213ss %xmm1, %xmm0, %xmm3
>> vmovaps %xmm2, -16(%rsi)
>> vmovaps %xmm3, (%rsi)
>> addq $1, %rax
>>...
2014 Oct 13
2
[LLVMdev] Unexpected spilling of vector register during lane extraction on some x86_64 targets
...,%xmm0,%rdx
400533: movslq %edx,%rsi
400536: sar $0x20,%rdx
40053a: vmovss 0x4006c0(,%rcx,4),%xmm0
400543: vinsertps $0x10,0x4006c0(,%rax,4),%xmm0,%xmm0
40054e: vinsertps $0x20,0x4006c0(,%rsi,4),%xmm0,%xmm0
400559: vinsertps $0x30,0x4006c0(,%rdx,4),%xmm0,%xmm0
400564: vmulps 0x144(%rip),%xmm0,%xmm0 # 4006b0
<__dso_handle+0x38>
40056c: vmovaps %xmm0,0x20046c(%rip) # 6009e0 <r>
400574: xor %eax,%eax
400576: retq
$ clang++ test.cpp -O3 -fstrict-aliasing -funroll-loops -ffast-math
-march=native -mtune=native -DSPILLING_ENSUES=1...
2015 Jul 14
4
[LLVMdev] Poor register allocation (constants causing spilling)
...follows:
llc -mcpu=btver2 test.ll
Examining the assembly in test.s we can see a constant is being loaded
into %xmm8 (second instruction in foo). Tracing the constant we can
see the following:
foo:
...
vmovaps .LCPI0_0(%rip), %xmm8 # xmm8 = [6.366197e-01,6.366197e-01,...]
...
vmulps %xmm8, %xmm0, %xmm1 # first use of constant
vmovaps %xmm8, %xmm9 # move constant into another register
...
vmovaps %xmm0, -40(%rsp) # 16-byte Spill
vmovaps %xmm9, %xmm0 # move constant into vacated register
...
vmulps %xmm0, %xmm3,...
2018 Mar 02
0
[RFC] llvm-mca: a static performance analysis tool
...7] [8] [9]
> - - - - 2.00 1.00 - - - -
>
> Resource pressure by instruction:
> [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
> Instructions:
> - - - - - 1.00 - - - -
> vmulps %xmm0, %xmm1, %xmm2
> - - - - 1.00 - - - - -
> vhaddps %xmm2, %xmm2, %xmm3
> - - - - 1.00 - - - - -
> vhaddps %xmm3, %xmm3, %xmm4
>
>
> Instruction Info:
> [1]: #uOps
>...
2018 Mar 01
9
[RFC] llvm-mca: a static performance analysis tool
...2] [3] [4] [5] [6] [7] [8] [9]
- - - - 2.00 1.00 - - - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
Instructions:
- - - - - 1.00 - - - -
vmulps %xmm0, %xmm1, %xmm2
- - - - 1.00 - - - - -
vhaddps %xmm2, %xmm2, %xmm3
- - - - 1.00 - - - - -
vhaddps %xmm3, %xmm3, %xmm4
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: Ma...
2018 Mar 02
0
[RFC] llvm-mca: a static performance analysis tool
...[7] [8] [9]
> - - - - 2.00 1.00 - - - -
>
> Resource pressure by instruction:
> [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
> Instructions:
> - - - - - 1.00 - - - -
> vmulps %xmm0, %xmm1, %xmm2
> - - - - 1.00 - - - - -
> vhaddps %xmm2, %xmm2, %xmm3
> - - - - 1.00 - - - - -
> vhaddps %xmm3, %xmm3, %xmm4
>
>
> Instruction Info:
> [1]: #uOps
> [2]: La...
2019 Sep 02
2
AVX2 codegen - question reg. FMA generation
...2 FMA instructions. Here's the snippet in the output it generates:
> >
> > $ llc -O3 -mcpu=skylake
> >
> > ---------------------
> > .LBB0_2: # =>This Inner Loop Header: Depth=1
> > vbroadcastss (%rsi,%rdx,4), %ymm0
> > vmulps (%rdi,%rcx), %ymm0, %ymm0
> > vaddps (%rax,%rcx), %ymm0, %ymm0
> > vmovups %ymm0, (%rax,%rcx)
> > incq %rdx
> > addq $32, %rcx
> > cmpq $15, %rdx
> > jle .LBB0_2
> > -----------------------
> >
> > $ llc --version
> > LLVM (http://llvm.or...
2018 Mar 02
0
[RFC] llvm-mca: a static performance analysis tool
...report which contains the so-called "timeline view". Below is the
> timeline view for the dot-product example from the previous section.
>
> ///////////////
> Timeline view:
> 012345
> Index 0123456789
>
> [0,0] DeeER. . . vmulps %xmm0, %xmm1, %xmm2
> [0,1] D==eeeER . . vhaddps %xmm2, %xmm2, %xmm3
> [0,2] .D====eeeER . vhaddps %xmm3, %xmm3, %xmm4
>
> [1,0] .DeeE-----R . vmulps %xmm0, %xmm1, %xmm2
> [1,1] . D=eeeE---R . vhaddps %xmm2, %xmm2, %xmm3
> [1,2]...
2017 Aug 06
2
VBROADCAST Implementation Issues
...gt;>>>>>>>>>>>>>>>>>>>> vbroadcastss zmm1, dword ptr [rip + .LCPI0_0]
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> vmulps zmm2, zmm2, zmm1
>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>> how does it lowered the above IR code into
>>>>>>>>>>>>>>>>>...
2017 Aug 07
2
VBROADCAST Implementation Issues
...gt;>>>>>>>>>>>>>>>> vbroadcastss zmm1, dword ptr [rip + .LCPI0_0]
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> vmulps zmm2, zmm2, zmm1
>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>> how does it lowered the above IR code into
>>>>>>>>>>>>>...
2017 Aug 07
3
VBROADCAST Implementation Issues
...gt;>>>>>>>> vbroadcastss zmm1, dword ptr [rip + .LCPI0_0]
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> vmulps zmm2, zmm2, zmm1
>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>> how does it lowered the above IR code into
>>>>>...
2018 Mar 02
5
[RFC] llvm-mca: a static performance analysis tool
...in the report which contains the so-called "timeline view". Below is the
> timeline view for the dot-product example from the previous section.
>
> ///////////////
> Timeline view:
> 012345
> Index 0123456789
>
> [0,0] DeeER. . . vmulps %xmm0, %xmm1, %xmm2
> [0,1] D==eeeER . . vhaddps %xmm2, %xmm2, %xmm3
> [0,2] .D====eeeER . vhaddps %xmm3, %xmm3, %xmm4
>
> [1,0] .DeeE-----R . vmulps %xmm0, %xmm1, %xmm2
> [1,1] . D=eeeE---R . vhaddps %xmm2, %xmm2, %xmm3
> [1,2]...
2013 Oct 15
0
[LLVMdev] [llvm-commits] r192750 - Enable MI Sched for x86.
...CodeGen optimizes
>> ; away the bugpointed code. Just ensure the basics are still there.
>> ;CHECK-LABEL: func:
>> -;CHECK: vxorps
>> -;CHECK: vinsertf128
>> +;CHECK: vpxor
>> +;CHECK: vinserti128
>> ;CHECK: vpshufd
>> ;CHECK: vpshufd
>> ;CHECK: vmulps
>>
>> Modified: llvm/trunk/test/CodeGen/X86/3addr-16bit.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/3addr-16bit.ll?rev=192750&r1=192749&r2=192750&view=diff
>> =======================================================================...