Displaying 20 results from an estimated 1410 matches for "rcx".
Did you mean:
rc
2013 Aug 20
0
[LLVMdev] Memory optimizations for LLVM JIT
...[4]; mov %rdx,0x28(%rax)
--------------------------------------------------------------------
JIT (map p to GlobalVariable) ==> JIT (map p to constant GlobalVariable)
1* movabsq $0x18c6b88, %rax 1* movabsq $0x18c6b88, %rax
2* movq (%rax), %rcx // p 2* movq (%rax), %rax
3* movq 0x8(%rcx), %rdx // a[1] 3* movq 0x8(%rax), %rcx
4* movq %rdx, 0x10(%rcx) // a[2] 4* movq %rcx, 0x10(%rax)
5 movq (%rax), %rcx 5
6 movq 0x8(%rcx), %rdx 6 movq 0x8(%rax), %rcx
7*...
2013 Aug 20
4
[LLVMdev] Memory optimizations for LLVM JIT
...[4]; mov %rdx,0x28(%rax)
--------------------------------------------------------------------
JIT (map p to GlobalVariable) ==> JIT (map p to constant
GlobalVariable)
1* movabsq $0x18c6b88, %rax 1* movabsq $0x18c6b88, %rax
2* movq (%rax), %rcx // p 2* movq (%rax), %rax
3* movq 0x8(%rcx), %rdx // a[1] 3* movq 0x8(%rax), %rcx
4* movq %rdx, 0x10(%rcx) // a[2] 4* movq %rcx, 0x10(%rax)
5 movq (%rax), %rcx 5
6 movq 0x8(%rcx), %rdx 6 movq 0x8(%rax), %rcx
7*...
2017 Oct 03
2
invalid code generated on Windows x86_64 using skylake-specific features
I figured it out. I was using this implementation of __chkstk from
compiler-rt:
DEFINE_COMPILERRT_FUNCTION(___chkstk)
push %rcx
cmp $0x1000,%rax
lea 16(%rsp),%rcx // rsp before calling this routine -> rcx
jb 1f
2:
sub $0x1000,%rcx
test %rcx,(%rcx)
sub $0x1000,%rax
cmp $0x1000,%rax
ja 2b
1:
sub %rax,%rcx
test...
2015 Feb 13
2
[LLVMdev] trunk's optimizer generates slower code than 3.5
...FEh
js loc_100000DFA
test r15d, r15d
mov r11d, [rax+r8*4]
jle loc_100000EAE
mov ecx, r15d
add ecx, 0FFFFFFFEh
mov [rsp+48h+var_34], ecx
movsxd rcx, ecx
lea rcx, [rax+rcx*4]
mov [rsp+48h+var_40], rcx
lea rcx, [rax+4]
mov [rsp+48h+var_48], rcx
xor r14d, r14d
jmp short loc_100000D33
; -------------------------------------------...
2018 Sep 11
2
Byte-wide stores aren't coalesced if interspersed with other stores
Andres:
FWIW, codegen will do the merge if you turn on global alias analysis for it
"-combiner-global-alias-analysis". That said, we should be able to do this
merging earlier.
-Nirav
On Mon, Sep 10, 2018 at 8:33 PM, Andres Freund via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> Hi,
>
> On 2018-09-10 13:42:21 -0700, Andres Freund wrote:
> > I have, in postres,
2017 Mar 01
2
[Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm
...fcmp une double %rtb_Sum3_737, 0.000000e+00
%_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
br i1 %603, label %true73, label %false74
Now, in broken.asm, notice the same merge128 is missing the branch instruction:
.LBB6_55: # %merge128
movq 184(%rsp), %rcx
movq %rax, 728(%rcx)
movq 184(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 184(%rsp), %rax
movq $0, 744(%rax)
movq 184(%rsp), %rax
movq $0, 752(%rax)
movq 184(%rsp), %rax
movq $0, 760(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mu...
2015 Feb 14
2
[LLVMdev] trunk's optimizer generates slower code than 3.5
...test r15d, r15d
>> mov r11d, [rax+r8*4]
>> jle loc_100000EAE
>> mov ecx, r15d
>> add ecx, 0FFFFFFFEh
>> mov [rsp+48h+var_34], ecx
>> movsxd rcx, ecx
>> lea rcx, [rax+rcx*4]
>> mov [rsp+48h+var_40], rcx
>> lea rcx, [rax+4]
>> mov [rsp+48h+var_48], rcx
>> xor r14d, r14d
>> jmp short loc_1...
2015 Feb 14
2
[LLVMdev] trunk's optimizer generates slower code than 3.5
...mov r11d, [rax+r8*4]
>>>> jle loc_100000EAE
>>>> mov ecx, r15d
>>>> add ecx, 0FFFFFFFEh
>>>> mov [rsp+48h+var_34], ecx
>>>> movsxd rcx, ecx
>>>> lea rcx, [rax+rcx*4]
>>>> mov [rsp+48h+var_40], rcx
>>>> lea rcx, [rax+4]
>>>> mov [rsp+48h+var_48], rcx
>>>> xor r14d, r14d
>...
2013 Aug 19
3
[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012
...word ptr [rbp-4],0
0000000581D3001F mov al,1
0000000581D30021 test al,al
0000000581D30023 jne 0000000581D30042
0000000581D30029 mov eax,10h
0000000581D3002E call 00000005F08425D0
0000000581D30033 sub rsp,rax
0000000581D30036 mov byte ptr [rcx+94h],0
0000000581D3003D jmp 0000000581D30056
0000000581D30042 mov eax,10h
0000000581D30047 call 00000005F08425D0
0000000581D3004C sub rsp,rax
0000000581D3004F mov byte ptr [rcx+94h],1
0000000581D30056 test byte ptr [rbp-1],80h
0000000581D3005A je...
2020 Aug 17
3
Code generation option for wide integers on x86_64?
...erator to emit a loop for the following code:
define i4096 @add(i4096 %a, i4096 %b) alwaysinline {
%c = add i4096 %a, %b
ret i4096 %c
}
instead of:
movq %rdi, %rax
addq 96(%rsp), %rsi
adcq 104(%rsp), %rdx
movq %rdx, 8(%rdi)
movq %rsi, (%rdi)
adcq 112(%rsp), %rcx
movq %rcx, 16(%rdi)
adcq 120(%rsp), %r8
movq %r8, 24(%rdi)
adcq 128(%rsp), %r9
movq %r9, 32(%rdi)
movq 8(%rsp), %rcx
adcq 136(%rsp), %rcx
movq %rcx, 40(%rdi)
movq 16(%rsp), %rcx
:
:
:
What is the best strategy for lowering wide inte...
2018 Sep 11
2
Byte-wide stores aren't coalesced if interspersed with other stores
...case at hand, with a manual 64bit store (this is on a 64bit
> target), llvm then combines 8 byte-wide stores into one.
>
>
> Without -combiner-global-alias-analysis it generates:
>
> movb $0, 1(%rdx)
> movl 4(%rsi,%rdi), %ebx
> movq %rbx, 8(%rcx)
> movb $0, 2(%rdx)
> movl 8(%rsi,%rdi), %ebx
> movq %rbx, 16(%rcx)
> movb $0, 3(%rdx)
> movl 12(%rsi,%rdi), %ebx
> movq %rbx, 24(%rcx)
> movb $0, 4(%rdx)
> movq 16(%rsi,%rdi), %rbx
>...
2018 Nov 25
3
BUGS n code generated for target i386 compiling __bswapdi3, and for target x86-64 compiling __bswapsi2()
...ected result for the input value
0x0123456789ABCDEF is 0xEFCDAB8967452301; the compiled code but
produces 0x67452301EFCDAB89
And compiled for x86-64 this yields the following code (see
<https://godbolt.org/z/uM9nvN>):
__bswapsi2: # @__bswapsi2
mov eax, edi
shr eax, 24
mov rcx, rdi
shr rcx, 8
and ecx, 65280
or rax, rcx
mov rcx, rdi
shl rcx, 8
and ecx, 16711680
or rax, rcx
and rdi, 255
shl rdi, 24
or rax, rdi
ret
__bswapdi2: # @__bswapdi2
bswap rdi
mov rax, rdi
ret
Both are correct, but __b...
2010 Sep 01
5
[LLVMdev] equivalent IR, different asm
...oxModelObjectEPNS_10StyleImageE
.align 4, 0x90
__ZN7WebCore6kolos1ERiS0_PKNS_20RenderBoxModelObjectEPNS_10StyleImageE: ## @_ZN7WebCore6kolos1ERiS0_PKNS_20RenderBoxModelObjectEPNS_10StyleImageE
## BB#0:
pushq %r14
pushq %rbx
subq $8, %rsp
movq %rsi, %rbx
movq %rdi, %r14
movq %rdx, %rdi
movq %rcx, %rsi
callq __ZN7WebCore4viziEPKNS_20RenderBoxModelObjectEPNS_10StyleImageE
movq %rax, %rcx
shrq $32, %rcx
testl %ecx, %ecx
je LBB0_2
## BB#1:
imull (%rbx), %eax
cltd
idivl %ecx
movl %eax, (%r14)
LBB0_2:
addq $8, %rsp
popq %rbx
popq %r14
ret
$ llc opt-fail.ll -o -
.section __TEXT,_...
2015 Mar 03
2
[LLVMdev] Need a clue to improve the optimization of some C code
...erator and try to add an optimization pass ?
Thanks for any feedback.
Ciao
Nat!
P.S. In case someone is interested, here is the assembler code and the IR that produced it.
Relevant LLVM generated x86_64 assembler portion with -Os
~~~
testq %r12, %r12
je LBB0_5
## BB#1:
movq -8(%r12), %rcx
movq (%rcx), %rax
movq -8(%rax), %rdx
andq %r15, %rdx
cmpq %r15, (%rax,%rdx)
je LBB0_2
## BB#3:
addq $8, %rcx
jmp LBB0_4
LBB0_2:
leaq 8(%rdx,%rax), %rcx
LBB0_4:
movq %r12, %rdi
movq %r15, %rsi
movq %r14, %rdx
callq *(%rcx)
movq %rax, %rbx
LBB0_5:
~~~
Better/tighter assembler code woul...
2020 Nov 12
2
LLVM X86 MachineBasicBlock inserting push and pop instructions causes segmentation fault
...g a
function. I'm able to add the instructions and verify they get added, but
when the compiled program runs, it stops with a segfault.
For brevity, I'm not sharing the whole code here but basically I have a X86
MachineFunctionPass added to addPreEmitPass2 stage which simply inserts a
push rcx immediately followed by pop rcx before each basic block (only the
relevant logic portions are included):
/* Inserts push rcx followed by pop rcx before each MachineBasicBlock */
void VirtualTimeManager::__insertVtlLogic(MachineFunction &MF,
MachineBasicBlock* origMBB) {
const llvm::Tar...
2019 Sep 02
3
AVX2 codegen - question reg. FMA generation
...set to haswell or later types) turning it into an
AVX2 FMA instructions. Here's the snippet in the output it generates:
$ llc -O3 -mcpu=skylake
---------------------
.LBB0_2: # =>This Inner Loop Header: Depth=1
vbroadcastss (%rsi,%rdx,4), %ymm0
vmulps (%rdi,%rcx), %ymm0, %ymm0
vaddps (%rax,%rcx), %ymm0, %ymm0
vmovups %ymm0, (%rax,%rcx)
incq %rdx
addq $32, %rcx
cmpq $15, %rdx
jle .LBB0_2
-----------------------
$ llc --version
LLVM (http://llvm.org/):
LLVM version 8.0.0
Optimized build.
Default target: x86_64-unknown-linux-gnu
Host CPU: skylake
(ll...
2010 Sep 01
0
[LLVMdev] equivalent IR, different asm
...90
> __ZN7WebCore6kolos1ERiS0_PKNS_20RenderBoxModelObjectEPNS_10StyleImageE: ## @_ZN7WebCore6kolos1ERiS0_PKNS_20RenderBoxModelObjectEPNS_10StyleImageE
> ## BB#0:
> pushq %r14
> pushq %rbx
> subq $8, %rsp
> movq %rsi, %rbx
> movq %rdi, %r14
> movq %rdx, %rdi
> movq %rcx, %rsi
> callq __ZN7WebCore4viziEPKNS_20RenderBoxModelObjectEPNS_10StyleImageE
> movq %rax, %rcx
> shrq $32, %rcx
> testl %ecx, %ecx
> je LBB0_2
> ## BB#1:
> imull (%rbx), %eax
> cltd
> idivl %ecx
> movl %eax, (%r14)
> LBB0_2:
> addq $8, %rsp
> popq...
2016 Aug 05
3
enabling interleaved access loop vectorization
...mate we need to fix, since the vectorized code is actually fairly decent - e.g. forcing vectorization, with SSE4.2, we get:
.LBB0_3: # %vector.body
# =>This Inner Loop Header: Depth=1
movdqu (%rdi,%rax,4), %xmm3
movd %xmm0, %rcx
movdqu 4(%rdi,%rcx,4), %xmm4
paddd %xmm3, %xmm4
movdqu 8(%rdi,%rcx,4), %xmm3
paddd %xmm4, %xmm3
movdqa %xmm1, %xmm4
paddq %xmm4, %xmm4
movdqa %xmm0, %xmm5
paddq %xmm5, %xmm5
movd %xmm5, %rcx
pextrq $1, %xmm5, %rdx
movd %xmm4, %r8
pextrq $1, %xmm4, %r9
movd (%rdi,%rcx,4), %xmm4 # xmm4 = mem[0],ze...
2013 Aug 27
0
[LLVMdev] Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012
...3001F mov al,1
> 0000000581D30021 test al,al
> 0000000581D30023 jne 0000000581D30042
> 0000000581D30029 mov eax,10h
> 0000000581D3002E call 00000005F08425D0
> 0000000581D30033 sub rsp,rax
> 0000000581D30036 mov byte ptr [rcx+94h],0
> 0000000581D3003D jmp 0000000581D30056
> 0000000581D30042 mov eax,10h
> 0000000581D30047 call 00000005F08425D0
> 0000000581D3004C sub rsp,rax
> 0000000581D3004F mov byte ptr [rcx+94h],1
> 0000000581D30056 test byte ptr [r...
2015 Oct 27
4
How can I tell llvm, that a branch is preferred ?
...for "branch"
or "switch". And __buildin_expect does nothing, that I am sure of.
Unfortunately llvm has this knack for ordering my one most crucial part
of code exactly the opposite I want to, it does: (x86_64)
cmpq %r15, (%rax,%rdx)
jne LBB0_3
Ltmp18:
leaq 8(%rax,%rdx), %rcx
jmp LBB0_4
LBB0_3:
addq $8, %rcx
LBB0_4:
when I want,
cmpq %r15, (%rax,%rdx)
jeq LBB0_3
addq $8, %rcx
jmp LBB0_4
LBB0_3:
leaq 8(%rax,%rdx), %rcx
LBB0_4:
since that saves me executing a jump 99.9% of the time. Is there
anything I can do ?
Ciao
Nat!