Displaying 20 results from an estimated 52 matches for "shrq".
Did you mean:
shrb
2015 Jul 24
2
[LLVMdev] SIMD for sdiv <2 x i64>
...i
%sub.ptr.div.i7.i.i.i.i = sdiv <2 x i64> %sub.ptr.sub.i6.i.i.i.i, <i64 24,
i64 24>
Assembly:
vpsubq %xmm6, %xmm5, %xmm5
vmovq %xmm5, %rax
movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB
imulq %rbx
movq %rdx, %rcx
movq %rcx, %rax
shrq $63, %rax
shrq $2, %rcx
addl %eax, %ecx
vpextrq $1, %xmm5, %rax
imulq %rbx
movq %rdx, %rax
shrq $63, %rax
shrq $2, %rdx
addl %eax, %edx
movslq %edx, %rax
vmovq %rax, %xmm5
movslq %ecx, %rax
vmovq %rax, %xmm6
vpun...
2010 Sep 01
5
[LLVMdev] equivalent IR, different asm
...tEPNS_10StyleImageE: ## @_ZN7WebCore6kolos1ERiS0_PKNS_20RenderBoxModelObjectEPNS_10StyleImageE
## BB#0:
pushq %r14
pushq %rbx
subq $8, %rsp
movq %rsi, %rbx
movq %rdi, %r14
movq %rdx, %rdi
movq %rcx, %rsi
callq __ZN7WebCore4viziEPKNS_20RenderBoxModelObjectEPNS_10StyleImageE
movq %rax, %rcx
shrq $32, %rcx
testl %ecx, %ecx
je LBB0_2
## BB#1:
imull (%rbx), %eax
cltd
idivl %ecx
movl %eax, (%r14)
LBB0_2:
addq $8, %rsp
popq %rbx
popq %r14
ret
$ llc opt-fail.ll -o -
.section __TEXT,__text,regular,pure_instructions
.globl __ZN7WebCore6kolos1ERiS0_PKNS_20RenderBoxModelObjectEPNS_10S...
2015 Jul 24
0
[LLVMdev] SIMD for sdiv <2 x i64>
...AAAAAAAAAAAAAAB
> imulq %rbx
> movq %rdx, %rcx
> movq %rcx, %rax
> shrq $63, %rax
> shrq $2, %rcx
> addl %eax, %ecx
> vpextrq $1, %xmm5, %rax
> imulq %rbx
> movq %rd...
2015 Jul 24
2
[LLVMdev] SIMD for sdiv <2 x i64>
..., i64 24>
>>
>> Assembly:
>> vpsubq %xmm6, %xmm5, %xmm5
>> vmovq %xmm5, %rax
>> movabsq $3074457345618258603, %rbx # imm = 0x2AAAAAAAAAAAAAAB
>> imulq %rbx
>> movq %rdx, %rcx
>> movq %rcx, %rax
>> shrq $63, %rax
>> shrq $2, %rcx
>> addl %eax, %ecx
>> vpextrq $1, %xmm5, %rax
>> imulq %rbx
>> movq %rdx, %rax
>> shrq $63, %rax
>> shrq $2, %rdx
>> addl %eax, %edx
>> movslq %edx,...
2015 Jan 19
2
[LLVMdev] X86TargetLowering::LowerToBT
Which BTQ? There are three flavors.
BTQ reg/reg
BTQ reg/mem
BTQ reg/imm
I can imagine that the reg/reg and especially the reg/mem versions would be
slow. However the shrq/and versions *with the same operands* would be slow
as well. There's even a compiler comment about the reg/mem version saying
"this is for disassembly only".
But I doubt BTQ reg/imm would be microcoded.
--
Ite Ursi
-------------- next part --------------
An HTML attachment was scr...
2010 Sep 01
0
[LLVMdev] equivalent IR, different asm
...0RenderBoxModelObjectEPNS_10StyleImageE
> ## BB#0:
> pushq %r14
> pushq %rbx
> subq $8, %rsp
> movq %rsi, %rbx
> movq %rdi, %r14
> movq %rdx, %rdi
> movq %rcx, %rsi
> callq __ZN7WebCore4viziEPKNS_20RenderBoxModelObjectEPNS_10StyleImageE
> movq %rax, %rcx
> shrq $32, %rcx
> testl %ecx, %ecx
> je LBB0_2
> ## BB#1:
> imull (%rbx), %eax
> cltd
> idivl %ecx
> movl %eax, (%r14)
> LBB0_2:
> addq $8, %rsp
> popq %rbx
> popq %r14
> ret
>
>
> $ llc opt-fail.ll -o -
>
> .section __TEXT,__text,regular,p...
2015 Jul 24
0
[LLVMdev] SIMD for sdiv <2 x i64>
...# %invoke.cont
vmovaps %xmm2, 96(%rsp) # 16-byte Spill
vmovdqa 48(%rsp), %xmm0 # 16-byte Reload
vpsubq %xmm0, %xmm2, %xmm0
vpextrq $1, %xmm0, %rax
movabsq $3074457345618258603, %rcx # imm = 0x2AAAAAAAAAAAAAAB
imulq %rcx
movq %rdx, %rax
shrq $63, %rax
sarq $2, %rdx
addq %rax, %rdx
vmovq %rdx, %xmm1
vmovq %xmm0, %rax
imulq %rcx
movq %rdx, %rax
shrq $63, %rax
sarq $2, %rdx
addq %rax, %rdx
vmovq %rdx, %xmm0
vpunpcklqdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[0],xmm1[0]...
2015 Jul 24
1
[LLVMdev] SIMD for sdiv <2 x i64>
...s %xmm2, 96(%rsp) # 16-byte Spill
> vmovdqa 48(%rsp), %xmm0 # 16-byte Reload
> vpsubq %xmm0, %xmm2, %xmm0
> vpextrq $1, %xmm0, %rax
> movabsq $3074457345618258603, %rcx # imm = 0x2AAAAAAAAAAAAAAB
> imulq %rcx
> movq %rdx, %rax
> shrq $63, %rax
> sarq $2, %rdx
> addq %rax, %rdx
> vmovq %rdx, %xmm1
> vmovq %xmm0, %rax
> imulq %rcx
> movq %rdx, %rax
> shrq $63, %rax
> sarq $2, %rdx
> addq %rax, %rdx
> vmovq %rdx, %xmm0
> vpunp...
2016 Jun 23
2
AVX512 instruction generated when JIT compiling for an avx2 architecture
...lobl main
.align 16, 0x90
.type main, at function
main:
.cfi_startproc
movq 8(%rsp), %r10
leaq (%rdi,%r8), %rdx
addq %rsi, %r8
testb $1, %cl
cmoveq %rdi, %rdx
cmoveq %rsi, %r8
movq %rdx, %rax
sarq $63, %rax
shrq $62, %rax
addq %rdx, %rax
sarq $2, %rax
movq %r8, %rcx
sarq $63, %rcx
shrq $62, %rcx
addq %r8, %rcx
sarq $2, %rcx
movq (%r10), %r8
movq 8(%r10), %r10
movq %r8, %rdi
shrq $32, %rdi
movq %r10, %rsi...
2016 Jun 23
2
AVX512 instruction generated when JIT compiling for an avx2 architecture
....cfi_startproc
> movq 8(%rsp), %r10
> leaq (%rdi,%r8), %rdx
> addq %rsi, %r8
> testb $1, %cl
> cmoveq %rdi, %rdx
> cmoveq %rsi, %r8
> movq %rdx, %rax
> sarq $63, %rax
> shrq $62, %rax
> addq %rdx, %rax
> sarq $2, %rax
> movq %r8, %rcx
> sarq $63, %rcx
> shrq $62, %rcx
> addq %r8, %rcx
> sarq $2, %rcx
> movq (%r10), %r8
> movq 8(%r10), %r10
&g...
2015 Jan 19
2
[LLVMdev] X86TargetLowering::LowerToBT
Sure. Attached is the file but here are the functions. The first uses a
fixed bit offset. The second has a indexed bit offset. Compiling with llc
-O3, LLVM version 3.7.0svn, it compiles the IR from IsBitSetB() using btq %rsi,
%rdi. Good. But then it compiles IsBitSetA() with shrq/andq, which is is
pretty much what Clang had generated as IR.
shrq $25, %rdi
andq $1, %rdi
LLVM should be able to replace these two with a single X86_64 instruction:
btq reg,25
The generated code is correct in both cases. It just isn't optimized in the
immediate operatnd case.
unsigned long...
2016 Jun 29
2
avx512 JIT backend generates wrong code on <4 x float>
...fter.ll"
.globl adjmul
.align 16, 0x90
.type adjmul, at function
adjmul:
.cfi_startproc
leaq (%rdi,%r8), %rdx
addq %rsi, %r8
testb $1, %cl
cmoveq %rdi, %rdx
cmoveq %rsi, %r8
movq %rdx, %rax
sarq $63, %rax
shrq $62, %rax
addq %rdx, %rax
sarq $2, %rax
movq %r8, %rcx
sarq $63, %rcx
shrq $62, %rcx
addq %r8, %rcx
sarq $2, %rcx
movq %rax, %rdx
shlq $5, %rdx
leaq 16(%r9,%rdx), %rsi
orq $16, %rdx
movq 16(%rsp), %rd...
2016 Jun 29
0
avx512 JIT backend generates wrong code on <4 x float>
...0x90
> .type adjmul, at function
> adjmul:
> .cfi_startproc
> leaq (%rdi,%r8), %rdx
> addq %rsi, %r8
> testb $1, %cl
> cmoveq %rdi, %rdx
> cmoveq %rsi, %r8
> movq %rdx, %rax
> sarq $63, %rax
> shrq $62, %rax
> addq %rdx, %rax
> sarq $2, %rax
> movq %r8, %rcx
> sarq $63, %rcx
> shrq $62, %rcx
> addq %r8, %rcx
> sarq $2, %rcx
> movq %rax, %rdx
> shlq $5, %rdx
> leaq 16(%r9,%rdx), %rs...
2010 Sep 01
2
[LLVMdev] equivalent IR, different asm
...> ## BB#0:
>> pushq %r14
>> pushq %rbx
>> subq $8, %rsp
>> movq %rsi, %rbx
>> movq %rdi, %r14
>> movq %rdx, %rdi
>> movq %rcx, %rsi
>> callq __ZN7WebCore4viziEPKNS_20RenderBoxModelObjectEPNS_10StyleImageE
>> movq %rax, %rcx
>> shrq $32, %rcx
>> testl %ecx, %ecx
>> je LBB0_2
>> ## BB#1:
>> imull (%rbx), %eax
>> cltd
>> idivl %ecx
>> movl %eax, (%r14)
>> LBB0_2:
>> addq $8, %rsp
>> popq %rbx
>> popq %r14
>> ret
>>
>>
>> $ llc o...
2016 Jun 30
1
avx512 JIT backend generates wrong code on <4 x float>
...gt; adjmul:
>> .cfi_startproc
>> leaq (%rdi,%r8), %rdx
>> addq %rsi, %r8
>> testb $1, %cl
>> cmoveq %rdi, %rdx
>> cmoveq %rsi, %r8
>> movq %rdx, %rax
>> sarq $63, %rax
>> shrq $62, %rax
>> addq %rdx, %rax
>> sarq $2, %rax
>> movq %r8, %rcx
>> sarq $63, %rcx
>> shrq $62, %rcx
>> addq %r8, %rcx
>> sarq $2, %rcx
>> movq %rax, %rdx
>> shlq...
2017 Oct 11
1
[PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
...107c961bb4..64eb5c87d04a 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -98,16 +98,20 @@
#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
- movzbl src ## bh, RID1d; \
- movzbl src ## bl, RID2d; \
- shrq $16, src; \
- movl s1(, RID1, 4), dst ## d; \
- op1 s2(, RID2, 4), dst ## d; \
- movzbl src ## bh, RID1d; \
- movzbl src ## bl, RID2d; \
- interleave_op(il_reg); \
- op2 s3(, RID1, 4), dst ## d; \
- op3 s4(, RID2, 4), dst ## d;
+ movzbl src ## bh, R...
2010 Sep 01
0
[LLVMdev] equivalent IR, different asm
...les seem equivalent, but the resulting asm from
> 'opt-fail.ll' causes a crash to webkit.
> I suspect the usage of registers is wrong, can someone take a look ?
Yes, the code here is wrong:
> movl (%rbx), %ecx
> imull %ecx, %eax
This computes h*((int32)%1) in %eax.
> shrq $32, %rax
> movl %eax, %ecx
This is trying to compute (int32)(%1>>32) into %ecx, but is using the
wrong input value since %rax has been clobbered by the above code, and
further is clobbering the value in %eax computed above, which is
implicit input to the divide. This is some kind...
2015 Jan 19
6
[LLVMdev] X86TargetLowering::LowerToBT
I'm tracking down an X86 code generation malfeasance regarding BT (bit
test) and I have some questions.
This IR *matches* and then *X86TargetLowering::LowerToBT **is called:*
%and = and i64 %shl, %val * ; (val & (1 << index)) != 0 ; *bit test
with a *register* index
This IR *does not match* and so *X86TargetLowering::LowerToBT **is not
called:*
%and = lshr i64 %val, 25
2010 Sep 01
1
[LLVMdev] equivalent IR, different asm
...>> 'opt-fail.ll' causes a crash to webkit.
>> I suspect the usage of registers is wrong, can someone take a look ?
>
> Yes, the code here is wrong:
>
>> movl (%rbx), %ecx
>> imull %ecx, %eax
>
> This computes h*((int32)%1) in %eax.
>
>> shrq $32, %rax
>> movl %eax, %ecx
>
> This is trying to compute (int32)(%1>>32) into %ecx, but is using the
> wrong input value since %rax has been clobbered by the above code, and
> further is clobbering the value in %eax computed above, which is
> implicit input to t...
2013 Aug 09
2
[LLVMdev] [RFC] Poor code generation for paired load
...on the target. Truncate and shift instructions are useless (instructions 2., 4., and 5.).
Cost: ldi64 + 2 trunc + 1 shift vs. 1 ldpair
** To Reproduce **
Here is a way to reproduce the poor code generation for x86-64.
opt -sroa current_input.ll -S -o - | llc -O3 -o -
You will see 2 vmovd and 1 shrq that can be avoided as illustrated with the next command.
Here is a nicer code produced by modifying the input so that SROA generates friendlier code for this case.
opt -sroa mod_input.ll -S -o - | llc -O3 -o -
Basically the difference between both inputs is that memcpy has not been expanded in...