Displaying 20 results from an estimated 32 matches for "andq".
Did you mean:
and
2015 Jan 19
2
[LLVMdev] X86TargetLowering::LowerToBT
Which BTQ? There are three flavors.
BTQ reg/reg
BTQ reg/mem
BTQ reg/imm
I can imagine that the reg/reg and especially the reg/mem versions would be
slow. However the shrq/and versions *with the same operands* would be slow
as well. There's even a compiler comment about the reg/mem version saying
"this is for disassembly only".
But I doubt BTQ reg/imm would be microcoded.
--
Ite
2015 Jan 19
2
[LLVMdev] X86TargetLowering::LowerToBT
Sure. Attached is the file but here are the functions. The first uses a
fixed bit offset. The second has a indexed bit offset. Compiling with llc
-O3, LLVM version 3.7.0svn, it compiles the IR from IsBitSetB() using btq %rsi,
%rdi. Good. But then it compiles IsBitSetA() with shrq/andq, which is is
pretty much what Clang had generated as IR.
shrq $25, %rdi
andq $1, %rdi
LLVM should be able to replace these two with a single X86_64 instruction:
btq reg,25
The generated code is correct in both cases. It just isn't optimized in the
immediate operatnd case.
unsigned long long...
2014 Jul 23
4
[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops
..., %rcx
leaq 4(%rdi), %rax
cmpq %rax, %rcx
cmovaq %rcx, %rax
movq %rdi, %rsi
notq %rsi
addq %rax, %rsi
shrq $2, %rsi
incq %rsi
xorl %edx, %edx
movabsq $9223372036854775800, %rax # imm = 0x7FFFFFFFFFFFFFF8
andq %rsi, %rax
pxor %xmm0, %xmm0
je .LBB0_1
# BB#2: # %vector.body.preheader
leaq (%rdi,%rax,4), %r8
addq $16, %rdi
movq %rsi, %rdx
andq $-8, %rdx
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
.align...
2015 Mar 03
2
[LLVMdev] Need a clue to improve the optimization of some C code
...Thanks for any feedback.
Ciao
Nat!
P.S. In case someone is interested, here is the assembler code and the IR that produced it.
Relevant LLVM generated x86_64 assembler portion with -Os
~~~
testq %r12, %r12
je LBB0_5
## BB#1:
movq -8(%r12), %rcx
movq (%rcx), %rax
movq -8(%rax), %rdx
andq %r15, %rdx
cmpq %r15, (%rax,%rdx)
je LBB0_2
## BB#3:
addq $8, %rcx
jmp LBB0_4
LBB0_2:
leaq 8(%rdx,%rax), %rcx
LBB0_4:
movq %r12, %rdi
movq %r15, %rsi
movq %r14, %rdx
callq *(%rcx)
movq %rax, %rbx
LBB0_5:
~~~
Better/tighter assembler code would be (saves 2 instructions, one jump less)
~~~...
2011 Jul 12
0
[LLVMdev] GCC Atomic NAND implementation
Hey Guys,
I have a newbie question about supporting the GNU atomic
builtin, __sync_fetch_and_nand. It appears that LLVM 29 produces X86
assembly like the GCC versions below v4.4, i.e.
NEGATE and AND
notq %rax
movq 48(%rsp), %rcx
andq %rcx, %rax
I'm looking to produce X86 assembly like GCC v4.4 and greater, i.e.
NOT AND
movq 48(%rsp), %rcx
andq %rcx, %rax
notq %rax
I currently have custom code to make the switch between implementations, but
it's invasive at best. Has the newer...
2017 Jun 06
4
LLD support for ld64 mach-o linker synthesised symbols
...$__section
- section$end$__SEGMENT$__section
In asm:
/* get imagebase and slide for static PIE and ASLR support in x86_64-xnu-musl */
.align 3
__image_base:
.quad segment$start$__TEXT
__start_static:
.quad start
.text
.align 3
.global start
start:
xor %rbp,%rbp
mov %rsp,%rdi
andq $-16,%rsp
movq __image_base(%rip), %rsi
leaq start(%rip), %rdx
subq __start_static(%rip), %rdx
call __start_c
In C:
/* run C++ constructors in __libc_start_main for x86_64-xnu-musl */
typedef void (*__init_fn)(int, char **, char **, char **);
extern __init_fn __init_...
2015 Jan 19
6
[LLVMdev] X86TargetLowering::LowerToBT
I'm tracking down an X86 code generation malfeasance regarding BT (bit
test) and I have some questions.
This IR *matches* and then *X86TargetLowering::LowerToBT **is called:*
%and = and i64 %shl, %val * ; (val & (1 << index)) != 0 ; *bit test
with a *register* index
This IR *does not match* and so *X86TargetLowering::LowerToBT **is not
called:*
%and = lshr i64 %val, 25
2015 Jun 26
2
[LLVMdev] Can LLVM vectorize <2 x i32> type
...# %for.cond.preheader
imull %r9d, %ebx
testl %ebx, %ebx
jle .LBB10_63
# BB#4: # %for.body.preheader
leal -1(%rbx), %eax
incq %rax
xorl %edx, %edx
movabsq $8589934584, %rcx # imm = 0x1FFFFFFF8
andq %rax, %rcx
je .LBB10_8
I changed all the scalar operands to <2 x ValueType> ones. The IR becomes
the following
for.cond.preheader: ; preds = %if.end18
%mulS44_D = mul <2 x i32> %splatLDS24_D.splat, %splatLDS7_D.splat
%cmp21128S45_D = icmp sgt...
2017 Jun 06
2
LLD support for ld64 mach-o linker synthesised symbols
...gebase and slide for static PIE and ASLR support in x86_64-xnu-musl */
>
> .align 3
> __image_base:
> .quad segment$start$__TEXT
> __start_static:
> .quad start
> .text
> .align 3
> .global start
> start:
> xor %rbp,%rbp
> mov %rsp,%rdi
> andq $-16,%rsp
> movq __image_base(%rip), %rsi
> leaq start(%rip), %rdx
> subq __start_static(%rip), %rdx
> call __start_c
>
> In C:
>
> /* run C++ constructors in __libc_start_main for x86_64-xnu-musl */
>
> typedef void (*__init_fn)(int, cha...
2015 Jan 22
2
[LLVMdev] X86TargetLowering::LowerToBT
...>
>>> Sure. Attached is the file but here are the functions. The first uses a fixed bit offset. The second has a indexed bit offset. Compiling with llc -O3, LLVM version 3.7.0svn, it compiles the IR from IsBitSetB() using btq %rsi, %rdi. Good. But then it compiles IsBitSetA() with shrq/andq, which is is pretty much what Clang had generated as IR.
>>>
>>> shrq $25, %rdi
>>> andq $1, %rdi
>>>
>>> LLVM should be able to replace these two with a single X86_64 instruction: btq reg,25
>>> The generated code is correct in both cases. It...
2015 Mar 03
2
[LLVMdev] Need a clue to improve the optimization of some C code
...d the IR that produced it.
>>
>>
>>
>> Relevant LLVM generated x86_64 assembler portion with -Os
>> ~~~
>> testq %r12, %r12
>> je LBB0_5
>> ## BB#1:
>> movq -8(%r12), %rcx
>> movq (%rcx), %rax
>> movq -8(%rax), %rdx
>> andq %r15, %rdx
>> cmpq %r15, (%rax,%rdx)
>> je LBB0_2
>> ## BB#3:
>> addq $8, %rcx
>> jmp LBB0_4
>> LBB0_2:
>> leaq 8(%rdx,%rax), %rcx
>> LBB0_4:
>> movq %r12, %rdi
>> movq %r15, %rsi
>> movq %r14, %rdx
>> callq *(%rcx)
&g...
2012 Nov 20
12
[PATCH v2 00/11] xen: Initial kexec/kdump implementation
Hi,
This set of patches contains initial kexec/kdump implementation for Xen v2
(previous version were posted to few people by mistake; sorry for that).
Currently only dom0 is supported, however, almost all infrustructure
required for domU support is ready.
Jan Beulich suggested to merge Xen x86 assembler code with baremetal x86 code.
This could simplify and reduce a bit size of kernel code.
2012 Nov 20
12
[PATCH v2 00/11] xen: Initial kexec/kdump implementation
Hi,
This set of patches contains initial kexec/kdump implementation for Xen v2
(previous version were posted to few people by mistake; sorry for that).
Currently only dom0 is supported, however, almost all infrustructure
required for domU support is ready.
Jan Beulich suggested to merge Xen x86 assembler code with baremetal x86 code.
This could simplify and reduce a bit size of kernel code.
2012 Nov 20
12
[PATCH v2 00/11] xen: Initial kexec/kdump implementation
Hi,
This set of patches contains initial kexec/kdump implementation for Xen v2
(previous version were posted to few people by mistake; sorry for that).
Currently only dom0 is supported, however, almost all infrustructure
required for domU support is ready.
Jan Beulich suggested to merge Xen x86 assembler code with baremetal x86 code.
This could simplify and reduce a bit size of kernel code.
2017 Jun 07
3
LLD support for ld64 mach-o linker synthesised symbols
...x86_64-xnu-musl */
>>
>> .align 3
>> __image_base:
>> .quad segment$start$__TEXT
>> __start_static:
>> .quad start
>> .text
>> .align 3
>> .global start
>> start:
>> xor %rbp,%rbp
>> mov %rsp,%rdi
>> andq $-16,%rsp
>> movq __image_base(%rip), %rsi
>> leaq start(%rip), %rdx
>> subq __start_static(%rip), %rdx
>> call __start_c
>>
>>
>> In C:
>>
>> /* run C++ constructors in __libc_start_main for x86_64-xnu-musl */
>&g...
2015 Feb 03
2
[LLVMdev] RFC: Constant Hoisting
I've had a bug/pessimization which I've tracked down for 1 bit bitmasks:
if (((xx) & (1ULL << (40))))
return 1;
if (!((yy) & (1ULL << (40))))
...
The second time Constant Hoisting sees the value (1<<40) it wraps it up
with a bitcast.
That value then gets hoisted. However, the first (1<<40) is not bitcast and
gets recognized
as a BT. The second
2007 Apr 18
0
[RFC/PATCH PV_OPS X86_64 03/17] paravirt_ops - system routines
...__("movq %%cr4,%%rax\n\t"
- "orq %0,%%rax\n\t"
- "movq %%rax,%%cr4\n"
- : : "irg" (mask)
- :"ax");
-}
-
-static inline void clear_in_cr4 (unsigned long mask)
-{
- mmu_cr4_features &= ~mask;
- __asm__("movq %%cr4,%%rax\n\t"
- "andq %0,%%rax\n\t"
- "movq %%rax,%%cr4\n"
- : : "irg" (~mask)
- :"ax");
-}
-
-
-/*
* User space process size. 47bits minus one guard page.
*/
#define TASK_SIZE64 (0x800000000000UL - 4096)
@@ -299,6 +270,10 @@ struct thread_struct {
set_fs(USER_DS); \...
2007 Apr 18
0
[RFC/PATCH PV_OPS X86_64 03/17] paravirt_ops - system routines
...__("movq %%cr4,%%rax\n\t"
- "orq %0,%%rax\n\t"
- "movq %%rax,%%cr4\n"
- : : "irg" (mask)
- :"ax");
-}
-
-static inline void clear_in_cr4 (unsigned long mask)
-{
- mmu_cr4_features &= ~mask;
- __asm__("movq %%cr4,%%rax\n\t"
- "andq %0,%%rax\n\t"
- "movq %%rax,%%cr4\n"
- : : "irg" (~mask)
- :"ax");
-}
-
-
-/*
* User space process size. 47bits minus one guard page.
*/
#define TASK_SIZE64 (0x800000000000UL - 4096)
@@ -299,6 +270,10 @@ struct thread_struct {
set_fs(USER_DS); \...
2017 May 04
4
Xen package security updates for jessie 4.4, XSA-213, XSA-214
...RCX, R11, [DS-GS,] [ERRCODE,] RIP, CS, RFLAGS, RSP, SS } */
+ /* %rdx: trap_bounce, %rbx: struct vcpu */
+ /* On return only %rbx and %rdx are guaranteed non-clobbered. */
+ create_bounce_frame:
+@@ -366,7 +366,7 @@ create_bounce_frame:
+ 2: andq $~0xf,%rsi # Stack frames are 16-byte aligned.
+ movq $HYPERVISOR_VIRT_START,%rax
+ cmpq %rax,%rsi
+- movq $HYPERVISOR_VIRT_END+60,%rax
++ movq $HYPERVISOR_VIRT_END+12*8,%rax
+ sbb %ecx,%ecx # In +ve address space? Then oka...
2015 Jun 24
2
[LLVMdev] Can LLVM vectorize <2 x i32> type
Hi,
Is LLVM be able to generate code for the following code?
%mul = mul <2 x i32> %1, %2, where %1 and %2 are <2 x i32> type.
I am running it on a Haswell processor with LLVM-3.4.2. It seems that it
will generates really complicated code with vpaddq, vpmuludq, vpsllq,
vpsrlq.
Thanks,
Zhi
-------------- next part --------------
An HTML attachment was scrubbed...
URL: