Displaying 20 results from an estimated 49 matches for "shrl".
Did you mean:
shrb
2014 Jan 18
2
[LLVMdev] Scheduling quirks
....size _Z13test_registeri, .Ltmp0-_Z13test_registeri
.cfi_endproc
.globl _Z14test_scheduleri
.align 16, 0x90
.type _Z14test_scheduleri, at function
_Z14test_scheduleri: # @_Z14test_scheduleri
.cfi_startproc
# BB#0: # %entry
movl %edi, %eax
shrl $2, %eax
andl $15, %eax
shrl $3, %edi
andl $31, %edi
xorl %eax, %edi
movl %edi, %eax
retq
.Ltmp1:
.size _Z14test_scheduleri, .Ltmp1-_Z14test_scheduleri
.cfi_endproc
.ident "clang version 3.5 (trunk 199507)"
.section ".note.GNU-stack","", at progbits
<===...
2019 Aug 15
2
Slow XCHG in arch/i386/libgcc/__ashrdi3.S and arch/i386/libgcc/__lshrdi3.S
...pub/scm/libs/klibc/klibc.git/plain/usr/klibc/arch/i386/libgcc/__ashldi3.S
and
https://git.kernel.org/pub/scm/libs/klibc/klibc.git/plain/usr/klibc/arch/i386/libgcc/__lshrdi3.S
use the following code sequences for shift counts greater 31:
1: 1:
xorl %edx,%edx shrl %cl,%edx
shl %cl,%eax xorl %eax,%eax
^
xchgl %edx,%eax xchgl %edx,%eax
ret ret
At least and especially on Intel processors XCHG was and
still is a rather slow instruction and should be avoided.
Use the following better code s...
2017 Oct 20
1
[PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
...- movl TAB+1024(,r5,4),r5 ## E;\
>> + round_mov(TAB+1024, r5, r5 ## E)\
>> movw r4 ## X,r2 ## X; \
>> - movl TAB(,r6,4),r6 ## E; \
>> + round_mov(TAB, r6, r6 ## E) \
>> roll $16,r2 ## E; \
>> shrl $16,r4 ## E; \
>> movzbl r4 ## L,r7 ## E; \
>> movzbl r4 ## H,r4 ## E; \
>> xorl OFFSET(r8),ra ## E; \
>> xorl OFFSET+4(r8),rb ## E; \
>> - xorl TAB+3072(,r4,4),r5 ## E;\
>> - xorl TA...
2017 Oct 20
1
[PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
...- movl TAB+1024(,r5,4),r5 ## E;\
>> + round_mov(TAB+1024, r5, r5 ## E)\
>> movw r4 ## X,r2 ## X; \
>> - movl TAB(,r6,4),r6 ## E; \
>> + round_mov(TAB, r6, r6 ## E) \
>> roll $16,r2 ## E; \
>> shrl $16,r4 ## E; \
>> movzbl r4 ## L,r7 ## E; \
>> movzbl r4 ## H,r4 ## E; \
>> xorl OFFSET(r8),ra ## E; \
>> xorl OFFSET+4(r8),rb ## E; \
>> - xorl TAB+3072(,r4,4),r5 ## E;\
>> - xorl TA...
2003 Aug 22
2
kernel: locore.s doesn't assemble (fillkpt, $PAGE_SHIFT, $PTESHIFT)
...NG_4/src/sys/../include
-I/www/freebsd/RELENG_4/src/sys/contrib/ipfilter -D_KERNEL -include
opt_global.h -mpr eferred-stack-boundary=2
/www/freebsd/RELENG_4/src/sys/i386/i386/locore.s
/tmp/ccOO8Chb.s: Assembler messages:
/tmp/ccOO8Chb.s:2495: Error: suffix or operands invalid for `shr'
shrl $PAGE_SHIFT,%ecx
/tmp/ccOO8Chb.s:2496: Error: suffix or operands invalid for `shr'
/tmp/ccOO8Chb.s:2496: Error: suffix or operands invalid for `shl'
movl %eax, %ebx ; shrl $PAGE_SHIFT, %ebx ;
shll $PTESHIFT,%ebx ; addl (( KPTphys
)-KERNBASE)...
2017 Oct 20
0
[PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
...r8,ra,rb,rc,rd) \
> movzbl r2 ## H,r5 ## E; \
> movzbl r2 ## L,r6 ## E; \
> - movl TAB+1024(,r5,4),r5 ## E;\
> + round_mov(TAB+1024, r5, r5 ## E)\
> movw r4 ## X,r2 ## X; \
> - movl TAB(,r6,4),r6 ## E; \
> + round_mov(TAB, r6, r6 ## E) \
> roll $16,r2 ## E; \
> shrl $16,r4 ## E; \
> movzbl r4 ## L,r7 ## E; \
> movzbl r4 ## H,r4 ## E; \
> xorl OFFSET(r8),ra ## E; \
> xorl OFFSET+4(r8),rb ## E; \
> - xorl TAB+3072(,r4,4),r5 ## E;\
> - xorl TAB+2048(,r7,4),r6 ## E;\
> + round_xor(TAB+3072, r4, r5 ## E)\
> + round_xor(TAB+2048, r7,...
2019 Aug 20
1
Slow XCHG in arch/i386/libgcc/__ashrdi3.S and arch/i386/libgcc/__lshrdi3.S
.../libgcc/__ashldi3.S
>> and
>> https://git.kernel.org/pub/scm/libs/klibc/klibc.git/plain/usr/klibc/arch/i386/libgcc/__lshrdi3.S
>> use the following code sequences for shift counts greater 31:
>>
>> 1: 1:
>> xorl %edx,%edx shrl %cl,%edx
>> shl %cl,%eax xorl %eax,%eax
>> ^
>> xchgl %edx,%eax xchgl %edx,%eax
>> ret ret
>>
>> At least and especially on Intel processors XCHG was and
>> still is a rather slow ins...
2017 Oct 11
1
[PATCH v1 01/27] x86/crypto: Adapt assembly for PIE support
...define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
movzbl r2 ## H,r5 ## E; \
movzbl r2 ## L,r6 ## E; \
- movl TAB+1024(,r5,4),r5 ## E;\
+ round_mov(TAB+1024, r5, r5 ## E)\
movw r4 ## X,r2 ## X; \
- movl TAB(,r6,4),r6 ## E; \
+ round_mov(TAB, r6, r6 ## E) \
roll $16,r2 ## E; \
shrl $16,r4 ## E; \
movzbl r4 ## L,r7 ## E; \
movzbl r4 ## H,r4 ## E; \
xorl OFFSET(r8),ra ## E; \
xorl OFFSET+4(r8),rb ## E; \
- xorl TAB+3072(,r4,4),r5 ## E;\
- xorl TAB+2048(,r7,4),r6 ## E;\
+ round_xor(TAB+3072, r4, r5 ## E)\
+ round_xor(TAB+2048, r7, r6 ## E)\
movzbl r1 ## L,r7 ## E; \...
2010 Dec 15
3
[LLVMdev] opinions on turning on encoding info by default in -S
...## fixup A - offset: 3, value: _last_tf_arg_u at GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
movq %rsi, (%rax) ## encoding: [0x48,0x89,0x30]
imull $43691, %esi, %eax ## encoding: [0x69,0xc6,0xab,0xaa,0x00,0x00]
## imm = 0xAAAB
shrl $17, %eax ## encoding: [0xc1,0xe8,0x11]
ret ## encoding: [0xc3]
.comm _last_tf_arg_u,8,3 ## @last_tf_arg_u
## @last_tf_arg_u
.subsections_via_symbols
Relatively recently, we turned on verbose-asm output by...
2019 Aug 19
0
Slow XCHG in arch/i386/libgcc/__ashrdi3.S and arch/i386/libgcc/__lshrdi3.S
...lain/usr/klibc/arch/i386/libgcc/__ashldi3.S
> and
> https://git.kernel.org/pub/scm/libs/klibc/klibc.git/plain/usr/klibc/arch/i386/libgcc/__lshrdi3.S
> use the following code sequences for shift counts greater 31:
>
> 1: 1:
> xorl %edx,%edx shrl %cl,%edx
> shl %cl,%eax xorl %eax,%eax
> ^
> xchgl %edx,%eax xchgl %edx,%eax
> ret ret
>
> At least and especially on Intel processors XCHG was and
> still is a rather slow instruction and should be avoid...
2012 Jul 31
0
[LLVMdev] [llvm-commits] rotate
On Tue, Jul 31, 2012 at 8:42 AM, Cameron McInally
<cameron.mcinally at nyu.edu> wrote:
> Andy,
>
> Here is the left circular shift operator patch. I apologize to the reviewer
> in advance. The patch has a good bit of fine detail. Any
> comments/criticisms?
>
> Some caveats...
>
> 1) This is just the bare minimum needed to make the left circular shift
> operator
2015 Jan 23
2
[LLVMdev] X86TargetLowering::LowerToBT
I suspect that this is because the mask in your example is the result of a variable shift, which (a) has it’s own performance and flags hazards pre-SHLX and (b) requires additional µops to do with TEST. I expect that ICC is putting a dummy TEST or XOR ahead of the BT to break the false flags dependency, as well.
If the mask were constant, I expect ICC would generate TEST instead (but I don’t
2010 Dec 16
0
[LLVMdev] opinions on turning on encoding info by default in -S
...A - offset: 3, value: _last_tf_arg_u at GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
> movq %rsi, (%rax) ## encoding: [0x48,0x89,0x30]
> imull $43691, %esi, %eax ## encoding: [0x69,0xc6,0xab,0xaa,0x00,0x00]
> ## imm = 0xAAAB
> shrl $17, %eax ## encoding: [0xc1,0xe8,0x11]
> ret ## encoding: [0xc3]
>
> .comm _last_tf_arg_u,8,3 ## @last_tf_arg_u
> ## @last_tf_arg_u
>
> .subsections_via_symbols
>
>
>
> Relati...
2012 Mar 27
1
[LLVMdev] Compiling integer mod
....
Thanks for any help,
Brent
int f(int n)
{
return (n + 1) % 18;
}
"clang -O2 -S" produces this code:
_f: # @f
# BB#0:
movl 4(%esp), %ecx
incl %ecx
movl $954437177, %edx # imm = 0x38E38E39
movl %ecx, %eax
imull %edx
movl %edx, %eax
shrl $31, %eax
sarl $2, %edx
addl %eax, %edx
imull $18, %edx, %eax
subl %eax, %ecx
movl %ecx, %eax
ret
The visual studio compiler (/O2) instead issues the idiv instruction:
PUBLIC _f
; Function compile flags: /Ogtpy
; COMDAT _f
_TEXT SEGMENT
_n$ = 8 ; size = 4
_f PROC ; COMDAT
; File...
2012 Jul 29
0
[LLVMdev] rotate
...def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
movl %edi, -4(%rbp)
movq %rsi, -16(%rbp)
movl -4(%rbp), %edi
movq -16(%rbp), %rsi
movl %esi, %eax
movl %eax, %ecx
## kill: CL<def> ECX<kill>
shrl %cl, %edi
movl -4(%rbp), %eax
movabsq $32, %rsi
subq -16(%rbp), %rsi
movl %esi, %edx
movl %edx, %ecx
## kill: CL<def> ECX<kill>
shll %cl, %eax
orl %eax, %edi
movl %edi, %eax
popq %rbp
ret
.cfi_endproc
.subsections_via_symbols
=====
M...
2005 Aug 17
2
MMX loop filter for theora-exp
...*/ \
+" packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */ \
+" punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \
+" movd %%mm5,%%eax\n" /* eax = newpix21 */ \
+" movw %%ax,1(%0)\n" \
+" psrlq $32,%%mm5\n" /* why is so big stall here ? */ \
+" shrl $16,%%eax\n" \
+" lea 1(%0,%1,2),%%edi\n" \
+" movw %%ax,1(%0,%1,1)\n" \
+" movd %%mm5,%%eax\n" /* eax = newpix21 high part */ \
+" lea (%1,%1,2),%%esi\n" \
+" movw %%ax,(%%edi)\n" \
+" shrl $16,%%eax\n" \
+" movw %%ax,1(%0,%%es...
2012 Jul 29
3
[LLVMdev] rotate
Nice!
Clever compiler..
On 07/28/2012 08:55 PM, Michael Gottesman wrote:
> I can get clang/llvm to emit a rotate instruction on x86-64 when compiling C by just using -Os and the rotate from Hacker's Delight i.e.,
>
> ======
> #include<stdlib.h>
> #include<stdint.h>
>
> uint32_t ror(uint32_t input, size_t rot_bits)
> {
> return (input>>
2012 Jul 31
3
[LLVMdev] rotate
Andy,
Here is the left circular shift operator patch. I apologize to the reviewer
in advance. The patch has a good bit of fine detail. Any
comments/criticisms?
Some caveats...
1) This is just the bare minimum needed to make the left circular shift
operator work (e.g. no instruction combining).
2) I tried my best to select operator names in the existing style; please
feel free to change them as
2017 Oct 11
32
[PATCH v1 00/27] x86: PIE support and option to extend KASLR randomization
Changes:
- patch v1:
- Simplify ftrace implementation.
- Use gcc mstack-protector-guard-reg=%gs with PIE when possible.
- rfc v3:
- Use --emit-relocs instead of -pie to reduce dynamic relocation space on
mapped memory. It also simplifies the relocation process.
- Move the start the module section next to the kernel. Remove the need for
-mcmodel=large on modules. Extends
2017 Oct 11
32
[PATCH v1 00/27] x86: PIE support and option to extend KASLR randomization
Changes:
- patch v1:
- Simplify ftrace implementation.
- Use gcc mstack-protector-guard-reg=%gs with PIE when possible.
- rfc v3:
- Use --emit-relocs instead of -pie to reduce dynamic relocation space on
mapped memory. It also simplifies the relocation process.
- Move the start the module section next to the kernel. Remove the need for
-mcmodel=large on modules. Extends