Displaying 20 results from an estimated 59 matches for "lbb0_3".
Did you mean:
lbb0_1
2015 Oct 27
4
How can I tell llvm, that a branch is preferred ?
...correct ? I see nothing in the specs for "branch"
or "switch". And __buildin_expect does nothing, that I am sure of.
Unfortunately llvm has this knack for ordering my one most crucial part
of code exactly the opposite I want to, it does: (x86_64)
cmpq %r15, (%rax,%rdx)
jne LBB0_3
Ltmp18:
leaq 8(%rax,%rdx), %rcx
jmp LBB0_4
LBB0_3:
addq $8, %rcx
LBB0_4:
when I want,
cmpq %r15, (%rax,%rdx)
jeq LBB0_3
addq $8, %rcx
jmp LBB0_4
LBB0_3:
leaq 8(%rax,%rdx), %rcx
LBB0_4:
since that saves me executing a jump 99.9% of the time. Is there
anything I can do ?
Ciao
Nat!
2020 Jun 01
3
Aarch64: unaligned access despite -mstrict-align
...$ cat test.s
.text
.file "test.c"
.globl f // -- Begin function f
.p2align 2
.type f, at function
f: // @f
// %bb.0:
adrp x8, g
ldr x10, [x8, :lo12:g]
ldr x9, [x0]
ldr x8, [x10]
rev x9, x9
rev x8, x8
cmp x8, x9
b.ne .LBB0_3
// %bb.1:
ldr x8, [x10, #8]
ldr x9, [x0, #8]
rev x8, x8
rev x9, x9
cmp x8, x9
b.ne .LBB0_3
// %bb.2:
mov w0, wzr
ret
.LBB0_3:
cmp x8, x9
mov w8, #-1
cneg w0, w8, hs
ret
.Lfunc_end0:
.size f, .Lfunc_end0-f
// -- End function
.ident "clang ve...
2018 Apr 12
3
[RFC] __builtin_constant_p() Improvements
...# %entry
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
movl %edi, -16(%rbp)
cmpl $0, -16(%rbp)
je .LBB0_2
# %bb.1: # %if.then
movl $42, -8(%rbp)
movl $0, -4(%rbp)
movl -4(%rbp), %eax
movl %eax, -12(%rbp)
jmp .LBB0_3
.LBB0_2: # %if.else
movl $927, -12(%rbp) # imm = 0x39F
.LBB0_3: # %return
movl -12(%rbp), %eax
popq %rbp
retq
If the patch looks okay to people, I can shove it onto Phabricator for a
review. (My phab-fu is bad.)
Thoughts?
-bw...
2019 Jun 30
6
[hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
...// encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75]
// fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL
// %bb.2:
{
r0 = #-100
} // encoding: [0x80,0xf3,0xdf,0x78]
.LBB0_3: // %while.body
// =>This Inner Loop Header: Depth=1
{
r3 = add(r0,#1)
r4 = memw(r2++#4)
memw(r1++#4) = r4.new
} // encoding: [0x23,0x40,0x00,0xb0,0x24,0x40,0x82,0x9b,0x08,0xd2,0xa1,0xab]...
2015 Sep 01
2
[RFC] New pass: LoopExitValues
...uter)
for (int Inner = 0; Inner < Size; ++Inner)
Dst[Outer * Size + Inner] = Src[Outer * Size + Inner] * Val;
}
With LoopExitValues
-------------------------------
matrix_mul:
testl %edi, %edi
je .LBB0_5
xorl %r9d, %r9d
xorl %r8d, %r8d
.LBB0_2:
xorl %r11d, %r11d
.LBB0_3:
movl %r9d, %r10d
movl (%rdx,%r10,4), %eax
imull %ecx, %eax
movl %eax, (%rsi,%r10,4)
incl %r11d
incl %r9d
cmpl %r11d, %edi
jne .LBB0_3
incl %r8d
cmpl %edi, %r8d
jne .LBB0_2
.LBB0_5:
retq
Without LoopExitValues:
-----------------------------------
m...
2016 May 27
2
Handling post-inc users in LSR
...i32 %StoredValue, i32* %StoredAddr
%cmp = icmp sgt i64 %K, 1
br i1 %cmp, label %for.body, label %for.end
for.end:
ret void
}
# Output in AArch64 where you can see redundant add instructions for
stored value, store address, and in cmp :
foo:
.cfi_startproc
// BB#0:
cmp w0, #2
b.lt .LBB0_3
// BB#1:
sxtw x9, w0
add w8, w0, #1
.LBB0_2:
add x10, x1, x9, lsl #2
add x9, x9, #1
str w8, [x10, #4]
add w8, w8, #1
cmp x9, #1
b.gt .LBB0_2
.LBB0_3:
ret
2012 Jan 13
2
[LLVMdev] Odd weak symbol thing on i386
...lldiv_t r;
r.quot = num / denom;
r.rem = num % denom;
if (num >= 0 && r.rem < 0) {
r.quot++;
r.rem -= denom;
}
return (r);
}
I get the following code emitted for the return if the alias line is present:
LBB0_3: # %if.end
movl 64(%esp), %eax
movsd 24(%esp), %xmm0
movsd 32(%esp), %xmm1
movsd %xmm1, 8(%eax)
movsd %xmm0, (%eax)
addl $56, %esp
popl %esi
ret
.Ltmp0:
.size _lldiv, .Ltmp0-_lldiv
.weak lldiv
lldiv = _lldiv
And this if it isn't:
LBB0_3:...
2015 Aug 31
2
[RFC] New pass: LoopExitValues
Hello LLVM,
This is a proposal for a new pass that improves performance and code
size in some nested loop situations. The pass is target independent.
>From the description in the file header:
This optimization finds loop exit values reevaluated after the loop
execution and replaces them by the corresponding exit values if they
are available. Such sequences can arise after the
2020 Jul 20
2
[ARM] Should Use Load and Store with Register Offset
...void* src, size_t len) {
char* save = (char*)dst;
for (size_t i = 0; i < len; ++i)
*((char*)(dst + i)) = *((char*)(src + i));
return save;
}
clang --target=armv6m-none-eabi -Os -fomit-frame-pointer
memcpy_alt1:
push {r4, lr}
cmp r2, #0
beq .LBB0_3
mov r3, r0
.LBB0_2:
ldrb r4, [r1]
strb r4, [r3]
adds r1, r1, #1
adds r3, r3, #1
subs r2, r2, #1
bne .LBB0_2
.LBB0_3:
pop {r4, pc}
arm-none-eabi-gcc -march=armv6-m -Os
memcpy_alt1:
movs r3, #0...
2019 Jul 01
0
[hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.
...// fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL
// %bb.2:
{
r0 = #-100
} // encoding: [0x80,0xf3,0xdf,0x78]
.LBB0_3: // %while.body
// =>This Inner Loop Header: Depth=1
{
r3 = add(r0,#1)
r4 = memw(r2++#4)...
2018 Sep 20
3
Comparing Clang and GCC: only clang stores updated value in each iteration.
... .type b, at function
b: # @b
# %bb.0: # %entry
lrl %r0, a
.LBB0_1: # %do.body
# =>This Inner Loop Header: Depth=1
cije %r0, 0, .LBB0_3
# %bb.2: # %if.then
# in Loop: Header=BB0_1 Depth=1
ahi %r0, 1
strl %r0, a
.LBB0_3: # %do.cond
# in Loop: Header=BB0_1 Depth=1...
2018 Apr 13
0
[RFC] __builtin_constant_p() Improvements
...offset %rbp, -16
> movq %rsp, %rbp
> .cfi_def_cfa_register %rbp
> movl %edi, -16(%rbp)
> cmpl $0, -16(%rbp)
> je .LBB0_2
> # %bb.1: # %if.then
> movl $42, -8(%rbp)
> movl $0, -4(%rbp)
> movl -4(%rbp), %eax
> movl %eax, -12(%rbp)
> jmp .LBB0_3
> .LBB0_2: # %if.else
> movl $927, -12(%rbp) # imm = 0x39F
> .LBB0_3: # %return
> movl -12(%rbp), %eax
> popq %rbp
> retq
>
> If the patch looks okay to people, I can shove it onto Phabricator for a
> r...
2020 Jul 21
2
[ARM] Should Use Load and Store with Register Offset
...ompiling with clang and
confirmed that the Clang's generated assembly is equivalent to GCC for the
code snippet I posted above.
clang --target=armv6m-none-eabi -Oz -fomit-frame-pointer
memcpy_alt1:
push {r4, lr}
movs r3, #0
.LBB0_1:
cmp r2, r3
beq .LBB0_3
ldrb r4, [r1, r3]
strb r4, [r0, r3]
adds r3, r3, #1
b .LBB0_1
.LBB0_3:
pop {r4, pc}
On the other hand, -O2 in GCC still uses the register-offset load and store
instructions while Clang -O2 generates the same assembly as -Os:
immediate-offs...
2017 May 30
3
[atomics][AArch64] Possible bug in cmpxchg lowering
...efine i1 @foo(i32* %obj, i32 %old, i32 %new) {
entry:
%v0 = cmpxchg weak volatile i32* %obj, i32 %old, i32 %new _*release
acquire*_
%v1 = extractvalue { i32, i1 } %v0, 1
ret i1 %v1
}
to the equivalent of the following on AArch64:
_*ldxr w8, [x0]*_
cmp w8, w1
b.ne .LBB0_3
// BB#1: // %cmpxchg.trystore
stlxr w8, w2, [x0]
cbz w8, .LBB0_4
// BB#2: // %cmpxchg.failure
mov w0, wzr
ret
.LBB0_3: // %cmpxchg.nostore
clrex
mov w0, wzr...
2011 Feb 07
1
[LLVMdev] Post-inc combining
...i<n2;i+=n3)
{
s+=a[i];
}
, with GCC, I get the following loop body, with a post-modify load:
.L4:
add r1, r1, r3
ldr r4, [ip], r6
rsb r5, r3, r1
cmp r2, r5
add r0, r0, r4
bgt .L4
With LLVM, however, I get:
.LBB0_3: @ %for.body
@ =>This Inner Loop Header: Depth=1
add r12, lr, r3
ldr lr, [r0, lr, lsl #2]
add r1, lr, r1
cmp r12, r2
mov lr, r12
blt .LBB0_3
, which doe...
2016 May 27
0
Handling post-inc users in LSR
...gt; br i1 %cmp, label %for.body, label %for.end
>
> for.end:
> ret void
> }
>
>
> # Output in AArch64 where you can see redundant add instructions for stored value, store address, and in cmp :
>
> foo:
> .cfi_startproc
> // BB#0:
> cmp w0, #2
> b.lt .LBB0_3
> // BB#1:
> sxtw x9, w0
> add w8, w0, #1
> .LBB0_2:
> add x10, x1, x9, lsl #2
> add x9, x9, #1
> str w8, [x10, #4]
> add w8, w8, #1
> cmp x9, #1
> b.gt .LBB0_2
> .LBB0_3:
> ret
> _______________________________________________
> LLVM Developers m...
2018 Nov 06
4
Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
...0_4: # in Loop: Header=BB0_2 Depth=1
add rdi, 1
test esi, esi
je .LBB0_5
.LBB0_2: # =>This Loop Header: Depth=1
add esi, -1
movzx edx, byte ptr [rdi]
shl edx, 24
xor edx, eax
mov ecx, -8
mov eax, edx
.LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 not clobbered!
lea r8d, [rax + rax] | add eax, eax
mov edx, r8d | # CF is set from the MSB of EAX
xor edx, -306674912 | sbb edx, edx
test eax, eax...
2017 Dec 19
4
A code layout related side-effect introduced by rL318299
...--------------
~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll
|~/workarea/llvm-r318298/dbuild/bin/llc
.cfi_startproc
# BB#0: # %entry
pushq %rax
.cfi_def_cfa_offset 16
movl $i, %eax
cmpq %rax, %rsi
ja .LBB0_5
# BB#1:
movl $i, %eax
.p2align 4, 0x90
.LBB0_3: # %while.body
# =>This Inner Loop Header: Depth=1
movq (%rdi), %rcx
movq %rcx, (%rsi)
movq 8(%rdi), %rcx
movq %rcx, (%rsi)
addq $6, %rsi
cmpq %rdx, %rsi
jae .LBB0_4
# BB#2: # %while.cond...
2016 Aug 05
3
enabling interleaved access loop vectorization
...in[i + 2] + in[i * 2];
}
}
We don't vectorize this loop at all, because we calculate the cost of the in[i * 2] gather to be 14 cycles per lane (!).
This is an overestimate we need to fix, since the vectorized code is actually fairly decent - e.g. forcing vectorization, with SSE4.2, we get:
.LBB0_3: # %vector.body
# =>This Inner Loop Header: Depth=1
movdqu (%rdi,%rax,4), %xmm3
movd %xmm0, %rcx
movdqu 4(%rdi,%rcx,4), %xmm4
paddd %xmm3, %xmm4
movdqu 8(%rdi,%rcx,4), %xmm3
paddd %xmm4, %xmm3
movdqa %xmm1, %xmm4
paddq %xmm4,...
2017 Nov 20
2
Nowaday Scalar Evolution's Problem.
...; eax++
cmp eax, 4 ; cmpv = (ecx == 4)
je .LBB0_4 ; if(cmpv == true) goto LBB0_4
.LBB0_2:
cmp eax, 10 ; cmpv = (eax == 10)
jne .LBB0_5 ; if(cmpv == false) goto LBB0_5
jmp .LBB0_3 ; goto LBB0_3
.LBB0_4:
mov eax, 5 ; eax = 5
jmp .LBB0_5 ; goto LBB0_5
.LBB0_3:
ret ; return;
.Lfunc_end0:
The loop doesn't even deleted! whats happening to SCEV!
Yes, reason...