thr3ads.net - search: "lbb0

How can I tell llvm, that a branch is preferred ?

2015 Oct 27

4

How can I tell llvm, that a branch is preferred ?

...correct ? I see nothing in the specs for "branch" or "switch". And __buildin_expect does nothing, that I am sure of. Unfortunately llvm has this knack for ordering my one most crucial part of code exactly the opposite I want to, it does: (x86_64) cmpq %r15, (%rax,%rdx) jne LBB0_3 Ltmp18: leaq 8(%rax,%rdx), %rcx jmp LBB0_4 LBB0_3: addq $8, %rcx LBB0_4: when I want, cmpq %r15, (%rax,%rdx) jeq LBB0_3 addq $8, %rcx jmp LBB0_4 LBB0_3: leaq 8(%rax,%rdx), %rcx LBB0_4: since that saves me executing a jump 99.9% of the time. Is there anything I can do ? Ciao Nat!

Aarch64: unaligned access despite -mstrict-align

2020 Jun 01

3

Aarch64: unaligned access despite -mstrict-align

...$ cat test.s .text .file "test.c" .globl f // -- Begin function f .p2align 2 .type f, at function f: // @f // %bb.0: adrp x8, g ldr x10, [x8, :lo12:g] ldr x9, [x0] ldr x8, [x10] rev x9, x9 rev x8, x8 cmp x8, x9 b.ne .LBB0_3 // %bb.1: ldr x8, [x10, #8] ldr x9, [x0, #8] rev x8, x8 rev x9, x9 cmp x8, x9 b.ne .LBB0_3 // %bb.2: mov w0, wzr ret .LBB0_3: cmp x8, x9 mov w8, #-1 cneg w0, w8, hs ret .Lfunc_end0: .size f, .Lfunc_end0-f // -- End function .ident "clang ve...

[RFC] __builtin_constant_p() Improvements

2018 Apr 12

3

[RFC] __builtin_constant_p() Improvements

...# %entry pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset %rbp, -16 movq %rsp, %rbp .cfi_def_cfa_register %rbp movl %edi, -16(%rbp) cmpl $0, -16(%rbp) je .LBB0_2 # %bb.1: # %if.then movl $42, -8(%rbp) movl $0, -4(%rbp) movl -4(%rbp), %eax movl %eax, -12(%rbp) jmp .LBB0_3 .LBB0_2: # %if.else movl $927, -12(%rbp) # imm = 0x39F .LBB0_3: # %return movl -12(%rbp), %eax popq %rbp retq If the patch looks okay to people, I can shove it onto Phabricator for a review. (My phab-fu is bad.) Thoughts? -bw...

[hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.

2019 Jun 30

6

[hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.

...// encoding: [A,0x48'A',A,0x5c'A',0xe0,0xf1,0x40,0x75] // fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL // %bb.2: { r0 = #-100 } // encoding: [0x80,0xf3,0xdf,0x78] .LBB0_3: // %while.body // =>This Inner Loop Header: Depth=1 { r3 = add(r0,#1) r4 = memw(r2++#4) memw(r1++#4) = r4.new } // encoding: [0x23,0x40,0x00,0xb0,0x24,0x40,0x82,0x9b,0x08,0xd2,0xa1,0xab]...

[RFC] New pass: LoopExitValues

2015 Sep 01

2

[RFC] New pass: LoopExitValues

...uter) for (int Inner = 0; Inner < Size; ++Inner) Dst[Outer * Size + Inner] = Src[Outer * Size + Inner] * Val; } With LoopExitValues ------------------------------- matrix_mul: testl %edi, %edi je .LBB0_5 xorl %r9d, %r9d xorl %r8d, %r8d .LBB0_2: xorl %r11d, %r11d .LBB0_3: movl %r9d, %r10d movl (%rdx,%r10,4), %eax imull %ecx, %eax movl %eax, (%rsi,%r10,4) incl %r11d incl %r9d cmpl %r11d, %edi jne .LBB0_3 incl %r8d cmpl %edi, %r8d jne .LBB0_2 .LBB0_5: retq Without LoopExitValues: ----------------------------------- m...

Handling post-inc users in LSR

2016 May 27

2

Handling post-inc users in LSR

...i32 %StoredValue, i32* %StoredAddr %cmp = icmp sgt i64 %K, 1 br i1 %cmp, label %for.body, label %for.end for.end: ret void } # Output in AArch64 where you can see redundant add instructions for stored value, store address, and in cmp : foo: .cfi_startproc // BB#0: cmp w0, #2 b.lt .LBB0_3 // BB#1: sxtw x9, w0 add w8, w0, #1 .LBB0_2: add x10, x1, x9, lsl #2 add x9, x9, #1 str w8, [x10, #4] add w8, w8, #1 cmp x9, #1 b.gt .LBB0_2 .LBB0_3: ret

[LLVMdev] Odd weak symbol thing on i386

2012 Jan 13

2

[LLVMdev] Odd weak symbol thing on i386

...lldiv_t r; r.quot = num / denom; r.rem = num % denom; if (num >= 0 && r.rem < 0) { r.quot++; r.rem -= denom; } return (r); } I get the following code emitted for the return if the alias line is present: LBB0_3: # %if.end movl 64(%esp), %eax movsd 24(%esp), %xmm0 movsd 32(%esp), %xmm1 movsd %xmm1, 8(%eax) movsd %xmm0, (%eax) addl $56, %esp popl %esi ret .Ltmp0: .size _lldiv, .Ltmp0-_lldiv .weak lldiv lldiv = _lldiv And this if it isn't: LBB0_3:...

[RFC] New pass: LoopExitValues

2015 Aug 31

2

[RFC] New pass: LoopExitValues

Hello LLVM, This is a proposal for a new pass that improves performance and code size in some nested loop situations. The pass is target independent. >From the description in the file header: This optimization finds loop exit values reevaluated after the loop execution and replaces them by the corresponding exit values if they are available. Such sequences can arise after the

[ARM] Should Use Load and Store with Register Offset

2020 Jul 20

2

[ARM] Should Use Load and Store with Register Offset

...void* src, size_t len) { char* save = (char*)dst; for (size_t i = 0; i < len; ++i) *((char*)(dst + i)) = *((char*)(src + i)); return save; } clang --target=armv6m-none-eabi -Os -fomit-frame-pointer memcpy_alt1: push {r4, lr} cmp r2, #0 beq .LBB0_3 mov r3, r0 .LBB0_2: ldrb r4, [r1] strb r4, [r3] adds r1, r1, #1 adds r3, r3, #1 subs r2, r2, #1 bne .LBB0_2 .LBB0_3: pop {r4, pc} arm-none-eabi-gcc -march=armv6-m -Os memcpy_alt1: movs r3, #0...

[hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.

2019 Jul 01

0

[hexagon][PowerPC] code regression (sub-optimal code) on LLVM 9 when generating hardware loops, and the "llvm.uadd" intrinsic.

...// fixup A - offset: 0, value: .LBB0_5, kind: fixup_Hexagon_B15_PCREL // %bb.2: { r0 = #-100 } // encoding: [0x80,0xf3,0xdf,0x78] .LBB0_3: // %while.body // =>This Inner Loop Header: Depth=1 { r3 = add(r0,#1) r4 = memw(r2++#4)...

Comparing Clang and GCC: only clang stores updated value in each iteration.

2018 Sep 20

3

Comparing Clang and GCC: only clang stores updated value in each iteration.

... .type b, at function b: # @b # %bb.0: # %entry lrl %r0, a .LBB0_1: # %do.body # =>This Inner Loop Header: Depth=1 cije %r0, 0, .LBB0_3 # %bb.2: # %if.then # in Loop: Header=BB0_1 Depth=1 ahi %r0, 1 strl %r0, a .LBB0_3: # %do.cond # in Loop: Header=BB0_1 Depth=1...

[RFC] __builtin_constant_p() Improvements

2018 Apr 13

0

[RFC] __builtin_constant_p() Improvements

...offset %rbp, -16 > movq %rsp, %rbp > .cfi_def_cfa_register %rbp > movl %edi, -16(%rbp) > cmpl $0, -16(%rbp) > je .LBB0_2 > # %bb.1: # %if.then > movl $42, -8(%rbp) > movl $0, -4(%rbp) > movl -4(%rbp), %eax > movl %eax, -12(%rbp) > jmp .LBB0_3 > .LBB0_2: # %if.else > movl $927, -12(%rbp) # imm = 0x39F > .LBB0_3: # %return > movl -12(%rbp), %eax > popq %rbp > retq > > If the patch looks okay to people, I can shove it onto Phabricator for a > r...

[ARM] Should Use Load and Store with Register Offset

2020 Jul 21

2

[ARM] Should Use Load and Store with Register Offset

...ompiling with clang and confirmed that the Clang's generated assembly is equivalent to GCC for the code snippet I posted above. clang --target=armv6m-none-eabi -Oz -fomit-frame-pointer memcpy_alt1: push {r4, lr} movs r3, #0 .LBB0_1: cmp r2, r3 beq .LBB0_3 ldrb r4, [r1, r3] strb r4, [r0, r3] adds r3, r3, #1 b .LBB0_1 .LBB0_3: pop {r4, pc} On the other hand, -O2 in GCC still uses the register-offset load and store instructions while Clang -O2 generates the same assembly as -Os: immediate-offs...

[atomics][AArch64] Possible bug in cmpxchg lowering

2017 May 30

3

[atomics][AArch64] Possible bug in cmpxchg lowering

...efine i1 @foo(i32* %obj, i32 %old, i32 %new) { entry: %v0 = cmpxchg weak volatile i32* %obj, i32 %old, i32 %new _*release acquire*_ %v1 = extractvalue { i32, i1 } %v0, 1 ret i1 %v1 } to the equivalent of the following on AArch64: _*ldxr w8, [x0]*_ cmp w8, w1 b.ne .LBB0_3 // BB#1: // %cmpxchg.trystore stlxr w8, w2, [x0] cbz w8, .LBB0_4 // BB#2: // %cmpxchg.failure mov w0, wzr ret .LBB0_3: // %cmpxchg.nostore clrex mov w0, wzr...

[LLVMdev] Post-inc combining

2011 Feb 07

1

[LLVMdev] Post-inc combining

...i<n2;i+=n3) { s+=a[i]; } , with GCC, I get the following loop body, with a post-modify load: .L4: add r1, r1, r3 ldr r4, [ip], r6 rsb r5, r3, r1 cmp r2, r5 add r0, r0, r4 bgt .L4 With LLVM, however, I get: .LBB0_3: @ %for.body @ =>This Inner Loop Header: Depth=1 add r12, lr, r3 ldr lr, [r0, lr, lsl #2] add r1, lr, r1 cmp r12, r2 mov lr, r12 blt .LBB0_3 , which doe...

Handling post-inc users in LSR

2016 May 27

0

Handling post-inc users in LSR

...gt; br i1 %cmp, label %for.body, label %for.end > > for.end: > ret void > } > > > # Output in AArch64 where you can see redundant add instructions for stored value, store address, and in cmp : > > foo: > .cfi_startproc > // BB#0: > cmp w0, #2 > b.lt .LBB0_3 > // BB#1: > sxtw x9, w0 > add w8, w0, #1 > .LBB0_2: > add x10, x1, x9, lsl #2 > add x9, x9, #1 > str w8, [x10, #4] > add w8, w8, #1 > cmp x9, #1 > b.gt .LBB0_2 > .LBB0_3: > ret > _______________________________________________ > LLVM Developers m...

Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

2018 Nov 06

4

Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

...0_4: # in Loop: Header=BB0_2 Depth=1 add rdi, 1 test esi, esi je .LBB0_5 .LBB0_2: # =>This Loop Header: Depth=1 add esi, -1 movzx edx, byte ptr [rdi] shl edx, 24 xor edx, eax mov ecx, -8 mov eax, edx .LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 not clobbered! lea r8d, [rax + rax] | add eax, eax mov edx, r8d | # CF is set from the MSB of EAX xor edx, -306674912 | sbb edx, edx test eax, eax...

A code layout related side-effect introduced by rL318299

2017 Dec 19

4

A code layout related side-effect introduced by rL318299

...-------------- ~/workarea/llvm-r318298/dbuild/bin/opt -loop-rotate -S < b.ll |~/workarea/llvm-r318298/dbuild/bin/llc .cfi_startproc # BB#0: # %entry pushq %rax .cfi_def_cfa_offset 16 movl $i, %eax cmpq %rax, %rsi ja .LBB0_5 # BB#1: movl $i, %eax .p2align 4, 0x90 .LBB0_3: # %while.body # =>This Inner Loop Header: Depth=1 movq (%rdi), %rcx movq %rcx, (%rsi) movq 8(%rdi), %rcx movq %rcx, (%rsi) addq $6, %rsi cmpq %rdx, %rsi jae .LBB0_4 # BB#2: # %while.cond...

enabling interleaved access loop vectorization

2016 Aug 05

3

enabling interleaved access loop vectorization

...in[i + 2] + in[i * 2]; } } We don't vectorize this loop at all, because we calculate the cost of the in[i * 2] gather to be 14 cycles per lane (!). This is an overestimate we need to fix, since the vectorized code is actually fairly decent - e.g. forcing vectorization, with SSE4.2, we get: .LBB0_3: # %vector.body # =>This Inner Loop Header: Depth=1 movdqu (%rdi,%rax,4), %xmm3 movd %xmm0, %rcx movdqu 4(%rdi,%rcx,4), %xmm4 paddd %xmm3, %xmm4 movdqu 8(%rdi,%rcx,4), %xmm3 paddd %xmm4, %xmm3 movdqa %xmm1, %xmm4 paddq %xmm4,...

Nowaday Scalar Evolution's Problem.

2017 Nov 20

2

Nowaday Scalar Evolution's Problem.

...; eax++ cmp eax, 4 ; cmpv = (ecx == 4) je .LBB0_4 ; if(cmpv == true) goto LBB0_4 .LBB0_2: cmp eax, 10 ; cmpv = (eax == 10) jne .LBB0_5 ; if(cmpv == false) goto LBB0_5 jmp .LBB0_3 ; goto LBB0_3 .LBB0_4: mov eax, 5 ; eax = 5 jmp .LBB0_5 ; goto LBB0_5 .LBB0_3: ret ; return; .Lfunc_end0: The loop doesn't even deleted! whats happening to SCEV! Yes, reason...

search for: lbb0_3