Nat!
2015-Mar-03 14:06 UTC
[LLVMdev] Need a clue to improve the optimization of some C code
Hi
I have some inline function C code, that llvm could be optimizing better.
Since I am new to this, I wonder if someone could give me a few pointers, how to
approach this in LLVM.
Should I try to change the IR code -somehow- to get the code generator to
generate better code, or should I rather go to the code generator and try to add
an optimization pass ?
Thanks for any feedback.
Ciao
Nat!
P.S. In case someone is interested, here is the assembler code and the IR that
produced it.
Relevant LLVM generated x86_64 assembler portion with -Os
~~~
testq %r12, %r12
je LBB0_5
## BB#1:
movq -8(%r12), %rcx
movq (%rcx), %rax
movq -8(%rax), %rdx
andq %r15, %rdx
cmpq %r15, (%rax,%rdx)
je LBB0_2
## BB#3:
addq $8, %rcx
jmp LBB0_4
LBB0_2:
leaq 8(%rdx,%rax), %rcx
LBB0_4:
movq %r12, %rdi
movq %r15, %rsi
movq %r14, %rdx
callq *(%rcx)
movq %rax, %rbx
LBB0_5:
~~~
Better/tighter assembler code would be (saves 2 instructions, one jump less)
~~~
testq %r12, %r12
je LBB0_5
movq -8(%r12), %rcx
movq (%rcx), %rax
movq -8(%rax), %rdx
andq %r15, %rdx
cmpq %r15, (%rax,%rdx)
jne LBB0_4
leaq 0(%rdx,%rax), %rcx
LBB0_4:
movq %r12, %rdi
movq %r15, %rsi
movq %r14, %rdx
callq *8(%rcx)
movq %rax, %rbx
LBB0_5:
~~~
This is the IR code, which produces above:
~~~
; Function Attrs: inlinehint nounwind
define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8* %value) #2
{
%1 = alloca %struct._foo*, align 8
%2 = alloca %struct._foo*, align 8
%3 = alloca i8*, align 8
%4 = alloca i8*, align 8
%dispatch = alloca %struct._dispatch*, align 8
%table = alloca %struct._table*, align 8
%entrys = alloca %struct._entry*, align 8
%entry = alloca %struct._entry*, align 8
%offset = alloca i64, align 8
%mask = alloca i64, align 8
%f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8
store %struct._foo* %foo, %struct._foo** %2, align 8
store i8* %key, i8** %3, align 8
store i8* %value, i8** %4, align 8
%5 = load %struct._foo** %2, align 8
%6 = icmp ne %struct._foo* %5, null
br i1 %6, label %9, label %7
; <label>:7 ; preds = %0
%8 = load %struct._foo** %2, align 8
store %struct._foo* %8, %struct._foo** %1
br label %50
; <label>:9 ; preds = %0
%10 = load %struct._foo** %2, align 8
%11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo* %10)
%12 = load %struct._dispatch** %11, align 8
store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8
%13 = load %struct._dispatch** %dispatch, align 8
%14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0
%15 = load %struct._entry** %14, align 8
store %struct._entry* %15, %struct._entry** %entrys, align 8
%16 = load %struct._entry** %entrys, align 8
%17 = call %struct._table* @_table_from_entrys(%struct._entry* %16)
store %struct._table* %17, %struct._table** %table, align 8
%18 = load %struct._table** %table, align 8
%19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2
%20 = load i64* %19, align 8
store i64 %20, i64* %mask, align 8
%21 = load i8** %3, align 8
%22 = ptrtoint i8* %21 to i64
%23 = load i64* %mask, align 8
%24 = and i64 %22, %23
store i64 %24, i64* %offset, align 8
%25 = load i64* %offset, align 8
%26 = load %struct._entry** %entrys, align 8
%27 = bitcast %struct._entry* %26 to i8*
%28 = getelementptr inbounds i8* %27, i64 %25
%29 = bitcast i8* %28 to %struct._entry*
store %struct._entry* %29, %struct._entry** %entry, align 8
%30 = load %struct._entry** %entry, align 8
%31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0
%32 = load i8** %31, align 8
%33 = load i8** %3, align 8
%34 = icmp eq i8* %32, %33
br i1 %34, label %35, label %39
; <label>:35 ; preds = %9
%36 = load %struct._entry** %entry, align 8
%37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1
%38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8
br label %43
; <label>:39 ; preds = %9
%40 = load %struct._dispatch** %dispatch, align 8
%41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1
%42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8
br label %43
; <label>:43 ; preds = %39, %35
%44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [ %42, %39 ]
store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo*
(%struct._foo*, i8*, i8*)** %f, align 8
%45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8
%46 = load %struct._foo** %2, align 8
%47 = load i8** %3, align 8
%48 = load i8** %4, align 8
%49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48)
store %struct._foo* %49, %struct._foo** %1
br label %50
; <label>:50 ; preds = %43, %7
%51 = load %struct._foo** %1
ret %struct._foo* %51
}
~~~
------------------------------------------------------
When vanity and rivalry disappear, all the lines go
out of your stomach and you slow down and coast
slowly to a stop in the middle. -- DLR
Philip Reames
2015-Mar-03 18:49 UTC
[LLVMdev] Need a clue to improve the optimization of some C code
You'll need to prove a bit more information to get any useful response. Questions: 1) What's you're use case? Are you using clang to compile C code? Are you manually generating LLVM IR? 2) Are you tracking ToT? If you're not, you should be. Depending on your use case, you might find this document useful: http://llvm.org/docs/Frontend/PerformanceTips.html On 03/03/2015 06:06 AM, Nat! wrote:> Hi > > I have some inline function C code, that llvm could be optimizing better. > Since I am new to this, I wonder if someone could give me a few pointers, how to approach this in LLVM. > > Should I try to change the IR code -somehow- to get the code generator to generate better code, or should I rather go to the code generator and try to add an optimization pass ? > > Thanks for any feedback. > > Ciao > Nat! > > > P.S. In case someone is interested, here is the assembler code and the IR that produced it. > > > > Relevant LLVM generated x86_64 assembler portion with -Os > ~~~ > testq %r12, %r12 > je LBB0_5 > ## BB#1: > movq -8(%r12), %rcx > movq (%rcx), %rax > movq -8(%rax), %rdx > andq %r15, %rdx > cmpq %r15, (%rax,%rdx) > je LBB0_2 > ## BB#3: > addq $8, %rcx > jmp LBB0_4 > LBB0_2: > leaq 8(%rdx,%rax), %rcx > LBB0_4: > movq %r12, %rdi > movq %r15, %rsi > movq %r14, %rdx > callq *(%rcx) > movq %rax, %rbx > LBB0_5: > ~~~ > > Better/tighter assembler code would be (saves 2 instructions, one jump less) > ~~~ > testq %r12, %r12 > je LBB0_5 > > movq -8(%r12), %rcx > movq (%rcx), %rax > movq -8(%rax), %rdx > andq %r15, %rdx > cmpq %r15, (%rax,%rdx) > jne LBB0_4 > > leaq 0(%rdx,%rax), %rcx > LBB0_4: > movq %r12, %rdi > movq %r15, %rsi > movq %r14, %rdx > callq *8(%rcx) > movq %rax, %rbx > LBB0_5: > ~~~ > > > This is the IR code, which produces above: > ~~~ > ; Function Attrs: inlinehint nounwind > define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8* %value) #2 { > %1 = alloca %struct._foo*, align 8 > %2 = alloca %struct._foo*, align 8 > %3 = alloca i8*, align 8 > %4 = alloca i8*, align 8 > %dispatch = alloca %struct._dispatch*, align 8 > %table = alloca %struct._table*, align 8 > %entrys = alloca %struct._entry*, align 8 > %entry = alloca %struct._entry*, align 8 > %offset = alloca i64, align 8 > %mask = alloca i64, align 8 > %f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8 > store %struct._foo* %foo, %struct._foo** %2, align 8 > store i8* %key, i8** %3, align 8 > store i8* %value, i8** %4, align 8 > %5 = load %struct._foo** %2, align 8 > %6 = icmp ne %struct._foo* %5, null > br i1 %6, label %9, label %7 > > ; <label>:7 ; preds = %0 > %8 = load %struct._foo** %2, align 8 > store %struct._foo* %8, %struct._foo** %1 > br label %50 > > ; <label>:9 ; preds = %0 > %10 = load %struct._foo** %2, align 8 > %11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo* %10) > %12 = load %struct._dispatch** %11, align 8 > store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8 > %13 = load %struct._dispatch** %dispatch, align 8 > %14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0 > %15 = load %struct._entry** %14, align 8 > store %struct._entry* %15, %struct._entry** %entrys, align 8 > %16 = load %struct._entry** %entrys, align 8 > %17 = call %struct._table* @_table_from_entrys(%struct._entry* %16) > store %struct._table* %17, %struct._table** %table, align 8 > %18 = load %struct._table** %table, align 8 > %19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2 > %20 = load i64* %19, align 8 > store i64 %20, i64* %mask, align 8 > %21 = load i8** %3, align 8 > %22 = ptrtoint i8* %21 to i64 > %23 = load i64* %mask, align 8 > %24 = and i64 %22, %23 > store i64 %24, i64* %offset, align 8 > %25 = load i64* %offset, align 8 > %26 = load %struct._entry** %entrys, align 8 > %27 = bitcast %struct._entry* %26 to i8* > %28 = getelementptr inbounds i8* %27, i64 %25 > %29 = bitcast i8* %28 to %struct._entry* > store %struct._entry* %29, %struct._entry** %entry, align 8 > %30 = load %struct._entry** %entry, align 8 > %31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0 > %32 = load i8** %31, align 8 > %33 = load i8** %3, align 8 > %34 = icmp eq i8* %32, %33 > br i1 %34, label %35, label %39 > > ; <label>:35 ; preds = %9 > %36 = load %struct._entry** %entry, align 8 > %37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1 > %38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8 > br label %43 > > ; <label>:39 ; preds = %9 > %40 = load %struct._dispatch** %dispatch, align 8 > %41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1 > %42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8 > br label %43 > > ; <label>:43 ; preds = %39, %35 > %44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [ %42, %39 ] > store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 > %45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 > %46 = load %struct._foo** %2, align 8 > %47 = load i8** %3, align 8 > %48 = load i8** %4, align 8 > %49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48) > store %struct._foo* %49, %struct._foo** %1 > br label %50 > > ; <label>:50 ; preds = %43, %7 > %51 = load %struct._foo** %1 > ret %struct._foo* %51 > } > > ~~~ > > ------------------------------------------------------ > When vanity and rivalry disappear, all the lines go > out of your stomach and you slow down and coast > slowly to a stop in the middle. -- DLR > > > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Nat!
2015-Mar-03 19:20 UTC
[LLVMdev] Need a clue to improve the optimization of some C code
Am 03.03.2015 um 19:49 schrieb Philip Reames <listmail at philipreames.com>: Hi Philip first thanks for your response,> You'll need to prove a bit more information to get any useful response. Questions: > 1) What's you're use case? Are you using clang to compile C code? Are you manually generating LLVM IR?yes the "inline function C code" will be compiled by clang.> 2) Are you tracking ToT? If you're not, you should be.Currently the git repo doesn't build for me (libLLVMR600CodeGen fails). Admittedly my IR output is from an older version - 3.6.> > Depending on your use case, you might find this document useful: > http://llvm.org/docs/Frontend/PerformanceTips.htmlMaybe yes, if it would be sensible to rewrite the IR, which I am wondering, if that is a good/useful idea ? I don't know. I basically need to get LLVM to emit ~~~ cmpq %r15, (%rax,%rdx) jne LBB0_4 leaq 0(%rdx,%rax), %rcx LBB0_4: callq *8(%rcx) ~~~ instead of ~~~ cmpq %r15, (%rax,%rdx) je LBB0_2 addq $8, %rcx jmp LBB0_4 LBB0_2: leaq 8(%rdx,%rax), %rcx LBB0_4: callq *(%rcx) ~~~ If I can do this by rewriting the IR, it would be nice, because it hopefully translates to other architectures. And I would dig into that. If in general the approach is to keep the IR as it, but tweak the code generation I'll try that first. I have only a coarse understanding how LLVM optimization works. It would be nice if I could avoid some common dead-ends right from the start. Ciao Nat!> > On 03/03/2015 06:06 AM, Nat! wrote: >> Hi >> >> I have some inline function C code, that llvm could be optimizing better. >> Since I am new to this, I wonder if someone could give me a few pointers, how to approach this in LLVM. >> >> Should I try to change the IR code -somehow- to get the code generator to generate better code, or should I rather go to the code generator and try to add an optimization pass ? >> >> Thanks for any feedback. >> >> Ciao >> Nat! >> >> >> P.S. In case someone is interested, here is the assembler code and the IR that produced it. >> >> >> >> Relevant LLVM generated x86_64 assembler portion with -Os >> ~~~ >> testq %r12, %r12 >> je LBB0_5 >> ## BB#1: >> movq -8(%r12), %rcx >> movq (%rcx), %rax >> movq -8(%rax), %rdx >> andq %r15, %rdx >> cmpq %r15, (%rax,%rdx) >> je LBB0_2 >> ## BB#3: >> addq $8, %rcx >> jmp LBB0_4 >> LBB0_2: >> leaq 8(%rdx,%rax), %rcx >> LBB0_4: >> movq %r12, %rdi >> movq %r15, %rsi >> movq %r14, %rdx >> callq *(%rcx) >> movq %rax, %rbx >> LBB0_5: >> ~~~ >> >> Better/tighter assembler code would be (saves 2 instructions, one jump less) >> ~~~ >> testq %r12, %r12 >> je LBB0_5 >> >> movq -8(%r12), %rcx >> movq (%rcx), %rax >> movq -8(%rax), %rdx >> andq %r15, %rdx >> cmpq %r15, (%rax,%rdx) >> jne LBB0_4 >> >> leaq 0(%rdx,%rax), %rcx >> LBB0_4: >> movq %r12, %rdi >> movq %r15, %rsi >> movq %r14, %rdx >> callq *8(%rcx) >> movq %rax, %rbx >> LBB0_5: >> ~~~ >> >> >> This is the IR code, which produces above: >> ~~~ >> ; Function Attrs: inlinehint nounwind >> define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8* %value) #2 { >> %1 = alloca %struct._foo*, align 8 >> %2 = alloca %struct._foo*, align 8 >> %3 = alloca i8*, align 8 >> %4 = alloca i8*, align 8 >> %dispatch = alloca %struct._dispatch*, align 8 >> %table = alloca %struct._table*, align 8 >> %entrys = alloca %struct._entry*, align 8 >> %entry = alloca %struct._entry*, align 8 >> %offset = alloca i64, align 8 >> %mask = alloca i64, align 8 >> %f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8 >> store %struct._foo* %foo, %struct._foo** %2, align 8 >> store i8* %key, i8** %3, align 8 >> store i8* %value, i8** %4, align 8 >> %5 = load %struct._foo** %2, align 8 >> %6 = icmp ne %struct._foo* %5, null >> br i1 %6, label %9, label %7 >> >> ; <label>:7 ; preds = %0 >> %8 = load %struct._foo** %2, align 8 >> store %struct._foo* %8, %struct._foo** %1 >> br label %50 >> >> ; <label>:9 ; preds = %0 >> %10 = load %struct._foo** %2, align 8 >> %11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo* %10) >> %12 = load %struct._dispatch** %11, align 8 >> store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8 >> %13 = load %struct._dispatch** %dispatch, align 8 >> %14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0 >> %15 = load %struct._entry** %14, align 8 >> store %struct._entry* %15, %struct._entry** %entrys, align 8 >> %16 = load %struct._entry** %entrys, align 8 >> %17 = call %struct._table* @_table_from_entrys(%struct._entry* %16) >> store %struct._table* %17, %struct._table** %table, align 8 >> %18 = load %struct._table** %table, align 8 >> %19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2 >> %20 = load i64* %19, align 8 >> store i64 %20, i64* %mask, align 8 >> %21 = load i8** %3, align 8 >> %22 = ptrtoint i8* %21 to i64 >> %23 = load i64* %mask, align 8 >> %24 = and i64 %22, %23 >> store i64 %24, i64* %offset, align 8 >> %25 = load i64* %offset, align 8 >> %26 = load %struct._entry** %entrys, align 8 >> %27 = bitcast %struct._entry* %26 to i8* >> %28 = getelementptr inbounds i8* %27, i64 %25 >> %29 = bitcast i8* %28 to %struct._entry* >> store %struct._entry* %29, %struct._entry** %entry, align 8 >> %30 = load %struct._entry** %entry, align 8 >> %31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0 >> %32 = load i8** %31, align 8 >> %33 = load i8** %3, align 8 >> %34 = icmp eq i8* %32, %33 >> br i1 %34, label %35, label %39 >> >> ; <label>:35 ; preds = %9 >> %36 = load %struct._entry** %entry, align 8 >> %37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1 >> %38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8 >> br label %43 >> >> ; <label>:39 ; preds = %9 >> %40 = load %struct._dispatch** %dispatch, align 8 >> %41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1 >> %42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8 >> br label %43 >> >> ; <label>:43 ; preds = %39, %35 >> %44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [ %42, %39 ] >> store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 >> %45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 >> %46 = load %struct._foo** %2, align 8 >> %47 = load i8** %3, align 8 >> %48 = load i8** %4, align 8 >> %49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48) >> store %struct._foo* %49, %struct._foo** %1 >> br label %50 >> >> ; <label>:50 ; preds = %43, %7 >> %51 = load %struct._foo** %1 >> ret %struct._foo* %51 >> } >> >> ~~~ >> >> ------------------------------------------------------ >> When vanity and rivalry disappear, all the lines go >> out of your stomach and you slow down and coast >> slowly to a stop in the middle. -- DLR >> >> >> _______________________________________________ >> LLVM Developers mailing list >> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev > >------------------------------------------------------ Lemmings will hurl themselves at projects, promising the world and then getting bored within a couple of days as they realise that enthusiasm, good will, and desperation do not produce much code by themselves. -- H. Suleiman
Maybe Matching Threads
- [LLVMdev] Need a clue to improve the optimization of some C code
- Expected constant simplification not happening
- Expected constant simplification not happening
- A code layout related side-effect introduced by rL318299
- A code layout related side-effect introduced by rL318299