Nat!
2015-Mar-03 14:06 UTC
[LLVMdev] Need a clue to improve the optimization of some C code
Hi I have some inline function C code, that llvm could be optimizing better. Since I am new to this, I wonder if someone could give me a few pointers, how to approach this in LLVM. Should I try to change the IR code -somehow- to get the code generator to generate better code, or should I rather go to the code generator and try to add an optimization pass ? Thanks for any feedback. Ciao Nat! P.S. In case someone is interested, here is the assembler code and the IR that produced it. Relevant LLVM generated x86_64 assembler portion with -Os ~~~ testq %r12, %r12 je LBB0_5 ## BB#1: movq -8(%r12), %rcx movq (%rcx), %rax movq -8(%rax), %rdx andq %r15, %rdx cmpq %r15, (%rax,%rdx) je LBB0_2 ## BB#3: addq $8, %rcx jmp LBB0_4 LBB0_2: leaq 8(%rdx,%rax), %rcx LBB0_4: movq %r12, %rdi movq %r15, %rsi movq %r14, %rdx callq *(%rcx) movq %rax, %rbx LBB0_5: ~~~ Better/tighter assembler code would be (saves 2 instructions, one jump less) ~~~ testq %r12, %r12 je LBB0_5 movq -8(%r12), %rcx movq (%rcx), %rax movq -8(%rax), %rdx andq %r15, %rdx cmpq %r15, (%rax,%rdx) jne LBB0_4 leaq 0(%rdx,%rax), %rcx LBB0_4: movq %r12, %rdi movq %r15, %rsi movq %r14, %rdx callq *8(%rcx) movq %rax, %rbx LBB0_5: ~~~ This is the IR code, which produces above: ~~~ ; Function Attrs: inlinehint nounwind define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8* %value) #2 { %1 = alloca %struct._foo*, align 8 %2 = alloca %struct._foo*, align 8 %3 = alloca i8*, align 8 %4 = alloca i8*, align 8 %dispatch = alloca %struct._dispatch*, align 8 %table = alloca %struct._table*, align 8 %entrys = alloca %struct._entry*, align 8 %entry = alloca %struct._entry*, align 8 %offset = alloca i64, align 8 %mask = alloca i64, align 8 %f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8 store %struct._foo* %foo, %struct._foo** %2, align 8 store i8* %key, i8** %3, align 8 store i8* %value, i8** %4, align 8 %5 = load %struct._foo** %2, align 8 %6 = icmp ne %struct._foo* %5, null br i1 %6, label %9, label %7 ; <label>:7 ; preds = %0 %8 = load %struct._foo** %2, align 8 store %struct._foo* %8, %struct._foo** %1 br label %50 ; <label>:9 ; preds = %0 %10 = load %struct._foo** %2, align 8 %11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo* %10) %12 = load %struct._dispatch** %11, align 8 store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8 %13 = load %struct._dispatch** %dispatch, align 8 %14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0 %15 = load %struct._entry** %14, align 8 store %struct._entry* %15, %struct._entry** %entrys, align 8 %16 = load %struct._entry** %entrys, align 8 %17 = call %struct._table* @_table_from_entrys(%struct._entry* %16) store %struct._table* %17, %struct._table** %table, align 8 %18 = load %struct._table** %table, align 8 %19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2 %20 = load i64* %19, align 8 store i64 %20, i64* %mask, align 8 %21 = load i8** %3, align 8 %22 = ptrtoint i8* %21 to i64 %23 = load i64* %mask, align 8 %24 = and i64 %22, %23 store i64 %24, i64* %offset, align 8 %25 = load i64* %offset, align 8 %26 = load %struct._entry** %entrys, align 8 %27 = bitcast %struct._entry* %26 to i8* %28 = getelementptr inbounds i8* %27, i64 %25 %29 = bitcast i8* %28 to %struct._entry* store %struct._entry* %29, %struct._entry** %entry, align 8 %30 = load %struct._entry** %entry, align 8 %31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0 %32 = load i8** %31, align 8 %33 = load i8** %3, align 8 %34 = icmp eq i8* %32, %33 br i1 %34, label %35, label %39 ; <label>:35 ; preds = %9 %36 = load %struct._entry** %entry, align 8 %37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1 %38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8 br label %43 ; <label>:39 ; preds = %9 %40 = load %struct._dispatch** %dispatch, align 8 %41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1 %42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8 br label %43 ; <label>:43 ; preds = %39, %35 %44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [ %42, %39 ] store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 %45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 %46 = load %struct._foo** %2, align 8 %47 = load i8** %3, align 8 %48 = load i8** %4, align 8 %49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48) store %struct._foo* %49, %struct._foo** %1 br label %50 ; <label>:50 ; preds = %43, %7 %51 = load %struct._foo** %1 ret %struct._foo* %51 } ~~~ ------------------------------------------------------ When vanity and rivalry disappear, all the lines go out of your stomach and you slow down and coast slowly to a stop in the middle. -- DLR
Philip Reames
2015-Mar-03 18:49 UTC
[LLVMdev] Need a clue to improve the optimization of some C code
You'll need to prove a bit more information to get any useful response. Questions: 1) What's you're use case? Are you using clang to compile C code? Are you manually generating LLVM IR? 2) Are you tracking ToT? If you're not, you should be. Depending on your use case, you might find this document useful: http://llvm.org/docs/Frontend/PerformanceTips.html On 03/03/2015 06:06 AM, Nat! wrote:> Hi > > I have some inline function C code, that llvm could be optimizing better. > Since I am new to this, I wonder if someone could give me a few pointers, how to approach this in LLVM. > > Should I try to change the IR code -somehow- to get the code generator to generate better code, or should I rather go to the code generator and try to add an optimization pass ? > > Thanks for any feedback. > > Ciao > Nat! > > > P.S. In case someone is interested, here is the assembler code and the IR that produced it. > > > > Relevant LLVM generated x86_64 assembler portion with -Os > ~~~ > testq %r12, %r12 > je LBB0_5 > ## BB#1: > movq -8(%r12), %rcx > movq (%rcx), %rax > movq -8(%rax), %rdx > andq %r15, %rdx > cmpq %r15, (%rax,%rdx) > je LBB0_2 > ## BB#3: > addq $8, %rcx > jmp LBB0_4 > LBB0_2: > leaq 8(%rdx,%rax), %rcx > LBB0_4: > movq %r12, %rdi > movq %r15, %rsi > movq %r14, %rdx > callq *(%rcx) > movq %rax, %rbx > LBB0_5: > ~~~ > > Better/tighter assembler code would be (saves 2 instructions, one jump less) > ~~~ > testq %r12, %r12 > je LBB0_5 > > movq -8(%r12), %rcx > movq (%rcx), %rax > movq -8(%rax), %rdx > andq %r15, %rdx > cmpq %r15, (%rax,%rdx) > jne LBB0_4 > > leaq 0(%rdx,%rax), %rcx > LBB0_4: > movq %r12, %rdi > movq %r15, %rsi > movq %r14, %rdx > callq *8(%rcx) > movq %rax, %rbx > LBB0_5: > ~~~ > > > This is the IR code, which produces above: > ~~~ > ; Function Attrs: inlinehint nounwind > define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8* %value) #2 { > %1 = alloca %struct._foo*, align 8 > %2 = alloca %struct._foo*, align 8 > %3 = alloca i8*, align 8 > %4 = alloca i8*, align 8 > %dispatch = alloca %struct._dispatch*, align 8 > %table = alloca %struct._table*, align 8 > %entrys = alloca %struct._entry*, align 8 > %entry = alloca %struct._entry*, align 8 > %offset = alloca i64, align 8 > %mask = alloca i64, align 8 > %f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8 > store %struct._foo* %foo, %struct._foo** %2, align 8 > store i8* %key, i8** %3, align 8 > store i8* %value, i8** %4, align 8 > %5 = load %struct._foo** %2, align 8 > %6 = icmp ne %struct._foo* %5, null > br i1 %6, label %9, label %7 > > ; <label>:7 ; preds = %0 > %8 = load %struct._foo** %2, align 8 > store %struct._foo* %8, %struct._foo** %1 > br label %50 > > ; <label>:9 ; preds = %0 > %10 = load %struct._foo** %2, align 8 > %11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo* %10) > %12 = load %struct._dispatch** %11, align 8 > store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8 > %13 = load %struct._dispatch** %dispatch, align 8 > %14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0 > %15 = load %struct._entry** %14, align 8 > store %struct._entry* %15, %struct._entry** %entrys, align 8 > %16 = load %struct._entry** %entrys, align 8 > %17 = call %struct._table* @_table_from_entrys(%struct._entry* %16) > store %struct._table* %17, %struct._table** %table, align 8 > %18 = load %struct._table** %table, align 8 > %19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2 > %20 = load i64* %19, align 8 > store i64 %20, i64* %mask, align 8 > %21 = load i8** %3, align 8 > %22 = ptrtoint i8* %21 to i64 > %23 = load i64* %mask, align 8 > %24 = and i64 %22, %23 > store i64 %24, i64* %offset, align 8 > %25 = load i64* %offset, align 8 > %26 = load %struct._entry** %entrys, align 8 > %27 = bitcast %struct._entry* %26 to i8* > %28 = getelementptr inbounds i8* %27, i64 %25 > %29 = bitcast i8* %28 to %struct._entry* > store %struct._entry* %29, %struct._entry** %entry, align 8 > %30 = load %struct._entry** %entry, align 8 > %31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0 > %32 = load i8** %31, align 8 > %33 = load i8** %3, align 8 > %34 = icmp eq i8* %32, %33 > br i1 %34, label %35, label %39 > > ; <label>:35 ; preds = %9 > %36 = load %struct._entry** %entry, align 8 > %37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1 > %38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8 > br label %43 > > ; <label>:39 ; preds = %9 > %40 = load %struct._dispatch** %dispatch, align 8 > %41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1 > %42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8 > br label %43 > > ; <label>:43 ; preds = %39, %35 > %44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [ %42, %39 ] > store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 > %45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 > %46 = load %struct._foo** %2, align 8 > %47 = load i8** %3, align 8 > %48 = load i8** %4, align 8 > %49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48) > store %struct._foo* %49, %struct._foo** %1 > br label %50 > > ; <label>:50 ; preds = %43, %7 > %51 = load %struct._foo** %1 > ret %struct._foo* %51 > } > > ~~~ > > ------------------------------------------------------ > When vanity and rivalry disappear, all the lines go > out of your stomach and you slow down and coast > slowly to a stop in the middle. -- DLR > > > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Nat!
2015-Mar-03 19:20 UTC
[LLVMdev] Need a clue to improve the optimization of some C code
Am 03.03.2015 um 19:49 schrieb Philip Reames <listmail at philipreames.com>: Hi Philip first thanks for your response,> You'll need to prove a bit more information to get any useful response. Questions: > 1) What's you're use case? Are you using clang to compile C code? Are you manually generating LLVM IR?yes the "inline function C code" will be compiled by clang.> 2) Are you tracking ToT? If you're not, you should be.Currently the git repo doesn't build for me (libLLVMR600CodeGen fails). Admittedly my IR output is from an older version - 3.6.> > Depending on your use case, you might find this document useful: > http://llvm.org/docs/Frontend/PerformanceTips.htmlMaybe yes, if it would be sensible to rewrite the IR, which I am wondering, if that is a good/useful idea ? I don't know. I basically need to get LLVM to emit ~~~ cmpq %r15, (%rax,%rdx) jne LBB0_4 leaq 0(%rdx,%rax), %rcx LBB0_4: callq *8(%rcx) ~~~ instead of ~~~ cmpq %r15, (%rax,%rdx) je LBB0_2 addq $8, %rcx jmp LBB0_4 LBB0_2: leaq 8(%rdx,%rax), %rcx LBB0_4: callq *(%rcx) ~~~ If I can do this by rewriting the IR, it would be nice, because it hopefully translates to other architectures. And I would dig into that. If in general the approach is to keep the IR as it, but tweak the code generation I'll try that first. I have only a coarse understanding how LLVM optimization works. It would be nice if I could avoid some common dead-ends right from the start. Ciao Nat!> > On 03/03/2015 06:06 AM, Nat! wrote: >> Hi >> >> I have some inline function C code, that llvm could be optimizing better. >> Since I am new to this, I wonder if someone could give me a few pointers, how to approach this in LLVM. >> >> Should I try to change the IR code -somehow- to get the code generator to generate better code, or should I rather go to the code generator and try to add an optimization pass ? >> >> Thanks for any feedback. >> >> Ciao >> Nat! >> >> >> P.S. In case someone is interested, here is the assembler code and the IR that produced it. >> >> >> >> Relevant LLVM generated x86_64 assembler portion with -Os >> ~~~ >> testq %r12, %r12 >> je LBB0_5 >> ## BB#1: >> movq -8(%r12), %rcx >> movq (%rcx), %rax >> movq -8(%rax), %rdx >> andq %r15, %rdx >> cmpq %r15, (%rax,%rdx) >> je LBB0_2 >> ## BB#3: >> addq $8, %rcx >> jmp LBB0_4 >> LBB0_2: >> leaq 8(%rdx,%rax), %rcx >> LBB0_4: >> movq %r12, %rdi >> movq %r15, %rsi >> movq %r14, %rdx >> callq *(%rcx) >> movq %rax, %rbx >> LBB0_5: >> ~~~ >> >> Better/tighter assembler code would be (saves 2 instructions, one jump less) >> ~~~ >> testq %r12, %r12 >> je LBB0_5 >> >> movq -8(%r12), %rcx >> movq (%rcx), %rax >> movq -8(%rax), %rdx >> andq %r15, %rdx >> cmpq %r15, (%rax,%rdx) >> jne LBB0_4 >> >> leaq 0(%rdx,%rax), %rcx >> LBB0_4: >> movq %r12, %rdi >> movq %r15, %rsi >> movq %r14, %rdx >> callq *8(%rcx) >> movq %rax, %rbx >> LBB0_5: >> ~~~ >> >> >> This is the IR code, which produces above: >> ~~~ >> ; Function Attrs: inlinehint nounwind >> define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8* %value) #2 { >> %1 = alloca %struct._foo*, align 8 >> %2 = alloca %struct._foo*, align 8 >> %3 = alloca i8*, align 8 >> %4 = alloca i8*, align 8 >> %dispatch = alloca %struct._dispatch*, align 8 >> %table = alloca %struct._table*, align 8 >> %entrys = alloca %struct._entry*, align 8 >> %entry = alloca %struct._entry*, align 8 >> %offset = alloca i64, align 8 >> %mask = alloca i64, align 8 >> %f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8 >> store %struct._foo* %foo, %struct._foo** %2, align 8 >> store i8* %key, i8** %3, align 8 >> store i8* %value, i8** %4, align 8 >> %5 = load %struct._foo** %2, align 8 >> %6 = icmp ne %struct._foo* %5, null >> br i1 %6, label %9, label %7 >> >> ; <label>:7 ; preds = %0 >> %8 = load %struct._foo** %2, align 8 >> store %struct._foo* %8, %struct._foo** %1 >> br label %50 >> >> ; <label>:9 ; preds = %0 >> %10 = load %struct._foo** %2, align 8 >> %11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo* %10) >> %12 = load %struct._dispatch** %11, align 8 >> store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8 >> %13 = load %struct._dispatch** %dispatch, align 8 >> %14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0 >> %15 = load %struct._entry** %14, align 8 >> store %struct._entry* %15, %struct._entry** %entrys, align 8 >> %16 = load %struct._entry** %entrys, align 8 >> %17 = call %struct._table* @_table_from_entrys(%struct._entry* %16) >> store %struct._table* %17, %struct._table** %table, align 8 >> %18 = load %struct._table** %table, align 8 >> %19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2 >> %20 = load i64* %19, align 8 >> store i64 %20, i64* %mask, align 8 >> %21 = load i8** %3, align 8 >> %22 = ptrtoint i8* %21 to i64 >> %23 = load i64* %mask, align 8 >> %24 = and i64 %22, %23 >> store i64 %24, i64* %offset, align 8 >> %25 = load i64* %offset, align 8 >> %26 = load %struct._entry** %entrys, align 8 >> %27 = bitcast %struct._entry* %26 to i8* >> %28 = getelementptr inbounds i8* %27, i64 %25 >> %29 = bitcast i8* %28 to %struct._entry* >> store %struct._entry* %29, %struct._entry** %entry, align 8 >> %30 = load %struct._entry** %entry, align 8 >> %31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0 >> %32 = load i8** %31, align 8 >> %33 = load i8** %3, align 8 >> %34 = icmp eq i8* %32, %33 >> br i1 %34, label %35, label %39 >> >> ; <label>:35 ; preds = %9 >> %36 = load %struct._entry** %entry, align 8 >> %37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1 >> %38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8 >> br label %43 >> >> ; <label>:39 ; preds = %9 >> %40 = load %struct._dispatch** %dispatch, align 8 >> %41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1 >> %42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8 >> br label %43 >> >> ; <label>:43 ; preds = %39, %35 >> %44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [ %42, %39 ] >> store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 >> %45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8 >> %46 = load %struct._foo** %2, align 8 >> %47 = load i8** %3, align 8 >> %48 = load i8** %4, align 8 >> %49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48) >> store %struct._foo* %49, %struct._foo** %1 >> br label %50 >> >> ; <label>:50 ; preds = %43, %7 >> %51 = load %struct._foo** %1 >> ret %struct._foo* %51 >> } >> >> ~~~ >> >> ------------------------------------------------------ >> When vanity and rivalry disappear, all the lines go >> out of your stomach and you slow down and coast >> slowly to a stop in the middle. -- DLR >> >> >> _______________________________________________ >> LLVM Developers mailing list >> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev > >------------------------------------------------------ Lemmings will hurl themselves at projects, promising the world and then getting bored within a couple of days as they realise that enthusiasm, good will, and desperation do not produce much code by themselves. -- H. Suleiman