thr3ads.net - llvm dev - [LLVMdev] Need a clue to improve the optimization of some C code [Mar 2015]

If this information is useful, please help other people find it:
Share via:

Nat!

2015-Mar-03 14:06 UTC

[LLVMdev] Need a clue to improve the optimization of some C code

Hi 

I have some inline function C code, that llvm could be optimizing better. 
Since I am new to this, I wonder if someone could give me a few pointers, how to
approach this in LLVM.

Should I try to change the IR code -somehow- to get the code generator to
generate better code, or should I rather go to the code generator and try to add
an optimization pass ?

Thanks for any feedback.

Ciao
   Nat!


P.S. In case someone is interested, here is the assembler code and the IR that
produced it.



Relevant LLVM generated x86_64 assembler portion with -Os
~~~
	testq	%r12, %r12
	je	LBB0_5
## BB#1:
	movq	-8(%r12), %rcx
	movq	(%rcx), %rax
	movq	-8(%rax), %rdx
	andq	%r15, %rdx
	cmpq	%r15, (%rax,%rdx)
	je	LBB0_2
## BB#3:
	addq	$8, %rcx
	jmp	LBB0_4
LBB0_2:
	leaq	8(%rdx,%rax), %rcx
LBB0_4:
	movq	%r12, %rdi
	movq	%r15, %rsi
	movq	%r14, %rdx
	callq	*(%rcx)
	movq	%rax, %rbx
LBB0_5:
~~~

Better/tighter assembler code would be (saves 2 instructions, one jump less)
~~~
	testq	%r12, %r12
	je	LBB0_5

	movq	-8(%r12), %rcx
	movq	(%rcx), %rax
	movq	-8(%rax), %rdx
	andq	%r15, %rdx
	cmpq	%r15, (%rax,%rdx)
	jne	LBB0_4

	leaq	0(%rdx,%rax), %rcx
LBB0_4:
	movq	%r12, %rdi
	movq	%r15, %rsi
	movq	%r14, %rdx
	callq	*8(%rcx)
	movq	%rax, %rbx
LBB0_5:
~~~


This is the IR code, which produces above:
~~~
; Function Attrs: inlinehint nounwind
define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8* %value) #2
{
  %1 = alloca %struct._foo*, align 8
  %2 = alloca %struct._foo*, align 8
  %3 = alloca i8*, align 8
  %4 = alloca i8*, align 8
  %dispatch = alloca %struct._dispatch*, align 8
  %table = alloca %struct._table*, align 8
  %entrys = alloca %struct._entry*, align 8
  %entry = alloca %struct._entry*, align 8
  %offset = alloca i64, align 8
  %mask = alloca i64, align 8
  %f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8
  store %struct._foo* %foo, %struct._foo** %2, align 8
  store i8* %key, i8** %3, align 8
  store i8* %value, i8** %4, align 8
  %5 = load %struct._foo** %2, align 8
  %6 = icmp ne %struct._foo* %5, null
  br i1 %6, label %9, label %7

; <label>:7                                       ; preds = %0
  %8 = load %struct._foo** %2, align 8
  store %struct._foo* %8, %struct._foo** %1
  br label %50

; <label>:9                                       ; preds = %0
  %10 = load %struct._foo** %2, align 8
  %11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo* %10)
  %12 = load %struct._dispatch** %11, align 8
  store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8
  %13 = load %struct._dispatch** %dispatch, align 8
  %14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0
  %15 = load %struct._entry** %14, align 8
  store %struct._entry* %15, %struct._entry** %entrys, align 8
  %16 = load %struct._entry** %entrys, align 8
  %17 = call %struct._table* @_table_from_entrys(%struct._entry* %16)
  store %struct._table* %17, %struct._table** %table, align 8
  %18 = load %struct._table** %table, align 8
  %19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2
  %20 = load i64* %19, align 8
  store i64 %20, i64* %mask, align 8
  %21 = load i8** %3, align 8
  %22 = ptrtoint i8* %21 to i64
  %23 = load i64* %mask, align 8
  %24 = and i64 %22, %23
  store i64 %24, i64* %offset, align 8
  %25 = load i64* %offset, align 8
  %26 = load %struct._entry** %entrys, align 8
  %27 = bitcast %struct._entry* %26 to i8*
  %28 = getelementptr inbounds i8* %27, i64 %25
  %29 = bitcast i8* %28 to %struct._entry*
  store %struct._entry* %29, %struct._entry** %entry, align 8
  %30 = load %struct._entry** %entry, align 8
  %31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0
  %32 = load i8** %31, align 8
  %33 = load i8** %3, align 8
  %34 = icmp eq i8* %32, %33
  br i1 %34, label %35, label %39

; <label>:35                                      ; preds = %9
  %36 = load %struct._entry** %entry, align 8
  %37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1
  %38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8
  br label %43

; <label>:39                                      ; preds = %9
  %40 = load %struct._dispatch** %dispatch, align 8
  %41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1
  %42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8
  br label %43

; <label>:43                                      ; preds = %39, %35
  %44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [ %42, %39 ]
  store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo*
(%struct._foo*, i8*, i8*)** %f, align 8
  %45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8
  %46 = load %struct._foo** %2, align 8
  %47 = load i8** %3, align 8
  %48 = load i8** %4, align 8
  %49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48)
  store %struct._foo* %49, %struct._foo** %1
  br label %50

; <label>:50                                      ; preds = %43, %7
  %51 = load %struct._foo** %1
  ret %struct._foo* %51
}

~~~

------------------------------------------------------
When vanity and rivalry disappear, all the lines go 
out of your stomach and you slow down and coast 
slowly to a stop in the middle. -- DLR

Philip Reames

2015-Mar-03 18:49 UTC

head link

[LLVMdev] Need a clue to improve the optimization of some C code

You'll need to prove a bit more information to get any useful response.  
Questions:
1) What's you're use case?  Are you using clang to compile C code?  Are 
you manually generating LLVM IR?
2) Are you tracking ToT?  If you're not, you should be.

Depending on your use case, you might find this document useful:
http://llvm.org/docs/Frontend/PerformanceTips.html

On 03/03/2015 06:06 AM, Nat! wrote:> Hi
>
> I have some inline function C code, that llvm could be optimizing better.
> Since I am new to this, I wonder if someone could give me a few pointers,
how to approach this in LLVM.
>
> Should I try to change the IR code -somehow- to get the code generator to
generate better code, or should I rather go to the code generator and try to add
an optimization pass ?
>
> Thanks for any feedback.
>
> Ciao
>     Nat!
>
>
> P.S. In case someone is interested, here is the assembler code and the IR
that produced it.
>
>
>
> Relevant LLVM generated x86_64 assembler portion with -Os
> ~~~
> 	testq	%r12, %r12
> 	je	LBB0_5
> ## BB#1:
> 	movq	-8(%r12), %rcx
> 	movq	(%rcx), %rax
> 	movq	-8(%rax), %rdx
> 	andq	%r15, %rdx
> 	cmpq	%r15, (%rax,%rdx)
> 	je	LBB0_2
> ## BB#3:
> 	addq	$8, %rcx
> 	jmp	LBB0_4
> LBB0_2:
> 	leaq	8(%rdx,%rax), %rcx
> LBB0_4:
> 	movq	%r12, %rdi
> 	movq	%r15, %rsi
> 	movq	%r14, %rdx
> 	callq	*(%rcx)
> 	movq	%rax, %rbx
> LBB0_5:
> ~~~
>
> Better/tighter assembler code would be (saves 2 instructions, one jump
less)
> ~~~
> 	testq	%r12, %r12
> 	je	LBB0_5
>
> 	movq	-8(%r12), %rcx
> 	movq	(%rcx), %rax
> 	movq	-8(%rax), %rdx
> 	andq	%r15, %rdx
> 	cmpq	%r15, (%rax,%rdx)
> 	jne	LBB0_4
>
> 	leaq	0(%rdx,%rax), %rcx
> LBB0_4:
> 	movq	%r12, %rdi
> 	movq	%r15, %rsi
> 	movq	%r14, %rdx
> 	callq	*8(%rcx)
> 	movq	%rax, %rbx
> LBB0_5:
> ~~~
>
>
> This is the IR code, which produces above:
> ~~~
> ; Function Attrs: inlinehint nounwind
> define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8*
%value) #2 {
>    %1 = alloca %struct._foo*, align 8
>    %2 = alloca %struct._foo*, align 8
>    %3 = alloca i8*, align 8
>    %4 = alloca i8*, align 8
>    %dispatch = alloca %struct._dispatch*, align 8
>    %table = alloca %struct._table*, align 8
>    %entrys = alloca %struct._entry*, align 8
>    %entry = alloca %struct._entry*, align 8
>    %offset = alloca i64, align 8
>    %mask = alloca i64, align 8
>    %f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8
>    store %struct._foo* %foo, %struct._foo** %2, align 8
>    store i8* %key, i8** %3, align 8
>    store i8* %value, i8** %4, align 8
>    %5 = load %struct._foo** %2, align 8
>    %6 = icmp ne %struct._foo* %5, null
>    br i1 %6, label %9, label %7
>
> ; <label>:7                                       ; preds = %0
>    %8 = load %struct._foo** %2, align 8
>    store %struct._foo* %8, %struct._foo** %1
>    br label %50
>
> ; <label>:9                                       ; preds = %0
>    %10 = load %struct._foo** %2, align 8
>    %11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo* %10)
>    %12 = load %struct._dispatch** %11, align 8
>    store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8
>    %13 = load %struct._dispatch** %dispatch, align 8
>    %14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0
>    %15 = load %struct._entry** %14, align 8
>    store %struct._entry* %15, %struct._entry** %entrys, align 8
>    %16 = load %struct._entry** %entrys, align 8
>    %17 = call %struct._table* @_table_from_entrys(%struct._entry* %16)
>    store %struct._table* %17, %struct._table** %table, align 8
>    %18 = load %struct._table** %table, align 8
>    %19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2
>    %20 = load i64* %19, align 8
>    store i64 %20, i64* %mask, align 8
>    %21 = load i8** %3, align 8
>    %22 = ptrtoint i8* %21 to i64
>    %23 = load i64* %mask, align 8
>    %24 = and i64 %22, %23
>    store i64 %24, i64* %offset, align 8
>    %25 = load i64* %offset, align 8
>    %26 = load %struct._entry** %entrys, align 8
>    %27 = bitcast %struct._entry* %26 to i8*
>    %28 = getelementptr inbounds i8* %27, i64 %25
>    %29 = bitcast i8* %28 to %struct._entry*
>    store %struct._entry* %29, %struct._entry** %entry, align 8
>    %30 = load %struct._entry** %entry, align 8
>    %31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0
>    %32 = load i8** %31, align 8
>    %33 = load i8** %3, align 8
>    %34 = icmp eq i8* %32, %33
>    br i1 %34, label %35, label %39
>
> ; <label>:35                                      ; preds = %9
>    %36 = load %struct._entry** %entry, align 8
>    %37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1
>    %38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8
>    br label %43
>
> ; <label>:39                                      ; preds = %9
>    %40 = load %struct._dispatch** %dispatch, align 8
>    %41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1
>    %42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8
>    br label %43
>
> ; <label>:43                                      ; preds = %39, %35
>    %44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [ %42,
%39 ]
>    store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo*
(%struct._foo*, i8*, i8*)** %f, align 8
>    %45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8
>    %46 = load %struct._foo** %2, align 8
>    %47 = load i8** %3, align 8
>    %48 = load i8** %4, align 8
>    %49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48)
>    store %struct._foo* %49, %struct._foo** %1
>    br label %50
>
> ; <label>:50                                      ; preds = %43, %7
>    %51 = load %struct._foo** %1
>    ret %struct._foo* %51
> }
>
> ~~~
>
> ------------------------------------------------------
> When vanity and rivalry disappear, all the lines go
> out of your stomach and you slow down and coast
> slowly to a stop in the middle. -- DLR
>
>
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

Nat!

2015-Mar-03 19:20 UTC

head link

[LLVMdev] Need a clue to improve the optimization of some C code

Am 03.03.2015 um 19:49 schrieb Philip Reames <listmail at
philipreames.com>:

Hi Philip

first thanks for your response,
> You'll need to prove a bit more information to get any useful response.
Questions:
> 1) What's you're use case?  Are you using clang to compile C code? 
Are you manually generating LLVM IR?
yes the "inline function C code" will be compiled by clang. 
> 2) Are you tracking ToT?  If you're not, you should be.
Currently the git repo doesn't build for me (libLLVMR600CodeGen fails).
Admittedly my IR output is from an older version - 3.6.
> 
> Depending on your use case, you might find this document useful:
> http://llvm.org/docs/Frontend/PerformanceTips.html
Maybe yes, if it would be sensible to rewrite the IR, which I am wondering, if
that is a good/useful idea ? I don't know.

I basically need to get LLVM to emit

~~~
	cmpq	%r15, (%rax,%rdx)
	jne	LBB0_4

	leaq	0(%rdx,%rax), %rcx
LBB0_4:
	callq	*8(%rcx)
~~~

instead of
~~~
	cmpq	%r15, (%rax,%rdx)
	je	LBB0_2

	addq	$8, %rcx
	jmp	LBB0_4
LBB0_2:
	leaq	8(%rdx,%rax), %rcx
LBB0_4:
	callq	*(%rcx)

~~~

If I can do this by rewriting the IR, it would be nice, because it hopefully
translates to other architectures. And I would dig into that. If in general the
approach is to keep the IR as it, but tweak the code generation I'll try
that first.

I have only a coarse understanding how LLVM optimization works. It would be nice
if I could avoid some common dead-ends right from the start.

Ciao
   Nat!
> 
> On 03/03/2015 06:06 AM, Nat! wrote:
>> Hi
>> 
>> I have some inline function C code, that llvm could be optimizing
better.
>> Since I am new to this, I wonder if someone could give me a few
pointers, how to approach this in LLVM.
>> 
>> Should I try to change the IR code -somehow- to get the code generator
to generate better code, or should I rather go to the code generator and try to
add an optimization pass ?
>> 
>> Thanks for any feedback.
>> 
>> Ciao
>>    Nat!
>> 
>> 
>> P.S. In case someone is interested, here is the assembler code and the
IR that produced it.
>> 
>> 
>> 
>> Relevant LLVM generated x86_64 assembler portion with -Os
>> ~~~
>> 	testq	%r12, %r12
>> 	je	LBB0_5
>> ## BB#1:
>> 	movq	-8(%r12), %rcx
>> 	movq	(%rcx), %rax
>> 	movq	-8(%rax), %rdx
>> 	andq	%r15, %rdx
>> 	cmpq	%r15, (%rax,%rdx)
>> 	je	LBB0_2
>> ## BB#3:
>> 	addq	$8, %rcx
>> 	jmp	LBB0_4
>> LBB0_2:
>> 	leaq	8(%rdx,%rax), %rcx
>> LBB0_4:
>> 	movq	%r12, %rdi
>> 	movq	%r15, %rsi
>> 	movq	%r14, %rdx
>> 	callq	*(%rcx)
>> 	movq	%rax, %rbx
>> LBB0_5:
>> ~~~
>> 
>> Better/tighter assembler code would be (saves 2 instructions, one jump
less)
>> ~~~
>> 	testq	%r12, %r12
>> 	je	LBB0_5
>> 
>> 	movq	-8(%r12), %rcx
>> 	movq	(%rcx), %rax
>> 	movq	-8(%rax), %rdx
>> 	andq	%r15, %rdx
>> 	cmpq	%r15, (%rax,%rdx)
>> 	jne	LBB0_4
>> 
>> 	leaq	0(%rdx,%rax), %rcx
>> LBB0_4:
>> 	movq	%r12, %rdi
>> 	movq	%r15, %rsi
>> 	movq	%r14, %rdx
>> 	callq	*8(%rcx)
>> 	movq	%rax, %rbx
>> LBB0_5:
>> ~~~
>> 
>> 
>> This is the IR code, which produces above:
>> ~~~
>> ; Function Attrs: inlinehint nounwind
>> define internal %struct._foo* @call(%struct._foo* %foo, i8* %key, i8*
%value) #2 {
>>   %1 = alloca %struct._foo*, align 8
>>   %2 = alloca %struct._foo*, align 8
>>   %3 = alloca i8*, align 8
>>   %4 = alloca i8*, align 8
>>   %dispatch = alloca %struct._dispatch*, align 8
>>   %table = alloca %struct._table*, align 8
>>   %entrys = alloca %struct._entry*, align 8
>>   %entry = alloca %struct._entry*, align 8
>>   %offset = alloca i64, align 8
>>   %mask = alloca i64, align 8
>>   %f = alloca %struct._foo* (%struct._foo*, i8*, i8*)*, align 8
>>   store %struct._foo* %foo, %struct._foo** %2, align 8
>>   store i8* %key, i8** %3, align 8
>>   store i8* %value, i8** %4, align 8
>>   %5 = load %struct._foo** %2, align 8
>>   %6 = icmp ne %struct._foo* %5, null
>>   br i1 %6, label %9, label %7
>> 
>> ; <label>:7                                       ; preds = %0
>>   %8 = load %struct._foo** %2, align 8
>>   store %struct._foo* %8, %struct._foo** %1
>>   br label %50
>> 
>> ; <label>:9                                       ; preds = %0
>>   %10 = load %struct._foo** %2, align 8
>>   %11 = call %struct._dispatch** @_dispatch_p_from_foo(%struct._foo*
%10)
>>   %12 = load %struct._dispatch** %11, align 8
>>   store %struct._dispatch* %12, %struct._dispatch** %dispatch, align 8
>>   %13 = load %struct._dispatch** %dispatch, align 8
>>   %14 = getelementptr inbounds %struct._dispatch* %13, i32 0, i32 0
>>   %15 = load %struct._entry** %14, align 8
>>   store %struct._entry* %15, %struct._entry** %entrys, align 8
>>   %16 = load %struct._entry** %entrys, align 8
>>   %17 = call %struct._table* @_table_from_entrys(%struct._entry* %16)
>>   store %struct._table* %17, %struct._table** %table, align 8
>>   %18 = load %struct._table** %table, align 8
>>   %19 = getelementptr inbounds %struct._table* %18, i32 0, i32 2
>>   %20 = load i64* %19, align 8
>>   store i64 %20, i64* %mask, align 8
>>   %21 = load i8** %3, align 8
>>   %22 = ptrtoint i8* %21 to i64
>>   %23 = load i64* %mask, align 8
>>   %24 = and i64 %22, %23
>>   store i64 %24, i64* %offset, align 8
>>   %25 = load i64* %offset, align 8
>>   %26 = load %struct._entry** %entrys, align 8
>>   %27 = bitcast %struct._entry* %26 to i8*
>>   %28 = getelementptr inbounds i8* %27, i64 %25
>>   %29 = bitcast i8* %28 to %struct._entry*
>>   store %struct._entry* %29, %struct._entry** %entry, align 8
>>   %30 = load %struct._entry** %entry, align 8
>>   %31 = getelementptr inbounds %struct._entry* %30, i32 0, i32 0
>>   %32 = load i8** %31, align 8
>>   %33 = load i8** %3, align 8
>>   %34 = icmp eq i8* %32, %33
>>   br i1 %34, label %35, label %39
>> 
>> ; <label>:35                                      ; preds = %9
>>   %36 = load %struct._entry** %entry, align 8
>>   %37 = getelementptr inbounds %struct._entry* %36, i32 0, i32 1
>>   %38 = load %struct._foo* (%struct._foo*, i8*, i8*)** %37, align 8
>>   br label %43
>> 
>> ; <label>:39                                      ; preds = %9
>>   %40 = load %struct._dispatch** %dispatch, align 8
>>   %41 = getelementptr inbounds %struct._dispatch* %40, i32 0, i32 1
>>   %42 = load %struct._foo* (%struct._foo*, i8*, i8*)** %41, align 8
>>   br label %43
>> 
>> ; <label>:43                                      ; preds = %39,
%35
>>   %44 = phi %struct._foo* (%struct._foo*, i8*, i8*)* [ %38, %35 ], [
%42, %39 ]
>>   store %struct._foo* (%struct._foo*, i8*, i8*)* %44, %struct._foo*
(%struct._foo*, i8*, i8*)** %f, align 8
>>   %45 = load %struct._foo* (%struct._foo*, i8*, i8*)** %f, align 8
>>   %46 = load %struct._foo** %2, align 8
>>   %47 = load i8** %3, align 8
>>   %48 = load i8** %4, align 8
>>   %49 = call %struct._foo* %45(%struct._foo* %46, i8* %47, i8* %48)
>>   store %struct._foo* %49, %struct._foo** %1
>>   br label %50
>> 
>> ; <label>:50                                      ; preds = %43,
%7
>>   %51 = load %struct._foo** %1
>>   ret %struct._foo* %51
>> }
>> 
>> ~~~
>> 
>> ------------------------------------------------------
>> When vanity and rivalry disappear, all the lines go
>> out of your stomach and you slow down and coast
>> slowly to a stop in the middle. -- DLR
>> 
>> 
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
> 
> 
------------------------------------------------------
Lemmings will hurl themselves at projects, promising 
the world and then getting bored within a couple of 
days as they realise that enthusiasm, good will, and 
desperation do not produce much code by themselves.
   -- H. Suleiman

Seemingly Similar Threads

Search for more possibly parallel threads

llvm dev - Mar 2015 - [LLVMdev] Need a clue to improve the optimization of some C code

[LLVMdev] Need a clue to improve the optimization of some C code

[LLVMdev] Need a clue to improve the optimization of some C code

[LLVMdev] Need a clue to improve the optimization of some C code

Seemingly Similar Threads