thr3ads.net - llvm dev - [llvm-dev] Memory Store/Load Optimization Issue (Emulating stack) [Feb 2016]

If this information is useful, please help other people find it:
Share via:

Paul Peet via llvm-dev

2016-Feb-10 20:18 UTC

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

Thank you for the hint.

I adjusted the code and it works:

The code after replacing inttoptr with getelementptr:

define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
entry:
  ; push foo (On "stack")
  %sp_1 = getelementptr i8, i8* %sp, i32 -4
  %sp_1_ptr = bitcast i8* %sp_1 to i32*
  store i32 %foo, i32* %sp_1_ptr, align 4

  ; push bar
  %sp_2 = getelementptr i8, i8* %sp_1, i32 -4
  %sp_2_ptr = bitcast i8* %sp_2 to i32*
  store i32 %bar, i32* %sp_2_ptr, align 4

  ; val1 = pop (val1 = bar)
  %sp_3_ptr = bitcast i8* %sp_2 to i32*
  %val1 = load i32, i32* %sp_3_ptr, align 4
  %sp_3 = getelementptr i8, i8* %sp_2, i32 4

  ; val2 = pop (val2 = foo)
  %sp_4_ptr = bitcast i8* %sp_3 to i32*
  %val2 = load i32, i32* %sp_4_ptr, align 4
  %sp_4 = getelementptr i8, i8* %sp_3, i32 4

  %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %val1, 0
  %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %val2, 1
  %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp_4, 2

  ret { i32, i32, i8* } %ret_3
}

After optimization ("opt -instcombine ./code.ll -S")

define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
entry:
  %sp_1 = getelementptr i8, i8* %sp, i64 -4
  %sp_1_ptr = bitcast i8* %sp_1 to i32*
  store i32 %foo, i32* %sp_1_ptr, align 4
  %sp_2 = getelementptr i8, i8* %sp, i64 -8
  %sp_2_ptr = bitcast i8* %sp_2 to i32*
  store i32 %bar, i32* %sp_2_ptr, align 4
  %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %bar, 0
  %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %foo, 1
  %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp, 2
  ret { i32, i32, i8* } %ret_3
}

My only questions are now:
- How is it that inttoptr cannot provide that specific alias information so
it can optimize that store/load away ?
- Might it be possible to get inttoptr providing such alias analysis ?
- I came across MemorySSA while browsing though the llvm source. Is it
possible that one can use MemorySSA to do such optimization without alias
analysis ?
- Where do I have to look in the source which is doing this kind of
optimization (Is it instcombine which uses lib/Analysis/Loads.cpp ?)

Regards,
Paul


2016-02-10 0:26 GMT+01:00 Philip Reames <listmail at philipreames.com>:
> Two points:
> - Using inttoptr is a mistake here.  GEPs are strongly preferred and
> provide strictly more aliasing information to the optimizer.
> - The zext is a bit weird.  I'm not sure where that came from, but
I'd not
> bother looking into until the preceding point is addressed.
>
> In general, you may find these docs useful:
> http://llvm.org/docs/Frontend/PerformanceTips.html
>
> Philip
>
>
>
> On 02/08/2016 06:54 AM, Paul Peet via llvm-dev wrote:
>
> Hello,
>
> I am trying to emulate the "stack" as like on x86 when using
push/pop so
> afterwards I can use LLVM's optimizer passes to simplify (reduce junk)
the
> code.
>
> The LLVM IR code:
>
> define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) {
>   ; push foo (On "stack")
>   %sp_1 = sub i32 %sp, 4
>   %sp_1_ptr = inttoptr i32 %sp_1 to i32*
>   store i32 %foo, i32* %sp_1_ptr, align 4
>
>   ; push bar
>   %sp_2 = sub i32 %sp_1, 4
>   %sp_2_ptr = inttoptr i32 %sp_2 to i32*
>   store i32 %bar, i32* %sp_2_ptr, align 4
>
>   ; val1 = pop (val1 = bar)
>   %sp_3_ptr = inttoptr i32 %sp_2 to i32*
>   %val1 = load i32, i32* %sp_3_ptr, align 4
>   %sp_3 = add i32 %sp_2, 4
>
>   ; val2 = pop (val2 = foo)
>   %sp_4_ptr = inttoptr i32 %sp_3 to i32*
>   %val2 = load i32, i32* %sp_4_ptr, align 4
>   %sp_4 = add i32 %sp_3, 4
>
>   %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %val1, 0
>   %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1
>   %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp_4, 2
>
>   ret { i32, i32, i32 } %ret_3
> }
>
> This code will "push" two values onto the stack and pop them in
reverse
> order so afterwards "foo" and "bar" will be swapped and
returned back.
>
> After running this through "opt -O2 ./test.ll", I am getting
this:
>
> define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) #0 {
>   %sp_1 = add i32 %sp, -4
>   %1 = zext i32 %sp_1 to i64
>   %sp_1_ptr = inttoptr i64 %1 to i32*
>   store i32 %foo, i32* %sp_1_ptr, align 4
>   %sp_2 = add i32 %sp, -8
>   %2 = zext i32 %sp_2 to i64
>   %sp_2_ptr = inttoptr i64 %2 to i32*
>   store i32 %bar, i32* %sp_2_ptr, align 4
>   %val2 = load i32, i32* %sp_1_ptr, align 4
>   %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %bar, 0 ; Swapped
>   %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1; Not Swapped
> (Not optimized; Should be %foo)
>   %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp, 2
>   ret { i32, i32, i32 } %ret_3
> }
>
> As you can see that the IR has got additional code, eg. zext. But the main
> problem here is that val2 hasn't been optimized.
> Could anyone show me some hints what is preventing the second val from
> being optimized? (My guess would be the zext because I am using %sp as a
> 32bit pointer although the "target" is 64bit).
>
> Regards,
> Paul
>
>
> _______________________________________________
> LLVM Developers mailing listllvm-dev at
lists.llvm.orghttp://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20160210/df898102/attachment.html>

Daniel Berlin via llvm-dev

2016-Feb-10 20:24 UTC

head link

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

On Wed, Feb 10, 2016 at 12:18 PM, Paul Peet via llvm-dev <
llvm-dev at lists.llvm.org> wrote:
> Thank you for the hint.
>
> I adjusted the code and it works:
>
> The code after replacing inttoptr with getelementptr:
>
> define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
> entry:
>   ; push foo (On "stack")
>   %sp_1 = getelementptr i8, i8* %sp, i32 -4
>   %sp_1_ptr = bitcast i8* %sp_1 to i32*
>   store i32 %foo, i32* %sp_1_ptr, align 4
>
>   ; push bar
>   %sp_2 = getelementptr i8, i8* %sp_1, i32 -4
>   %sp_2_ptr = bitcast i8* %sp_2 to i32*
>   store i32 %bar, i32* %sp_2_ptr, align 4
>
>   ; val1 = pop (val1 = bar)
>   %sp_3_ptr = bitcast i8* %sp_2 to i32*
>   %val1 = load i32, i32* %sp_3_ptr, align 4
>   %sp_3 = getelementptr i8, i8* %sp_2, i32 4
>
>   ; val2 = pop (val2 = foo)
>   %sp_4_ptr = bitcast i8* %sp_3 to i32*
>   %val2 = load i32, i32* %sp_4_ptr, align 4
>   %sp_4 = getelementptr i8, i8* %sp_3, i32 4
>
>   %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %val1, 0
>   %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %val2, 1
>   %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp_4, 2
>
>   ret { i32, i32, i8* } %ret_3
> }
>
> After optimization ("opt -instcombine ./code.ll -S")
>
> define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
> entry:
>   %sp_1 = getelementptr i8, i8* %sp, i64 -4
>   %sp_1_ptr = bitcast i8* %sp_1 to i32*
>   store i32 %foo, i32* %sp_1_ptr, align 4
>   %sp_2 = getelementptr i8, i8* %sp, i64 -8
>   %sp_2_ptr = bitcast i8* %sp_2 to i32*
>   store i32 %bar, i32* %sp_2_ptr, align 4
>   %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %bar, 0
>   %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %foo, 1
>   %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp, 2
>   ret { i32, i32, i8* } %ret_3
> }
>
> My only questions are now:
> - How is it that inttoptr cannot provide that specific alias information
> so it can optimize that store/load away ?
>Because nothing tracks what happens to the ints, and what happens when they
are converted back to pointers and whether it's sane :)

http://llvm.org/docs/GetElementPtr.html#how-is-gep-different-from-ptrtoint-arithmetic-and-inttoptr

- Might it be possible to get inttoptr providing such alias analysis
?>It doesn't make a lot of sense to try in most cases.
Most of the cases ptrtoint/inttoptr is useful are those where you want to
do crazy things to the pointer.


> - I came across MemorySSA while browsing though the llvm source. Is it
> possible that one can use MemorySSA to do such optimization without alias
> analysis ?
>
MemorySSA relies on alias analysis to generate the SSA form.

> - Where do I have to look in the source which is doing this kind of
> optimization (Is it instcombine which uses lib/Analysis/Loads.cpp ?)
>
> It's probably a combination of opts. The most likely candidate is -gvn,but  I would look at the pass dumps after each opt
> Regards,
> Paul
>
>
> 2016-02-10 0:26 GMT+01:00 Philip Reames <listmail at
philipreames.com>:
>
>> Two points:
>> - Using inttoptr is a mistake here.  GEPs are strongly preferred and
>> provide strictly more aliasing information to the optimizer.
>> - The zext is a bit weird.  I'm not sure where that came from, but
I'd
>> not bother looking into until the preceding point is addressed.
>>
>> In general, you may find these docs useful:
>> http://llvm.org/docs/Frontend/PerformanceTips.html
>>
>> Philip
>>
>>
>>
>> On 02/08/2016 06:54 AM, Paul Peet via llvm-dev wrote:
>>
>> Hello,
>>
>> I am trying to emulate the "stack" as like on x86 when using
push/pop so
>> afterwards I can use LLVM's optimizer passes to simplify (reduce
junk) the
>> code.
>>
>> The LLVM IR code:
>>
>> define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) {
>>   ; push foo (On "stack")
>>   %sp_1 = sub i32 %sp, 4
>>   %sp_1_ptr = inttoptr i32 %sp_1 to i32*
>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>
>>   ; push bar
>>   %sp_2 = sub i32 %sp_1, 4
>>   %sp_2_ptr = inttoptr i32 %sp_2 to i32*
>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>
>>   ; val1 = pop (val1 = bar)
>>   %sp_3_ptr = inttoptr i32 %sp_2 to i32*
>>   %val1 = load i32, i32* %sp_3_ptr, align 4
>>   %sp_3 = add i32 %sp_2, 4
>>
>>   ; val2 = pop (val2 = foo)
>>   %sp_4_ptr = inttoptr i32 %sp_3 to i32*
>>   %val2 = load i32, i32* %sp_4_ptr, align 4
>>   %sp_4 = add i32 %sp_3, 4
>>
>>   %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %val1, 0
>>   %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1
>>   %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp_4, 2
>>
>>   ret { i32, i32, i32 } %ret_3
>> }
>>
>> This code will "push" two values onto the stack and pop them
in reverse
>> order so afterwards "foo" and "bar" will be swapped
and returned back.
>>
>> After running this through "opt -O2 ./test.ll", I am getting
this:
>>
>> define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) #0 {
>>   %sp_1 = add i32 %sp, -4
>>   %1 = zext i32 %sp_1 to i64
>>   %sp_1_ptr = inttoptr i64 %1 to i32*
>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>   %sp_2 = add i32 %sp, -8
>>   %2 = zext i32 %sp_2 to i64
>>   %sp_2_ptr = inttoptr i64 %2 to i32*
>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>   %val2 = load i32, i32* %sp_1_ptr, align 4
>>   %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %bar, 0 ; Swapped
>>   %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1; Not
>> Swapped (Not optimized; Should be %foo)
>>   %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp, 2
>>   ret { i32, i32, i32 } %ret_3
>> }
>>
>> As you can see that the IR has got additional code, eg. zext. But the
>> main problem here is that val2 hasn't been optimized.
>> Could anyone show me some hints what is preventing the second val from
>> being optimized? (My guess would be the zext because I am using %sp as
a
>> 32bit pointer although the "target" is 64bit).
>>
>> Regards,
>> Paul
>>
>>
>> _______________________________________________
>> LLVM Developers mailing listllvm-dev at
lists.llvm.orghttp://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>>
>>
>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20160210/988dd066/attachment.html>

Hal Finkel via llvm-dev

2016-Feb-10 20:29 UTC

head link

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

----- Original Message ----- 
> From: "Daniel Berlin via llvm-dev" <llvm-dev at
lists.llvm.org>
> To: "Paul Peet" <paulpeet17 at gmail.com>
> Cc: "llvm-dev" <llvm-dev at lists.llvm.org>
> Sent: Wednesday, February 10, 2016 2:24:01 PM
> Subject: Re: [llvm-dev] Memory Store/Load Optimization Issue
> (Emulating stack)
> On Wed, Feb 10, 2016 at 12:18 PM, Paul Peet via llvm-dev <
> llvm-dev at lists.llvm.org > wrote:
> > Thank you for the hint.
> 
> > I adjusted the code and it works:
> 
> > The code after replacing inttoptr with getelementptr:
> 
> > define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
> 
> > entry:
> 
> > ; push foo (On "stack")
> 
> > %sp_1 = getelementptr i8, i8* %sp, i32 -4
> 
> > %sp_1_ptr = bitcast i8* %sp_1 to i32*
> 
> > store i32 %foo, i32* %sp_1_ptr, align 4
> 
> > ; push bar
> 
> > %sp_2 = getelementptr i8, i8* %sp_1, i32 -4
> 
> > %sp_2_ptr = bitcast i8* %sp_2 to i32*
> 
> > store i32 %bar, i32* %sp_2_ptr, align 4
> 
> > ; val1 = pop (val1 = bar)
> 
> > %sp_3_ptr = bitcast i8* %sp_2 to i32*
> 
> > %val1 = load i32, i32* %sp_3_ptr, align 4
> 
> > %sp_3 = getelementptr i8, i8* %sp_2, i32 4
> 
> > ; val2 = pop (val2 = foo)
> 
> > %sp_4_ptr = bitcast i8* %sp_3 to i32*
> 
> > %val2 = load i32, i32* %sp_4_ptr, align 4
> 
> > %sp_4 = getelementptr i8, i8* %sp_3, i32 4
> 
> > %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %val1, 0
> 
> > %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %val2, 1
> 
> > %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp_4, 2
> 
> > ret { i32, i32, i8* } %ret_3
> 
> > }
> 
> > After optimization ("opt -instcombine ./code.ll -S")
> 
> > define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
> 
> > entry:
> 
> > %sp_1 = getelementptr i8, i8* %sp, i64 -4
> 
> > %sp_1_ptr = bitcast i8* %sp_1 to i32*
> 
> > store i32 %foo, i32* %sp_1_ptr, align 4
> 
> > %sp_2 = getelementptr i8, i8* %sp, i64 -8
> 
> > %sp_2_ptr = bitcast i8* %sp_2 to i32*
> 
> > store i32 %bar, i32* %sp_2_ptr, align 4
> 
> > %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %bar, 0
> 
> > %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %foo, 1
> 
> > %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp, 2
> 
> > ret { i32, i32, i8* } %ret_3
> 
> > }
> 
> > My only questions are now:
> 
> > - How is it that inttoptr cannot provide that specific alias
> > information so it can optimize that store/load away ?
> 
> Because nothing tracks what happens to the ints, and what happens
> when they are converted back to pointers and whether it's sane :)
>
http://llvm.org/docs/GetElementPtr.html#how-is-gep-different-from-ptrtoint-arithmetic-and-inttoptr
> > - Might it be possible to get inttoptr providing such alias
> > analysis
> > ?
> 
> It doesn't make a lot of sense to try in most cases.
> Most of the cases ptrtoint/inttoptr is useful are those where you
> want to do crazy things to the pointer.
> > - I came across MemorySSA while browsing though the llvm source. Is
> > it possible that one can use MemorySSA to do such optimization
> > without alias analysis ?
> 
> MemorySSA relies on alias analysis to generate the SSA form.
> > - Where do I have to look in the source which is doing this kind of
> > optimization (Is it instcombine which uses lib/Analysis/Loads.cpp
> > ?)
> 
> It's probably a combination of opts. The most likely candidate is
> -gvn, but I would look at the pass dumps after each opt
I recommend running your IR through opt with -print-after-all, and you'll
see what optimization passes are doing what.

 -Hal
> > Regards,
> 
> > Paul
> 
> > 2016-02-10 0:26 GMT+01:00 Philip Reames < listmail at
philipreames.com
> > >
> > :
> 
> > > Two points:
> > 
> 
> > > - Using inttoptr is a mistake here. GEPs are strongly preferred
> > > and
> > > provide strictly more aliasing information to the optimizer.
> > 
> 
> > > - The zext is a bit weird. I'm not sure where that came from,
but
> > > I'd
> > > not bother looking into until the preceding point is addressed.
> > 
> 
> > > In general, you may find these docs useful:
> > 
> 
> > > http://llvm.org/docs/Frontend/PerformanceTips.html
> > 
> 
> > > Philip
> > 
> 
> > > On 02/08/2016 06:54 AM, Paul Peet via llvm-dev wrote:
> > 
> 
> > > > Hello,
> > > 
> > 
> 
> > > > I am trying to emulate the "stack" as like on x86
when using
> > > > push/pop
> > > > so afterwards I can use LLVM's optimizer passes to
simplify
> > > > (reduce
> > > > junk) the code.
> > > 
> > 
> 
> > > > The LLVM IR code:
> > > 
> > 
> 
> > > > define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp)
{
> > > 
> > 
> 
> > > > ; push foo (On "stack")
> > > 
> > 
> 
> > > > %sp_1 = sub i32 %sp, 4
> > > 
> > 
> 
> > > > %sp_1_ptr = inttoptr i32 %sp_1 to i32*
> > > 
> > 
> 
> > > > store i32 %foo, i32* %sp_1_ptr, align 4
> > > 
> > 
> 
> > > > ; push bar
> > > 
> > 
> 
> > > > %sp_2 = sub i32 %sp_1, 4
> > > 
> > 
> 
> > > > %sp_2_ptr = inttoptr i32 %sp_2 to i32*
> > > 
> > 
> 
> > > > store i32 %bar, i32* %sp_2_ptr, align 4
> > > 
> > 
> 
> > > > ; val1 = pop (val1 = bar)
> > > 
> > 
> 
> > > > %sp_3_ptr = inttoptr i32 %sp_2 to i32*
> > > 
> > 
> 
> > > > %val1 = load i32, i32* %sp_3_ptr, align 4
> > > 
> > 
> 
> > > > %sp_3 = add i32 %sp_2, 4
> > > 
> > 
> 
> > > > ; val2 = pop (val2 = foo)
> > > 
> > 
> 
> > > > %sp_4_ptr = inttoptr i32 %sp_3 to i32*
> > > 
> > 
> 
> > > > %val2 = load i32, i32* %sp_4_ptr, align 4
> > > 
> > 
> 
> > > > %sp_4 = add i32 %sp_3, 4
> > > 
> > 
> 
> > > > %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %val1, 0
> > > 
> > 
> 
> > > > %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1
> > > 
> > 
> 
> > > > %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp_4, 2
> > > 
> > 
> 
> > > > ret { i32, i32, i32 } %ret_3
> > > 
> > 
> 
> > > > }
> > > 
> > 
> 
> > > > This code will "push" two values onto the stack
and pop them in
> > > > reverse order so afterwards "foo" and
"bar" will be swapped and
> > > > returned back.
> > > 
> > 
> 
> > > > After running this through "opt -O2 ./test.ll", I
am getting
> > > > this:
> > > 
> > 
> 
> > > > define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp)
#0
> > > > {
> > > 
> > 
> 
> > > > %sp_1 = add i32 %sp, -4
> > > 
> > 
> 
> > > > %1 = zext i32 %sp_1 to i64
> > > 
> > 
> 
> > > > %sp_1_ptr = inttoptr i64 %1 to i32*
> > > 
> > 
> 
> > > > store i32 %foo, i32* %sp_1_ptr, align 4
> > > 
> > 
> 
> > > > %sp_2 = add i32 %sp, -8
> > > 
> > 
> 
> > > > %2 = zext i32 %sp_2 to i64
> > > 
> > 
> 
> > > > %sp_2_ptr = inttoptr i64 %2 to i32*
> > > 
> > 
> 
> > > > store i32 %bar, i32* %sp_2_ptr, align 4
> > > 
> > 
> 
> > > > %val2 = load i32, i32* %sp_1_ptr, align 4
> > > 
> > 
> 
> > > > %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %bar, 0 ;
> > > > Swapped
> > > 
> > 
> 
> > > > %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1;
> > > > Not
> > > > Swapped (Not optimized; Should be %foo)
> > > 
> > 
> 
> > > > %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp, 2
> > > 
> > 
> 
> > > > ret { i32, i32, i32 } %ret_3
> > > 
> > 
> 
> > > > }
> > > 
> > 
> 
> > > > As you can see that the IR has got additional code, eg.
zext.
> > > > But
> > > > the
> > > > main problem here is that val2 hasn't been optimized.
> > > 
> > 
> 
> > > > Could anyone show me some hints what is preventing the
second
> > > > val
> > > > from being optimized? (My guess would be the zext because I
am
> > > > using
> > > > %sp as a 32bit pointer although the "target" is
64bit).
> > > 
> > 
> 
> > > > Regards,
> > > 
> > 
> 
> > > > Paul
> > > 
> > 
> 
> > > > _______________________________________________
> > > 
> > 
> 
> > > > LLVM Developers mailing list llvm-dev at lists.llvm.org
> > > > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
> > > 
> > 
> 
> > _______________________________________________
> 
> > LLVM Developers mailing list
> 
> > llvm-dev at lists.llvm.org
> 
> > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
> 
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
-- 

-- 
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory

Paul Peet via llvm-dev

2016-Feb-10 21:13 UTC

head link

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

Thanks for the answers. Although I am not sure if I've understood the docs
about how inttoptr/ptrtointr are different when compared to gep.
It says: "It’s invalid to take a GEP from one object, address into a
different separately allocated object, and dereference it.".
To go back to my intention why I am doing this, I would like to
"emulate"
some x86 instructions with llvm-ir but as far as I understand that aliasing
rule, I am not sure if I am breaking that rule.

For example when translating this x86 code to llvm ir:

  push eax
  add esp, 2
  push ecx
...

  ; push foo (On "stack")
  %sp_1 = getelementptr i8, i8* %sp, i32 -4
  %sp_1_ptr = bitcast i8* %sp_1 to i32*
  store i32 %foo, i32* %sp_1_ptr, align 4

  %sp_x = getelementptr i8, i8* %sp_1, i32 2

  ; push bar
  %sp_2 = getelementptr i8, i8* %sp_x, i32 -4
  %sp_2_ptr = bitcast i8* %sp_2 to i32*
  store i32 %bar, i32* %sp_2_ptr, align 4

Both objects (eax, ecx) will overlap because of the size difference (eax i32).
What are the consequences when doing this. Will this break alias
analysis for the further instructions?

2016-02-10 21:24 GMT+01:00 Daniel Berlin <dberlin at dberlin.org>:
>
>
> On Wed, Feb 10, 2016 at 12:18 PM, Paul Peet via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
>
>> Thank you for the hint.
>>
>> I adjusted the code and it works:
>>
>> The code after replacing inttoptr with getelementptr:
>>
>> define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
>> entry:
>>   ; push foo (On "stack")
>>   %sp_1 = getelementptr i8, i8* %sp, i32 -4
>>   %sp_1_ptr = bitcast i8* %sp_1 to i32*
>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>
>>   ; push bar
>>   %sp_2 = getelementptr i8, i8* %sp_1, i32 -4
>>   %sp_2_ptr = bitcast i8* %sp_2 to i32*
>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>
>>   ; val1 = pop (val1 = bar)
>>   %sp_3_ptr = bitcast i8* %sp_2 to i32*
>>   %val1 = load i32, i32* %sp_3_ptr, align 4
>>   %sp_3 = getelementptr i8, i8* %sp_2, i32 4
>>
>>   ; val2 = pop (val2 = foo)
>>   %sp_4_ptr = bitcast i8* %sp_3 to i32*
>>   %val2 = load i32, i32* %sp_4_ptr, align 4
>>   %sp_4 = getelementptr i8, i8* %sp_3, i32 4
>>
>>   %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %val1, 0
>>   %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %val2, 1
>>   %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp_4, 2
>>
>>   ret { i32, i32, i8* } %ret_3
>> }
>>
>> After optimization ("opt -instcombine ./code.ll -S")
>>
>> define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
>> entry:
>>   %sp_1 = getelementptr i8, i8* %sp, i64 -4
>>   %sp_1_ptr = bitcast i8* %sp_1 to i32*
>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>   %sp_2 = getelementptr i8, i8* %sp, i64 -8
>>   %sp_2_ptr = bitcast i8* %sp_2 to i32*
>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>   %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %bar, 0
>>   %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %foo, 1
>>   %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp, 2
>>   ret { i32, i32, i8* } %ret_3
>> }
>>
>> My only questions are now:
>> - How is it that inttoptr cannot provide that specific alias
information
>> so it can optimize that store/load away ?
>>
> Because nothing tracks what happens to the ints, and what happens when
> they are converted back to pointers and whether it's sane :)
>
>
http://llvm.org/docs/GetElementPtr.html#how-is-gep-different-from-ptrtoint-arithmetic-and-inttoptr
>
> - Might it be possible to get inttoptr providing such alias analysis ?
>>
> It doesn't make a lot of sense to try in most cases.
> Most of the cases ptrtoint/inttoptr is useful are those where you want to
> do crazy things to the pointer.
>
>
>
>> - I came across MemorySSA while browsing though the llvm source. Is it
>> possible that one can use MemorySSA to do such optimization without
alias
>> analysis ?
>>
>
> MemorySSA relies on alias analysis to generate the SSA form.
>
>
>> - Where do I have to look in the source which is doing this kind of
>> optimization (Is it instcombine which uses lib/Analysis/Loads.cpp ?)
>>
>> It's probably a combination of opts. The most likely candidate is
-gvn,
> but  I would look at the pass dumps after each opt
>
>> Regards,
>> Paul
>>
>>
>> 2016-02-10 0:26 GMT+01:00 Philip Reames <listmail at
philipreames.com>:
>>
>>> Two points:
>>> - Using inttoptr is a mistake here.  GEPs are strongly preferred
and
>>> provide strictly more aliasing information to the optimizer.
>>> - The zext is a bit weird.  I'm not sure where that came from,
but I'd
>>> not bother looking into until the preceding point is addressed.
>>>
>>> In general, you may find these docs useful:
>>> http://llvm.org/docs/Frontend/PerformanceTips.html
>>>
>>> Philip
>>>
>>>
>>>
>>> On 02/08/2016 06:54 AM, Paul Peet via llvm-dev wrote:
>>>
>>> Hello,
>>>
>>> I am trying to emulate the "stack" as like on x86 when
using push/pop so
>>> afterwards I can use LLVM's optimizer passes to simplify
(reduce junk) the
>>> code.
>>>
>>> The LLVM IR code:
>>>
>>> define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) {
>>>   ; push foo (On "stack")
>>>   %sp_1 = sub i32 %sp, 4
>>>   %sp_1_ptr = inttoptr i32 %sp_1 to i32*
>>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>>
>>>   ; push bar
>>>   %sp_2 = sub i32 %sp_1, 4
>>>   %sp_2_ptr = inttoptr i32 %sp_2 to i32*
>>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>>
>>>   ; val1 = pop (val1 = bar)
>>>   %sp_3_ptr = inttoptr i32 %sp_2 to i32*
>>>   %val1 = load i32, i32* %sp_3_ptr, align 4
>>>   %sp_3 = add i32 %sp_2, 4
>>>
>>>   ; val2 = pop (val2 = foo)
>>>   %sp_4_ptr = inttoptr i32 %sp_3 to i32*
>>>   %val2 = load i32, i32* %sp_4_ptr, align 4
>>>   %sp_4 = add i32 %sp_3, 4
>>>
>>>   %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %val1, 0
>>>   %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1
>>>   %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp_4, 2
>>>
>>>   ret { i32, i32, i32 } %ret_3
>>> }
>>>
>>> This code will "push" two values onto the stack and pop
them in reverse
>>> order so afterwards "foo" and "bar" will be
swapped and returned back.
>>>
>>> After running this through "opt -O2 ./test.ll", I am
getting this:
>>>
>>> define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) #0 {
>>>   %sp_1 = add i32 %sp, -4
>>>   %1 = zext i32 %sp_1 to i64
>>>   %sp_1_ptr = inttoptr i64 %1 to i32*
>>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>>   %sp_2 = add i32 %sp, -8
>>>   %2 = zext i32 %sp_2 to i64
>>>   %sp_2_ptr = inttoptr i64 %2 to i32*
>>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>>   %val2 = load i32, i32* %sp_1_ptr, align 4
>>>   %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %bar, 0 ;
Swapped
>>>   %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1; Not
>>> Swapped (Not optimized; Should be %foo)
>>>   %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp, 2
>>>   ret { i32, i32, i32 } %ret_3
>>> }
>>>
>>> As you can see that the IR has got additional code, eg. zext. But
the
>>> main problem here is that val2 hasn't been optimized.
>>> Could anyone show me some hints what is preventing the second val
from
>>> being optimized? (My guess would be the zext because I am using %sp
as a
>>> 32bit pointer although the "target" is 64bit).
>>>
>>> Regards,
>>> Paul
>>>
>>>
>>> _______________________________________________
>>> LLVM Developers mailing listllvm-dev at
lists.llvm.orghttp://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>>
>>>
>>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20160210/668ac98a/attachment.html>

Paul Peet via llvm-dev

2016-Feb-12 11:21 UTC

head link

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

Hi again,

So I finally gave up on trying to get through the converting (x86' push pop
mov add) because it deals a lot with crazy pointer arithmetics and sonce
inttoptr and ptrtoint doesn't provide any alias analysis information.
Daniel, you said it doesn't make much sense to provide it but in my cases
it is actually very much needed, you didn't say it wasn't possible to
provide it but it is possible right? Could you guide me through where I
should look to implement such analysis? Would it be complex/non-trivial to
implement such thing?

2016-02-10 21:24 GMT+01:00 Daniel Berlin <dberlin at dberlin.org>:
>
>
> On Wed, Feb 10, 2016 at 12:18 PM, Paul Peet via llvm-dev <
> llvm-dev at lists.llvm.org> wrote:
>
>> Thank you for the hint.
>>
>> I adjusted the code and it works:
>>
>> The code after replacing inttoptr with getelementptr:
>>
>> define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
>> entry:
>>   ; push foo (On "stack")
>>   %sp_1 = getelementptr i8, i8* %sp, i32 -4
>>   %sp_1_ptr = bitcast i8* %sp_1 to i32*
>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>
>>   ; push bar
>>   %sp_2 = getelementptr i8, i8* %sp_1, i32 -4
>>   %sp_2_ptr = bitcast i8* %sp_2 to i32*
>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>
>>   ; val1 = pop (val1 = bar)
>>   %sp_3_ptr = bitcast i8* %sp_2 to i32*
>>   %val1 = load i32, i32* %sp_3_ptr, align 4
>>   %sp_3 = getelementptr i8, i8* %sp_2, i32 4
>>
>>   ; val2 = pop (val2 = foo)
>>   %sp_4_ptr = bitcast i8* %sp_3 to i32*
>>   %val2 = load i32, i32* %sp_4_ptr, align 4
>>   %sp_4 = getelementptr i8, i8* %sp_3, i32 4
>>
>>   %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %val1, 0
>>   %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %val2, 1
>>   %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp_4, 2
>>
>>   ret { i32, i32, i8* } %ret_3
>> }
>>
>> After optimization ("opt -instcombine ./code.ll -S")
>>
>> define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
>> entry:
>>   %sp_1 = getelementptr i8, i8* %sp, i64 -4
>>   %sp_1_ptr = bitcast i8* %sp_1 to i32*
>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>   %sp_2 = getelementptr i8, i8* %sp, i64 -8
>>   %sp_2_ptr = bitcast i8* %sp_2 to i32*
>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>   %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %bar, 0
>>   %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %foo, 1
>>   %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp, 2
>>   ret { i32, i32, i8* } %ret_3
>> }
>>
>> My only questions are now:
>> - How is it that inttoptr cannot provide that specific alias
information
>> so it can optimize that store/load away ?
>>
> Because nothing tracks what happens to the ints, and what happens when
> they are converted back to pointers and whether it's sane :)
>
>
http://llvm.org/docs/GetElementPtr.html#how-is-gep-different-from-ptrtoint-arithmetic-and-inttoptr
>
> - Might it be possible to get inttoptr providing such alias analysis ?
>>
> It doesn't make a lot of sense to try in most cases.
> Most of the cases ptrtoint/inttoptr is useful are those where you want to
> do crazy things to the pointer.
>
>
>
>> - I came across MemorySSA while browsing though the llvm source. Is it
>> possible that one can use MemorySSA to do such optimization without
alias
>> analysis ?
>>
>
> MemorySSA relies on alias analysis to generate the SSA form.
>
>
>> - Where do I have to look in the source which is doing this kind of
>> optimization (Is it instcombine which uses lib/Analysis/Loads.cpp ?)
>>
>> It's probably a combination of opts. The most likely candidate is
-gvn,
> but  I would look at the pass dumps after each opt
>
>> Regards,
>> Paul
>>
>>
>> 2016-02-10 0:26 GMT+01:00 Philip Reames <listmail at
philipreames.com>:
>>
>>> Two points:
>>> - Using inttoptr is a mistake here.  GEPs are strongly preferred
and
>>> provide strictly more aliasing information to the optimizer.
>>> - The zext is a bit weird.  I'm not sure where that came from,
but I'd
>>> not bother looking into until the preceding point is addressed.
>>>
>>> In general, you may find these docs useful:
>>> http://llvm.org/docs/Frontend/PerformanceTips.html
>>>
>>> Philip
>>>
>>>
>>>
>>> On 02/08/2016 06:54 AM, Paul Peet via llvm-dev wrote:
>>>
>>> Hello,
>>>
>>> I am trying to emulate the "stack" as like on x86 when
using push/pop so
>>> afterwards I can use LLVM's optimizer passes to simplify
(reduce junk) the
>>> code.
>>>
>>> The LLVM IR code:
>>>
>>> define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) {
>>>   ; push foo (On "stack")
>>>   %sp_1 = sub i32 %sp, 4
>>>   %sp_1_ptr = inttoptr i32 %sp_1 to i32*
>>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>>
>>>   ; push bar
>>>   %sp_2 = sub i32 %sp_1, 4
>>>   %sp_2_ptr = inttoptr i32 %sp_2 to i32*
>>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>>
>>>   ; val1 = pop (val1 = bar)
>>>   %sp_3_ptr = inttoptr i32 %sp_2 to i32*
>>>   %val1 = load i32, i32* %sp_3_ptr, align 4
>>>   %sp_3 = add i32 %sp_2, 4
>>>
>>>   ; val2 = pop (val2 = foo)
>>>   %sp_4_ptr = inttoptr i32 %sp_3 to i32*
>>>   %val2 = load i32, i32* %sp_4_ptr, align 4
>>>   %sp_4 = add i32 %sp_3, 4
>>>
>>>   %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %val1, 0
>>>   %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1
>>>   %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp_4, 2
>>>
>>>   ret { i32, i32, i32 } %ret_3
>>> }
>>>
>>> This code will "push" two values onto the stack and pop
them in reverse
>>> order so afterwards "foo" and "bar" will be
swapped and returned back.
>>>
>>> After running this through "opt -O2 ./test.ll", I am
getting this:
>>>
>>> define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) #0 {
>>>   %sp_1 = add i32 %sp, -4
>>>   %1 = zext i32 %sp_1 to i64
>>>   %sp_1_ptr = inttoptr i64 %1 to i32*
>>>   store i32 %foo, i32* %sp_1_ptr, align 4
>>>   %sp_2 = add i32 %sp, -8
>>>   %2 = zext i32 %sp_2 to i64
>>>   %sp_2_ptr = inttoptr i64 %2 to i32*
>>>   store i32 %bar, i32* %sp_2_ptr, align 4
>>>   %val2 = load i32, i32* %sp_1_ptr, align 4
>>>   %ret_1 = insertvalue { i32, i32, i32 } undef, i32 %bar, 0 ;
Swapped
>>>   %ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1; Not
>>> Swapped (Not optimized; Should be %foo)
>>>   %ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp, 2
>>>   ret { i32, i32, i32 } %ret_3
>>> }
>>>
>>> As you can see that the IR has got additional code, eg. zext. But
the
>>> main problem here is that val2 hasn't been optimized.
>>> Could anyone show me some hints what is preventing the second val
from
>>> being optimized? (My guess would be the zext because I am using %sp
as a
>>> 32bit pointer although the "target" is 64bit).
>>>
>>> Regards,
>>> Paul
>>>
>>>
>>> _______________________________________________
>>> LLVM Developers mailing listllvm-dev at
lists.llvm.orghttp://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>>
>>>
>>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>>
>>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20160212/12bd77f6/attachment.html>

llvm dev - Feb 2016 - Memory Store/Load Optimization Issue (Emulating stack)

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

[llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)