thr3ads.net - llvm dev - [LLVMdev] Area for improvement [Feb 2005]

If this information is useful, please help other people find it:
Share via:

Jeff Cohen

2005-Feb-22 02:48 UTC

[LLVMdev] Area for improvement

I noticed that fourinarow is one of the programs in which LLVM is much 
slower than GCC, so I decided to take a look and see why that is so.  
The program has many loops that look like this:

    #define ROWS 6
    #define COLS 7

    void init_board(char b[COLS][ROWS+1])
    {
      int i,j;

      for (i=0;i<COLS;i++)
            for (j=0;j<ROWS;j++)
              b[i][j]='.';
                     
      for (i=0;i<COLS;i++)
            b[i][ROWS]=0;
    }

This generates the following X86 code:

        .text
        .align    16
        .globl    init_board
        .type    init_board, @function
    init_board:
        subl $4, %esp
        movl %esi, (%esp)
        movl 8(%esp), %eax
        movl $0, %ecx
    .LBBinit_board_1:    # loopexit.1
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        movb $46, (%esi)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 1(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 2(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 3(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 4(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 5(%esi), %edx
        movb $46, (%edx)
        incl %ecx
        cmpl $7, %ecx
        jne .LBBinit_board_1    # loopexit.1
    .LBBinit_board_2:    # return
        movb $0, 6(%eax)
        movb $0, 13(%eax)
        movb $0, 20(%eax)
        movb $0, 27(%eax)
        movb $0, 34(%eax)
        movb $0, 41(%eax)
        movb $0, 48(%eax)
        movl (%esp), %esi
        addl $4, %esp
        ret

The code generated by GCC is much faster.  LLVM does unroll the loop, 
which GCC does not do, but the addressing code is a constant repetition 
of these three instructions:

        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi

All but the first occurrance do nothing but put the same address into 
%esi that was already there to being with, and does it with an expensive 
multiply at that.  GCC correctly creates a suitable induction variable 
and strength reduces the multiplication down to addition.

It turns out that using the new selection dag code generator, the outer 
loop is unrolled also and nothing is left but a bunch of movb 
instructions.  Much, much faster.  But still, the optimization passes 
shouldn't be leaving so much garbage around for instruction selection to 
clean up.  At first, I thought all that was needed was to another round 
of basic block optimizations after the loop was unrolled, until I saw 
the actual LLVM bytecodes:

    void %init_board([7 x sbyte]* %b) {
    entry:
        br label %loopexit.1

    loopexit.1:        ; preds = %loopexit.1, %entry
        %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15,
    %loopexit.1 ]        ; <uint> [#uses=7]
        %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    0        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10
        %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    1        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.1
        %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    2        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.2
        %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    3        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.3
        %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    4        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.4
        %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    5        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.5
        %indvar.next15 = add uint %indvar14, 1        ; <uint> [#uses=2]
        %exitcond16 = seteq uint %indvar.next15, 7        ; <bool>
[#uses=1]
        br bool %exitcond16, label %return, label %loopexit.1

    return:        ; preds = %loopexit.1
        %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19
        %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.1
        %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.2
        %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.3
        %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.4
        %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.5
        %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.6
        ret void
    }

Now the problem is obvious.  A two dimensional array access is being 
performed by a single instruction.  The arithmetic needed to address the 
element is implicit, and therefore inaccessible to optimizations.  The 
redundant calculations can not be eliminated, nor can strength reduction 
be performed.  getelementptr needs to be broken down into its 
constituent operations at some point.

Or am I missing something?  Is this breaking down and reapplying 
optimizations part of the selection dag process?  I am somewhat 
surprised that using it affected what loops got unrolled.

-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20050221/3409ef8f/attachment.html>

Jeff Cohen

2005-Feb-22 03:17 UTC

head link

[LLVMdev] Area for improvement

When I increased COLS to the point where the loop could no longer be 
unrolled, the selection dag code generator generated effectively the 
same code as the default X86 code generator.  Lots of redundant 
imul/movl/addl sequences.  It can't clean it up either.  Only unrolling 
all nested loops permits it to be optimized away, regardless of code 
generator.

Jeff Cohen wrote:
> I noticed that fourinarow is one of the programs in which LLVM is much 
> slower than GCC, so I decided to take a look and see why that is so.  
> The program has many loops that look like this:
>
>     #define ROWS 6
>     #define COLS 7
>
>     void init_board(char b[COLS][ROWS+1])
>     {
>       int i,j;
>
>       for (i=0;i<COLS;i++)
>             for (j=0;j<ROWS;j++)
>               b[i][j]='.';
>                      
>       for (i=0;i<COLS;i++)
>             b[i][ROWS]=0;
>     }
>
> This generates the following X86 code:
>
>         .text
>         .align    16
>         .globl    init_board
>         .type    init_board, @function
>     init_board:
>         subl $4, %esp
>         movl %esi, (%esp)
>         movl 8(%esp), %eax
>         movl $0, %ecx
>     .LBBinit_board_1:    # loopexit.1
>         imull $7, %ecx, %edx
>         movl %eax, %esi
>         addl %edx, %esi
>         movb $46, (%esi)
>         imull $7, %ecx, %edx
>         movl %eax, %esi
>         addl %edx, %esi
>         leal 1(%esi), %edx
>         movb $46, (%edx)
>         imull $7, %ecx, %edx
>         movl %eax, %esi
>         addl %edx, %esi
>         leal 2(%esi), %edx
>         movb $46, (%edx)
>         imull $7, %ecx, %edx
>         movl %eax, %esi
>         addl %edx, %esi
>         leal 3(%esi), %edx
>         movb $46, (%edx)
>         imull $7, %ecx, %edx
>         movl %eax, %esi
>         addl %edx, %esi
>         leal 4(%esi), %edx
>         movb $46, (%edx)
>         imull $7, %ecx, %edx
>         movl %eax, %esi
>         addl %edx, %esi
>         leal 5(%esi), %edx
>         movb $46, (%edx)
>         incl %ecx
>         cmpl $7, %ecx
>         jne .LBBinit_board_1    # loopexit.1
>     .LBBinit_board_2:    # return
>         movb $0, 6(%eax)
>         movb $0, 13(%eax)
>         movb $0, 20(%eax)
>         movb $0, 27(%eax)
>         movb $0, 34(%eax)
>         movb $0, 41(%eax)
>         movb $0, 48(%eax)
>         movl (%esp), %esi
>         addl $4, %esp
>         ret
>
> The code generated by GCC is much faster.  LLVM does unroll the loop, 
> which GCC does not do, but the addressing code is a constant 
> repetition of these three instructions:
>
>         imull $7, %ecx, %edx
>         movl %eax, %esi
>         addl %edx, %esi
>
> All but the first occurrance do nothing but put the same address into 
> %esi that was already there to being with, and does it with an 
> expensive multiply at that.  GCC correctly creates a suitable 
> induction variable and strength reduces the multiplication down to 
> addition.
>
> It turns out that using the new selection dag code generator, the 
> outer loop is unrolled also and nothing is left but a bunch of movb 
> instructions.  Much, much faster.  But still, the optimization passes 
> shouldn't be leaving so much garbage around for instruction selection 
> to clean up.  At first, I thought all that was needed was to another 
> round of basic block optimizations after the loop was unrolled, until 
> I saw the actual LLVM bytecodes:
>
>     void %init_board([7 x sbyte]* %b) {
>     entry:
>         br label %loopexit.1
>
>     loopexit.1:        ; preds = %loopexit.1, %entry
>         %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15,
>     %loopexit.1 ]        ; <uint> [#uses=7]
>         %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
>     0        ; <sbyte*> [#uses=1]
>         store sbyte 46, sbyte* %tmp.10
>         %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
>     1        ; <sbyte*> [#uses=1]
>         store sbyte 46, sbyte* %tmp.10.1
>         %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
>     2        ; <sbyte*> [#uses=1]
>         store sbyte 46, sbyte* %tmp.10.2
>         %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
>     3        ; <sbyte*> [#uses=1]
>         store sbyte 46, sbyte* %tmp.10.3
>         %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
>     4        ; <sbyte*> [#uses=1]
>         store sbyte 46, sbyte* %tmp.10.4
>         %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
>     5        ; <sbyte*> [#uses=1]
>         store sbyte 46, sbyte* %tmp.10.5
>         %indvar.next15 = add uint %indvar14, 1        ; <uint>
[#uses=2]
>         %exitcond16 = seteq uint %indvar.next15, 7        ; <bool>
>     [#uses=1]
>         br bool %exitcond16, label %return, label %loopexit.1
>
>     return:        ; preds = %loopexit.1
>         %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6        ;
>     <sbyte*> [#uses=1]
>         store sbyte 0, sbyte* %tmp.19
>         %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int 6       
>     ; <sbyte*> [#uses=1]
>         store sbyte 0, sbyte* %tmp.19.1
>         %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int 6       
>     ; <sbyte*> [#uses=1]
>         store sbyte 0, sbyte* %tmp.19.2
>         %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int 6       
>     ; <sbyte*> [#uses=1]
>         store sbyte 0, sbyte* %tmp.19.3
>         %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int 6       
>     ; <sbyte*> [#uses=1]
>         store sbyte 0, sbyte* %tmp.19.4
>         %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int 6       
>     ; <sbyte*> [#uses=1]
>         store sbyte 0, sbyte* %tmp.19.5
>         %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int 6       
>     ; <sbyte*> [#uses=1]
>         store sbyte 0, sbyte* %tmp.19.6
>         ret void
>     }
>
> Now the problem is obvious.  A two dimensional array access is being 
> performed by a single instruction.  The arithmetic needed to address 
> the element is implicit, and therefore inaccessible to optimizations.  
> The redundant calculations can not be eliminated, nor can strength 
> reduction be performed.  getelementptr needs to be broken down into 
> its constituent operations at some point.
>
> Or am I missing something?  Is this breaking down and reapplying 
> optimizations part of the selection dag process?  I am somewhat 
> surprised that using it affected what loops got unrolled.
>
>------------------------------------------------------------------------
>
>_______________________________________________
>LLVM Developers mailing list
>LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
>http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev
>  
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20050221/c40b64e4/attachment.html>

Reid Spencer

2005-Feb-22 03:51 UTC

head link

[LLVMdev] Area for improvement

Jeff,

This is an important find. I started looking at fourinarow and other
programs that don't do well compared to GCC. I think there are probably
only a couple things remaining for LLVM to smoke GCC on all tests
(floating point is another one). I know that Chris has been looking for
good analyses like the one you've provided.

Could you put this into a bugzilla for us? That way it definitely gets
on the "to do" list.

Thanks,

Reid.

On Mon, 2005-02-21 at 19:17, Jeff Cohen wrote:> When I increased COLS to the point where the loop could no longer be
> unrolled, the selection dag code generator generated effectively the
> same code as the default X86 code generator.  Lots of redundant
> imul/movl/addl sequences.  It can't clean it up either.  Only
> unrolling all nested loops permits it to be optimized away, regardless
> of code generator.
> 
> Jeff Cohen wrote: 
> > I noticed that fourinarow is one of the programs in which LLVM is
> > much slower than GCC, so I decided to take a look and see why that
> > is so.  The program has many loops that look like this:
> >         #define ROWS 6
> >         #define COLS 7
> >         
> >         void init_board(char b[COLS][ROWS+1])
> >         {
> >           int i,j;
> >         
> >           for (i=0;i<COLS;i++)
> >                 for (j=0;j<ROWS;j++)
> >                   b[i][j]='.';
> >                           
> >           for (i=0;i<COLS;i++)
> >                 b[i][ROWS]=0;
> >         }
> > This generates the following X86 code:
> >             .text
> >             .align    16
> >             .globl    init_board
> >             .type    init_board, @function
> >         init_board:
> >             subl $4, %esp
> >             movl %esi, (%esp)
> >             movl 8(%esp), %eax
> >             movl $0, %ecx
> >         .LBBinit_board_1:    # loopexit.1
> >             imull $7, %ecx, %edx
> >             movl %eax, %esi
> >             addl %edx, %esi
> >             movb $46, (%esi)
> >             imull $7, %ecx, %edx
> >             movl %eax, %esi
> >             addl %edx, %esi
> >             leal 1(%esi), %edx
> >             movb $46, (%edx)
> >             imull $7, %ecx, %edx
> >             movl %eax, %esi
> >             addl %edx, %esi
> >             leal 2(%esi), %edx
> >             movb $46, (%edx)
> >             imull $7, %ecx, %edx
> >             movl %eax, %esi
> >             addl %edx, %esi
> >             leal 3(%esi), %edx
> >             movb $46, (%edx)
> >             imull $7, %ecx, %edx
> >             movl %eax, %esi
> >             addl %edx, %esi
> >             leal 4(%esi), %edx
> >             movb $46, (%edx)
> >             imull $7, %ecx, %edx
> >             movl %eax, %esi
> >             addl %edx, %esi
> >             leal 5(%esi), %edx
> >             movb $46, (%edx)
> >             incl %ecx
> >             cmpl $7, %ecx
> >             jne .LBBinit_board_1    # loopexit.1
> >         .LBBinit_board_2:    # return
> >             movb $0, 6(%eax)
> >             movb $0, 13(%eax)
> >             movb $0, 20(%eax)
> >             movb $0, 27(%eax)
> >             movb $0, 34(%eax)
> >             movb $0, 41(%eax)
> >             movb $0, 48(%eax)
> >             movl (%esp), %esi
> >             addl $4, %esp
> >             ret
> > The code generated by GCC is much faster.  LLVM does unroll the
> > loop, which GCC does not do, but the addressing code is a constant
> > repetition of these three instructions:
> >             imull $7, %ecx, %edx
> >             movl %eax, %esi
> >             addl %edx, %esi
> > All but the first occurrance do nothing but put the same address
> > into %esi that was already there to being with, and does it with an
> > expensive multiply at that.  GCC correctly creates a suitable
> > induction variable and strength reduces the multiplication down to
> > addition.
> > 
> > It turns out that using the new selection dag code generator, the
> > outer loop is unrolled also and nothing is left but a bunch of movb
> > instructions.  Much, much faster.  But still, the optimization
> > passes shouldn't be leaving so much garbage around for instruction
> > selection to clean up.  At first, I thought all that was needed was
> > to another round of basic block optimizations after the loop was
> > unrolled, until I saw the actual LLVM bytecodes:
> >         void %init_board([7 x sbyte]* %b) {
> >         entry:
> >             br label %loopexit.1
> >         
> >         loopexit.1:        ; preds = %loopexit.1, %entry
> >             %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15,
> >         %loopexit.1 ]        ; <uint> [#uses=7]
> >             %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14,
> >         int 0        ; <sbyte*> [#uses=1]
> >             store sbyte 46, sbyte* %tmp.10
> >             %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint
> >         %indvar14, int 1        ; <sbyte*> [#uses=1]
> >             store sbyte 46, sbyte* %tmp.10.1
> >             %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint
> >         %indvar14, int 2        ; <sbyte*> [#uses=1]
> >             store sbyte 46, sbyte* %tmp.10.2
> >             %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint
> >         %indvar14, int 3        ; <sbyte*> [#uses=1]
> >             store sbyte 46, sbyte* %tmp.10.3
> >             %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint
> >         %indvar14, int 4        ; <sbyte*> [#uses=1]
> >             store sbyte 46, sbyte* %tmp.10.4
> >             %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint
> >         %indvar14, int 5        ; <sbyte*> [#uses=1]
> >             store sbyte 46, sbyte* %tmp.10.5
> >             %indvar.next15 = add uint %indvar14, 1        ;
<uint>
> >         [#uses=2]
> >             %exitcond16 = seteq uint %indvar.next15, 7        ;
> >         <bool> [#uses=1]
> >             br bool %exitcond16, label %return, label %loopexit.1
> >         
> >         return:        ; preds = %loopexit.1
> >             %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6   
> >         ; <sbyte*> [#uses=1]
> >             store sbyte 0, sbyte* %tmp.19
> >             %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int
> >         6        ; <sbyte*> [#uses=1]
> >             store sbyte 0, sbyte* %tmp.19.1
> >             %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int
> >         6        ; <sbyte*> [#uses=1]
> >             store sbyte 0, sbyte* %tmp.19.2
> >             %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int
> >         6        ; <sbyte*> [#uses=1]
> >             store sbyte 0, sbyte* %tmp.19.3
> >             %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int
> >         6        ; <sbyte*> [#uses=1]
> >             store sbyte 0, sbyte* %tmp.19.4
> >             %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int
> >         6        ; <sbyte*> [#uses=1]
> >             store sbyte 0, sbyte* %tmp.19.5
> >             %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int
> >         6        ; <sbyte*> [#uses=1]
> >             store sbyte 0, sbyte* %tmp.19.6
> >             ret void
> >         }
> > Now the problem is obvious.  A two dimensional array access is being
> > performed by a single instruction.  The arithmetic needed to address
> > the element is implicit, and therefore inaccessible to
> > optimizations.  The redundant calculations can not be eliminated,
> > nor can strength reduction be performed.  getelementptr needs to be
> > broken down into its constituent operations at some point.
> > 
> > Or am I missing something?  Is this breaking down and reapplying
> > optimizations part of the selection dag process?  I am somewhat
> > surprised that using it affected what loops got unrolled.
> > 
> > 
> > ____________________________________________________________________
> > _______________________________________________
> > LLVM Developers mailing list
> > LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> > http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev
> >   
> 
> 
> ______________________________________________________________________
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 189 bytes
Desc: This is a digitally signed message part
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20050221/17ee9e35/attachment.sig>

Vikram Adve

2005-Feb-22 04:18 UTC

head link

[LLVMdev] Area for improvement

>
>  Now the problem is obvious.  A two dimensional array access is being 
> performed by a single instruction.  The arithmetic needed to address 
> the element is implicit, and therefore inaccessible to optimizations.  
> The redundant calculations can not be eliminated, nor can strength 
> reduction be performed.  getelementptr needs to be broken down into 
> its constituent operations at some point.
Jeff,

This is exactly right -- any multi-dimensional array indexing 
operations need to decomposed into a sequence of single index 
operations so that they can be exposed to GCSE and LICM.  This 
transformation obscures the indexing behavior of the code, so the right 
place to do this is within each back-end, on LLVM code just before 
instruction selection.

There is a pass called DecomposeMultiDimRefs (which seems to have been 
incorrectly moved to lib/Target/SparcV9/) which does this 
transformation.  It's used by the SparcV9 back end before instr. 
selection but is not specific to Sparc.  Running this, followed by LICM 
and GCSE should address this problem.

--Vikram

Chris Lattner

2005-Feb-22 06:01 UTC

head link

[LLVMdev] Area for improvement

On Mon, 21 Feb 2005, Jeff Cohen wrote:> I noticed that fourinarow is one of the programs in which LLVM is much
slower
> than GCC, so I decided to take a look and see why that is so.  The program 
> has many loops that look like this:
>
>   #define ROWS 6
>   #define COLS 7
>
>   void init_board(char b[COLS][ROWS+1])
>   {
>     int i,j;
>
>     for (i=0;i<COLS;i++)
>           for (j=0;j<ROWS;j++)
>             b[i][j]='.';
>                         for (i=0;i<COLS;i++)
>           b[i][ROWS]=0;
>   }
>
> This generates the following X86 code:
>       imull $7, %ecx, %edx
>       movl %eax, %esi
>       addl %edx, %esi
>       movb $46, (%esi)
>       imull $7, %ecx, %edx
>       movl %eax, %esi
>       addl %edx, %esi
>       leal 1(%esi), %edx... (many many copies of this, see the end of the email for full output) 
...
> The code generated by GCC is much faster.  LLVM does unroll the loop, which
> GCC does not do, but the addressing code is a constant repetition of these 
> three instructions:
>
>       imull $7, %ecx, %edx
>       movl %eax, %esi
>       addl %edx, %esi
Yup, this is an issue of simple "peephole expansion" of the
getelementptr
instruction.
> All but the first occurrance do nothing but put the same address into %esi 
> that was already there to being with, and does it with an expensive
multiply
> at that.  GCC correctly creates a suitable induction variable and strength 
> reduces the multiplication down to addition.
This is called loop strength reduction.
> It turns out that using the new selection dag code generator, the outer
loop
> is unrolled also and nothing is left but a bunch of movb instructions. 
Much,
> much faster.
For the record, instead of the ugly code above, the new isel generates:

init_board:
         mov %EAX, DWORD PTR [%ESP + 4]
         mov %ECX, 0
.LBBinit_board_1:       # loopexit.1
         imul %EDX, %ECX, 7
         mov BYTE PTR [%EAX + %EDX], 46
         mov BYTE PTR [%EAX + %EDX + 1], 46
         mov BYTE PTR [%EAX + %EDX + 2], 46
         mov BYTE PTR [%EAX + %EDX + 3], 46
         mov BYTE PTR [%EAX + %EDX + 4], 46
         mov BYTE PTR [%EAX + %EDX + 5], 46
         inc %ECX
         cmp %ECX, 7
         jne .LBBinit_board_1    # loopexit.1
.LBBinit_board_2:       # return
         mov BYTE PTR [%EAX + 6], 0
         mov BYTE PTR [%EAX + 13], 0
         mov BYTE PTR [%EAX + 20], 0
         mov BYTE PTR [%EAX + 27], 0
         mov BYTE PTR [%EAX + 34], 0
         mov BYTE PTR [%EAX + 41], 0
         mov BYTE PTR [%EAX + 48], 0
         ret

This code is almost "perfect" with the exception of the multiply by 7 
inside of the loop.  Loop strength reduction would transform this into an 
add like GCC produces.

> But still, the optimization passes shouldn't be leaving so much 
> garbage around for instruction selection to clean up.  At first, I thought 
> all that was needed was to another round of basic block optimizations after
> the loop was unrolled, until I saw the actual LLVM bytecodes:
There are two issues here.  First, LLVM code is NOT designed for trivial 
peephole code generation.  This is *possible*, but not going to give you 
good quality code (see above ;-) ).  The new isel uses an extremely 
light-weight DAG-based optimizer that both simplifies instruction 
selectors and optimizes the code before selection.  LLVM code is meant to 
be a host to mid-level and interprocedural optimizations, and I think it 
does that job well.
> Now the problem is obvious.  A two dimensional array access is being 
> performed by a single instruction.  The arithmetic needed to address the 
> element is implicit, and therefore inaccessible to optimizations.  The 
> redundant calculations can not be eliminated, nor can strength reduction be
> performed.  getelementptr needs to be broken down into its constituent 
> operations at some point.
Actually strength reduction can be trivially done with getelementptr 
instructions, that was part of their design.  Part of the problem is that 
we have an induction variable canonicalization pass which UNDOES strength 
reduction to make loop analysis simpler, but we don't have strength 
reduction to redo it.   Thus if you compile a loop like this (complex 
double chosen to show a size that X86 can scale "for free"):

#include <complex.h>
void test(unsigned N, complex double *P) {
   for (; N != 0; --N)
     *P++ = 0;
}

We compile it to this LLVM code:

void %test(uint %N, "complex long double"* %P) {
entry:
         %tmp.15 = seteq uint %N, 0              ; <bool> [#uses=1]
         br bool %tmp.15, label %return, label %no_exit

no_exit:                ; preds = %no_exit, %entry
         %indvar = phi uint [ %indvar.next, %no_exit ], [ 0, %entry ]           
; <uint> [#uses=3]
         %tmp.4 = getelementptr "complex long double"* %P, uint
%indvar, uint 0          ; <double*> [#uses=1]
         store double 0.000000e+00, double* %tmp.4
         %tmp.5 = getelementptr "complex long double"* %P, uint
%indvar, uint 1          ; <double*> [#uses=1]
         store double 0.000000e+00, double* %tmp.5
         %indvar.next = add uint %indvar, 1              ; <uint>
[#uses=2]
         %exitcond = seteq uint %indvar.next, %N         ; <bool>
[#uses=1]
         br bool %exitcond, label %return, label %no_exit

return:         ; preds = %no_exit, %entry
         ret void
}

Note the accesses to P are using P[indvar].{real|imag}, not incrementing 
P.  Compile this to X86, and you get this loop (using the simple 
instruction selector):

.LBBtest_1:     # no_exit
         mov %ESI, %EDX
         shl %ESI, 4                       --> scale by sizeof(complex
double)
         mov %EDI, %ECX
         add %EDI, %ESI
         mov DWORD PTR [%EDI], 0
         mov DWORD PTR [%EDI + 4], 0
         mov %ESI, %EDX
         shl %ESI, 4                       --> scale by sizeof(complex
double)
         mov %EDI, %ECX
         add %EDI, %ESI
         lea %ESI, DWORD PTR [%EDI + 8]
         mov DWORD PTR [%ESI], 0
         mov DWORD PTR [%ESI + 4], 0
         inc %EDX
         cmp %EDX, %EAX
         jne .LBBtest_1  # no_exit


Now, if you disable the -indvars pass, you'll get this loop instead (note 
the pointer increment of P is retained):

void %test(uint %N, "complex long double"* %P) {
entry:
         %tmp.15 = seteq uint %N, 0              ; <bool> [#uses=1]
         br bool %tmp.15, label %return, label %no_exit
no_exit:                ; preds = %no_exit, %entry
         %P_addr.0.0 = phi "complex long double"* [ %inc, %no_exit ],
[ %P, %entry ]             ; <"complex long double"*> [#uses=3]
         %N_addr.0.0 = phi uint [ %dec, %no_exit ], [ %N, %entry ]              
; <uint> [#uses=1]
         %inc = getelementptr "complex long double"* %P_addr.0.0, int
1          ; <"complex long double"*> [#uses=1]
         %tmp.4 = getelementptr "complex long double"* %P_addr.0.0,
int 0, uint 0                ; <double*> [#uses=1]
         store double 0.000000e+00, double* %tmp.4
         %tmp.5 = getelementptr "complex long double"* %P_addr.0.0,
int 0, uint 1                ; <double*> [#uses=1]
         store double 0.000000e+00, double* %tmp.5
         %dec = add uint %N_addr.0.0, 4294967295         ; <uint>
[#uses=2]
         %tmp.1 = seteq uint %dec, 0             ; <bool> [#uses=1]
         br bool %tmp.1, label %return, label %no_exit
return:         ; preds = %no_exit, %entry
         ret void
}

which codegens to this loop (simple isel again):

.LBBtest_1:     # no_exit
         lea %EDX, DWORD PTR [%ECX + 16]
         mov DWORD PTR [%ECX], 0
         mov DWORD PTR [%ECX + 4], 0
         mov DWORD PTR [%ECX + 8], 0
         mov DWORD PTR [%ECX + 12], 0
         dec %EAX
         test %EAX, %EAX
         mov %ECX, %EDX
         jne .LBBtest_1  # no_exit

I think you'll agree that this loop is much nicer, though there is some 
wierd register shuffling going on between ECX and EDX.  This is the code 
we would generate if we did loop strength reduction on the LLVM level. 
Note that on X86, this isn't a HUGE problem in all cases: it can scale by 
2,4 and 8 for "free".  The problem is much worse on RISC targets.
> Or am I missing something?  Is this breaking down and reapplying 
> optimizations part of the selection dag process?  I am somewhat surprised 
> that using it affected what loops got unrolled.
I'm not sure what you mean.  The LLVM code input to the selection dag isel 
is exactly the same as that input to the simple isel, and it does no loop 
unrolling.  The full code produced by the simple isel (for Jeff's example) 
is attached below for reference.  Contrast this to the pattern isel output 
at the top.

init_board:
         sub %ESP, 4
         mov DWORD PTR [%ESP], %ESI
         mov %EAX, DWORD PTR [%ESP + 8]
         mov %ECX, 0
.LBBinit_board_1:       # loopexit.1
         imul %EDX, %ECX, 7
         mov %ESI, %EAX
         add %ESI, %EDX
         mov BYTE PTR [%ESI], 46
         imul %EDX, %ECX, 7
         mov %ESI, %EAX
         add %ESI, %EDX
         lea %EDX, DWORD PTR [%ESI + 1]
         mov BYTE PTR [%EDX], 46
         imul %EDX, %ECX, 7
         mov %ESI, %EAX
         add %ESI, %EDX
         lea %EDX, DWORD PTR [%ESI + 2]
         mov BYTE PTR [%EDX], 46
         imul %EDX, %ECX, 7
         mov %ESI, %EAX
         add %ESI, %EDX
         lea %EDX, DWORD PTR [%ESI + 3]
         mov BYTE PTR [%EDX], 46
         imul %EDX, %ECX, 7
         mov %ESI, %EAX
         add %ESI, %EDX
         lea %EDX, DWORD PTR [%ESI + 4]
         mov BYTE PTR [%EDX], 46
         imul %EDX, %ECX, 7
         mov %ESI, %EAX
         add %ESI, %EDX
         lea %EDX, DWORD PTR [%ESI + 5]
         mov BYTE PTR [%EDX], 46
         inc %ECX
         cmp %ECX, 7
         jne .LBBinit_board_1    # loopexit.1
.LBBinit_board_2:       # return
         mov BYTE PTR [%EAX + 6], 0
         mov BYTE PTR [%EAX + 13], 0
         mov BYTE PTR [%EAX + 20], 0
         mov BYTE PTR [%EAX + 27], 0
         mov BYTE PTR [%EAX + 34], 0
         mov BYTE PTR [%EAX + 41], 0
         mov BYTE PTR [%EAX + 48], 0
         mov %ESI, DWORD PTR [%ESP]
         add %ESP, 4
 	ret

-Chris

-- 
http://nondot.org/sabre/
http://llvm.cs.uiuc.edu/

Jeff Cohen

2005-Feb-22 06:11 UTC

head link

[LLVMdev] Area for improvement

Sorry, I thought I was running selection dag isel but I screwed up when 
trying out the really big array.  You're right, it does clean it up 
except for the multiplication.

So LoopStrengthReduce is not ready for prime time and doesn't actually 
get used?

I might consider whipping it into shape.  Does it still have to handle 
getelementptr in its full generality?

Chris Lattner wrote:
> On Mon, 21 Feb 2005, Jeff Cohen wrote:
>
>> I noticed that fourinarow is one of the programs in which LLVM is 
>> much slower than GCC, so I decided to take a look and see why that is 
>> so.  The program has many loops that look like this:
>>
>>   #define ROWS 6
>>   #define COLS 7
>>
>>   void init_board(char b[COLS][ROWS+1])
>>   {
>>     int i,j;
>>
>>     for (i=0;i<COLS;i++)
>>           for (j=0;j<ROWS;j++)
>>             b[i][j]='.';
>>                         for (i=0;i<COLS;i++)
>>           b[i][ROWS]=0;
>>   }
>>
>> This generates the following X86 code:
>>       imull $7, %ecx, %edx
>>       movl %eax, %esi
>>       addl %edx, %esi
>>       movb $46, (%esi)
>>       imull $7, %ecx, %edx
>>       movl %eax, %esi
>>       addl %edx, %esi
>>       leal 1(%esi), %edx
>
> ... (many many copies of this, see the end of the email for full 
> output) ...
>
>> The code generated by GCC is much faster.  LLVM does unroll the loop, 
>> which GCC does not do, but the addressing code is a constant 
>> repetition of these three instructions:
>>
>>       imull $7, %ecx, %edx
>>       movl %eax, %esi
>>       addl %edx, %esi
>
>
> Yup, this is an issue of simple "peephole expansion" of the 
> getelementptr instruction.
>
>> All but the first occurrance do nothing but put the same address into 
>> %esi that was already there to being with, and does it with an 
>> expensive multiply at that.  GCC correctly creates a suitable 
>> induction variable and strength reduces the multiplication down to 
>> addition.
>
>
> This is called loop strength reduction.
>
>> It turns out that using the new selection dag code generator, the 
>> outer loop is unrolled also and nothing is left but a bunch of movb 
>> instructions.  Much, much faster.
>
>
> For the record, instead of the ugly code above, the new isel generates:
>
> init_board:
>         mov %EAX, DWORD PTR [%ESP + 4]
>         mov %ECX, 0
> .LBBinit_board_1:       # loopexit.1
>         imul %EDX, %ECX, 7
>         mov BYTE PTR [%EAX + %EDX], 46
>         mov BYTE PTR [%EAX + %EDX + 1], 46
>         mov BYTE PTR [%EAX + %EDX + 2], 46
>         mov BYTE PTR [%EAX + %EDX + 3], 46
>         mov BYTE PTR [%EAX + %EDX + 4], 46
>         mov BYTE PTR [%EAX + %EDX + 5], 46
>         inc %ECX
>         cmp %ECX, 7
>         jne .LBBinit_board_1    # loopexit.1
> .LBBinit_board_2:       # return
>         mov BYTE PTR [%EAX + 6], 0
>         mov BYTE PTR [%EAX + 13], 0
>         mov BYTE PTR [%EAX + 20], 0
>         mov BYTE PTR [%EAX + 27], 0
>         mov BYTE PTR [%EAX + 34], 0
>         mov BYTE PTR [%EAX + 41], 0
>         mov BYTE PTR [%EAX + 48], 0
>         ret
>
> This code is almost "perfect" with the exception of the multiply
by 7
> inside of the loop.  Loop strength reduction would transform this into 
> an add like GCC produces.
>
>
>> But still, the optimization passes shouldn't be leaving so much 
>> garbage around for instruction selection to clean up.  At first, I 
>> thought all that was needed was to another round of basic block 
>> optimizations after the loop was unrolled, until I saw the actual 
>> LLVM bytecodes:
>
>
> There are two issues here.  First, LLVM code is NOT designed for 
> trivial peephole code generation.  This is *possible*, but not going 
> to give you good quality code (see above ;-) ).  The new isel uses an 
> extremely light-weight DAG-based optimizer that both simplifies 
> instruction selectors and optimizes the code before selection.  LLVM 
> code is meant to be a host to mid-level and interprocedural 
> optimizations, and I think it does that job well.
>
>> Now the problem is obvious.  A two dimensional array access is being 
>> performed by a single instruction.  The arithmetic needed to address 
>> the element is implicit, and therefore inaccessible to 
>> optimizations.  The redundant calculations can not be eliminated, nor 
>> can strength reduction be performed.  getelementptr needs to be 
>> broken down into its constituent operations at some point.
>
>
> Actually strength reduction can be trivially done with getelementptr 
> instructions, that was part of their design.  Part of the problem is 
> that we have an induction variable canonicalization pass which UNDOES 
> strength reduction to make loop analysis simpler, but we don't have 
> strength reduction to redo it.   Thus if you compile a loop like this 
> (complex double chosen to show a size that X86 can scale "for
free"):
>
> #include <complex.h>
> void test(unsigned N, complex double *P) {
>   for (; N != 0; --N)
>     *P++ = 0;
> }
>
> We compile it to this LLVM code:
>
> void %test(uint %N, "complex long double"* %P) {
> entry:
>         %tmp.15 = seteq uint %N, 0              ; <bool> [#uses=1]
>         br bool %tmp.15, label %return, label %no_exit
>
> no_exit:                ; preds = %no_exit, %entry
>         %indvar = phi uint [ %indvar.next, %no_exit ], [ 0, %entry 
> ]            ; <uint> [#uses=3]
>         %tmp.4 = getelementptr "complex long double"* %P, uint 
> %indvar, uint 0          ; <double*> [#uses=1]
>         store double 0.000000e+00, double* %tmp.4
>         %tmp.5 = getelementptr "complex long double"* %P, uint 
> %indvar, uint 1          ; <double*> [#uses=1]
>         store double 0.000000e+00, double* %tmp.5
>         %indvar.next = add uint %indvar, 1              ; <uint> 
> [#uses=2]
>         %exitcond = seteq uint %indvar.next, %N         ; <bool> 
> [#uses=1]
>         br bool %exitcond, label %return, label %no_exit
>
> return:         ; preds = %no_exit, %entry
>         ret void
> }
>
> Note the accesses to P are using P[indvar].{real|imag}, not 
> incrementing P.  Compile this to X86, and you get this loop (using the 
> simple instruction selector):
>
> .LBBtest_1:     # no_exit
>         mov %ESI, %EDX
>         shl %ESI, 4                       --> scale by sizeof(complex 
> double)
>         mov %EDI, %ECX
>         add %EDI, %ESI
>         mov DWORD PTR [%EDI], 0
>         mov DWORD PTR [%EDI + 4], 0
>         mov %ESI, %EDX
>         shl %ESI, 4                       --> scale by sizeof(complex 
> double)
>         mov %EDI, %ECX
>         add %EDI, %ESI
>         lea %ESI, DWORD PTR [%EDI + 8]
>         mov DWORD PTR [%ESI], 0
>         mov DWORD PTR [%ESI + 4], 0
>         inc %EDX
>         cmp %EDX, %EAX
>         jne .LBBtest_1  # no_exit
>
>
> Now, if you disable the -indvars pass, you'll get this loop instead 
> (note the pointer increment of P is retained):
>
> void %test(uint %N, "complex long double"* %P) {
> entry:
>         %tmp.15 = seteq uint %N, 0              ; <bool> [#uses=1]
>         br bool %tmp.15, label %return, label %no_exit
> no_exit:                ; preds = %no_exit, %entry
>         %P_addr.0.0 = phi "complex long double"* [ %inc, %no_exit
], [
> %P, %entry ]             ; <"complex long double"*>
[#uses=3]
>         %N_addr.0.0 = phi uint [ %dec, %no_exit ], [ %N, %entry 
> ]               ; <uint> [#uses=1]
>         %inc = getelementptr "complex long double"* %P_addr.0.0,
int
> 1          ; <"complex long double"*> [#uses=1]
>         %tmp.4 = getelementptr "complex long double"*
%P_addr.0.0, int
> 0, uint 0                ; <double*> [#uses=1]
>         store double 0.000000e+00, double* %tmp.4
>         %tmp.5 = getelementptr "complex long double"*
%P_addr.0.0, int
> 0, uint 1                ; <double*> [#uses=1]
>         store double 0.000000e+00, double* %tmp.5
>         %dec = add uint %N_addr.0.0, 4294967295         ; <uint> 
> [#uses=2]
>         %tmp.1 = seteq uint %dec, 0             ; <bool> [#uses=1]
>         br bool %tmp.1, label %return, label %no_exit
> return:         ; preds = %no_exit, %entry
>         ret void
> }
>
> which codegens to this loop (simple isel again):
>
> .LBBtest_1:     # no_exit
>         lea %EDX, DWORD PTR [%ECX + 16]
>         mov DWORD PTR [%ECX], 0
>         mov DWORD PTR [%ECX + 4], 0
>         mov DWORD PTR [%ECX + 8], 0
>         mov DWORD PTR [%ECX + 12], 0
>         dec %EAX
>         test %EAX, %EAX
>         mov %ECX, %EDX
>         jne .LBBtest_1  # no_exit
>
> I think you'll agree that this loop is much nicer, though there is 
> some wierd register shuffling going on between ECX and EDX.  This is 
> the code we would generate if we did loop strength reduction on the 
> LLVM level. Note that on X86, this isn't a HUGE problem in all cases: 
> it can scale by 2,4 and 8 for "free".  The problem is much worse
on
> RISC targets.
>
>> Or am I missing something?  Is this breaking down and reapplying 
>> optimizations part of the selection dag process?  I am somewhat 
>> surprised that using it affected what loops got unrolled.
>
>
> I'm not sure what you mean.  The LLVM code input to the selection dag 
> isel is exactly the same as that input to the simple isel, and it does 
> no loop unrolling.  The full code produced by the simple isel (for 
> Jeff's example) is attached below for reference.  Contrast this to the 
> pattern isel output at the top.
>
> init_board:
>         sub %ESP, 4
>         mov DWORD PTR [%ESP], %ESI
>         mov %EAX, DWORD PTR [%ESP + 8]
>         mov %ECX, 0
> .LBBinit_board_1:       # loopexit.1
>         imul %EDX, %ECX, 7
>         mov %ESI, %EAX
>         add %ESI, %EDX
>         mov BYTE PTR [%ESI], 46
>         imul %EDX, %ECX, 7
>         mov %ESI, %EAX
>         add %ESI, %EDX
>         lea %EDX, DWORD PTR [%ESI + 1]
>         mov BYTE PTR [%EDX], 46
>         imul %EDX, %ECX, 7
>         mov %ESI, %EAX
>         add %ESI, %EDX
>         lea %EDX, DWORD PTR [%ESI + 2]
>         mov BYTE PTR [%EDX], 46
>         imul %EDX, %ECX, 7
>         mov %ESI, %EAX
>         add %ESI, %EDX
>         lea %EDX, DWORD PTR [%ESI + 3]
>         mov BYTE PTR [%EDX], 46
>         imul %EDX, %ECX, 7
>         mov %ESI, %EAX
>         add %ESI, %EDX
>         lea %EDX, DWORD PTR [%ESI + 4]
>         mov BYTE PTR [%EDX], 46
>         imul %EDX, %ECX, 7
>         mov %ESI, %EAX
>         add %ESI, %EDX
>         lea %EDX, DWORD PTR [%ESI + 5]
>         mov BYTE PTR [%EDX], 46
>         inc %ECX
>         cmp %ECX, 7
>         jne .LBBinit_board_1    # loopexit.1
> .LBBinit_board_2:       # return
>         mov BYTE PTR [%EAX + 6], 0
>         mov BYTE PTR [%EAX + 13], 0
>         mov BYTE PTR [%EAX + 20], 0
>         mov BYTE PTR [%EAX + 27], 0
>         mov BYTE PTR [%EAX + 34], 0
>         mov BYTE PTR [%EAX + 41], 0
>         mov BYTE PTR [%EAX + 48], 0
>         mov %ESI, DWORD PTR [%ESP]
>         add %ESP, 4
>     ret
>
> -Chris
>

Reasonably Related Threads

Search for more maybe matching threads

llvm dev - Feb 2005 - [LLVMdev] Area for improvement

[LLVMdev] Area for improvement

[LLVMdev] Area for improvement

[LLVMdev] Area for improvement

[LLVMdev] Area for improvement

[LLVMdev] Area for improvement

[LLVMdev] Area for improvement

Reasonably Related Threads