I noticed that fourinarow is one of the programs in which LLVM is much slower than GCC, so I decided to take a look and see why that is so. The program has many loops that look like this: #define ROWS 6 #define COLS 7 void init_board(char b[COLS][ROWS+1]) { int i,j; for (i=0;i<COLS;i++) for (j=0;j<ROWS;j++) b[i][j]='.'; for (i=0;i<COLS;i++) b[i][ROWS]=0; } This generates the following X86 code: .text .align 16 .globl init_board .type init_board, @function init_board: subl $4, %esp movl %esi, (%esp) movl 8(%esp), %eax movl $0, %ecx .LBBinit_board_1: # loopexit.1 imull $7, %ecx, %edx movl %eax, %esi addl %edx, %esi movb $46, (%esi) imull $7, %ecx, %edx movl %eax, %esi addl %edx, %esi leal 1(%esi), %edx movb $46, (%edx) imull $7, %ecx, %edx movl %eax, %esi addl %edx, %esi leal 2(%esi), %edx movb $46, (%edx) imull $7, %ecx, %edx movl %eax, %esi addl %edx, %esi leal 3(%esi), %edx movb $46, (%edx) imull $7, %ecx, %edx movl %eax, %esi addl %edx, %esi leal 4(%esi), %edx movb $46, (%edx) imull $7, %ecx, %edx movl %eax, %esi addl %edx, %esi leal 5(%esi), %edx movb $46, (%edx) incl %ecx cmpl $7, %ecx jne .LBBinit_board_1 # loopexit.1 .LBBinit_board_2: # return movb $0, 6(%eax) movb $0, 13(%eax) movb $0, 20(%eax) movb $0, 27(%eax) movb $0, 34(%eax) movb $0, 41(%eax) movb $0, 48(%eax) movl (%esp), %esi addl $4, %esp ret The code generated by GCC is much faster. LLVM does unroll the loop, which GCC does not do, but the addressing code is a constant repetition of these three instructions: imull $7, %ecx, %edx movl %eax, %esi addl %edx, %esi All but the first occurrance do nothing but put the same address into %esi that was already there to being with, and does it with an expensive multiply at that. GCC correctly creates a suitable induction variable and strength reduces the multiplication down to addition. It turns out that using the new selection dag code generator, the outer loop is unrolled also and nothing is left but a bunch of movb instructions. Much, much faster. But still, the optimization passes shouldn't be leaving so much garbage around for instruction selection to clean up. At first, I thought all that was needed was to another round of basic block optimizations after the loop was unrolled, until I saw the actual LLVM bytecodes: void %init_board([7 x sbyte]* %b) { entry: br label %loopexit.1 loopexit.1: ; preds = %loopexit.1, %entry %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15, %loopexit.1 ] ; <uint> [#uses=7] %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 0 ; <sbyte*> [#uses=1] store sbyte 46, sbyte* %tmp.10 %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 1 ; <sbyte*> [#uses=1] store sbyte 46, sbyte* %tmp.10.1 %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 2 ; <sbyte*> [#uses=1] store sbyte 46, sbyte* %tmp.10.2 %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 3 ; <sbyte*> [#uses=1] store sbyte 46, sbyte* %tmp.10.3 %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 4 ; <sbyte*> [#uses=1] store sbyte 46, sbyte* %tmp.10.4 %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 5 ; <sbyte*> [#uses=1] store sbyte 46, sbyte* %tmp.10.5 %indvar.next15 = add uint %indvar14, 1 ; <uint> [#uses=2] %exitcond16 = seteq uint %indvar.next15, 7 ; <bool> [#uses=1] br bool %exitcond16, label %return, label %loopexit.1 return: ; preds = %loopexit.1 %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6 ; <sbyte*> [#uses=1] store sbyte 0, sbyte* %tmp.19 %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int 6 ; <sbyte*> [#uses=1] store sbyte 0, sbyte* %tmp.19.1 %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int 6 ; <sbyte*> [#uses=1] store sbyte 0, sbyte* %tmp.19.2 %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int 6 ; <sbyte*> [#uses=1] store sbyte 0, sbyte* %tmp.19.3 %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int 6 ; <sbyte*> [#uses=1] store sbyte 0, sbyte* %tmp.19.4 %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int 6 ; <sbyte*> [#uses=1] store sbyte 0, sbyte* %tmp.19.5 %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int 6 ; <sbyte*> [#uses=1] store sbyte 0, sbyte* %tmp.19.6 ret void } Now the problem is obvious. A two dimensional array access is being performed by a single instruction. The arithmetic needed to address the element is implicit, and therefore inaccessible to optimizations. The redundant calculations can not be eliminated, nor can strength reduction be performed. getelementptr needs to be broken down into its constituent operations at some point. Or am I missing something? Is this breaking down and reapplying optimizations part of the selection dag process? I am somewhat surprised that using it affected what loops got unrolled. -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20050221/3409ef8f/attachment.html>
When I increased COLS to the point where the loop could no longer be unrolled, the selection dag code generator generated effectively the same code as the default X86 code generator. Lots of redundant imul/movl/addl sequences. It can't clean it up either. Only unrolling all nested loops permits it to be optimized away, regardless of code generator. Jeff Cohen wrote:> I noticed that fourinarow is one of the programs in which LLVM is much > slower than GCC, so I decided to take a look and see why that is so. > The program has many loops that look like this: > > #define ROWS 6 > #define COLS 7 > > void init_board(char b[COLS][ROWS+1]) > { > int i,j; > > for (i=0;i<COLS;i++) > for (j=0;j<ROWS;j++) > b[i][j]='.'; > > for (i=0;i<COLS;i++) > b[i][ROWS]=0; > } > > This generates the following X86 code: > > .text > .align 16 > .globl init_board > .type init_board, @function > init_board: > subl $4, %esp > movl %esi, (%esp) > movl 8(%esp), %eax > movl $0, %ecx > .LBBinit_board_1: # loopexit.1 > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > movb $46, (%esi) > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > leal 1(%esi), %edx > movb $46, (%edx) > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > leal 2(%esi), %edx > movb $46, (%edx) > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > leal 3(%esi), %edx > movb $46, (%edx) > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > leal 4(%esi), %edx > movb $46, (%edx) > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > leal 5(%esi), %edx > movb $46, (%edx) > incl %ecx > cmpl $7, %ecx > jne .LBBinit_board_1 # loopexit.1 > .LBBinit_board_2: # return > movb $0, 6(%eax) > movb $0, 13(%eax) > movb $0, 20(%eax) > movb $0, 27(%eax) > movb $0, 34(%eax) > movb $0, 41(%eax) > movb $0, 48(%eax) > movl (%esp), %esi > addl $4, %esp > ret > > The code generated by GCC is much faster. LLVM does unroll the loop, > which GCC does not do, but the addressing code is a constant > repetition of these three instructions: > > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > > All but the first occurrance do nothing but put the same address into > %esi that was already there to being with, and does it with an > expensive multiply at that. GCC correctly creates a suitable > induction variable and strength reduces the multiplication down to > addition. > > It turns out that using the new selection dag code generator, the > outer loop is unrolled also and nothing is left but a bunch of movb > instructions. Much, much faster. But still, the optimization passes > shouldn't be leaving so much garbage around for instruction selection > to clean up. At first, I thought all that was needed was to another > round of basic block optimizations after the loop was unrolled, until > I saw the actual LLVM bytecodes: > > void %init_board([7 x sbyte]* %b) { > entry: > br label %loopexit.1 > > loopexit.1: ; preds = %loopexit.1, %entry > %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15, > %loopexit.1 ] ; <uint> [#uses=7] > %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14, int > 0 ; <sbyte*> [#uses=1] > store sbyte 46, sbyte* %tmp.10 > %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint %indvar14, int > 1 ; <sbyte*> [#uses=1] > store sbyte 46, sbyte* %tmp.10.1 > %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint %indvar14, int > 2 ; <sbyte*> [#uses=1] > store sbyte 46, sbyte* %tmp.10.2 > %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint %indvar14, int > 3 ; <sbyte*> [#uses=1] > store sbyte 46, sbyte* %tmp.10.3 > %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint %indvar14, int > 4 ; <sbyte*> [#uses=1] > store sbyte 46, sbyte* %tmp.10.4 > %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint %indvar14, int > 5 ; <sbyte*> [#uses=1] > store sbyte 46, sbyte* %tmp.10.5 > %indvar.next15 = add uint %indvar14, 1 ; <uint> [#uses=2] > %exitcond16 = seteq uint %indvar.next15, 7 ; <bool> > [#uses=1] > br bool %exitcond16, label %return, label %loopexit.1 > > return: ; preds = %loopexit.1 > %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6 ; > <sbyte*> [#uses=1] > store sbyte 0, sbyte* %tmp.19 > %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int 6 > ; <sbyte*> [#uses=1] > store sbyte 0, sbyte* %tmp.19.1 > %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int 6 > ; <sbyte*> [#uses=1] > store sbyte 0, sbyte* %tmp.19.2 > %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int 6 > ; <sbyte*> [#uses=1] > store sbyte 0, sbyte* %tmp.19.3 > %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int 6 > ; <sbyte*> [#uses=1] > store sbyte 0, sbyte* %tmp.19.4 > %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int 6 > ; <sbyte*> [#uses=1] > store sbyte 0, sbyte* %tmp.19.5 > %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int 6 > ; <sbyte*> [#uses=1] > store sbyte 0, sbyte* %tmp.19.6 > ret void > } > > Now the problem is obvious. A two dimensional array access is being > performed by a single instruction. The arithmetic needed to address > the element is implicit, and therefore inaccessible to optimizations. > The redundant calculations can not be eliminated, nor can strength > reduction be performed. getelementptr needs to be broken down into > its constituent operations at some point. > > Or am I missing something? Is this breaking down and reapplying > optimizations part of the selection dag process? I am somewhat > surprised that using it affected what loops got unrolled. > >------------------------------------------------------------------------ > >_______________________________________________ >LLVM Developers mailing list >LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20050221/c40b64e4/attachment.html>
Jeff, This is an important find. I started looking at fourinarow and other programs that don't do well compared to GCC. I think there are probably only a couple things remaining for LLVM to smoke GCC on all tests (floating point is another one). I know that Chris has been looking for good analyses like the one you've provided. Could you put this into a bugzilla for us? That way it definitely gets on the "to do" list. Thanks, Reid. On Mon, 2005-02-21 at 19:17, Jeff Cohen wrote:> When I increased COLS to the point where the loop could no longer be > unrolled, the selection dag code generator generated effectively the > same code as the default X86 code generator. Lots of redundant > imul/movl/addl sequences. It can't clean it up either. Only > unrolling all nested loops permits it to be optimized away, regardless > of code generator. > > Jeff Cohen wrote: > > I noticed that fourinarow is one of the programs in which LLVM is > > much slower than GCC, so I decided to take a look and see why that > > is so. The program has many loops that look like this: > > #define ROWS 6 > > #define COLS 7 > > > > void init_board(char b[COLS][ROWS+1]) > > { > > int i,j; > > > > for (i=0;i<COLS;i++) > > for (j=0;j<ROWS;j++) > > b[i][j]='.'; > > > > for (i=0;i<COLS;i++) > > b[i][ROWS]=0; > > } > > This generates the following X86 code: > > .text > > .align 16 > > .globl init_board > > .type init_board, @function > > init_board: > > subl $4, %esp > > movl %esi, (%esp) > > movl 8(%esp), %eax > > movl $0, %ecx > > .LBBinit_board_1: # loopexit.1 > > imull $7, %ecx, %edx > > movl %eax, %esi > > addl %edx, %esi > > movb $46, (%esi) > > imull $7, %ecx, %edx > > movl %eax, %esi > > addl %edx, %esi > > leal 1(%esi), %edx > > movb $46, (%edx) > > imull $7, %ecx, %edx > > movl %eax, %esi > > addl %edx, %esi > > leal 2(%esi), %edx > > movb $46, (%edx) > > imull $7, %ecx, %edx > > movl %eax, %esi > > addl %edx, %esi > > leal 3(%esi), %edx > > movb $46, (%edx) > > imull $7, %ecx, %edx > > movl %eax, %esi > > addl %edx, %esi > > leal 4(%esi), %edx > > movb $46, (%edx) > > imull $7, %ecx, %edx > > movl %eax, %esi > > addl %edx, %esi > > leal 5(%esi), %edx > > movb $46, (%edx) > > incl %ecx > > cmpl $7, %ecx > > jne .LBBinit_board_1 # loopexit.1 > > .LBBinit_board_2: # return > > movb $0, 6(%eax) > > movb $0, 13(%eax) > > movb $0, 20(%eax) > > movb $0, 27(%eax) > > movb $0, 34(%eax) > > movb $0, 41(%eax) > > movb $0, 48(%eax) > > movl (%esp), %esi > > addl $4, %esp > > ret > > The code generated by GCC is much faster. LLVM does unroll the > > loop, which GCC does not do, but the addressing code is a constant > > repetition of these three instructions: > > imull $7, %ecx, %edx > > movl %eax, %esi > > addl %edx, %esi > > All but the first occurrance do nothing but put the same address > > into %esi that was already there to being with, and does it with an > > expensive multiply at that. GCC correctly creates a suitable > > induction variable and strength reduces the multiplication down to > > addition. > > > > It turns out that using the new selection dag code generator, the > > outer loop is unrolled also and nothing is left but a bunch of movb > > instructions. Much, much faster. But still, the optimization > > passes shouldn't be leaving so much garbage around for instruction > > selection to clean up. At first, I thought all that was needed was > > to another round of basic block optimizations after the loop was > > unrolled, until I saw the actual LLVM bytecodes: > > void %init_board([7 x sbyte]* %b) { > > entry: > > br label %loopexit.1 > > > > loopexit.1: ; preds = %loopexit.1, %entry > > %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15, > > %loopexit.1 ] ; <uint> [#uses=7] > > %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14, > > int 0 ; <sbyte*> [#uses=1] > > store sbyte 46, sbyte* %tmp.10 > > %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint > > %indvar14, int 1 ; <sbyte*> [#uses=1] > > store sbyte 46, sbyte* %tmp.10.1 > > %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint > > %indvar14, int 2 ; <sbyte*> [#uses=1] > > store sbyte 46, sbyte* %tmp.10.2 > > %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint > > %indvar14, int 3 ; <sbyte*> [#uses=1] > > store sbyte 46, sbyte* %tmp.10.3 > > %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint > > %indvar14, int 4 ; <sbyte*> [#uses=1] > > store sbyte 46, sbyte* %tmp.10.4 > > %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint > > %indvar14, int 5 ; <sbyte*> [#uses=1] > > store sbyte 46, sbyte* %tmp.10.5 > > %indvar.next15 = add uint %indvar14, 1 ; <uint> > > [#uses=2] > > %exitcond16 = seteq uint %indvar.next15, 7 ; > > <bool> [#uses=1] > > br bool %exitcond16, label %return, label %loopexit.1 > > > > return: ; preds = %loopexit.1 > > %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6 > > ; <sbyte*> [#uses=1] > > store sbyte 0, sbyte* %tmp.19 > > %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int > > 6 ; <sbyte*> [#uses=1] > > store sbyte 0, sbyte* %tmp.19.1 > > %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int > > 6 ; <sbyte*> [#uses=1] > > store sbyte 0, sbyte* %tmp.19.2 > > %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int > > 6 ; <sbyte*> [#uses=1] > > store sbyte 0, sbyte* %tmp.19.3 > > %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int > > 6 ; <sbyte*> [#uses=1] > > store sbyte 0, sbyte* %tmp.19.4 > > %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int > > 6 ; <sbyte*> [#uses=1] > > store sbyte 0, sbyte* %tmp.19.5 > > %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int > > 6 ; <sbyte*> [#uses=1] > > store sbyte 0, sbyte* %tmp.19.6 > > ret void > > } > > Now the problem is obvious. A two dimensional array access is being > > performed by a single instruction. The arithmetic needed to address > > the element is implicit, and therefore inaccessible to > > optimizations. The redundant calculations can not be eliminated, > > nor can strength reduction be performed. getelementptr needs to be > > broken down into its constituent operations at some point. > > > > Or am I missing something? Is this breaking down and reapplying > > optimizations part of the selection dag process? I am somewhat > > surprised that using it affected what loops got unrolled. > > > > > > ____________________________________________________________________ > > _______________________________________________ > > LLVM Developers mailing list > > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > > http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev > > > > > ______________________________________________________________________ > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev-------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 189 bytes Desc: This is a digitally signed message part URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20050221/17ee9e35/attachment.sig>
> > Now the problem is obvious. A two dimensional array access is being > performed by a single instruction. The arithmetic needed to address > the element is implicit, and therefore inaccessible to optimizations. > The redundant calculations can not be eliminated, nor can strength > reduction be performed. getelementptr needs to be broken down into > its constituent operations at some point.Jeff, This is exactly right -- any multi-dimensional array indexing operations need to decomposed into a sequence of single index operations so that they can be exposed to GCSE and LICM. This transformation obscures the indexing behavior of the code, so the right place to do this is within each back-end, on LLVM code just before instruction selection. There is a pass called DecomposeMultiDimRefs (which seems to have been incorrectly moved to lib/Target/SparcV9/) which does this transformation. It's used by the SparcV9 back end before instr. selection but is not specific to Sparc. Running this, followed by LICM and GCSE should address this problem. --Vikram
On Mon, 21 Feb 2005, Jeff Cohen wrote:> I noticed that fourinarow is one of the programs in which LLVM is much slower > than GCC, so I decided to take a look and see why that is so. The program > has many loops that look like this: > > #define ROWS 6 > #define COLS 7 > > void init_board(char b[COLS][ROWS+1]) > { > int i,j; > > for (i=0;i<COLS;i++) > for (j=0;j<ROWS;j++) > b[i][j]='.'; > for (i=0;i<COLS;i++) > b[i][ROWS]=0; > } > > This generates the following X86 code: > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > movb $46, (%esi) > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esi > leal 1(%esi), %edx... (many many copies of this, see the end of the email for full output) ...> The code generated by GCC is much faster. LLVM does unroll the loop, which > GCC does not do, but the addressing code is a constant repetition of these > three instructions: > > imull $7, %ecx, %edx > movl %eax, %esi > addl %edx, %esiYup, this is an issue of simple "peephole expansion" of the getelementptr instruction.> All but the first occurrance do nothing but put the same address into %esi > that was already there to being with, and does it with an expensive multiply > at that. GCC correctly creates a suitable induction variable and strength > reduces the multiplication down to addition.This is called loop strength reduction.> It turns out that using the new selection dag code generator, the outer loop > is unrolled also and nothing is left but a bunch of movb instructions. Much, > much faster.For the record, instead of the ugly code above, the new isel generates: init_board: mov %EAX, DWORD PTR [%ESP + 4] mov %ECX, 0 .LBBinit_board_1: # loopexit.1 imul %EDX, %ECX, 7 mov BYTE PTR [%EAX + %EDX], 46 mov BYTE PTR [%EAX + %EDX + 1], 46 mov BYTE PTR [%EAX + %EDX + 2], 46 mov BYTE PTR [%EAX + %EDX + 3], 46 mov BYTE PTR [%EAX + %EDX + 4], 46 mov BYTE PTR [%EAX + %EDX + 5], 46 inc %ECX cmp %ECX, 7 jne .LBBinit_board_1 # loopexit.1 .LBBinit_board_2: # return mov BYTE PTR [%EAX + 6], 0 mov BYTE PTR [%EAX + 13], 0 mov BYTE PTR [%EAX + 20], 0 mov BYTE PTR [%EAX + 27], 0 mov BYTE PTR [%EAX + 34], 0 mov BYTE PTR [%EAX + 41], 0 mov BYTE PTR [%EAX + 48], 0 ret This code is almost "perfect" with the exception of the multiply by 7 inside of the loop. Loop strength reduction would transform this into an add like GCC produces.> But still, the optimization passes shouldn't be leaving so much > garbage around for instruction selection to clean up. At first, I thought > all that was needed was to another round of basic block optimizations after > the loop was unrolled, until I saw the actual LLVM bytecodes:There are two issues here. First, LLVM code is NOT designed for trivial peephole code generation. This is *possible*, but not going to give you good quality code (see above ;-) ). The new isel uses an extremely light-weight DAG-based optimizer that both simplifies instruction selectors and optimizes the code before selection. LLVM code is meant to be a host to mid-level and interprocedural optimizations, and I think it does that job well.> Now the problem is obvious. A two dimensional array access is being > performed by a single instruction. The arithmetic needed to address the > element is implicit, and therefore inaccessible to optimizations. The > redundant calculations can not be eliminated, nor can strength reduction be > performed. getelementptr needs to be broken down into its constituent > operations at some point.Actually strength reduction can be trivially done with getelementptr instructions, that was part of their design. Part of the problem is that we have an induction variable canonicalization pass which UNDOES strength reduction to make loop analysis simpler, but we don't have strength reduction to redo it. Thus if you compile a loop like this (complex double chosen to show a size that X86 can scale "for free"): #include <complex.h> void test(unsigned N, complex double *P) { for (; N != 0; --N) *P++ = 0; } We compile it to this LLVM code: void %test(uint %N, "complex long double"* %P) { entry: %tmp.15 = seteq uint %N, 0 ; <bool> [#uses=1] br bool %tmp.15, label %return, label %no_exit no_exit: ; preds = %no_exit, %entry %indvar = phi uint [ %indvar.next, %no_exit ], [ 0, %entry ] ; <uint> [#uses=3] %tmp.4 = getelementptr "complex long double"* %P, uint %indvar, uint 0 ; <double*> [#uses=1] store double 0.000000e+00, double* %tmp.4 %tmp.5 = getelementptr "complex long double"* %P, uint %indvar, uint 1 ; <double*> [#uses=1] store double 0.000000e+00, double* %tmp.5 %indvar.next = add uint %indvar, 1 ; <uint> [#uses=2] %exitcond = seteq uint %indvar.next, %N ; <bool> [#uses=1] br bool %exitcond, label %return, label %no_exit return: ; preds = %no_exit, %entry ret void } Note the accesses to P are using P[indvar].{real|imag}, not incrementing P. Compile this to X86, and you get this loop (using the simple instruction selector): .LBBtest_1: # no_exit mov %ESI, %EDX shl %ESI, 4 --> scale by sizeof(complex double) mov %EDI, %ECX add %EDI, %ESI mov DWORD PTR [%EDI], 0 mov DWORD PTR [%EDI + 4], 0 mov %ESI, %EDX shl %ESI, 4 --> scale by sizeof(complex double) mov %EDI, %ECX add %EDI, %ESI lea %ESI, DWORD PTR [%EDI + 8] mov DWORD PTR [%ESI], 0 mov DWORD PTR [%ESI + 4], 0 inc %EDX cmp %EDX, %EAX jne .LBBtest_1 # no_exit Now, if you disable the -indvars pass, you'll get this loop instead (note the pointer increment of P is retained): void %test(uint %N, "complex long double"* %P) { entry: %tmp.15 = seteq uint %N, 0 ; <bool> [#uses=1] br bool %tmp.15, label %return, label %no_exit no_exit: ; preds = %no_exit, %entry %P_addr.0.0 = phi "complex long double"* [ %inc, %no_exit ], [ %P, %entry ] ; <"complex long double"*> [#uses=3] %N_addr.0.0 = phi uint [ %dec, %no_exit ], [ %N, %entry ] ; <uint> [#uses=1] %inc = getelementptr "complex long double"* %P_addr.0.0, int 1 ; <"complex long double"*> [#uses=1] %tmp.4 = getelementptr "complex long double"* %P_addr.0.0, int 0, uint 0 ; <double*> [#uses=1] store double 0.000000e+00, double* %tmp.4 %tmp.5 = getelementptr "complex long double"* %P_addr.0.0, int 0, uint 1 ; <double*> [#uses=1] store double 0.000000e+00, double* %tmp.5 %dec = add uint %N_addr.0.0, 4294967295 ; <uint> [#uses=2] %tmp.1 = seteq uint %dec, 0 ; <bool> [#uses=1] br bool %tmp.1, label %return, label %no_exit return: ; preds = %no_exit, %entry ret void } which codegens to this loop (simple isel again): .LBBtest_1: # no_exit lea %EDX, DWORD PTR [%ECX + 16] mov DWORD PTR [%ECX], 0 mov DWORD PTR [%ECX + 4], 0 mov DWORD PTR [%ECX + 8], 0 mov DWORD PTR [%ECX + 12], 0 dec %EAX test %EAX, %EAX mov %ECX, %EDX jne .LBBtest_1 # no_exit I think you'll agree that this loop is much nicer, though there is some wierd register shuffling going on between ECX and EDX. This is the code we would generate if we did loop strength reduction on the LLVM level. Note that on X86, this isn't a HUGE problem in all cases: it can scale by 2,4 and 8 for "free". The problem is much worse on RISC targets.> Or am I missing something? Is this breaking down and reapplying > optimizations part of the selection dag process? I am somewhat surprised > that using it affected what loops got unrolled.I'm not sure what you mean. The LLVM code input to the selection dag isel is exactly the same as that input to the simple isel, and it does no loop unrolling. The full code produced by the simple isel (for Jeff's example) is attached below for reference. Contrast this to the pattern isel output at the top. init_board: sub %ESP, 4 mov DWORD PTR [%ESP], %ESI mov %EAX, DWORD PTR [%ESP + 8] mov %ECX, 0 .LBBinit_board_1: # loopexit.1 imul %EDX, %ECX, 7 mov %ESI, %EAX add %ESI, %EDX mov BYTE PTR [%ESI], 46 imul %EDX, %ECX, 7 mov %ESI, %EAX add %ESI, %EDX lea %EDX, DWORD PTR [%ESI + 1] mov BYTE PTR [%EDX], 46 imul %EDX, %ECX, 7 mov %ESI, %EAX add %ESI, %EDX lea %EDX, DWORD PTR [%ESI + 2] mov BYTE PTR [%EDX], 46 imul %EDX, %ECX, 7 mov %ESI, %EAX add %ESI, %EDX lea %EDX, DWORD PTR [%ESI + 3] mov BYTE PTR [%EDX], 46 imul %EDX, %ECX, 7 mov %ESI, %EAX add %ESI, %EDX lea %EDX, DWORD PTR [%ESI + 4] mov BYTE PTR [%EDX], 46 imul %EDX, %ECX, 7 mov %ESI, %EAX add %ESI, %EDX lea %EDX, DWORD PTR [%ESI + 5] mov BYTE PTR [%EDX], 46 inc %ECX cmp %ECX, 7 jne .LBBinit_board_1 # loopexit.1 .LBBinit_board_2: # return mov BYTE PTR [%EAX + 6], 0 mov BYTE PTR [%EAX + 13], 0 mov BYTE PTR [%EAX + 20], 0 mov BYTE PTR [%EAX + 27], 0 mov BYTE PTR [%EAX + 34], 0 mov BYTE PTR [%EAX + 41], 0 mov BYTE PTR [%EAX + 48], 0 mov %ESI, DWORD PTR [%ESP] add %ESP, 4 ret -Chris -- http://nondot.org/sabre/ http://llvm.cs.uiuc.edu/
Sorry, I thought I was running selection dag isel but I screwed up when trying out the really big array. You're right, it does clean it up except for the multiplication. So LoopStrengthReduce is not ready for prime time and doesn't actually get used? I might consider whipping it into shape. Does it still have to handle getelementptr in its full generality? Chris Lattner wrote:> On Mon, 21 Feb 2005, Jeff Cohen wrote: > >> I noticed that fourinarow is one of the programs in which LLVM is >> much slower than GCC, so I decided to take a look and see why that is >> so. The program has many loops that look like this: >> >> #define ROWS 6 >> #define COLS 7 >> >> void init_board(char b[COLS][ROWS+1]) >> { >> int i,j; >> >> for (i=0;i<COLS;i++) >> for (j=0;j<ROWS;j++) >> b[i][j]='.'; >> for (i=0;i<COLS;i++) >> b[i][ROWS]=0; >> } >> >> This generates the following X86 code: >> imull $7, %ecx, %edx >> movl %eax, %esi >> addl %edx, %esi >> movb $46, (%esi) >> imull $7, %ecx, %edx >> movl %eax, %esi >> addl %edx, %esi >> leal 1(%esi), %edx > > ... (many many copies of this, see the end of the email for full > output) ... > >> The code generated by GCC is much faster. LLVM does unroll the loop, >> which GCC does not do, but the addressing code is a constant >> repetition of these three instructions: >> >> imull $7, %ecx, %edx >> movl %eax, %esi >> addl %edx, %esi > > > Yup, this is an issue of simple "peephole expansion" of the > getelementptr instruction. > >> All but the first occurrance do nothing but put the same address into >> %esi that was already there to being with, and does it with an >> expensive multiply at that. GCC correctly creates a suitable >> induction variable and strength reduces the multiplication down to >> addition. > > > This is called loop strength reduction. > >> It turns out that using the new selection dag code generator, the >> outer loop is unrolled also and nothing is left but a bunch of movb >> instructions. Much, much faster. > > > For the record, instead of the ugly code above, the new isel generates: > > init_board: > mov %EAX, DWORD PTR [%ESP + 4] > mov %ECX, 0 > .LBBinit_board_1: # loopexit.1 > imul %EDX, %ECX, 7 > mov BYTE PTR [%EAX + %EDX], 46 > mov BYTE PTR [%EAX + %EDX + 1], 46 > mov BYTE PTR [%EAX + %EDX + 2], 46 > mov BYTE PTR [%EAX + %EDX + 3], 46 > mov BYTE PTR [%EAX + %EDX + 4], 46 > mov BYTE PTR [%EAX + %EDX + 5], 46 > inc %ECX > cmp %ECX, 7 > jne .LBBinit_board_1 # loopexit.1 > .LBBinit_board_2: # return > mov BYTE PTR [%EAX + 6], 0 > mov BYTE PTR [%EAX + 13], 0 > mov BYTE PTR [%EAX + 20], 0 > mov BYTE PTR [%EAX + 27], 0 > mov BYTE PTR [%EAX + 34], 0 > mov BYTE PTR [%EAX + 41], 0 > mov BYTE PTR [%EAX + 48], 0 > ret > > This code is almost "perfect" with the exception of the multiply by 7 > inside of the loop. Loop strength reduction would transform this into > an add like GCC produces. > > >> But still, the optimization passes shouldn't be leaving so much >> garbage around for instruction selection to clean up. At first, I >> thought all that was needed was to another round of basic block >> optimizations after the loop was unrolled, until I saw the actual >> LLVM bytecodes: > > > There are two issues here. First, LLVM code is NOT designed for > trivial peephole code generation. This is *possible*, but not going > to give you good quality code (see above ;-) ). The new isel uses an > extremely light-weight DAG-based optimizer that both simplifies > instruction selectors and optimizes the code before selection. LLVM > code is meant to be a host to mid-level and interprocedural > optimizations, and I think it does that job well. > >> Now the problem is obvious. A two dimensional array access is being >> performed by a single instruction. The arithmetic needed to address >> the element is implicit, and therefore inaccessible to >> optimizations. The redundant calculations can not be eliminated, nor >> can strength reduction be performed. getelementptr needs to be >> broken down into its constituent operations at some point. > > > Actually strength reduction can be trivially done with getelementptr > instructions, that was part of their design. Part of the problem is > that we have an induction variable canonicalization pass which UNDOES > strength reduction to make loop analysis simpler, but we don't have > strength reduction to redo it. Thus if you compile a loop like this > (complex double chosen to show a size that X86 can scale "for free"): > > #include <complex.h> > void test(unsigned N, complex double *P) { > for (; N != 0; --N) > *P++ = 0; > } > > We compile it to this LLVM code: > > void %test(uint %N, "complex long double"* %P) { > entry: > %tmp.15 = seteq uint %N, 0 ; <bool> [#uses=1] > br bool %tmp.15, label %return, label %no_exit > > no_exit: ; preds = %no_exit, %entry > %indvar = phi uint [ %indvar.next, %no_exit ], [ 0, %entry > ] ; <uint> [#uses=3] > %tmp.4 = getelementptr "complex long double"* %P, uint > %indvar, uint 0 ; <double*> [#uses=1] > store double 0.000000e+00, double* %tmp.4 > %tmp.5 = getelementptr "complex long double"* %P, uint > %indvar, uint 1 ; <double*> [#uses=1] > store double 0.000000e+00, double* %tmp.5 > %indvar.next = add uint %indvar, 1 ; <uint> > [#uses=2] > %exitcond = seteq uint %indvar.next, %N ; <bool> > [#uses=1] > br bool %exitcond, label %return, label %no_exit > > return: ; preds = %no_exit, %entry > ret void > } > > Note the accesses to P are using P[indvar].{real|imag}, not > incrementing P. Compile this to X86, and you get this loop (using the > simple instruction selector): > > .LBBtest_1: # no_exit > mov %ESI, %EDX > shl %ESI, 4 --> scale by sizeof(complex > double) > mov %EDI, %ECX > add %EDI, %ESI > mov DWORD PTR [%EDI], 0 > mov DWORD PTR [%EDI + 4], 0 > mov %ESI, %EDX > shl %ESI, 4 --> scale by sizeof(complex > double) > mov %EDI, %ECX > add %EDI, %ESI > lea %ESI, DWORD PTR [%EDI + 8] > mov DWORD PTR [%ESI], 0 > mov DWORD PTR [%ESI + 4], 0 > inc %EDX > cmp %EDX, %EAX > jne .LBBtest_1 # no_exit > > > Now, if you disable the -indvars pass, you'll get this loop instead > (note the pointer increment of P is retained): > > void %test(uint %N, "complex long double"* %P) { > entry: > %tmp.15 = seteq uint %N, 0 ; <bool> [#uses=1] > br bool %tmp.15, label %return, label %no_exit > no_exit: ; preds = %no_exit, %entry > %P_addr.0.0 = phi "complex long double"* [ %inc, %no_exit ], [ > %P, %entry ] ; <"complex long double"*> [#uses=3] > %N_addr.0.0 = phi uint [ %dec, %no_exit ], [ %N, %entry > ] ; <uint> [#uses=1] > %inc = getelementptr "complex long double"* %P_addr.0.0, int > 1 ; <"complex long double"*> [#uses=1] > %tmp.4 = getelementptr "complex long double"* %P_addr.0.0, int > 0, uint 0 ; <double*> [#uses=1] > store double 0.000000e+00, double* %tmp.4 > %tmp.5 = getelementptr "complex long double"* %P_addr.0.0, int > 0, uint 1 ; <double*> [#uses=1] > store double 0.000000e+00, double* %tmp.5 > %dec = add uint %N_addr.0.0, 4294967295 ; <uint> > [#uses=2] > %tmp.1 = seteq uint %dec, 0 ; <bool> [#uses=1] > br bool %tmp.1, label %return, label %no_exit > return: ; preds = %no_exit, %entry > ret void > } > > which codegens to this loop (simple isel again): > > .LBBtest_1: # no_exit > lea %EDX, DWORD PTR [%ECX + 16] > mov DWORD PTR [%ECX], 0 > mov DWORD PTR [%ECX + 4], 0 > mov DWORD PTR [%ECX + 8], 0 > mov DWORD PTR [%ECX + 12], 0 > dec %EAX > test %EAX, %EAX > mov %ECX, %EDX > jne .LBBtest_1 # no_exit > > I think you'll agree that this loop is much nicer, though there is > some wierd register shuffling going on between ECX and EDX. This is > the code we would generate if we did loop strength reduction on the > LLVM level. Note that on X86, this isn't a HUGE problem in all cases: > it can scale by 2,4 and 8 for "free". The problem is much worse on > RISC targets. > >> Or am I missing something? Is this breaking down and reapplying >> optimizations part of the selection dag process? I am somewhat >> surprised that using it affected what loops got unrolled. > > > I'm not sure what you mean. The LLVM code input to the selection dag > isel is exactly the same as that input to the simple isel, and it does > no loop unrolling. The full code produced by the simple isel (for > Jeff's example) is attached below for reference. Contrast this to the > pattern isel output at the top. > > init_board: > sub %ESP, 4 > mov DWORD PTR [%ESP], %ESI > mov %EAX, DWORD PTR [%ESP + 8] > mov %ECX, 0 > .LBBinit_board_1: # loopexit.1 > imul %EDX, %ECX, 7 > mov %ESI, %EAX > add %ESI, %EDX > mov BYTE PTR [%ESI], 46 > imul %EDX, %ECX, 7 > mov %ESI, %EAX > add %ESI, %EDX > lea %EDX, DWORD PTR [%ESI + 1] > mov BYTE PTR [%EDX], 46 > imul %EDX, %ECX, 7 > mov %ESI, %EAX > add %ESI, %EDX > lea %EDX, DWORD PTR [%ESI + 2] > mov BYTE PTR [%EDX], 46 > imul %EDX, %ECX, 7 > mov %ESI, %EAX > add %ESI, %EDX > lea %EDX, DWORD PTR [%ESI + 3] > mov BYTE PTR [%EDX], 46 > imul %EDX, %ECX, 7 > mov %ESI, %EAX > add %ESI, %EDX > lea %EDX, DWORD PTR [%ESI + 4] > mov BYTE PTR [%EDX], 46 > imul %EDX, %ECX, 7 > mov %ESI, %EAX > add %ESI, %EDX > lea %EDX, DWORD PTR [%ESI + 5] > mov BYTE PTR [%EDX], 46 > inc %ECX > cmp %ECX, 7 > jne .LBBinit_board_1 # loopexit.1 > .LBBinit_board_2: # return > mov BYTE PTR [%EAX + 6], 0 > mov BYTE PTR [%EAX + 13], 0 > mov BYTE PTR [%EAX + 20], 0 > mov BYTE PTR [%EAX + 27], 0 > mov BYTE PTR [%EAX + 34], 0 > mov BYTE PTR [%EAX + 41], 0 > mov BYTE PTR [%EAX + 48], 0 > mov %ESI, DWORD PTR [%ESP] > add %ESP, 4 > ret > > -Chris >