Joan Lluch via llvm-dev
2019-Aug-08  15:18 UTC
[llvm-dev] Suboptimal code generated by clang+llc in quite a common scenario (?)
I found a something that I quite not understand when compiling a common piece of
code using the -Os flags.
I found it while testing my own backend but then I got deeper and found that at
least the x86 is affected as well. This is the referred code:
char pp[3];
char *scscx = pp;
int tst( char i, char j, char k )
{
  scscx[0] = i;
  scscx[1] = j; 
  scscx[2] = k;
  return 0;
}
The above gets compiled for the x86 architecture like this:
; Function Attrs: nofree norecurse nounwind optsize uwtable
define i32 @tst(i8 signext %i, i8 signext %j, i8 signext %k) local_unnamed_addr
#1 {
entry:
  %0 = load i8*, i8** @scscx, align 8, !tbaa !11
  store i8 %i, i8* %0, align 1, !tbaa !13
  %1 = load i8*, i8** @scscx, align 8, !tbaa !11
  %arrayidx1 = getelementptr inbounds i8, i8* %1, i64 1
  store i8 %j, i8* %arrayidx1, align 1, !tbaa !13
  %2 = load i8*, i8** @scscx, align 8, !tbaa !11
  %arrayidx2 = getelementptr inbounds i8, i8* %2, i64 2
  store i8 %k, i8* %arrayidx2, align 1, !tbaa !13
  ret i32 0
}
According to that, the variable ‘scscx’ is loaded three times despite it’s never
modified. The resulting assembly code is this:
	.globl	_tst
_tst:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%esi
	.cfi_offset %esi, -12
	movb	16(%ebp), %al
	movb	12(%ebp), %cl
	movb	8(%ebp), %dl
	movl	_scscx, %esi
	movb	%dl, (%esi)
	movl	_scscx, %edx
	movb	%cl, 1(%edx)
	movl	_scscx, %ecx
	movb	%al, 2(%ecx)
	xorl	%eax, %eax
	popl	%esi
	popl	%ebp
	retl
	.cfi_endproc
	.comm	_pp,3,0
	.section	__DATA,__data
	.globl	_scscx
	.p2align	3
_scscx:
	.long	_pp
Again, the _scscx is loaded three times instead of reusing a register, which is
suboptimal.
NOW, if I replace the original code by this:
int pp[3];
int *scscx = pp;
int tst( int i, int j, int k )
{
  scscx[0] = i;
  scscx[1] = j; 
  scscx[2] = k;
  return 0;
}
I get the following:
; Function Attrs: nofree norecurse nounwind optsize uwtable
define i32 @tst(i32 %i, i32 %j, i32 %k) local_unnamed_addr #1 {
entry:
  %0 = load i32*, i32** @scscx, align 8, !tbaa !11
  store i32 %i, i32* %0, align 4, !tbaa !13
  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1
  store i32 %j, i32* %arrayidx1, align 4, !tbaa !13
  %arrayidx2 = getelementptr inbounds i32, i32* %0, i64 2
  store i32 %k, i32* %arrayidx2, align 4, !tbaa !13
  ret i32 0
}
	.globl	_tst
_tst:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%esi
	.cfi_offset %esi, -12
	movl	16(%ebp), %eax
	movl	12(%ebp), %ecx
	movl	8(%ebp), %edx
	movl	_scscx, %esi
	movl	%edx, (%esi)
	movl	%ecx, 4(%esi)
	movl	%eax, 8(%esi)
	xorl	%eax, %eax
	popl	%esi
	popl	%ebp
	retl
	.cfi_endproc
	.comm	_pp,12,2
	.section	__DATA,__data
	.globl	_scscx
	.p2align	3
_scscx:
	.long	_pp
In this case the compiler optimises the load of _scscx into a register and
reuses its value instead of loading the variable multiple times. This results in
a cleaner and more optimal code, specially when compared with the first case.
I would like to understand why this happens, and whether there’s a way (or
workaround) to improve it?
Should I file a bug report for that?
Thanks.
Joan
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20190808/2352599a/attachment-0001.html>
Michael Kruse via llvm-dev
2019-Aug-08  15:50 UTC
[llvm-dev] Suboptimal code generated by clang+llc in quite a common scenario (?)
Hi, char* scscx is an universal pointer and may point to anything, including itself. That is, scscx might point to itself: scscx = (char*)&scscx; such that scscx[0] = ... changes the address scscx point to. A pointer to (int*) in contrast is only allowed to point to integers in memory, it is not an universal pointer. In particular, when accessing it the compiler can assume that it is not aliasing with something that is of type char*. For more details, see e.g. Wikipedia [1] or Stackoverflow [2] [1] https://en.wikipedia.org/wiki/Pointer_aliasing#Aliasing_and_re-ordering [2] https://stackoverflow.com/questions/98650/what-is-the-strict-aliasing-rule Michael Am Do., 8. Aug. 2019 um 10:19 Uhr schrieb Joan Lluch via llvm-dev <llvm-dev at lists.llvm.org>:> > I found a something that I quite not understand when compiling a common piece of code using the -Os flags. > I found it while testing my own backend but then I got deeper and found that at least the x86 is affected as well. This is the referred code: > > char pp[3]; > char *scscx = pp; > int tst( char i, char j, char k ) > { > scscx[0] = i; > scscx[1] = j; > scscx[2] = k; > return 0; > } > > The above gets compiled for the x86 architecture like this: > > ; Function Attrs: nofree norecurse nounwind optsize uwtable > define i32 @tst(i8 signext %i, i8 signext %j, i8 signext %k) local_unnamed_addr #1 { > entry: > %0 = load i8*, i8** @scscx, align 8, !tbaa !11 > store i8 %i, i8* %0, align 1, !tbaa !13 > %1 = load i8*, i8** @scscx, align 8, !tbaa !11 > %arrayidx1 = getelementptr inbounds i8, i8* %1, i64 1 > store i8 %j, i8* %arrayidx1, align 1, !tbaa !13 > %2 = load i8*, i8** @scscx, align 8, !tbaa !11 > %arrayidx2 = getelementptr inbounds i8, i8* %2, i64 2 > store i8 %k, i8* %arrayidx2, align 1, !tbaa !13 > ret i32 0 > } > > According to that, the variable ‘scscx’ is loaded three times despite it’s never modified. The resulting assembly code is this: > > .globl _tst > _tst: > .cfi_startproc > pushl %ebp > .cfi_def_cfa_offset 8 > .cfi_offset %ebp, -8 > movl %esp, %ebp > .cfi_def_cfa_register %ebp > pushl %esi > .cfi_offset %esi, -12 > movb 16(%ebp), %al > movb 12(%ebp), %cl > movb 8(%ebp), %dl > movl _scscx, %esi > movb %dl, (%esi) > movl _scscx, %edx > movb %cl, 1(%edx) > movl _scscx, %ecx > movb %al, 2(%ecx) > xorl %eax, %eax > popl %esi > popl %ebp > retl > .cfi_endproc > > .comm _pp,3,0 > .section __DATA,__data > .globl _scscx > .p2align 3 > _scscx: > .long _pp > > > Again, the _scscx is loaded three times instead of reusing a register, which is suboptimal. > > > NOW, if I replace the original code by this: > > int pp[3]; > int *scscx = pp; > int tst( int i, int j, int k ) > { > scscx[0] = i; > scscx[1] = j; > scscx[2] = k; > return 0; > } > > I get the following: > > > ; Function Attrs: nofree norecurse nounwind optsize uwtable > define i32 @tst(i32 %i, i32 %j, i32 %k) local_unnamed_addr #1 { > entry: > %0 = load i32*, i32** @scscx, align 8, !tbaa !11 > store i32 %i, i32* %0, align 4, !tbaa !13 > %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1 > store i32 %j, i32* %arrayidx1, align 4, !tbaa !13 > %arrayidx2 = getelementptr inbounds i32, i32* %0, i64 2 > store i32 %k, i32* %arrayidx2, align 4, !tbaa !13 > ret i32 0 > } > > > .globl _tst > _tst: > .cfi_startproc > pushl %ebp > .cfi_def_cfa_offset 8 > .cfi_offset %ebp, -8 > movl %esp, %ebp > .cfi_def_cfa_register %ebp > pushl %esi > .cfi_offset %esi, -12 > movl 16(%ebp), %eax > movl 12(%ebp), %ecx > movl 8(%ebp), %edx > movl _scscx, %esi > movl %edx, (%esi) > movl %ecx, 4(%esi) > movl %eax, 8(%esi) > xorl %eax, %eax > popl %esi > popl %ebp > retl > .cfi_endproc > > .comm _pp,12,2 > .section __DATA,__data > .globl _scscx > .p2align 3 > _scscx: > .long _pp > > > In this case the compiler optimises the load of _scscx into a register and reuses its value instead of loading the variable multiple times. This results in a cleaner and more optimal code, specially when compared with the first case. > > I would like to understand why this happens, and whether there’s a way (or workaround) to improve it? > > Should I file a bug report for that? > > Thanks. > > Joan > > > > > > > > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Alex Brachet-Mialot via llvm-dev
2019-Aug-08  16:07 UTC
[llvm-dev] Suboptimal code generated by clang+llc in quite a common scenario (?)
This might not be the workaround you want because it is only available in C, but you can use restrict to allow such optimizations. https://godbolt.org/z/2gQ26f Alex On Thu, Aug 8, 2019 at 11:50 AM Michael Kruse via llvm-dev < llvm-dev at lists.llvm.org> wrote:> Hi, > > char* scscx is an universal pointer and may point to anything, > including itself. That is, scscx might point to itself: > > scscx = (char*)&scscx; > > such that > > scscx[0] = ... > > changes the address scscx point to. A pointer to (int*) in contrast is > only allowed to point to integers in memory, it is not an universal > pointer. In particular, when accessing it the compiler can assume that > it is not aliasing with something that is of type char*. > > For more details, see e.g. Wikipedia [1] or Stackoverflow [2] > > [1] > https://en.wikipedia.org/wiki/Pointer_aliasing#Aliasing_and_re-ordering > [2] > https://stackoverflow.com/questions/98650/what-is-the-strict-aliasing-rule > > Michael > > > Am Do., 8. Aug. 2019 um 10:19 Uhr schrieb Joan Lluch via llvm-dev > <llvm-dev at lists.llvm.org>: > > > > I found a something that I quite not understand when compiling a common > piece of code using the -Os flags. > > I found it while testing my own backend but then I got deeper and found > that at least the x86 is affected as well. This is the referred code: > > > > char pp[3]; > > char *scscx = pp; > > int tst( char i, char j, char k ) > > { > > scscx[0] = i; > > scscx[1] = j; > > scscx[2] = k; > > return 0; > > } > > > > The above gets compiled for the x86 architecture like this: > > > > ; Function Attrs: nofree norecurse nounwind optsize uwtable > > define i32 @tst(i8 signext %i, i8 signext %j, i8 signext %k) > local_unnamed_addr #1 { > > entry: > > %0 = load i8*, i8** @scscx, align 8, !tbaa !11 > > store i8 %i, i8* %0, align 1, !tbaa !13 > > %1 = load i8*, i8** @scscx, align 8, !tbaa !11 > > %arrayidx1 = getelementptr inbounds i8, i8* %1, i64 1 > > store i8 %j, i8* %arrayidx1, align 1, !tbaa !13 > > %2 = load i8*, i8** @scscx, align 8, !tbaa !11 > > %arrayidx2 = getelementptr inbounds i8, i8* %2, i64 2 > > store i8 %k, i8* %arrayidx2, align 1, !tbaa !13 > > ret i32 0 > > } > > > > According to that, the variable ‘scscx’ is loaded three times despite > it’s never modified. The resulting assembly code is this: > > > > .globl _tst > > _tst: > > .cfi_startproc > > pushl %ebp > > .cfi_def_cfa_offset 8 > > .cfi_offset %ebp, -8 > > movl %esp, %ebp > > .cfi_def_cfa_register %ebp > > pushl %esi > > .cfi_offset %esi, -12 > > movb 16(%ebp), %al > > movb 12(%ebp), %cl > > movb 8(%ebp), %dl > > movl _scscx, %esi > > movb %dl, (%esi) > > movl _scscx, %edx > > movb %cl, 1(%edx) > > movl _scscx, %ecx > > movb %al, 2(%ecx) > > xorl %eax, %eax > > popl %esi > > popl %ebp > > retl > > .cfi_endproc > > > > .comm _pp,3,0 > > .section __DATA,__data > > .globl _scscx > > .p2align 3 > > _scscx: > > .long _pp > > > > > > Again, the _scscx is loaded three times instead of reusing a register, > which is suboptimal. > > > > > > NOW, if I replace the original code by this: > > > > int pp[3]; > > int *scscx = pp; > > int tst( int i, int j, int k ) > > { > > scscx[0] = i; > > scscx[1] = j; > > scscx[2] = k; > > return 0; > > } > > > > I get the following: > > > > > > ; Function Attrs: nofree norecurse nounwind optsize uwtable > > define i32 @tst(i32 %i, i32 %j, i32 %k) local_unnamed_addr #1 { > > entry: > > %0 = load i32*, i32** @scscx, align 8, !tbaa !11 > > store i32 %i, i32* %0, align 4, !tbaa !13 > > %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1 > > store i32 %j, i32* %arrayidx1, align 4, !tbaa !13 > > %arrayidx2 = getelementptr inbounds i32, i32* %0, i64 2 > > store i32 %k, i32* %arrayidx2, align 4, !tbaa !13 > > ret i32 0 > > } > > > > > > .globl _tst > > _tst: > > .cfi_startproc > > pushl %ebp > > .cfi_def_cfa_offset 8 > > .cfi_offset %ebp, -8 > > movl %esp, %ebp > > .cfi_def_cfa_register %ebp > > pushl %esi > > .cfi_offset %esi, -12 > > movl 16(%ebp), %eax > > movl 12(%ebp), %ecx > > movl 8(%ebp), %edx > > movl _scscx, %esi > > movl %edx, (%esi) > > movl %ecx, 4(%esi) > > movl %eax, 8(%esi) > > xorl %eax, %eax > > popl %esi > > popl %ebp > > retl > > .cfi_endproc > > > > .comm _pp,12,2 > > .section __DATA,__data > > .globl _scscx > > .p2align 3 > > _scscx: > > .long _pp > > > > > > In this case the compiler optimises the load of _scscx into a register > and reuses its value instead of loading the variable multiple times. This > results in a cleaner and more optimal code, specially when compared with > the first case. > > > > I would like to understand why this happens, and whether there’s a way > (or workaround) to improve it? > > > > Should I file a bug report for that? > > > > Thanks. > > > > Joan > > > > > > > > > > > > > > > > > > _______________________________________________ > > LLVM Developers mailing list > > llvm-dev at lists.llvm.org > > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190808/7278d9f6/attachment.html>
Reasonably Related Threads
- Suboptimal code generated by clang+llc in quite a common scenario (?)
- [LLVMdev] What's the Alias Analysis does clang use ?
- [LLVMdev] What's the Alias Analysis does clang use ?
- LLVM-IR store-load propagation
- [LLVMdev] [Vectorization] Mis match in code generated