thr3ads.net - llvm dev - [llvm-dev] Bug in loop stores optimization [Jul 2018]

If this information is useful, please help other people find it:
Share via:

sunwenbo (A) via llvm-dev

2018-Jul-31 12:40 UTC

[llvm-dev] Bug in loop stores optimization

Hi folks,

I have a very simple c code as below:

void foo(int* in, int* out)
{

    int  inArray[4], outArray[4];
    int i;

    for(i=0; i<4; i++)
    {
      inArray[i] = in[8*i];
    }

    outArray[0] = inArray[0] *  inArray[1] + inArray[2] + inArray[3] + 0;
    outArray[1] = inArray[0] +  inArray[1] * inArray[2] + inArray[3] + 1;
    outArray[2] = inArray[0] +  inArray[1] + inArray[2] * inArray[3] + 2;
    outArray[3] = inArray[0] +  inArray[1] + inArray[2] + inArray[3] * 3;

    for (i=0; i<4; i++)
    {
      *out++ = outArray[i];
    }
}

The llvm IR code with O2 option is generated as:

define void @foo(i32* nocapture readonly %in, i32* nocapture %out)
local_unnamed_addr #0 {
entry:
  %outArray = alloca [4 x i32], align 16
  %0 = bitcast [4 x i32]* %outArray to i8*
  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #2
  %1 = load i32, i32* %in, align 4, !tbaa !2
  %arrayidx.1 = getelementptr inbounds i32, i32* %in, i64 8
  %2 = load i32, i32* %arrayidx.1, align 4, !tbaa !2
  %arrayidx.2 = getelementptr inbounds i32, i32* %in, i64 16
  %3 = load i32, i32* %arrayidx.2, align 4, !tbaa !2
  %arrayidx.3 = getelementptr inbounds i32, i32* %in, i64 24
  %4 = load i32, i32* %arrayidx.3, align 4, !tbaa !2
  %out56 = bitcast i32* %out to i8*
  %mul5 = mul nsw i32 %2, %1
  %add = add nsw i32 %mul5, %3
  %add8 = add nsw i32 %add, %4
  %arrayidx10 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 0,
i64 0
  store i32 %add8, i32* %arrayidx10, align 16, !tbaa !2
  %mul14 = mul nsw i32 %3, %2
  %add15 = add i32 %1, 1
  %add17 = add i32 %add15, %mul14
  %add18 = add i32 %add17, %4
  %arrayidx19 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 0,
i64 1
  store i32 %add18, i32* %arrayidx19, align 4, !tbaa !2
  %add22 = add nsw i32 %2, %1
  %mul25 = mul nsw i32 %4, %3
  %add26 = add i32 %add22, 2
  %add27 = add i32 %add26, %mul25
  %arrayidx28 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 0,
i64 2
  store i32 %add27, i32* %arrayidx28, align 8, !tbaa !2
  %add33 = add nsw i32 %add22, %3
  %mul35 = mul nsw i32 %4, 3
  %add36 = add nsw i32 %add33, %mul35
  %arrayidx37 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 0,
i64 3
  store i32 %add36, i32* %arrayidx37, align 4, !tbaa !2
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out56, i8* nonnull %0, i64 16, i32
4, i1 false)
  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #2
  ret void
}


Stack allocation (%outArray = alloca [4 x i32]) is unnecessary and should be
optimized for this case. I did some investigation on it and found `loop-idiom`
pass transformed
    for (i=0; i<4; i++)
    {
      *out++ = outArray[i];
    }
to memcpy, which blocked further optimization on local array (outArray).

I am not sure whether it's a llvm bug. Manually unroll the 2nd loop would
bypass this problem, but I prefer to fix it in compiler.
Any suggestion on potential solution to this issue?

Thx,
Wenbo
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20180731/b199430d/attachment.html>

Michael Kruse via llvm-dev

2018-Jul-31 17:01 UTC

head link

[llvm-dev] Bug in loop stores optimization

This might be pass ordering problem. -loop-idiom happens before
-loop-unroll (after full unroll of both loops, the array might be
SROA'ed-out), hence get's executed first.

For -loop-idiom, the conversion seems beneficial, but does not see
that the loop could be eliminated entirely later.

Michael


2018-07-31 7:40 GMT-05:00 sunwenbo (A) via llvm-dev <llvm-dev at
lists.llvm.org>:> Hi folks,
>
>
>
> I have a very simple c code as below:
>
>
>
> void foo(int* in, int* out)
>
> {
>
>
>
>     int  inArray[4], outArray[4];
>
>     int i;
>
>
>
>     for(i=0; i<4; i++)
>
>     {
>
>       inArray[i] = in[8*i];
>
>     }
>
>
>
>     outArray[0] = inArray[0] *  inArray[1] + inArray[2] + inArray[3] + 0;
>
>     outArray[1] = inArray[0] +  inArray[1] * inArray[2] + inArray[3] + 1;
>
>     outArray[2] = inArray[0] +  inArray[1] + inArray[2] * inArray[3] + 2;
>
>     outArray[3] = inArray[0] +  inArray[1] + inArray[2] + inArray[3] * 3;
>
>
>
>     for (i=0; i<4; i++)
>
>     {
>
>       *out++ = outArray[i];
>
>     }
>
> }
>
>
>
> The llvm IR code with O2 option is generated as:
>
>
>
> define void @foo(i32* nocapture readonly %in, i32* nocapture %out)
> local_unnamed_addr #0 {
>
> entry:
>
>   %outArray = alloca [4 x i32], align 16
>
>   %0 = bitcast [4 x i32]* %outArray to i8*
>
>   call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #2
>
>   %1 = load i32, i32* %in, align 4, !tbaa !2
>
>   %arrayidx.1 = getelementptr inbounds i32, i32* %in, i64 8
>
>   %2 = load i32, i32* %arrayidx.1, align 4, !tbaa !2
>
>   %arrayidx.2 = getelementptr inbounds i32, i32* %in, i64 16
>
>   %3 = load i32, i32* %arrayidx.2, align 4, !tbaa !2
>
>   %arrayidx.3 = getelementptr inbounds i32, i32* %in, i64 24
>
>   %4 = load i32, i32* %arrayidx.3, align 4, !tbaa !2
>
>   %out56 = bitcast i32* %out to i8*
>
>   %mul5 = mul nsw i32 %2, %1
>
>   %add = add nsw i32 %mul5, %3
>
>   %add8 = add nsw i32 %add, %4
>
>   %arrayidx10 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64
> 0, i64 0
>
>   store i32 %add8, i32* %arrayidx10, align 16, !tbaa !2
>
>   %mul14 = mul nsw i32 %3, %2
>
>   %add15 = add i32 %1, 1
>
>   %add17 = add i32 %add15, %mul14
>
>   %add18 = add i32 %add17, %4
>
>   %arrayidx19 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64
> 0, i64 1
>
>   store i32 %add18, i32* %arrayidx19, align 4, !tbaa !2
>
>   %add22 = add nsw i32 %2, %1
>
>   %mul25 = mul nsw i32 %4, %3
>
>   %add26 = add i32 %add22, 2
>
>   %add27 = add i32 %add26, %mul25
>
>   %arrayidx28 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64
> 0, i64 2
>
>   store i32 %add27, i32* %arrayidx28, align 8, !tbaa !2
>
>   %add33 = add nsw i32 %add22, %3
>
>   %mul35 = mul nsw i32 %4, 3
>
>   %add36 = add nsw i32 %add33, %mul35
>
>   %arrayidx37 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64
> 0, i64 3
>
>   store i32 %add36, i32* %arrayidx37, align 4, !tbaa !2
>
>   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out56, i8* nonnull %0, i64 16,
> i32 4, i1 false)
>
>   call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #2
>
>   ret void
>
> }
>
>
>
>
>
> Stack allocation (%outArray = alloca [4 x i32]) is unnecessary and should
be
> optimized for this case. I did some investigation on it and found
> `loop-idiom` pass transformed
>
>     for (i=0; i<4; i++)
>
>     {
>
>       *out++ = outArray[i];
>
>     }
>
> to memcpy, which blocked further optimization on local array (outArray).
>
>
>
> I am not sure whether it’s a llvm bug. Manually unroll the 2nd loop would
> bypass this problem, but I prefer to fix it in compiler.
>
> Any suggestion on potential solution to this issue?
>
>
>
> Thx,
>
> Wenbo
>
>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>

llvm dev - Jul 2018 - Bug in loop stores optimization

[llvm-dev] Bug in loop stores optimization

[llvm-dev] Bug in loop stores optimization