sunwenbo (A) via llvm-dev
2018-Jul-31 12:40 UTC
[llvm-dev] Bug in loop stores optimization
Hi folks, I have a very simple c code as below: void foo(int* in, int* out) { int inArray[4], outArray[4]; int i; for(i=0; i<4; i++) { inArray[i] = in[8*i]; } outArray[0] = inArray[0] * inArray[1] + inArray[2] + inArray[3] + 0; outArray[1] = inArray[0] + inArray[1] * inArray[2] + inArray[3] + 1; outArray[2] = inArray[0] + inArray[1] + inArray[2] * inArray[3] + 2; outArray[3] = inArray[0] + inArray[1] + inArray[2] + inArray[3] * 3; for (i=0; i<4; i++) { *out++ = outArray[i]; } } The llvm IR code with O2 option is generated as: define void @foo(i32* nocapture readonly %in, i32* nocapture %out) local_unnamed_addr #0 { entry: %outArray = alloca [4 x i32], align 16 %0 = bitcast [4 x i32]* %outArray to i8* call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #2 %1 = load i32, i32* %in, align 4, !tbaa !2 %arrayidx.1 = getelementptr inbounds i32, i32* %in, i64 8 %2 = load i32, i32* %arrayidx.1, align 4, !tbaa !2 %arrayidx.2 = getelementptr inbounds i32, i32* %in, i64 16 %3 = load i32, i32* %arrayidx.2, align 4, !tbaa !2 %arrayidx.3 = getelementptr inbounds i32, i32* %in, i64 24 %4 = load i32, i32* %arrayidx.3, align 4, !tbaa !2 %out56 = bitcast i32* %out to i8* %mul5 = mul nsw i32 %2, %1 %add = add nsw i32 %mul5, %3 %add8 = add nsw i32 %add, %4 %arrayidx10 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 0, i64 0 store i32 %add8, i32* %arrayidx10, align 16, !tbaa !2 %mul14 = mul nsw i32 %3, %2 %add15 = add i32 %1, 1 %add17 = add i32 %add15, %mul14 %add18 = add i32 %add17, %4 %arrayidx19 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 0, i64 1 store i32 %add18, i32* %arrayidx19, align 4, !tbaa !2 %add22 = add nsw i32 %2, %1 %mul25 = mul nsw i32 %4, %3 %add26 = add i32 %add22, 2 %add27 = add i32 %add26, %mul25 %arrayidx28 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 0, i64 2 store i32 %add27, i32* %arrayidx28, align 8, !tbaa !2 %add33 = add nsw i32 %add22, %3 %mul35 = mul nsw i32 %4, 3 %add36 = add nsw i32 %add33, %mul35 %arrayidx37 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 0, i64 3 store i32 %add36, i32* %arrayidx37, align 4, !tbaa !2 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out56, i8* nonnull %0, i64 16, i32 4, i1 false) call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #2 ret void } Stack allocation (%outArray = alloca [4 x i32]) is unnecessary and should be optimized for this case. I did some investigation on it and found `loop-idiom` pass transformed for (i=0; i<4; i++) { *out++ = outArray[i]; } to memcpy, which blocked further optimization on local array (outArray). I am not sure whether it's a llvm bug. Manually unroll the 2nd loop would bypass this problem, but I prefer to fix it in compiler. Any suggestion on potential solution to this issue? Thx, Wenbo -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20180731/b199430d/attachment.html>
Michael Kruse via llvm-dev
2018-Jul-31 17:01 UTC
[llvm-dev] Bug in loop stores optimization
This might be pass ordering problem. -loop-idiom happens before -loop-unroll (after full unroll of both loops, the array might be SROA'ed-out), hence get's executed first. For -loop-idiom, the conversion seems beneficial, but does not see that the loop could be eliminated entirely later. Michael 2018-07-31 7:40 GMT-05:00 sunwenbo (A) via llvm-dev <llvm-dev at lists.llvm.org>:> Hi folks, > > > > I have a very simple c code as below: > > > > void foo(int* in, int* out) > > { > > > > int inArray[4], outArray[4]; > > int i; > > > > for(i=0; i<4; i++) > > { > > inArray[i] = in[8*i]; > > } > > > > outArray[0] = inArray[0] * inArray[1] + inArray[2] + inArray[3] + 0; > > outArray[1] = inArray[0] + inArray[1] * inArray[2] + inArray[3] + 1; > > outArray[2] = inArray[0] + inArray[1] + inArray[2] * inArray[3] + 2; > > outArray[3] = inArray[0] + inArray[1] + inArray[2] + inArray[3] * 3; > > > > for (i=0; i<4; i++) > > { > > *out++ = outArray[i]; > > } > > } > > > > The llvm IR code with O2 option is generated as: > > > > define void @foo(i32* nocapture readonly %in, i32* nocapture %out) > local_unnamed_addr #0 { > > entry: > > %outArray = alloca [4 x i32], align 16 > > %0 = bitcast [4 x i32]* %outArray to i8* > > call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #2 > > %1 = load i32, i32* %in, align 4, !tbaa !2 > > %arrayidx.1 = getelementptr inbounds i32, i32* %in, i64 8 > > %2 = load i32, i32* %arrayidx.1, align 4, !tbaa !2 > > %arrayidx.2 = getelementptr inbounds i32, i32* %in, i64 16 > > %3 = load i32, i32* %arrayidx.2, align 4, !tbaa !2 > > %arrayidx.3 = getelementptr inbounds i32, i32* %in, i64 24 > > %4 = load i32, i32* %arrayidx.3, align 4, !tbaa !2 > > %out56 = bitcast i32* %out to i8* > > %mul5 = mul nsw i32 %2, %1 > > %add = add nsw i32 %mul5, %3 > > %add8 = add nsw i32 %add, %4 > > %arrayidx10 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 > 0, i64 0 > > store i32 %add8, i32* %arrayidx10, align 16, !tbaa !2 > > %mul14 = mul nsw i32 %3, %2 > > %add15 = add i32 %1, 1 > > %add17 = add i32 %add15, %mul14 > > %add18 = add i32 %add17, %4 > > %arrayidx19 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 > 0, i64 1 > > store i32 %add18, i32* %arrayidx19, align 4, !tbaa !2 > > %add22 = add nsw i32 %2, %1 > > %mul25 = mul nsw i32 %4, %3 > > %add26 = add i32 %add22, 2 > > %add27 = add i32 %add26, %mul25 > > %arrayidx28 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 > 0, i64 2 > > store i32 %add27, i32* %arrayidx28, align 8, !tbaa !2 > > %add33 = add nsw i32 %add22, %3 > > %mul35 = mul nsw i32 %4, 3 > > %add36 = add nsw i32 %add33, %mul35 > > %arrayidx37 = getelementptr inbounds [4 x i32], [4 x i32]* %outArray, i64 > 0, i64 3 > > store i32 %add36, i32* %arrayidx37, align 4, !tbaa !2 > > call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out56, i8* nonnull %0, i64 16, > i32 4, i1 false) > > call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #2 > > ret void > > } > > > > > > Stack allocation (%outArray = alloca [4 x i32]) is unnecessary and should be > optimized for this case. I did some investigation on it and found > `loop-idiom` pass transformed > > for (i=0; i<4; i++) > > { > > *out++ = outArray[i]; > > } > > to memcpy, which blocked further optimization on local array (outArray). > > > > I am not sure whether it’s a llvm bug. Manually unroll the 2nd loop would > bypass this problem, but I prefer to fix it in compiler. > > Any suggestion on potential solution to this issue? > > > > Thx, > > Wenbo > > > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev >