Verifying function running passes ... LV: Checking a loop in "bar" LV: Found a loop: L0 LV: Found an induction variable. LV: We need to do 0 pointer comparisons. LV: Checking memory dependencies LV: Bad stride - Not an AddRecExpr pointer %13 = getelementptr float* %arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2) LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)(Induction step: 1) LV: Distance for store float %11, float* %12 to store float %10, float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4 * (sext i32 %arg0 to i64)),+,-4}<%L0>) Non-consecutive pointer access LV: We don't need a runtime memory check. LV: Can't vectorize due to memory conflicts LV: Not vectorizing. Here the input IR: define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* noalias %arg3, float* noalias %arg4) { entrypoint: br label %L0 L0: ; preds = %L0, %entrypoint %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ] %1 = add nsw i32 %0, 256 %2 = sext i32 %0 to i64 %3 = getelementptr float* %arg3, i64 %2 %4 = load float* %3, align 4 %5 = sext i32 %1 to i64 %6 = getelementptr float* %arg3, i64 %5 %7 = load float* %6, align 4 %8 = getelementptr float* %arg4, i64 %2 %9 = load float* %8, align 4 %10 = getelementptr float* %arg4, i64 %5 %11 = load float* %10, align 4 %12 = fadd float %11, %7 %13 = fadd float %9, %4 %14 = getelementptr float* %arg2, i64 %2 store float %13, float* %14, align 4 %15 = getelementptr float* %arg2, i64 %5 store float %12, float* %15, align 4 %16 = add nsw i32 %0, 1 %17 = icmp slt i32 %16, %arg1 br i1 %17, label %L0, label %L1 L1: ; preds = %L0 ret void } This function is IMO equivalent to void main(int start, int end, float * restrict c, float * restrict a, float * restrict b) { const int width = 256; for (int i = start ; i < end ; ++i ) { c[ i ] = a[ i ] + b[ i ]; c[ width + i ] = a[ width + i ] + b[ width + i ]; } } With this version, the vectorizer doesnt complain about a bad stride and can parallelize the loop. Here the output from "clang -emit-llvm -S loop.c" which can be parallelized: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable define void @bar(float* noalias %c, float* noalias %a, float* noalias %b, i32 %start, i32 %end) #0 { entry: %c.addr = alloca float*, align 8 %a.addr = alloca float*, align 8 %b.addr = alloca float*, align 8 %start.addr = alloca i32, align 4 %end.addr = alloca i32, align 4 %width = alloca i32, align 4 %i = alloca i32, align 4 store float* %c, float** %c.addr, align 8 store float* %a, float** %a.addr, align 8 store float* %b, float** %b.addr, align 8 store i32 %start, i32* %start.addr, align 4 store i32 %end, i32* %end.addr, align 4 store i32 256, i32* %width, align 4 %0 = load i32* %start.addr, align 4 store i32 %0, i32* %i, align 4 br label %for.cond for.cond: ; preds = %for.inc, %entry %1 = load i32* %i, align 4 %2 = load i32* %end.addr, align 4 %cmp = icmp slt i32 %1, %2 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %3 = load i32* %i, align 4 %idxprom = sext i32 %3 to i64 %4 = load float** %a.addr, align 8 %arrayidx = getelementptr inbounds float* %4, i64 %idxprom %5 = load float* %arrayidx, align 4 %6 = load i32* %i, align 4 %idxprom1 = sext i32 %6 to i64 %7 = load float** %b.addr, align 8 %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1 %8 = load float* %arrayidx2, align 4 %add = fadd float %5, %8 %9 = load i32* %i, align 4 %idxprom3 = sext i32 %9 to i64 %10 = load float** %c.addr, align 8 %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3 store float %add, float* %arrayidx4, align 4 %11 = load i32* %i, align 4 %add5 = add nsw i32 256, %11 %idxprom6 = sext i32 %add5 to i64 %12 = load float** %a.addr, align 8 %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6 %13 = load float* %arrayidx7, align 4 %14 = load i32* %i, align 4 %add8 = add nsw i32 256, %14 %idxprom9 = sext i32 %add8 to i64 %15 = load float** %b.addr, align 8 %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9 %16 = load float* %arrayidx10, align 4 %add11 = fadd float %13, %16 %17 = load i32* %i, align 4 %add12 = add nsw i32 256, %17 %idxprom13 = sext i32 %add12 to i64 %18 = load float** %c.addr, align 8 %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13 store float %add11, float* %arrayidx14, align 4 br label %for.inc for.inc: ; preds = %for.body %19 = load i32* %i, align 4 %inc = add nsw i32 %19, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0} !0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"} Any ideas why the vectotizer doesn't like my code? Frank
Frank, It looks like the loop vectorizer is unable to tell that the two stores in your code never overlap. This is probably because of the sign-extend in your code. Can you extend the indices to 64bit ? Thanks, Nadav On Oct 28, 2013, at 1:38 PM, Frank Winter <fwinter at jlab.org> wrote:> Verifying function > running passes ... > LV: Checking a loop in "bar" > LV: Found a loop: L0 > LV: Found an induction variable. > LV: We need to do 0 pointer comparisons. > LV: Checking memory dependencies > LV: Bad stride - Not an AddRecExpr pointer %13 = getelementptr float* %arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2) > LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)(Induction step: 1) > LV: Distance for store float %11, float* %12 to store float %10, float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4 * (sext i32 %arg0 to i64)),+,-4}<%L0>) > Non-consecutive pointer access > LV: We don't need a runtime memory check. > LV: Can't vectorize due to memory conflicts > LV: Not vectorizing. > > Here the input IR: > > define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* noalias %arg3, float* noalias %arg4) { > entrypoint: > br label %L0 > > L0: ; preds = %L0, %entrypoint > %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ] > %1 = add nsw i32 %0, 256 > %2 = sext i32 %0 to i64 > %3 = getelementptr float* %arg3, i64 %2 > %4 = load float* %3, align 4 > %5 = sext i32 %1 to i64 > %6 = getelementptr float* %arg3, i64 %5 > %7 = load float* %6, align 4 > %8 = getelementptr float* %arg4, i64 %2 > %9 = load float* %8, align 4 > %10 = getelementptr float* %arg4, i64 %5 > %11 = load float* %10, align 4 > %12 = fadd float %11, %7 > %13 = fadd float %9, %4 > %14 = getelementptr float* %arg2, i64 %2 > store float %13, float* %14, align 4 > %15 = getelementptr float* %arg2, i64 %5 > store float %12, float* %15, align 4 > %16 = add nsw i32 %0, 1 > %17 = icmp slt i32 %16, %arg1 > br i1 %17, label %L0, label %L1 > > L1: ; preds = %L0 > ret void > } > > This function is IMO equivalent to > > void main(int start, int end, float * restrict c, float * restrict a, float * restrict b) > { > const int width = 256; > for (int i = start ; i < end ; ++i ) { > c[ i ] = a[ i ] + b[ i ]; > c[ width + i ] = a[ width + i ] + b[ width + i ]; > } > } > > With this version, the vectorizer doesnt complain about a bad stride and can parallelize the loop. > > Here the output from "clang -emit-llvm -S loop.c" which can be parallelized: > > > > target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" > target triple = "x86_64-unknown-linux-gnu" > > ; Function Attrs: nounwind uwtable > define void @bar(float* noalias %c, float* noalias %a, float* noalias %b, i32 %start, i32 %end) #0 { > entry: > %c.addr = alloca float*, align 8 > %a.addr = alloca float*, align 8 > %b.addr = alloca float*, align 8 > %start.addr = alloca i32, align 4 > %end.addr = alloca i32, align 4 > %width = alloca i32, align 4 > %i = alloca i32, align 4 > store float* %c, float** %c.addr, align 8 > store float* %a, float** %a.addr, align 8 > store float* %b, float** %b.addr, align 8 > store i32 %start, i32* %start.addr, align 4 > store i32 %end, i32* %end.addr, align 4 > store i32 256, i32* %width, align 4 > %0 = load i32* %start.addr, align 4 > store i32 %0, i32* %i, align 4 > br label %for.cond > > for.cond: ; preds = %for.inc, %entry > %1 = load i32* %i, align 4 > %2 = load i32* %end.addr, align 4 > %cmp = icmp slt i32 %1, %2 > br i1 %cmp, label %for.body, label %for.end > > for.body: ; preds = %for.cond > %3 = load i32* %i, align 4 > %idxprom = sext i32 %3 to i64 > %4 = load float** %a.addr, align 8 > %arrayidx = getelementptr inbounds float* %4, i64 %idxprom > %5 = load float* %arrayidx, align 4 > %6 = load i32* %i, align 4 > %idxprom1 = sext i32 %6 to i64 > %7 = load float** %b.addr, align 8 > %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1 > %8 = load float* %arrayidx2, align 4 > %add = fadd float %5, %8 > %9 = load i32* %i, align 4 > %idxprom3 = sext i32 %9 to i64 > %10 = load float** %c.addr, align 8 > %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3 > store float %add, float* %arrayidx4, align 4 > %11 = load i32* %i, align 4 > %add5 = add nsw i32 256, %11 > %idxprom6 = sext i32 %add5 to i64 > %12 = load float** %a.addr, align 8 > %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6 > %13 = load float* %arrayidx7, align 4 > %14 = load i32* %i, align 4 > %add8 = add nsw i32 256, %14 > %idxprom9 = sext i32 %add8 to i64 > %15 = load float** %b.addr, align 8 > %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9 > %16 = load float* %arrayidx10, align 4 > %add11 = fadd float %13, %16 > %17 = load i32* %i, align 4 > %add12 = add nsw i32 256, %17 > %idxprom13 = sext i32 %add12 to i64 > %18 = load float** %c.addr, align 8 > %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13 > store float %add11, float* %arrayidx14, align 4 > br label %for.inc > > for.inc: ; preds = %for.body > %19 = load i32* %i, align 4 > %inc = add nsw i32 %19, 1 > store i32 %inc, i32* %i, align 4 > br label %for.cond > > for.end: ; preds = %for.cond > ret void > } > > attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } > > !llvm.ident = !{!0} > > !0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"} > > > Any ideas why the vectotizer doesn't like my code? > > Frank > > > > > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Hi Nadav, right! The sign-extend was the problem. Hmm.. Is this a bug or a feature? Frank On 28/10/13 16:58, Nadav Rotem wrote:> Frank, > > It looks like the loop vectorizer is unable to tell that the two stores in your code never overlap. This is probably because of the sign-extend in your code. Can you extend the indices to 64bit ? > > Thanks, > Nadav > > On Oct 28, 2013, at 1:38 PM, Frank Winter <fwinter at jlab.org> wrote: > >> Verifying function >> running passes ... >> LV: Checking a loop in "bar" >> LV: Found a loop: L0 >> LV: Found an induction variable. >> LV: We need to do 0 pointer comparisons. >> LV: Checking memory dependencies >> LV: Bad stride - Not an AddRecExpr pointer %13 = getelementptr float* %arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2) >> LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)(Induction step: 1) >> LV: Distance for store float %11, float* %12 to store float %10, float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4 * (sext i32 %arg0 to i64)),+,-4}<%L0>) >> Non-consecutive pointer access >> LV: We don't need a runtime memory check. >> LV: Can't vectorize due to memory conflicts >> LV: Not vectorizing. >> >> Here the input IR: >> >> define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* noalias %arg3, float* noalias %arg4) { >> entrypoint: >> br label %L0 >> >> L0: ; preds = %L0, %entrypoint >> %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ] >> %1 = add nsw i32 %0, 256 >> %2 = sext i32 %0 to i64 >> %3 = getelementptr float* %arg3, i64 %2 >> %4 = load float* %3, align 4 >> %5 = sext i32 %1 to i64 >> %6 = getelementptr float* %arg3, i64 %5 >> %7 = load float* %6, align 4 >> %8 = getelementptr float* %arg4, i64 %2 >> %9 = load float* %8, align 4 >> %10 = getelementptr float* %arg4, i64 %5 >> %11 = load float* %10, align 4 >> %12 = fadd float %11, %7 >> %13 = fadd float %9, %4 >> %14 = getelementptr float* %arg2, i64 %2 >> store float %13, float* %14, align 4 >> %15 = getelementptr float* %arg2, i64 %5 >> store float %12, float* %15, align 4 >> %16 = add nsw i32 %0, 1 >> %17 = icmp slt i32 %16, %arg1 >> br i1 %17, label %L0, label %L1 >> >> L1: ; preds = %L0 >> ret void >> } >> >> This function is IMO equivalent to >> >> void main(int start, int end, float * restrict c, float * restrict a, float * restrict b) >> { >> const int width = 256; >> for (int i = start ; i < end ; ++i ) { >> c[ i ] = a[ i ] + b[ i ]; >> c[ width + i ] = a[ width + i ] + b[ width + i ]; >> } >> } >> >> With this version, the vectorizer doesnt complain about a bad stride and can parallelize the loop. >> >> Here the output from "clang -emit-llvm -S loop.c" which can be parallelized: >> >> >> >> target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" >> target triple = "x86_64-unknown-linux-gnu" >> >> ; Function Attrs: nounwind uwtable >> define void @bar(float* noalias %c, float* noalias %a, float* noalias %b, i32 %start, i32 %end) #0 { >> entry: >> %c.addr = alloca float*, align 8 >> %a.addr = alloca float*, align 8 >> %b.addr = alloca float*, align 8 >> %start.addr = alloca i32, align 4 >> %end.addr = alloca i32, align 4 >> %width = alloca i32, align 4 >> %i = alloca i32, align 4 >> store float* %c, float** %c.addr, align 8 >> store float* %a, float** %a.addr, align 8 >> store float* %b, float** %b.addr, align 8 >> store i32 %start, i32* %start.addr, align 4 >> store i32 %end, i32* %end.addr, align 4 >> store i32 256, i32* %width, align 4 >> %0 = load i32* %start.addr, align 4 >> store i32 %0, i32* %i, align 4 >> br label %for.cond >> >> for.cond: ; preds = %for.inc, %entry >> %1 = load i32* %i, align 4 >> %2 = load i32* %end.addr, align 4 >> %cmp = icmp slt i32 %1, %2 >> br i1 %cmp, label %for.body, label %for.end >> >> for.body: ; preds = %for.cond >> %3 = load i32* %i, align 4 >> %idxprom = sext i32 %3 to i64 >> %4 = load float** %a.addr, align 8 >> %arrayidx = getelementptr inbounds float* %4, i64 %idxprom >> %5 = load float* %arrayidx, align 4 >> %6 = load i32* %i, align 4 >> %idxprom1 = sext i32 %6 to i64 >> %7 = load float** %b.addr, align 8 >> %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1 >> %8 = load float* %arrayidx2, align 4 >> %add = fadd float %5, %8 >> %9 = load i32* %i, align 4 >> %idxprom3 = sext i32 %9 to i64 >> %10 = load float** %c.addr, align 8 >> %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3 >> store float %add, float* %arrayidx4, align 4 >> %11 = load i32* %i, align 4 >> %add5 = add nsw i32 256, %11 >> %idxprom6 = sext i32 %add5 to i64 >> %12 = load float** %a.addr, align 8 >> %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6 >> %13 = load float* %arrayidx7, align 4 >> %14 = load i32* %i, align 4 >> %add8 = add nsw i32 256, %14 >> %idxprom9 = sext i32 %add8 to i64 >> %15 = load float** %b.addr, align 8 >> %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9 >> %16 = load float* %arrayidx10, align 4 >> %add11 = fadd float %13, %16 >> %17 = load i32* %i, align 4 >> %add12 = add nsw i32 256, %17 >> %idxprom13 = sext i32 %add12 to i64 >> %18 = load float** %c.addr, align 8 >> %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13 >> store float %add11, float* %arrayidx14, align 4 >> br label %for.inc >> >> for.inc: ; preds = %for.body >> %19 = load i32* %i, align 4 >> %inc = add nsw i32 %19, 1 >> store i32 %inc, i32* %i, align 4 >> br label %for.cond >> >> for.end: ; preds = %for.cond >> ret void >> } >> >> attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } >> >> !llvm.ident = !{!0} >> >> !0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"} >> >> >> Any ideas why the vectotizer doesn't like my code? >> >> Frank >> >> >> >> >> _______________________________________________ >> LLVM Developers mailing list >> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev