Verifying function
running passes ...
LV: Checking a loop in "bar"
LV: Found a loop: L0
LV: Found an induction variable.
LV: We need to do 0 pointer comparisons.
LV: Checking memory dependencies
LV: Bad stride - Not an AddRecExpr pointer %13 = getelementptr float*
%arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to
i64)) + %arg2)
LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink
Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) +
%arg2)(Induction step: 1)
LV: Distance for store float %11, float* %12 to store float %10,
float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) +
{(-4
* (sext i32 %arg0 to i64)),+,-4}<%L0>)
Non-consecutive pointer access
LV: We don't need a runtime memory check.
LV: Can't vectorize due to memory conflicts
LV: Not vectorizing.
Here the input IR:
define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float*
noalias %arg3, float* noalias %arg4) {
entrypoint:
br label %L0
L0: ; preds = %L0, %entrypoint
%0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ]
%1 = add nsw i32 %0, 256
%2 = sext i32 %0 to i64
%3 = getelementptr float* %arg3, i64 %2
%4 = load float* %3, align 4
%5 = sext i32 %1 to i64
%6 = getelementptr float* %arg3, i64 %5
%7 = load float* %6, align 4
%8 = getelementptr float* %arg4, i64 %2
%9 = load float* %8, align 4
%10 = getelementptr float* %arg4, i64 %5
%11 = load float* %10, align 4
%12 = fadd float %11, %7
%13 = fadd float %9, %4
%14 = getelementptr float* %arg2, i64 %2
store float %13, float* %14, align 4
%15 = getelementptr float* %arg2, i64 %5
store float %12, float* %15, align 4
%16 = add nsw i32 %0, 1
%17 = icmp slt i32 %16, %arg1
br i1 %17, label %L0, label %L1
L1: ; preds = %L0
ret void
}
This function is IMO equivalent to
void main(int start, int end, float * restrict c, float * restrict a,
float * restrict b)
{
const int width = 256;
for (int i = start ; i < end ; ++i ) {
c[ i ] = a[ i ] + b[ i ];
c[ width + i ] = a[ width + i ] + b[ width + i ];
}
}
With this version, the vectorizer doesnt complain about a bad stride and
can parallelize the loop.
Here the output from "clang -emit-llvm -S loop.c" which can be
parallelized:
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define void @bar(float* noalias %c, float* noalias %a, float* noalias
%b, i32 %start, i32 %end) #0 {
entry:
%c.addr = alloca float*, align 8
%a.addr = alloca float*, align 8
%b.addr = alloca float*, align 8
%start.addr = alloca i32, align 4
%end.addr = alloca i32, align 4
%width = alloca i32, align 4
%i = alloca i32, align 4
store float* %c, float** %c.addr, align 8
store float* %a, float** %a.addr, align 8
store float* %b, float** %b.addr, align 8
store i32 %start, i32* %start.addr, align 4
store i32 %end, i32* %end.addr, align 4
store i32 256, i32* %width, align 4
%0 = load i32* %start.addr, align 4
store i32 %0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%1 = load i32* %i, align 4
%2 = load i32* %end.addr, align 4
%cmp = icmp slt i32 %1, %2
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%3 = load i32* %i, align 4
%idxprom = sext i32 %3 to i64
%4 = load float** %a.addr, align 8
%arrayidx = getelementptr inbounds float* %4, i64 %idxprom
%5 = load float* %arrayidx, align 4
%6 = load i32* %i, align 4
%idxprom1 = sext i32 %6 to i64
%7 = load float** %b.addr, align 8
%arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1
%8 = load float* %arrayidx2, align 4
%add = fadd float %5, %8
%9 = load i32* %i, align 4
%idxprom3 = sext i32 %9 to i64
%10 = load float** %c.addr, align 8
%arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3
store float %add, float* %arrayidx4, align 4
%11 = load i32* %i, align 4
%add5 = add nsw i32 256, %11
%idxprom6 = sext i32 %add5 to i64
%12 = load float** %a.addr, align 8
%arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6
%13 = load float* %arrayidx7, align 4
%14 = load i32* %i, align 4
%add8 = add nsw i32 256, %14
%idxprom9 = sext i32 %add8 to i64
%15 = load float** %b.addr, align 8
%arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9
%16 = load float* %arrayidx10, align 4
%add11 = fadd float %13, %16
%17 = load i32* %i, align 4
%add12 = add nsw i32 256, %17
%idxprom13 = sext i32 %add12 to i64
%18 = load float** %c.addr, align 8
%arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13
store float %add11, float* %arrayidx14, align 4
br label %for.inc
for.inc: ; preds = %for.body
%19 = load i32* %i, align 4
%inc = add nsw i32 %19, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
attributes #0 = { nounwind uwtable
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false"
"no-nans-fp-math"="false"
"stack-protector-buffer-size"="8"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"}
Any ideas why the vectotizer doesn't like my code?
Frank
Frank, It looks like the loop vectorizer is unable to tell that the two stores in your code never overlap. This is probably because of the sign-extend in your code. Can you extend the indices to 64bit ? Thanks, Nadav On Oct 28, 2013, at 1:38 PM, Frank Winter <fwinter at jlab.org> wrote:> Verifying function > running passes ... > LV: Checking a loop in "bar" > LV: Found a loop: L0 > LV: Found an induction variable. > LV: We need to do 0 pointer comparisons. > LV: Checking memory dependencies > LV: Bad stride - Not an AddRecExpr pointer %13 = getelementptr float* %arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2) > LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)(Induction step: 1) > LV: Distance for store float %11, float* %12 to store float %10, float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4 * (sext i32 %arg0 to i64)),+,-4}<%L0>) > Non-consecutive pointer access > LV: We don't need a runtime memory check. > LV: Can't vectorize due to memory conflicts > LV: Not vectorizing. > > Here the input IR: > > define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* noalias %arg3, float* noalias %arg4) { > entrypoint: > br label %L0 > > L0: ; preds = %L0, %entrypoint > %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ] > %1 = add nsw i32 %0, 256 > %2 = sext i32 %0 to i64 > %3 = getelementptr float* %arg3, i64 %2 > %4 = load float* %3, align 4 > %5 = sext i32 %1 to i64 > %6 = getelementptr float* %arg3, i64 %5 > %7 = load float* %6, align 4 > %8 = getelementptr float* %arg4, i64 %2 > %9 = load float* %8, align 4 > %10 = getelementptr float* %arg4, i64 %5 > %11 = load float* %10, align 4 > %12 = fadd float %11, %7 > %13 = fadd float %9, %4 > %14 = getelementptr float* %arg2, i64 %2 > store float %13, float* %14, align 4 > %15 = getelementptr float* %arg2, i64 %5 > store float %12, float* %15, align 4 > %16 = add nsw i32 %0, 1 > %17 = icmp slt i32 %16, %arg1 > br i1 %17, label %L0, label %L1 > > L1: ; preds = %L0 > ret void > } > > This function is IMO equivalent to > > void main(int start, int end, float * restrict c, float * restrict a, float * restrict b) > { > const int width = 256; > for (int i = start ; i < end ; ++i ) { > c[ i ] = a[ i ] + b[ i ]; > c[ width + i ] = a[ width + i ] + b[ width + i ]; > } > } > > With this version, the vectorizer doesnt complain about a bad stride and can parallelize the loop. > > Here the output from "clang -emit-llvm -S loop.c" which can be parallelized: > > > > target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" > target triple = "x86_64-unknown-linux-gnu" > > ; Function Attrs: nounwind uwtable > define void @bar(float* noalias %c, float* noalias %a, float* noalias %b, i32 %start, i32 %end) #0 { > entry: > %c.addr = alloca float*, align 8 > %a.addr = alloca float*, align 8 > %b.addr = alloca float*, align 8 > %start.addr = alloca i32, align 4 > %end.addr = alloca i32, align 4 > %width = alloca i32, align 4 > %i = alloca i32, align 4 > store float* %c, float** %c.addr, align 8 > store float* %a, float** %a.addr, align 8 > store float* %b, float** %b.addr, align 8 > store i32 %start, i32* %start.addr, align 4 > store i32 %end, i32* %end.addr, align 4 > store i32 256, i32* %width, align 4 > %0 = load i32* %start.addr, align 4 > store i32 %0, i32* %i, align 4 > br label %for.cond > > for.cond: ; preds = %for.inc, %entry > %1 = load i32* %i, align 4 > %2 = load i32* %end.addr, align 4 > %cmp = icmp slt i32 %1, %2 > br i1 %cmp, label %for.body, label %for.end > > for.body: ; preds = %for.cond > %3 = load i32* %i, align 4 > %idxprom = sext i32 %3 to i64 > %4 = load float** %a.addr, align 8 > %arrayidx = getelementptr inbounds float* %4, i64 %idxprom > %5 = load float* %arrayidx, align 4 > %6 = load i32* %i, align 4 > %idxprom1 = sext i32 %6 to i64 > %7 = load float** %b.addr, align 8 > %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1 > %8 = load float* %arrayidx2, align 4 > %add = fadd float %5, %8 > %9 = load i32* %i, align 4 > %idxprom3 = sext i32 %9 to i64 > %10 = load float** %c.addr, align 8 > %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3 > store float %add, float* %arrayidx4, align 4 > %11 = load i32* %i, align 4 > %add5 = add nsw i32 256, %11 > %idxprom6 = sext i32 %add5 to i64 > %12 = load float** %a.addr, align 8 > %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6 > %13 = load float* %arrayidx7, align 4 > %14 = load i32* %i, align 4 > %add8 = add nsw i32 256, %14 > %idxprom9 = sext i32 %add8 to i64 > %15 = load float** %b.addr, align 8 > %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9 > %16 = load float* %arrayidx10, align 4 > %add11 = fadd float %13, %16 > %17 = load i32* %i, align 4 > %add12 = add nsw i32 256, %17 > %idxprom13 = sext i32 %add12 to i64 > %18 = load float** %c.addr, align 8 > %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13 > store float %add11, float* %arrayidx14, align 4 > br label %for.inc > > for.inc: ; preds = %for.body > %19 = load i32* %i, align 4 > %inc = add nsw i32 %19, 1 > store i32 %inc, i32* %i, align 4 > br label %for.cond > > for.end: ; preds = %for.cond > ret void > } > > attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } > > !llvm.ident = !{!0} > > !0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"} > > > Any ideas why the vectotizer doesn't like my code? > > Frank > > > > > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
Hi Nadav, right! The sign-extend was the problem. Hmm.. Is this a bug or a feature? Frank On 28/10/13 16:58, Nadav Rotem wrote:> Frank, > > It looks like the loop vectorizer is unable to tell that the two stores in your code never overlap. This is probably because of the sign-extend in your code. Can you extend the indices to 64bit ? > > Thanks, > Nadav > > On Oct 28, 2013, at 1:38 PM, Frank Winter <fwinter at jlab.org> wrote: > >> Verifying function >> running passes ... >> LV: Checking a loop in "bar" >> LV: Found a loop: L0 >> LV: Found an induction variable. >> LV: We need to do 0 pointer comparisons. >> LV: Checking memory dependencies >> LV: Bad stride - Not an AddRecExpr pointer %13 = getelementptr float* %arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2) >> LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + %arg2)(Induction step: 1) >> LV: Distance for store float %11, float* %12 to store float %10, float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4 * (sext i32 %arg0 to i64)),+,-4}<%L0>) >> Non-consecutive pointer access >> LV: We don't need a runtime memory check. >> LV: Can't vectorize due to memory conflicts >> LV: Not vectorizing. >> >> Here the input IR: >> >> define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* noalias %arg3, float* noalias %arg4) { >> entrypoint: >> br label %L0 >> >> L0: ; preds = %L0, %entrypoint >> %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ] >> %1 = add nsw i32 %0, 256 >> %2 = sext i32 %0 to i64 >> %3 = getelementptr float* %arg3, i64 %2 >> %4 = load float* %3, align 4 >> %5 = sext i32 %1 to i64 >> %6 = getelementptr float* %arg3, i64 %5 >> %7 = load float* %6, align 4 >> %8 = getelementptr float* %arg4, i64 %2 >> %9 = load float* %8, align 4 >> %10 = getelementptr float* %arg4, i64 %5 >> %11 = load float* %10, align 4 >> %12 = fadd float %11, %7 >> %13 = fadd float %9, %4 >> %14 = getelementptr float* %arg2, i64 %2 >> store float %13, float* %14, align 4 >> %15 = getelementptr float* %arg2, i64 %5 >> store float %12, float* %15, align 4 >> %16 = add nsw i32 %0, 1 >> %17 = icmp slt i32 %16, %arg1 >> br i1 %17, label %L0, label %L1 >> >> L1: ; preds = %L0 >> ret void >> } >> >> This function is IMO equivalent to >> >> void main(int start, int end, float * restrict c, float * restrict a, float * restrict b) >> { >> const int width = 256; >> for (int i = start ; i < end ; ++i ) { >> c[ i ] = a[ i ] + b[ i ]; >> c[ width + i ] = a[ width + i ] + b[ width + i ]; >> } >> } >> >> With this version, the vectorizer doesnt complain about a bad stride and can parallelize the loop. >> >> Here the output from "clang -emit-llvm -S loop.c" which can be parallelized: >> >> >> >> target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" >> target triple = "x86_64-unknown-linux-gnu" >> >> ; Function Attrs: nounwind uwtable >> define void @bar(float* noalias %c, float* noalias %a, float* noalias %b, i32 %start, i32 %end) #0 { >> entry: >> %c.addr = alloca float*, align 8 >> %a.addr = alloca float*, align 8 >> %b.addr = alloca float*, align 8 >> %start.addr = alloca i32, align 4 >> %end.addr = alloca i32, align 4 >> %width = alloca i32, align 4 >> %i = alloca i32, align 4 >> store float* %c, float** %c.addr, align 8 >> store float* %a, float** %a.addr, align 8 >> store float* %b, float** %b.addr, align 8 >> store i32 %start, i32* %start.addr, align 4 >> store i32 %end, i32* %end.addr, align 4 >> store i32 256, i32* %width, align 4 >> %0 = load i32* %start.addr, align 4 >> store i32 %0, i32* %i, align 4 >> br label %for.cond >> >> for.cond: ; preds = %for.inc, %entry >> %1 = load i32* %i, align 4 >> %2 = load i32* %end.addr, align 4 >> %cmp = icmp slt i32 %1, %2 >> br i1 %cmp, label %for.body, label %for.end >> >> for.body: ; preds = %for.cond >> %3 = load i32* %i, align 4 >> %idxprom = sext i32 %3 to i64 >> %4 = load float** %a.addr, align 8 >> %arrayidx = getelementptr inbounds float* %4, i64 %idxprom >> %5 = load float* %arrayidx, align 4 >> %6 = load i32* %i, align 4 >> %idxprom1 = sext i32 %6 to i64 >> %7 = load float** %b.addr, align 8 >> %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1 >> %8 = load float* %arrayidx2, align 4 >> %add = fadd float %5, %8 >> %9 = load i32* %i, align 4 >> %idxprom3 = sext i32 %9 to i64 >> %10 = load float** %c.addr, align 8 >> %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3 >> store float %add, float* %arrayidx4, align 4 >> %11 = load i32* %i, align 4 >> %add5 = add nsw i32 256, %11 >> %idxprom6 = sext i32 %add5 to i64 >> %12 = load float** %a.addr, align 8 >> %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6 >> %13 = load float* %arrayidx7, align 4 >> %14 = load i32* %i, align 4 >> %add8 = add nsw i32 256, %14 >> %idxprom9 = sext i32 %add8 to i64 >> %15 = load float** %b.addr, align 8 >> %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9 >> %16 = load float* %arrayidx10, align 4 >> %add11 = fadd float %13, %16 >> %17 = load i32* %i, align 4 >> %add12 = add nsw i32 256, %17 >> %idxprom13 = sext i32 %add12 to i64 >> %18 = load float** %c.addr, align 8 >> %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13 >> store float %add11, float* %arrayidx14, align 4 >> br label %for.inc >> >> for.inc: ; preds = %for.body >> %19 = load i32* %i, align 4 >> %inc = add nsw i32 %19, 1 >> store i32 %inc, i32* %i, align 4 >> br label %for.cond >> >> for.end: ; preds = %for.cond >> ret void >> } >> >> attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } >> >> !llvm.ident = !{!0} >> >> !0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"} >> >> >> Any ideas why the vectotizer doesn't like my code? >> >> Frank >> >> >> >> >> _______________________________________________ >> LLVM Developers mailing list >> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu >> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev