thr3ads.net - llvm dev - [LLVMdev] loop vectorizer says Bad stride [Oct 2013]

If this information is useful, please help other people find it:
Share via:

Frank Winter

2013-Oct-28 20:38 UTC

[LLVMdev] loop vectorizer says Bad stride

Verifying function
running passes ...
LV: Checking a loop in "bar"
LV: Found a loop: L0
LV: Found an induction variable.
LV: We need to do 0 pointer comparisons.
LV: Checking memory dependencies
LV: Bad stride - Not an AddRecExpr pointer   %13 = getelementptr float* 
%arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to 
i64)) + %arg2)
LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink 
Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + 
%arg2)(Induction step: 1)
LV: Distance for   store float %11, float* %12 to   store float %10, 
float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) +
{(-4
* (sext i32 %arg0 to i64)),+,-4}<%L0>)
Non-consecutive pointer access
LV: We don't need a runtime memory check.
LV: Can't vectorize due to memory conflicts
LV: Not vectorizing.

Here the input IR:

define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* 
noalias %arg3, float* noalias %arg4) {
entrypoint:
   br label %L0

L0:                                               ; preds = %L0, %entrypoint
   %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ]
   %1 = add nsw i32 %0, 256
   %2 = sext i32 %0 to i64
   %3 = getelementptr float* %arg3, i64 %2
   %4 = load float* %3, align 4
   %5 = sext i32 %1 to i64
   %6 = getelementptr float* %arg3, i64 %5
   %7 = load float* %6, align 4
   %8 = getelementptr float* %arg4, i64 %2
   %9 = load float* %8, align 4
   %10 = getelementptr float* %arg4, i64 %5
   %11 = load float* %10, align 4
   %12 = fadd float %11, %7
   %13 = fadd float %9, %4
   %14 = getelementptr float* %arg2, i64 %2
   store float %13, float* %14, align 4
   %15 = getelementptr float* %arg2, i64 %5
   store float %12, float* %15, align 4
   %16 = add nsw i32 %0, 1
   %17 = icmp slt i32 %16, %arg1
   br i1 %17, label %L0, label %L1

L1:                                               ; preds = %L0
   ret void
}

This function is IMO equivalent to

void main(int start, int end, float * restrict c, float * restrict a, 
float * restrict b)
{
   const int width = 256;
   for (int i = start ; i < end ; ++i ) {
     c[ i ]         = a[ i ]         + b[ i ];
     c[ width + i ] = a[ width + i ] + b[ width + i ];
   }
}

With this version, the vectorizer doesnt complain about a bad stride and 
can parallelize the loop.

Here the output from "clang -emit-llvm -S loop.c" which can be
parallelized:



target datalayout = 
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: nounwind uwtable
define void @bar(float* noalias %c, float* noalias %a, float* noalias 
%b, i32 %start, i32 %end) #0 {
entry:
   %c.addr = alloca float*, align 8
   %a.addr = alloca float*, align 8
   %b.addr = alloca float*, align 8
   %start.addr = alloca i32, align 4
   %end.addr = alloca i32, align 4
   %width = alloca i32, align 4
   %i = alloca i32, align 4
   store float* %c, float** %c.addr, align 8
   store float* %a, float** %a.addr, align 8
   store float* %b, float** %b.addr, align 8
   store i32 %start, i32* %start.addr, align 4
   store i32 %end, i32* %end.addr, align 4
   store i32 256, i32* %width, align 4
   %0 = load i32* %start.addr, align 4
   store i32 %0, i32* %i, align 4
   br label %for.cond

for.cond:                                         ; preds = %for.inc, %entry
   %1 = load i32* %i, align 4
   %2 = load i32* %end.addr, align 4
   %cmp = icmp slt i32 %1, %2
   br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
   %3 = load i32* %i, align 4
   %idxprom = sext i32 %3 to i64
   %4 = load float** %a.addr, align 8
   %arrayidx = getelementptr inbounds float* %4, i64 %idxprom
   %5 = load float* %arrayidx, align 4
   %6 = load i32* %i, align 4
   %idxprom1 = sext i32 %6 to i64
   %7 = load float** %b.addr, align 8
   %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1
   %8 = load float* %arrayidx2, align 4
   %add = fadd float %5, %8
   %9 = load i32* %i, align 4
   %idxprom3 = sext i32 %9 to i64
   %10 = load float** %c.addr, align 8
   %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3
   store float %add, float* %arrayidx4, align 4
   %11 = load i32* %i, align 4
   %add5 = add nsw i32 256, %11
   %idxprom6 = sext i32 %add5 to i64
   %12 = load float** %a.addr, align 8
   %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6
   %13 = load float* %arrayidx7, align 4
   %14 = load i32* %i, align 4
   %add8 = add nsw i32 256, %14
   %idxprom9 = sext i32 %add8 to i64
   %15 = load float** %b.addr, align 8
   %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9
   %16 = load float* %arrayidx10, align 4
   %add11 = fadd float %13, %16
   %17 = load i32* %i, align 4
   %add12 = add nsw i32 256, %17
   %idxprom13 = sext i32 %add12 to i64
   %18 = load float** %c.addr, align 8
   %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13
   store float %add11, float* %arrayidx14, align 4
   br label %for.inc

for.inc:                                          ; preds = %for.body
   %19 = load i32* %i, align 4
   %inc = add nsw i32 %19, 1
   store i32 %inc, i32* %i, align 4
   br label %for.cond

for.end:                                          ; preds = %for.cond
   ret void
}

attributes #0 = { nounwind uwtable
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false"
"no-nans-fp-math"="false"
"stack-protector-buffer-size"="8"
"unsafe-fp-math"="false"
"use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"}


Any ideas why the vectotizer doesn't like my code?

Frank

Nadav Rotem

2013-Oct-28 20:58 UTC

head link

[LLVMdev] loop vectorizer says Bad stride

Frank, 

It looks like the loop vectorizer is unable to tell that the two stores in your
code never overlap. This is probably because of the sign-extend in your code.
Can you extend the indices to 64bit ?

Thanks,
Nadav

On Oct 28, 2013, at 1:38 PM, Frank Winter <fwinter at jlab.org> wrote:
> Verifying function
> running passes ...
> LV: Checking a loop in "bar"
> LV: Found a loop: L0
> LV: Found an induction variable.
> LV: We need to do 0 pointer comparisons.
> LV: Checking memory dependencies
> LV: Bad stride - Not an AddRecExpr pointer   %13 = getelementptr float*
%arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to
i64)) + %arg2)
> LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) + %arg2),+,4}<%L0>Sink
Scev: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) +
%arg2)(Induction step: 1)
> LV: Distance for   store float %11, float* %12 to   store float %10, float*
%13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) + {(-4 *
(sext i32 %arg0 to i64)),+,-4}<%L0>)
> Non-consecutive pointer access
> LV: We don't need a runtime memory check.
> LV: Can't vectorize due to memory conflicts
> LV: Not vectorizing.
> 
> Here the input IR:
> 
> define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float* noalias
%arg3, float* noalias %arg4) {
> entrypoint:
>  br label %L0
> 
> L0:                                               ; preds = %L0,
%entrypoint
>  %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ]
>  %1 = add nsw i32 %0, 256
>  %2 = sext i32 %0 to i64
>  %3 = getelementptr float* %arg3, i64 %2
>  %4 = load float* %3, align 4
>  %5 = sext i32 %1 to i64
>  %6 = getelementptr float* %arg3, i64 %5
>  %7 = load float* %6, align 4
>  %8 = getelementptr float* %arg4, i64 %2
>  %9 = load float* %8, align 4
>  %10 = getelementptr float* %arg4, i64 %5
>  %11 = load float* %10, align 4
>  %12 = fadd float %11, %7
>  %13 = fadd float %9, %4
>  %14 = getelementptr float* %arg2, i64 %2
>  store float %13, float* %14, align 4
>  %15 = getelementptr float* %arg2, i64 %5
>  store float %12, float* %15, align 4
>  %16 = add nsw i32 %0, 1
>  %17 = icmp slt i32 %16, %arg1
>  br i1 %17, label %L0, label %L1
> 
> L1:                                               ; preds = %L0
>  ret void
> }
> 
> This function is IMO equivalent to
> 
> void main(int start, int end, float * restrict c, float * restrict a, float
* restrict b)
> {
>  const int width = 256;
>  for (int i = start ; i < end ; ++i ) {
>    c[ i ]         = a[ i ]         + b[ i ];
>    c[ width + i ] = a[ width + i ] + b[ width + i ];
>  }
> }
> 
> With this version, the vectorizer doesnt complain about a bad stride and
can parallelize the loop.
> 
> Here the output from "clang -emit-llvm -S loop.c" which can be
parallelized:
> 
> 
> 
> target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> target triple = "x86_64-unknown-linux-gnu"
> 
> ; Function Attrs: nounwind uwtable
> define void @bar(float* noalias %c, float* noalias %a, float* noalias %b,
i32 %start, i32 %end) #0 {
> entry:
>  %c.addr = alloca float*, align 8
>  %a.addr = alloca float*, align 8
>  %b.addr = alloca float*, align 8
>  %start.addr = alloca i32, align 4
>  %end.addr = alloca i32, align 4
>  %width = alloca i32, align 4
>  %i = alloca i32, align 4
>  store float* %c, float** %c.addr, align 8
>  store float* %a, float** %a.addr, align 8
>  store float* %b, float** %b.addr, align 8
>  store i32 %start, i32* %start.addr, align 4
>  store i32 %end, i32* %end.addr, align 4
>  store i32 256, i32* %width, align 4
>  %0 = load i32* %start.addr, align 4
>  store i32 %0, i32* %i, align 4
>  br label %for.cond
> 
> for.cond:                                         ; preds = %for.inc,
%entry
>  %1 = load i32* %i, align 4
>  %2 = load i32* %end.addr, align 4
>  %cmp = icmp slt i32 %1, %2
>  br i1 %cmp, label %for.body, label %for.end
> 
> for.body:                                         ; preds = %for.cond
>  %3 = load i32* %i, align 4
>  %idxprom = sext i32 %3 to i64
>  %4 = load float** %a.addr, align 8
>  %arrayidx = getelementptr inbounds float* %4, i64 %idxprom
>  %5 = load float* %arrayidx, align 4
>  %6 = load i32* %i, align 4
>  %idxprom1 = sext i32 %6 to i64
>  %7 = load float** %b.addr, align 8
>  %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1
>  %8 = load float* %arrayidx2, align 4
>  %add = fadd float %5, %8
>  %9 = load i32* %i, align 4
>  %idxprom3 = sext i32 %9 to i64
>  %10 = load float** %c.addr, align 8
>  %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3
>  store float %add, float* %arrayidx4, align 4
>  %11 = load i32* %i, align 4
>  %add5 = add nsw i32 256, %11
>  %idxprom6 = sext i32 %add5 to i64
>  %12 = load float** %a.addr, align 8
>  %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6
>  %13 = load float* %arrayidx7, align 4
>  %14 = load i32* %i, align 4
>  %add8 = add nsw i32 256, %14
>  %idxprom9 = sext i32 %add8 to i64
>  %15 = load float** %b.addr, align 8
>  %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9
>  %16 = load float* %arrayidx10, align 4
>  %add11 = fadd float %13, %16
>  %17 = load i32* %i, align 4
>  %add12 = add nsw i32 256, %17
>  %idxprom13 = sext i32 %add12 to i64
>  %18 = load float** %c.addr, align 8
>  %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13
>  store float %add11, float* %arrayidx14, align 4
>  br label %for.inc
> 
> for.inc:                                          ; preds = %for.body
>  %19 = load i32* %i, align 4
>  %inc = add nsw i32 %19, 1
>  store i32 %inc, i32* %i, align 4
>  br label %for.cond
> 
> for.end:                                          ; preds = %for.cond
>  ret void
> }
> 
> attributes #0 = { nounwind uwtable
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false"
"no-nans-fp-math"="false"
"stack-protector-buffer-size"="8"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
> 
> !llvm.ident = !{!0}
> 
> !0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"}
> 
> 
> Any ideas why the vectotizer doesn't like my code?
> 
> Frank
> 
> 
> 
> 
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

Frank Winter

2013-Oct-28 23:31 UTC

head link

[LLVMdev] loop vectorizer says Bad stride

Hi Nadav,

right! The sign-extend was the problem. Hmm.. Is this a bug or a feature?

Frank


On 28/10/13 16:58, Nadav Rotem wrote:> Frank,
>
> It looks like the loop vectorizer is unable to tell that the two stores in
your code never overlap. This is probably because of the sign-extend in your
code. Can you extend the indices to 64bit ?
>
> Thanks,
> Nadav
>
> On Oct 28, 2013, at 1:38 PM, Frank Winter <fwinter at jlab.org>
wrote:
>
>> Verifying function
>> running passes ...
>> LV: Checking a loop in "bar"
>> LV: Found a loop: L0
>> LV: Found an induction variable.
>> LV: We need to do 0 pointer comparisons.
>> LV: Checking memory dependencies
>> LV: Bad stride - Not an AddRecExpr pointer   %13 = getelementptr float*
%arg2, i32 %1 SCEV: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to
i64)) + %arg2)
>> LV: Src Scev: {((4 * (sext i32 %arg0 to i64)) +
%arg2),+,4}<%L0>Sink Scev: ((4 * (sext i32 {(256 +
%arg0),+,1}<nw><%L0> to i64)) + %arg2)(Induction step: 1)
>> LV: Distance for   store float %11, float* %12 to   store float %10,
float* %13: ((4 * (sext i32 {(256 + %arg0),+,1}<nw><%L0> to i64)) +
{(-4 * (sext i32 %arg0 to i64)),+,-4}<%L0>)
>> Non-consecutive pointer access
>> LV: We don't need a runtime memory check.
>> LV: Can't vectorize due to memory conflicts
>> LV: Not vectorizing.
>>
>> Here the input IR:
>>
>> define void @bar(i32 %arg0, i32 %arg1, float* noalias %arg2, float*
noalias %arg3, float* noalias %arg4) {
>> entrypoint:
>>   br label %L0
>>
>> L0:                                               ; preds = %L0,
%entrypoint
>>   %0 = phi i32 [ %16, %L0 ], [ %arg0, %entrypoint ]
>>   %1 = add nsw i32 %0, 256
>>   %2 = sext i32 %0 to i64
>>   %3 = getelementptr float* %arg3, i64 %2
>>   %4 = load float* %3, align 4
>>   %5 = sext i32 %1 to i64
>>   %6 = getelementptr float* %arg3, i64 %5
>>   %7 = load float* %6, align 4
>>   %8 = getelementptr float* %arg4, i64 %2
>>   %9 = load float* %8, align 4
>>   %10 = getelementptr float* %arg4, i64 %5
>>   %11 = load float* %10, align 4
>>   %12 = fadd float %11, %7
>>   %13 = fadd float %9, %4
>>   %14 = getelementptr float* %arg2, i64 %2
>>   store float %13, float* %14, align 4
>>   %15 = getelementptr float* %arg2, i64 %5
>>   store float %12, float* %15, align 4
>>   %16 = add nsw i32 %0, 1
>>   %17 = icmp slt i32 %16, %arg1
>>   br i1 %17, label %L0, label %L1
>>
>> L1:                                               ; preds = %L0
>>   ret void
>> }
>>
>> This function is IMO equivalent to
>>
>> void main(int start, int end, float * restrict c, float * restrict a,
float * restrict b)
>> {
>>   const int width = 256;
>>   for (int i = start ; i < end ; ++i ) {
>>     c[ i ]         = a[ i ]         + b[ i ];
>>     c[ width + i ] = a[ width + i ] + b[ width + i ];
>>   }
>> }
>>
>> With this version, the vectorizer doesnt complain about a bad stride
and can parallelize the loop.
>>
>> Here the output from "clang -emit-llvm -S loop.c" which can
be parallelized:
>>
>>
>>
>> target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
>> target triple = "x86_64-unknown-linux-gnu"
>>
>> ; Function Attrs: nounwind uwtable
>> define void @bar(float* noalias %c, float* noalias %a, float* noalias
%b, i32 %start, i32 %end) #0 {
>> entry:
>>   %c.addr = alloca float*, align 8
>>   %a.addr = alloca float*, align 8
>>   %b.addr = alloca float*, align 8
>>   %start.addr = alloca i32, align 4
>>   %end.addr = alloca i32, align 4
>>   %width = alloca i32, align 4
>>   %i = alloca i32, align 4
>>   store float* %c, float** %c.addr, align 8
>>   store float* %a, float** %a.addr, align 8
>>   store float* %b, float** %b.addr, align 8
>>   store i32 %start, i32* %start.addr, align 4
>>   store i32 %end, i32* %end.addr, align 4
>>   store i32 256, i32* %width, align 4
>>   %0 = load i32* %start.addr, align 4
>>   store i32 %0, i32* %i, align 4
>>   br label %for.cond
>>
>> for.cond:                                         ; preds = %for.inc,
%entry
>>   %1 = load i32* %i, align 4
>>   %2 = load i32* %end.addr, align 4
>>   %cmp = icmp slt i32 %1, %2
>>   br i1 %cmp, label %for.body, label %for.end
>>
>> for.body:                                         ; preds = %for.cond
>>   %3 = load i32* %i, align 4
>>   %idxprom = sext i32 %3 to i64
>>   %4 = load float** %a.addr, align 8
>>   %arrayidx = getelementptr inbounds float* %4, i64 %idxprom
>>   %5 = load float* %arrayidx, align 4
>>   %6 = load i32* %i, align 4
>>   %idxprom1 = sext i32 %6 to i64
>>   %7 = load float** %b.addr, align 8
>>   %arrayidx2 = getelementptr inbounds float* %7, i64 %idxprom1
>>   %8 = load float* %arrayidx2, align 4
>>   %add = fadd float %5, %8
>>   %9 = load i32* %i, align 4
>>   %idxprom3 = sext i32 %9 to i64
>>   %10 = load float** %c.addr, align 8
>>   %arrayidx4 = getelementptr inbounds float* %10, i64 %idxprom3
>>   store float %add, float* %arrayidx4, align 4
>>   %11 = load i32* %i, align 4
>>   %add5 = add nsw i32 256, %11
>>   %idxprom6 = sext i32 %add5 to i64
>>   %12 = load float** %a.addr, align 8
>>   %arrayidx7 = getelementptr inbounds float* %12, i64 %idxprom6
>>   %13 = load float* %arrayidx7, align 4
>>   %14 = load i32* %i, align 4
>>   %add8 = add nsw i32 256, %14
>>   %idxprom9 = sext i32 %add8 to i64
>>   %15 = load float** %b.addr, align 8
>>   %arrayidx10 = getelementptr inbounds float* %15, i64 %idxprom9
>>   %16 = load float* %arrayidx10, align 4
>>   %add11 = fadd float %13, %16
>>   %17 = load i32* %i, align 4
>>   %add12 = add nsw i32 256, %17
>>   %idxprom13 = sext i32 %add12 to i64
>>   %18 = load float** %c.addr, align 8
>>   %arrayidx14 = getelementptr inbounds float* %18, i64 %idxprom13
>>   store float %add11, float* %arrayidx14, align 4
>>   br label %for.inc
>>
>> for.inc:                                          ; preds = %for.body
>>   %19 = load i32* %i, align 4
>>   %inc = add nsw i32 %19, 1
>>   store i32 %inc, i32* %i, align 4
>>   br label %for.cond
>>
>> for.end:                                          ; preds = %for.cond
>>   ret void
>> }
>>
>> attributes #0 = { nounwind uwtable
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="true"
"no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false"
"no-nans-fp-math"="false"
"stack-protector-buffer-size"="8"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
>>
>> !llvm.ident = !{!0}
>>
>> !0 = metadata !{metadata !"clang version 3.4 (trunk 193120)"}
>>
>>
>> Any ideas why the vectotizer doesn't like my code?
>>
>> Frank
>>
>>
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev

Possibly Parallel Threads

Search for more reasonably related threads

llvm dev - Oct 2013 - [LLVMdev] loop vectorizer says Bad stride

[LLVMdev] loop vectorizer says Bad stride

[LLVMdev] loop vectorizer says Bad stride

[LLVMdev] loop vectorizer says Bad stride

Possibly Parallel Threads