Hi Jyotirmoy Bhattacharya,
I've vectorized the outer loop of your example with RV, the Region
Vectorizer (https://github.com/cdl-saarland/rv). I've attached the full
IR. This is the code i got for the inner most loop:
for.body4.rv: ; preds = %for.body4.rv,
%for.body4.lr.ph.rv
%indvars.iv14 = phi i64 [ %0, %for.body4.lr.ph.rv ], [ %indvars.iv.next21,
%for.body4.rv ]
%u1.03315 = phi <4 x double> [ zeroinitializer, %for.body4.lr.ph.rv ],
[ %u0.03216, %for.body4.rv ]
%u0.03216 = phi <4 x double> [ zeroinitializer, %for.body4.lr.ph.rv ],
[ %add_SIMD, %for.body4.rv ]
%mul5_SIMD = fmul <4 x double> %mul_SIMD, %u0.03216
%sub_SIMD = fsub <4 x double> %mul5_SIMD, %u1.03315
%arrayidx717 = getelementptr inbounds double, double* %coeffs, i64
%indvars.iv14
%scal_load18 = load double, double* %arrayidx717, align 8
%.splatinsert19 = insertelement <4 x double> undef, double
%scal_load18, i32 0
%.splat20 = shufflevector <4 x double> %.splatinsert19, <4 x
double> undef, <4 x i32> zeroinitializer
%add_SIMD = fadd <4 x double> %sub_SIMD, %.splat20
%indvars.iv.next21 = add nsw i64 %indvars.iv14, -1
%cmp222 = icmp sgt i64 %indvars.iv14, 0
br i1 %cmp222, label %for.body4.rv, label %for.cond.cleanup3.loopexit.rv
}
To reproduce this get the release_38 branch of RV from github and do as
follows:
File cheby.c:
void cheby_eval(double *coeffs,int n,double *xs,double *ys,int m)
{
#pragma omp simd
for (int i=0;i<m;i++){
double x = xs[i];
double u0=0,u1=0,u2=0;
for (int k=n;k>=0;k--){
u2 = u1;
u1 = u0;
u0 = 2*x*u1-u2+coeffs[k];
}
ys[i] = 0.5*(coeffs[0]+u0-u2);
}
}
<EOF>
1. Compile to IR w/o any of LLVM's vectorizers:
clang -O3 -fno-vectorize -fno-slp-vectorize cheby.c -c -emit-llvm -S -o
cheby.ll
2. Run the IR through RV's cmd line vectorizer
./bin/rvTool -loopvec -w 4 -i cheby.ll -k cheby_eval -o cheby.rv.ll
I'd like to add your code to our test suite on github if that is ok with
you.
Please get in touch with me if you have any other outer loops that
could/should be vectorized.
Regards,
Simon
On 05/10/2017 04:09 PM, via llvm-dev wrote:> I have the following C++ code that evaluates a Chebyshev polynomial using
> Clenshaw's algorithm
>
> void cheby_eval(double *coeffs,int n,double *xs,double *ys,int m)
> {
> #pragma omp simd
> for (int i=0;i<m;i++){
> double x = xs[i];
> double u0=0,u1=0,u2=0;
> for (int k=n;k>=0;k--){
> u2 = u1;
> u1 = u0;
> u0 = 2*x*u1-u2+coeffs[k];
> }
> ys[i] = 0.5*(coeffs[0]+u0-u2);
> }
> }
>
> I'm hoping for an autovectorization of the outer loop so that the inner
> loop operates on vectors.
>
> When compiled with
>
> clang++ -O3 -march=haswell -Rpass-analysis=loop-vectorize -S chebyshev.cc
>
> using clang++ 3.8.1-23, no vectorization happens and I get the message
>
> chebyshev.cc:19:18: remark: loop not vectorized: cannot identify array
> bounds
> [-Rpass-analysis=loop-vectorize]
> ys[i] = 0.5*(coeffs[0]+u0-u2);
> ^
> chebyshev.cc:21:1: remark: loop not vectorized: value that could not be
> identified as reduction is used outside the loop
> [-Rpass-analysis=loop-vectorize]
>
>
> On the same code icc vectorizes the outer loop as expected.
>
> I was wondering if there are small ways in which I can change my code to
> help LLVM's autovectorizer to succeed. I would also appreciate any
pointers
> to documentation or LLVM source that can help me better understand how
> autovectorization of outer loops works.
>
> Regards,
> Jyotirmoy Bhattacharya
>
> PS. The interesting part of icc's assembler output is
>
> ..B1.4: # Preds ..B1.8 ..B1.3
> xorl %r15d, %r15d #14.5
> xorl %ebx, %ebx #14.21
> testq %rsi, %rsi #14.21
> vmovupd (%rdx,%r9,8), %ymm3 #12.16
> vxorpd %ymm5, %ymm5, %ymm5 #13.14
> vmovdqa %ymm1, %ymm4 #13.19
> vmovdqa %ymm1, %ymm2 #13.24
> jl ..B1.8 # Prob 2% #14.21
>
> ..B1.5: # Preds ..B1.4
> vaddpd %ymm3, %ymm3, %ymm3 #17.14
>
> ..B1.6: # Preds ..B1.6 ..B1.5
> vmovapd %ymm4, %ymm2 #20.3
> incq %r15 #14.5
> vmovapd %ymm5, %ymm4 #20.3
> vfmsub213pd %ymm2, %ymm3, %ymm5 #17.19
> vbroadcastsd (%r11,%rbx,8), %ymm6 #17.22
> decq %rbx
> vaddpd %ymm5, %ymm6, %ymm5 #17.22
> cmpq %r10, %r15 #14.5
> jb ..B1.6 # Prob 82% #14.5
>
> ..B1.8: # Preds ..B1.6 ..B1.4
> vbroadcastsd (%rdi), %ymm3 #19.18
> vaddpd %ymm3, %ymm5, %ymm4 #19.28
> vsubpd %ymm2, %ymm4, %ymm2 #19.31
> vmulpd %ymm2, %ymm0, %ymm5 #19.31
> vmovupd %ymm5, (%rcx,%r9,8) #19.5
> addq $4, %r9 #11.3
> cmpq %r8, %r9 #11.3
> jb ..B1.4 # Prob 82% #11
> -------------- next part --------------
> An HTML attachment was scrubbed...
> URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170510/9a48b564/attachment-0001.html>
>
> ------------------------------
>
>
--
Simon Moll
Researcher / PhD Student
Compiler Design Lab (Prof. Hack)
Saarland University, Computer Science
Building E1.3, Room 4.31
Tel. +49 (0)681 302-57521 : moll at cs.uni-saarland.de
Fax. +49 (0)681 302-3065 : http://compilers.cs.uni-saarland.de/people/moll
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20170516/aa1ee5a2/attachment.html>
-------------- next part --------------
; ModuleID = 'build/test_052_clenshaw-loop.ll'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse nounwind uwtable
define void @cheby_eval(double* nocapture readonly %coeffs, i32 %n, double*
nocapture readonly %xs, double* nocapture %ys, i32 %m) #0 {
entry:
%cmp36 = icmp sgt i32 %m, 0
br i1 %cmp36, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
%cmp231 = icmp sgt i32 %n, -1
%0 = sext i32 %n to i64
%.splatinsert = insertelement <4 x i1> undef, i1 %cmp231, i32 0
%.splat = shufflevector <4 x i1> %.splatinsert, <4 x i1> undef,
<4 x i32> zeroinitializer
br label %for.body.rv
for.cond.cleanup.loopexit: ; preds =
%for.cond.cleanup3.rv
br label %for.cond.cleanup
for.cond.cleanup: ; preds =
%for.cond.cleanup.loopexit, %entry
ret void
for.body.rv: ; preds = %for.body.lr.ph,
%for.cond.cleanup3.rv
%indvars.iv381 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next399,
%for.cond.cleanup3.rv ]
%exitMask.for.body.false_SIMD = select <4 x i1> %.splat, <4 x i1>
zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>
br i1 %cmp231, label %for.body4.lr.ph.rv, label %for.cond.cleanup3.rv
for.body4.lr.ph.rv: ; preds = %for.body.rv
%arrayidx2 = getelementptr inbounds double, double* %xs, i64 %indvars.iv381
%vec_cast = bitcast double* %arrayidx2 to <4 x double>*
%vec_load = load <4 x double>, <4 x double>* %vec_cast, align 8
%mul_SIMD = fmul <4 x double> %vec_load, <double 2.000000e+00, double
2.000000e+00, double 2.000000e+00, double 2.000000e+00>
br label %for.body4.rv
for.cond.cleanup3.loopexit.rv: ; preds = %for.body4.rv
%add.lcssa25 = phi <4 x double> [ %add_SIMD, %for.body4.rv ]
%u1.033.lcssa26 = phi <4 x double> [ %u1.03315, %for.body4.rv ]
br label %for.cond.cleanup3.rv
for.cond.cleanup3.rv: ; preds =
%for.cond.cleanup3.loopexit.rv, %for.body.rv
%u2.0.lcssa3 = phi <4 x double> [ zeroinitializer, %for.body.rv ], [
%u1.033.lcssa26, %for.cond.cleanup3.loopexit.rv ]
%u0.0.lcssa4 = phi <4 x double> [ zeroinitializer, %for.body.rv ], [
%add.lcssa25, %for.cond.cleanup3.loopexit.rv ]
%scal_load = load double, double* %coeffs, align 8
%.splatinsert5 = insertelement <4 x double> undef, double %scal_load,
i32 0
%.splat6 = shufflevector <4 x double> %.splatinsert5, <4 x double>
undef, <4 x i32> zeroinitializer
%add9_SIMD = fadd <4 x double> %u0.0.lcssa4, %.splat6
%sub10_SIMD = fsub <4 x double> %add9_SIMD, %u2.0.lcssa3
%mul11_SIMD = fmul <4 x double> %sub10_SIMD, <double 5.000000e-01,
double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
%arrayidx137 = getelementptr inbounds double, double* %ys, i64 %indvars.iv381
%vec_cast8 = bitcast double* %arrayidx137 to <4 x double>*
store <4 x double> %mul11_SIMD, <4 x double>* %vec_cast8, align 8
%indvars.iv.next399 = add nuw nsw i64 %indvars.iv381, 4
%lftr.wideiv10 = trunc i64 %indvars.iv.next399 to i32
%exitcond11 = icmp eq i32 %lftr.wideiv10, %m
%.splatinsert12 = insertelement <4 x i1> undef, i1 %exitcond11, i32 0
%.splat13 = shufflevector <4 x i1> %.splatinsert12, <4 x i1>
undef, <4 x i32> zeroinitializer
%exitMask.for.cond.cleanup3.false_SIMD = select <4 x i1> %.splat13,
<4 x i1> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1
true>
br i1 %exitcond11, label %for.cond.cleanup.loopexit, label %for.body.rv
for.body4.rv: ; preds = %for.body4.rv,
%for.body4.lr.ph.rv
%indvars.iv14 = phi i64 [ %0, %for.body4.lr.ph.rv ], [ %indvars.iv.next21,
%for.body4.rv ]
%u1.03315 = phi <4 x double> [ zeroinitializer, %for.body4.lr.ph.rv ], [
%u0.03216, %for.body4.rv ]
%u0.03216 = phi <4 x double> [ zeroinitializer, %for.body4.lr.ph.rv ], [
%add_SIMD, %for.body4.rv ]
%mul5_SIMD = fmul <4 x double> %mul_SIMD, %u0.03216
%sub_SIMD = fsub <4 x double> %mul5_SIMD, %u1.03315
%arrayidx717 = getelementptr inbounds double, double* %coeffs, i64
%indvars.iv14
%scal_load18 = load double, double* %arrayidx717, align 8
%.splatinsert19 = insertelement <4 x double> undef, double %scal_load18,
i32 0
%.splat20 = shufflevector <4 x double> %.splatinsert19, <4 x
double> undef, <4 x i32> zeroinitializer
%add_SIMD = fadd <4 x double> %sub_SIMD, %.splat20
%indvars.iv.next21 = add nsw i64 %indvars.iv14, -1
%cmp222 = icmp sgt i64 %indvars.iv14, 0
%.splatinsert23 = insertelement <4 x i1> undef, i1 %cmp222, i32 0
%.splat24 = shufflevector <4 x i1> %.splatinsert23, <4 x i1>
undef, <4 x i32> zeroinitializer
%exitMask.for.body4.true_SIMD = select <4 x i1> %.splat24, <4 x
i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1>
zeroinitializer
%exitMask.for.body4.false_SIMD = select <4 x i1> %.splat24, <4 x
i1> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1
true>
br i1 %cmp222, label %for.body4.rv, label %for.cond.cleanup3.loopexit.rv
}
attributes #0 = { norecurse nounwind uwtable
"disable-tail-calls"="false"
"less-precise-fpmad"="false"
"no-frame-pointer-elim"="false"
"no-infs-fp-math"="false"
"no-nans-fp-math"="false"
"stack-protector-buffer-size"="8"
"target-cpu"="haswell"
"target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.8.1 (http://llvm.org/git/clang.git
07a6361e0f32f699d47c124106e7911b584974d4) (http://llvm.org/git/llvm.git
051e787f26dbfdc26cf61a57bc82ca00dcb812e8)"}