Hi, LLVM community: I found basicaa seems not to tell must-not-alias for __restrict__ arguments in c/c++. It only compares two pointers and the underlying objects they point to. I wonder how clang does alias analysis for c/c++ keyword restrict. let assume we compile the following code: $cat myalias.cc float foo(float * __restrict__ v0, float * __restrict__ v1, float * __restrict__ v2, float * __restrict__ t) { float res; for (int i=0; i<10; ++i) { float x = v0[1]; float y = v1[1]; float z = v2[1]; res = x * 0.67 + y * 0.17 + z * 0.16; t[i] = res; } return res; } $clang -emit-llvm -c myalias.cc -o myalias.bc Clearly each argument has attribute 'noalias' in LLVM IR. I plan to use basicaa and licm to hoist all load/store of x/y/z and res to ahead of loop. $ opt -basicaa -licm -print-alias-sets myalias.bc -o myalias.opt.bc -stats -debug-only=licm LICM hoisting to entry: %1 = load float** %v0.addr, align 8 LICM hoisting to entry: %arrayidx = getelementptr inbounds float* %0, i64 1 LICM hoisting to entry: %3 = load float** %v1.addr, align 8 LICM hoisting to entry: %arrayidx1 = getelementptr inbounds float* %1, i64 1 LICM hoisting to entry: %5 = load float** %v2.addr, align 8 LICM hoisting to entry: %arrayidx2 = getelementptr inbounds float* %2, i64 1 LICM hoisting to entry: %12 = load float** %t.addr, align 8 Alias Set Tracker: 10 alias sets for 13 pointer values. AliasSet[0x1b7e800, 1] must alias, Mod/Ref Pointers: (float** %v0.addr, 8) AliasSet[0x1b7d7e0, 1] must alias, Mod/Ref Pointers: (float** %v1.addr, 8) AliasSet[0x1b7d8a0, 1] must alias, Mod/Ref Pointers: (float** %v2.addr, 8) AliasSet[0x1b912e0, 1] must alias, Mod/Ref Pointers: (float** %t.addr, 8) AliasSet[0x1b913a0, 1] must alias, Mod/Ref Pointers: (i32* %i, 4) AliasSet[0x1b91510, 4] may alias, Mod/Ref Pointers: (float* %arrayidx, 4), (float* %arrayidx1, 4), (float* %arrayidx2, 4), (float* %arrayidx9, 4) AliasSet[0x1b91590, 1] must alias, Mod/Ref Pointers: (float* %x, 4) AliasSet[0x1b91690, 1] must alias, Mod/Ref Pointers: (float* %y, 4) AliasSet[0x1b91790, 1] must alias, Mod/Ref Pointers: (float* %z, 4) AliasSet[0x1b91850, 1] must alias, Mod/Ref Pointers: (float* %res, 4) ===-------------------------------------------------------------------------== ... Statistics Collected ... ===-------------------------------------------------------------------------== 7 licm - Number of instructions hoisted out of loop 4 licm - Number of load insts hoisted or sunk 32 memory-builtins - Number of load instructions with unsolved size and offset The result is not good. The loop contains redundant computation. Clearly, Basicaa works out wrong AliasSet 'may alias' (arrayidx, arrayidx1, arrayidx2, arrayidx9). -tbaa doesn't help much here. Because v0/v1, v2, and t are all have 'noalias' attributes. I think AA should know that %5,%6,%7 are loading from distinguishing locations and no instruction in loop body will modify those places. ; ModuleID = '<stdin>' target datalayout "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define float @_Z3fooPfS_S_S_(float* noalias %v0, float* noalias %v1, float* noalias %v2, float* noalias %t) nounwind uwtable { entry: %v0.addr = alloca float*, align 8 %v1.addr = alloca float*, align 8 %v2.addr = alloca float*, align 8 %t.addr = alloca float*, align 8 %res = alloca float, align 4 %i = alloca i32, align 4 %x = alloca float, align 4 %y = alloca float, align 4 %z = alloca float, align 4 store float* %v0, float** %v0.addr, align 8 store float* %v1, float** %v1.addr, align 8 store float* %v2, float** %v2.addr, align 8 store float* %t, float** %t.addr, align 8 store i32 0, i32* %i, align 4 %0 = load float** %v0.addr, align 8 %arrayidx = getelementptr inbounds float* %0, i64 1 %1 = load float** %v1.addr, align 8 %arrayidx1 = getelementptr inbounds float* %1, i64 1 %2 = load float** %v2.addr, align 8 %arrayidx2 = getelementptr inbounds float* %2, i64 1 %3 = load float** %t.addr, align 8 br label %for.cond for.cond: ; preds = %for.inc, %entry %4 = load i32* %i, align 4 %cmp = icmp slt i32 %4, 10 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond %5 = load float* %arrayidx, align 4 store float %5, float* %x, align 4 %6 = load float* %arrayidx1, align 4 store float %6, float* %y, align 4 %7 = load float* %arrayidx2, align 4 store float %7, float* %z, align 4 %8 = load float* %x, align 4 %conv = fpext float %8 to double %mul = fmul double %conv, 6.700000e-01 %9 = load float* %y, align 4 %conv3 = fpext float %9 to double %mul4 = fmul double %conv3, 1.700000e-01 %add = fadd double %mul, %mul4 %10 = load float* %z, align 4 %conv5 = fpext float %10 to double %mul6 = fmul double %conv5, 1.600000e-01 %add7 = fadd double %add, %mul6 %conv8 = fptrunc double %add7 to float store float %conv8, float* %res, align 4 %11 = load float* %res, align 4 %12 = load i32* %i, align 4 %idxprom = sext i32 %12 to i64 %arrayidx9 = getelementptr inbounds float* %3, i64 %idxprom store float %11, float* %arrayidx9, align 4 br label %for.inc for.inc: ; preds = %for.body %13 = load i32* %i, align 4 %inc = add nsw i32 %13, 1 store i32 %inc, i32* %i, align 4 br label %for.cond for.end: ; preds = %for.cond %14 = load float* %res, align 4 ret float %14 } ps: "opt -O2" works. O2 involves too many optimizations. I don't think the redundant elimination is attributed to AA & LICM. thanks, --lx -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20131111/281c15a6/attachment.html>
Hi, Your problem is that the function arguments, which are makes as noalias, are not being directly used as the base objects of the array accesses:> %v0.addr = alloca float*, align 8 > %v1.addr = alloca float*, align 8 > %v2.addr = alloca float*, align 8 > %t.addr = alloca float*, align 8...> store float* %v0, float** %v0.addr, align 8 > store float* %v1, float** %v1.addr, align 8 > store float* %v2, float** %v2.addr, align 8 > store float* %t, float** %t.addr, align 8If I had to guess, running -sroa (or -mem2reg) will clean this up for you. Just in case you don't know, running: opt -O2 -debug-pass=Arguments -S < /dev/null will give you a command line for opt, equivalent to -O2, with the passes named explicitly. This is helpful for investigating these kinds of questions. -Hal ----- Original Message -----> > > Hi, LLVM community: > > > I found basicaa seems not to tell must-not-alias for __restrict__ > arguments in c/c++. It only compares two pointers and the underlying > objects they point to. I wonder how clang does alias analysis > for c/c++ keyword restrict. > > > let assume we compile the following code: > $cat myalias.cc > > float foo(float * __restrict__ v0, float * __restrict__ v1, float * > __restrict__ v2, float * __restrict__ t) { > float res; > for (int i=0; i<10; ++i) { > float x = v0[1]; > float y = v1[1]; > float z = v2[1]; > > > res = x * 0.67 + y * 0.17 + z * 0.16; > t[i] = res; > } > > > return res; > } > > > $clang -emit-llvm -c myalias.cc -o myalias.bc > > Clearly each argument has attribute 'noalias' in LLVM IR. I plan to > use basicaa and licm to hoist all load/store of x/y/z and res to > ahead of loop. > > > > $ opt -basicaa -licm -print-alias-sets myalias.bc -o myalias.opt.bc > -stats -debug-only=licm > LICM hoisting to entry: %1 = load float** %v0.addr, align 8 > LICM hoisting to entry: %arrayidx = getelementptr inbounds float* %0, > i64 1 > LICM hoisting to entry: %3 = load float** %v1.addr, align 8 > LICM hoisting to entry: %arrayidx1 = getelementptr inbounds float* > %1, i64 1 > LICM hoisting to entry: %5 = load float** %v2.addr, align 8 > LICM hoisting to entry: %arrayidx2 = getelementptr inbounds float* > %2, i64 1 > LICM hoisting to entry: %12 = load float** %t.addr, align 8 > Alias Set Tracker: 10 alias sets for 13 pointer values. > AliasSet[0x1b7e800, 1] must alias, Mod/Ref Pointers: (float** > %v0.addr, 8) > AliasSet[0x1b7d7e0, 1] must alias, Mod/Ref Pointers: (float** > %v1.addr, 8) > AliasSet[0x1b7d8a0, 1] must alias, Mod/Ref Pointers: (float** > %v2.addr, 8) > AliasSet[0x1b912e0, 1] must alias, Mod/Ref Pointers: (float** > %t.addr, 8) > AliasSet[0x1b913a0, 1] must alias, Mod/Ref Pointers: (i32* %i, 4) > AliasSet[0x1b91510, 4] may alias, Mod/Ref Pointers: (float* > %arrayidx, 4), (float* %arrayidx1, 4), (float* %arrayidx2, 4), > (float* %arrayidx9, 4) > AliasSet[0x1b91590, 1] must alias, Mod/Ref Pointers: (float* %x, 4) > AliasSet[0x1b91690, 1] must alias, Mod/Ref Pointers: (float* %y, 4) > AliasSet[0x1b91790, 1] must alias, Mod/Ref Pointers: (float* %z, 4) > AliasSet[0x1b91850, 1] must alias, Mod/Ref Pointers: (float* %res, 4) > > > ===-------------------------------------------------------------------------==> ... Statistics Collected ... > ===-------------------------------------------------------------------------==> > > 7 licm - Number of instructions hoisted out of loop > 4 licm - Number of load insts hoisted or sunk > 32 memory-builtins - Number of load instructions with unsolved size > and offset > > > > > The result is not good. The loop contains redundant computation. > Clearly, Basicaa works out wrong AliasSet 'may alias' (arrayidx, > arrayidx1, arrayidx2, arrayidx9). > -tbaa doesn't help much here. Because v0/v1, v2, and t are all have > 'noalias' attributes. I think AA should know that %5,%6,%7 are > loading from distinguishing locations > and no instruction in loop body will modify those places. > > > > ; ModuleID = '<stdin>' > target datalayout > "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" > target triple = "x86_64-unknown-linux-gnu" > > > define float @_Z3fooPfS_S_S_(float* noalias %v0, float* noalias %v1, > float* noalias %v2, float* noalias %t) nounwind uwtable { > entry: > %v0.addr = alloca float*, align 8 > %v1.addr = alloca float*, align 8 > %v2.addr = alloca float*, align 8 > %t.addr = alloca float*, align 8 > %res = alloca float, align 4 > %i = alloca i32, align 4 > %x = alloca float, align 4 > %y = alloca float, align 4 > %z = alloca float, align 4 > store float* %v0, float** %v0.addr, align 8 > store float* %v1, float** %v1.addr, align 8 > store float* %v2, float** %v2.addr, align 8 > store float* %t, float** %t.addr, align 8 > store i32 0, i32* %i, align 4 > %0 = load float** %v0.addr, align 8 > %arrayidx = getelementptr inbounds float* %0, i64 1 > %1 = load float** %v1.addr, align 8 > %arrayidx1 = getelementptr inbounds float* %1, i64 1 > %2 = load float** %v2.addr, align 8 > %arrayidx2 = getelementptr inbounds float* %2, i64 1 > %3 = load float** %t.addr, align 8 > br label %for.cond > > > for.cond: ; preds = %for.inc, %entry > %4 = load i32* %i, align 4 > %cmp = icmp slt i32 %4, 10 > br i1 %cmp, label %for.body, label %for.end > > > for.body: ; preds = %for.cond > %5 = load float* %arrayidx, align 4 > store float %5, float* %x, align 4 > %6 = load float* %arrayidx1, align 4 > store float %6, float* %y, align 4 > %7 = load float* %arrayidx2, align 4 > store float %7, float* %z, align 4 > %8 = load float* %x, align 4 > %conv = fpext float %8 to double > %mul = fmul double %conv, 6.700000e-01 > %9 = load float* %y, align 4 > %conv3 = fpext float %9 to double > %mul4 = fmul double %conv3, 1.700000e-01 > %add = fadd double %mul, %mul4 > %10 = load float* %z, align 4 > %conv5 = fpext float %10 to double > %mul6 = fmul double %conv5, 1.600000e-01 > %add7 = fadd double %add, %mul6 > %conv8 = fptrunc double %add7 to float > store float %conv8, float* %res, align 4 > %11 = load float* %res, align 4 > %12 = load i32* %i, align 4 > %idxprom = sext i32 %12 to i64 > %arrayidx9 = getelementptr inbounds float* %3, i64 %idxprom > store float %11, float* %arrayidx9, align 4 > br label %for.inc > > > for.inc: ; preds = %for.body > %13 = load i32* %i, align 4 > %inc = add nsw i32 %13, 1 > store i32 %inc, i32* %i, align 4 > br label %for.cond > > > for.end: ; preds = %for.cond > %14 = load float* %res, align 4 > ret float %14 > } > > > ps: "opt -O2" works. O2 involves too many optimizations. I don't > think the redundant elimination is attributed to AA & LICM. > > > thanks, > --lx > > > _______________________________________________ > LLVM Developers mailing list > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev >-- Hal Finkel Assistant Computational Scientist Leadership Computing Facility Argonne National Laboratory
Hi, Finkel Thank you for heading up. using your approach, I can narrow down the following combo optimizes the good code. opt -basicaa -mem2reg -loop-rotate -licm myalias.bc -o myalias.opt.bc -stats -debug-only=licm further, I think scalar-evolution should be helpful to analyze loop indvar. it's not good for my case. thanks, --lx On Tue, Nov 12, 2013 at 11:34 AM, Hal Finkel <hfinkel at anl.gov> wrote:> Hi, > > Your problem is that the function arguments, which are makes as noalias, > are not being directly used as the base objects of the array accesses: > > > %v0.addr = alloca float*, align 8 > > %v1.addr = alloca float*, align 8 > > %v2.addr = alloca float*, align 8 > > %t.addr = alloca float*, align 8 > ... > > store float* %v0, float** %v0.addr, align 8 > > store float* %v1, float** %v1.addr, align 8 > > store float* %v2, float** %v2.addr, align 8 > > store float* %t, float** %t.addr, align 8 > > If I had to guess, running -sroa (or -mem2reg) will clean this up for you. > > Just in case you don't know, running: > > opt -O2 -debug-pass=Arguments -S < /dev/null > > will give you a command line for opt, equivalent to -O2, with the passes > named explicitly. This is helpful for investigating these kinds of > questions. > > -Hal > > ----- Original Message ----- > > > > > > Hi, LLVM community: > > > > > > I found basicaa seems not to tell must-not-alias for __restrict__ > > arguments in c/c++. It only compares two pointers and the underlying > > objects they point to. I wonder how clang does alias analysis > > for c/c++ keyword restrict. > > > > > > let assume we compile the following code: > > $cat myalias.cc > > > > float foo(float * __restrict__ v0, float * __restrict__ v1, float * > > __restrict__ v2, float * __restrict__ t) { > > float res; > > for (int i=0; i<10; ++i) { > > float x = v0[1]; > > float y = v1[1]; > > float z = v2[1]; > > > > > > res = x * 0.67 + y * 0.17 + z * 0.16; > > t[i] = res; > > } > > > > > > return res; > > } > > > > > > $clang -emit-llvm -c myalias.cc -o myalias.bc > > > > Clearly each argument has attribute 'noalias' in LLVM IR. I plan to > > use basicaa and licm to hoist all load/store of x/y/z and res to > > ahead of loop. > > > > > > > > $ opt -basicaa -licm -print-alias-sets myalias.bc -o myalias.opt.bc > > -stats -debug-only=licm > > LICM hoisting to entry: %1 = load float** %v0.addr, align 8 > > LICM hoisting to entry: %arrayidx = getelementptr inbounds float* %0, > > i64 1 > > LICM hoisting to entry: %3 = load float** %v1.addr, align 8 > > LICM hoisting to entry: %arrayidx1 = getelementptr inbounds float* > > %1, i64 1 > > LICM hoisting to entry: %5 = load float** %v2.addr, align 8 > > LICM hoisting to entry: %arrayidx2 = getelementptr inbounds float* > > %2, i64 1 > > LICM hoisting to entry: %12 = load float** %t.addr, align 8 > > Alias Set Tracker: 10 alias sets for 13 pointer values. > > AliasSet[0x1b7e800, 1] must alias, Mod/Ref Pointers: (float** > > %v0.addr, 8) > > AliasSet[0x1b7d7e0, 1] must alias, Mod/Ref Pointers: (float** > > %v1.addr, 8) > > AliasSet[0x1b7d8a0, 1] must alias, Mod/Ref Pointers: (float** > > %v2.addr, 8) > > AliasSet[0x1b912e0, 1] must alias, Mod/Ref Pointers: (float** > > %t.addr, 8) > > AliasSet[0x1b913a0, 1] must alias, Mod/Ref Pointers: (i32* %i, 4) > > AliasSet[0x1b91510, 4] may alias, Mod/Ref Pointers: (float* > > %arrayidx, 4), (float* %arrayidx1, 4), (float* %arrayidx2, 4), > > (float* %arrayidx9, 4) > > AliasSet[0x1b91590, 1] must alias, Mod/Ref Pointers: (float* %x, 4) > > AliasSet[0x1b91690, 1] must alias, Mod/Ref Pointers: (float* %y, 4) > > AliasSet[0x1b91790, 1] must alias, Mod/Ref Pointers: (float* %z, 4) > > AliasSet[0x1b91850, 1] must alias, Mod/Ref Pointers: (float* %res, 4) > > > > > > > ===-------------------------------------------------------------------------==> > ... Statistics Collected ... > > > ===-------------------------------------------------------------------------==> > > > > > 7 licm - Number of instructions hoisted out of loop > > 4 licm - Number of load insts hoisted or sunk > > 32 memory-builtins - Number of load instructions with unsolved size > > and offset > > > > > > > > > > The result is not good. The loop contains redundant computation. > > Clearly, Basicaa works out wrong AliasSet 'may alias' (arrayidx, > > arrayidx1, arrayidx2, arrayidx9). > > -tbaa doesn't help much here. Because v0/v1, v2, and t are all have > > 'noalias' attributes. I think AA should know that %5,%6,%7 are > > loading from distinguishing locations > > and no instruction in loop body will modify those places. > > > > > > > > ; ModuleID = '<stdin>' > > target datalayout > > > "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" > > target triple = "x86_64-unknown-linux-gnu" > > > > > > define float @_Z3fooPfS_S_S_(float* noalias %v0, float* noalias %v1, > > float* noalias %v2, float* noalias %t) nounwind uwtable { > > entry: > > %v0.addr = alloca float*, align 8 > > %v1.addr = alloca float*, align 8 > > %v2.addr = alloca float*, align 8 > > %t.addr = alloca float*, align 8 > > %res = alloca float, align 4 > > %i = alloca i32, align 4 > > %x = alloca float, align 4 > > %y = alloca float, align 4 > > %z = alloca float, align 4 > > store float* %v0, float** %v0.addr, align 8 > > store float* %v1, float** %v1.addr, align 8 > > store float* %v2, float** %v2.addr, align 8 > > store float* %t, float** %t.addr, align 8 > > store i32 0, i32* %i, align 4 > > %0 = load float** %v0.addr, align 8 > > %arrayidx = getelementptr inbounds float* %0, i64 1 > > %1 = load float** %v1.addr, align 8 > > %arrayidx1 = getelementptr inbounds float* %1, i64 1 > > %2 = load float** %v2.addr, align 8 > > %arrayidx2 = getelementptr inbounds float* %2, i64 1 > > %3 = load float** %t.addr, align 8 > > br label %for.cond > > > > > > for.cond: ; preds = %for.inc, %entry > > %4 = load i32* %i, align 4 > > %cmp = icmp slt i32 %4, 10 > > br i1 %cmp, label %for.body, label %for.end > > > > > > for.body: ; preds = %for.cond > > %5 = load float* %arrayidx, align 4 > > store float %5, float* %x, align 4 > > %6 = load float* %arrayidx1, align 4 > > store float %6, float* %y, align 4 > > %7 = load float* %arrayidx2, align 4 > > store float %7, float* %z, align 4 > > %8 = load float* %x, align 4 > > %conv = fpext float %8 to double > > %mul = fmul double %conv, 6.700000e-01 > > %9 = load float* %y, align 4 > > %conv3 = fpext float %9 to double > > %mul4 = fmul double %conv3, 1.700000e-01 > > %add = fadd double %mul, %mul4 > > %10 = load float* %z, align 4 > > %conv5 = fpext float %10 to double > > %mul6 = fmul double %conv5, 1.600000e-01 > > %add7 = fadd double %add, %mul6 > > %conv8 = fptrunc double %add7 to float > > store float %conv8, float* %res, align 4 > > %11 = load float* %res, align 4 > > %12 = load i32* %i, align 4 > > %idxprom = sext i32 %12 to i64 > > %arrayidx9 = getelementptr inbounds float* %3, i64 %idxprom > > store float %11, float* %arrayidx9, align 4 > > br label %for.inc > > > > > > for.inc: ; preds = %for.body > > %13 = load i32* %i, align 4 > > %inc = add nsw i32 %13, 1 > > store i32 %inc, i32* %i, align 4 > > br label %for.cond > > > > > > for.end: ; preds = %for.cond > > %14 = load float* %res, align 4 > > ret float %14 > > } > > > > > > ps: "opt -O2" works. O2 involves too many optimizations. I don't > > think the redundant elimination is attributed to AA & LICM. > > > > > > thanks, > > --lx > > > > > > _______________________________________________ > > LLVM Developers mailing list > > LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu > > http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev > > > > -- > Hal Finkel > Assistant Computational Scientist > Leadership Computing Facility > Argonne National Laboratory >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20131113/19d41ecd/attachment.html>