thr3ads.net - llvm dev - [llvm-dev] Loop Unroll [May 2020]

If this information is useful, please help other people find it:
Share via:

legend xx via llvm-dev

2020-May-26 17:55 UTC

[llvm-dev] Loop Unroll

Awesome, thanks!

Now I have another question. I have a matrix multiplication code. This is
my code:

#include <stdio.h>
#include <stdlib.h>

#define n 4

int main(int argc, char *argv[]) {
    int i, j, k;

    int A[n][n], B[n][n], C[n][n];
    for(i=0;i<n;i++){
        for(j=0;j<n;j++){
           A[i][j] = 1;
           B[i][j] = 2;
           C[i][j] = 0;
        }
    }

    for(i=0;i<n;i++){
        for(j=0;j<n;j++){
            for(k=0;k<n;k++){
                C[i][j]=(C[i][j]+(A[i][k]*B[k][j]));
            }
        }
    }

    return 0;
}


I tried over them the loop-unroll-and-jam pass. I run:

$ clang -O0 -Xclang -disable-O0-optnone -emit-llvm mult.c -S -o mult.ll
$ opt -O0 -S -mem2reg -simplifycfg -view-cfg mult.ll -o multopt.ll
$ opt -O0 -S -mem2reg -simplifycfg --loop-unroll-and-jam
--unroll-and-jam-count=4 -simplifycfg -view-cfg mult.ll -o
mult-opt00-unroll4.ll

I get the same CFG graph in two case (I attach it). Also, I tried with -O1
level (opt -01 -loop-unroll-and-jam --unroll-and-jam-count=4), but I didn't
get any differece.

Why the pass --loop-unroll-and-jam does not work?






El dom., 24 may. 2020 a las 14:36, Florian Hahn (<florian_hahn at
apple.com>)
escribió:
>
>
> On May 23, 2020, at 17:15, legend xx <legendaryxx7slh at gmail.com>
wrote:
>
> This is my example (for.c):
>
> #include <stdio.h>
>
> int add(int a, int b) {
>     return a + b;
> }
>
> int main() {
>    int a, b, c, d;
>    a = 5;
>    b = 15;
>    c = add(a, b);
>    d = 0;
>    for(int i=0;i<16;i++)
>        d = add(c, d);
> }
>
> I run:
> $ clang -O0 -Xclang -disable-O0-optnone -emit-llvm for.c -S -o forO0.ll
> $ opt -O0 -S --loop-unroll --unroll-count=4 -view-cfg forO0.ll -o
> for-opt00-unroll4.ll
>
> And this is the LLVM IR code that I get:
>
> ; ModuleID = 'forO0.ll'
> source_filename = "for.c"
> target datalayout >
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-unknown-linux-gnu"
>
> ; Function Attrs: noinline nounwind uwtable
> define dso_local i32 @add(i32 %a, i32 %b) #0 {
> entry:
>   %a.addr = alloca i32, align 4
>   %b.addr = alloca i32, align 4
>   store i32 %a, i32* %a.addr, align 4
>   store i32 %b, i32* %b.addr, align 4
>   %0 = load i32, i32* %a.addr, align 4
>   %1 = load i32, i32* %b.addr, align 4
>   %add = add nsw i32 %0, %1
>   ret i32 %add
> }
>
> ; Function Attrs: noinline nounwind uwtable
> define dso_local i32 @main() #0 {
> entry:
>   %retval = alloca i32, align 4
>   %a = alloca i32, align 4
>   %b = alloca i32, align 4
>   %c = alloca i32, align 4
>   %d = alloca i32, align 4
>   %i = alloca i32, align 4
>   store i32 0, i32* %retval, align 4
>   store i32 5, i32* %a, align 4
>   store i32 15, i32* %b, align 4
>   %0 = load i32, i32* %a, align 4
>   %1 = load i32, i32* %b, align 4
>   %call = call i32 @add(i32 %0, i32 %1)
>   store i32 %call, i32* %c, align 4
>   store i32 0, i32* %d, align 4
>   store i32 0, i32* %i, align 4
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.inc.3,
> %entry
>   %2 = load i32, i32* %i, align 4
>   %cmp = icmp slt i32 %2, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %3 = load i32, i32* %c, align 4
>   %4 = load i32, i32* %d, align 4
>   %call1 = call i32 @add(i32 %3, i32 %4)
>   store i32 %call1, i32* %d, align 4
>   br label %for.inc
>
> for.inc:                                          ; preds = %for.body
>   %5 = load i32, i32* %i, align 4
>   %inc = add nsw i32 %5, 1
>   store i32 %inc, i32* %i, align 4
>   %6 = load i32, i32* %i, align 4
>   %cmp.1 = icmp slt i32 %6, 16
>   br i1 %cmp.1, label %for.body.1, label %for.end
>
> for.end:                                          ; preds = %for.inc.2,
> %for.inc.1, %for.inc, %for.cond
>   %7 = load i32, i32* %d, align 4
>   %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x
> i8], [20 x i8]* @.str, i64 0, i64 0), i32 %7)
>   %8 = load i32, i32* %retval, align 4
>   ret i32 %8
>
> for.body.1:                                       ; preds = %for.inc
>   %9 = load i32, i32* %c, align 4
>   %10 = load i32, i32* %d, align 4
>   %call1.1 = call i32 @add(i32 %9, i32 %10)
>   store i32 %call1.1, i32* %d, align 4
>   br label %for.inc.1
>
> for.inc.1:                                        ; preds = %for.body.1
>   %11 = load i32, i32* %i, align 4
>   %inc.1 = add nsw i32 %11, 1
>   store i32 %inc.1, i32* %i, align 4
>   %12 = load i32, i32* %i, align 4
>   %cmp.2 = icmp slt i32 %12, 16
>   br i1 %cmp.2, label %for.body.2, label %for.end
>
> for.body.2:                                       ; preds = %for.inc.1
>   %13 = load i32, i32* %c, align 4
>   %14 = load i32, i32* %d, align 4
>   %call1.2 = call i32 @add(i32 %13, i32 %14)
>   store i32 %call1.2, i32* %d, align 4
>   br label %for.inc.2
>
> for.inc.2:                                        ; preds = %for.body.2
>   %15 = load i32, i32* %i, align 4
>   %inc.2 = add nsw i32 %15, 1
>   store i32 %inc.2, i32* %i, align 4
>   %16 = load i32, i32* %i, align 4
>   %cmp.3 = icmp slt i32 %16, 16
>   br i1 %cmp.3, label %for.body.3, label %for.end
>
> for.body.3:                                       ; preds = %for.inc.2
>   %17 = load i32, i32* %c, align 4
>   %18 = load i32, i32* %d, align 4
>   %call1.3 = call i32 @add(i32 %17, i32 %18)
>   store i32 %call1.3, i32* %d, align 4
>   br label %for.inc.3
>
> for.inc.3:                                        ; preds = %for.body.3
>   %19 = load i32, i32* %i, align 4
>   %inc.3 = add nsw i32 %19, 1
>   store i32 %inc.3, i32* %i, align 4
>   br label %for.cond, !llvm.loop !2
> }
>
> declare dso_local i32 @printf(i8*, ...) #1
>
> attributes #0 = { noinline nounwind uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false"
"frame-pointer"="all"
> "less-precise-fpmad"="false"
"min-legal-vector-width"="0"
> "no-infs-fp-math"="false"
"no-jump-tables"="false"
> "no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false"
"stack-protector-buffer-size"="8"
> "target-cpu"="x86-64"
"target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false"
"use-soft-float"="false" }
> attributes #1 = {
"correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false"
"frame-pointer"="all"
> "less-precise-fpmad"="false"
"no-infs-fp-math"="false"
> "no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false"
"stack-protector-buffer-size"="8"
> "target-cpu"="x86-64"
"target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false"
"use-soft-float"="false" }
>
> !llvm.module.flags = !{!0}
> !llvm.ident = !{!1}
>
> !0 = !{i32 1, !"wchar_size", i32 4}
> !1 = !{!"clang version 11.0.0
(https://github.com/llvm/llvm-project.git
> a3485301d4870f57590d7b69eed7959134a694ab)"}
> !2 = distinct !{!2, !3}
> !3 = !{!"llvm.loop.unroll.disable"}
>
>
> So my problem is:
> With unroll 4 on the loop with 16 bounds I should see one single block for
> the incrementation i=i+4, then 4 instructions for each previous one
> instruction, and the condition should check if i<16. This is the
intuitive
> code. However, the incrementation that I get is i=i+1 and there are only 4
> blocks.
>
> Do you know why this happen?
>
>
> I think loop-unroll works as expected in your example, as you can see the
> copies of the unrolled loop blocks (for.body.X, for.inc.X). The reason this
> is not simplified to the single block you are expecting is the input for
> -loop-unroll: -loop-unroll gets the IR without any optimizations (-O0).
>
> For the expected result, you need to run a few additional passes before
> -loop-unroll to promote some of the loads/stores to registers and simplify
> the CFG of the input.  Running `opt -mem2reg -simplifycfg -loop-unroll
> -unroll-count=4 forO0.ll -S` should give you something like
>
> define i32 @main() #0 {
> entry:
>   %call = call i32 @add(i32 5, i32 15)
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.body.3,
> %entry
>   %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body.3 ]
>   %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body.3 ]
>   %cmp = icmp ult i32 %i.0, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %call1 = call i32 @add(i32 %call, i32 %d.0)
>   %inc = add nuw nsw i32 %i.0, 1
>   br label %for.body.1
>
> for.end:                                          ; preds = %for.cond
>   ret i32 0
>
> for.body.1:                                       ; preds = %for.body
>   %call1.1 = call i32 @add(i32 %call, i32 %call1)
>   %inc.1 = add nuw nsw i32 %inc, 1
>   br label %for.body.2
>
> for.body.2:                                       ; preds = %for.body.1
>   %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
>   %inc.2 = add nuw nsw i32 %inc.1, 1
>   br label %for.body.3
>
> for.body.3:                                       ; preds = %for.body.2
>   %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
>   %inc.3 = add nuw nsw i32 %inc.2, 1
>   br label %for.cond, !llvm.loop !4
> }
>
> Note that there are still 4 copies of the body instead of a single one.
> Like many passes in LLVM, the loop-unroll pass focuses on performing one
> transformation (duplicating the loop body a number of times) and relies on
> other passes to clean-up/simplify the result. To fold the 4 copies of the
> body into a single block, you need another round of CFG simplifications.
> Running `opt -mem2reg -simplifycfg -loop-unroll -unroll-count=4
> -simplifycfg forO0.ll -S` produces the code below, which is what you are
> looking for IIUC.
>
> define i32 @main() #0 {
> entry:
>   %call = call i32 @add(i32 5, i32 15)
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.body,
> %entry
>   %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body ]
>   %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
>   %cmp = icmp ult i32 %i.0, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %call1 = call i32 @add(i32 %call, i32 %d.0)
>   %inc = add nuw nsw i32 %i.0, 1
>   %call1.1 = call i32 @add(i32 %call, i32 %call1)
>   %inc.1 = add nuw nsw i32 %inc, 1
>   %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
>   %inc.2 = add nuw nsw i32 %inc.1, 1
>   %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
>   %inc.3 = add nuw nsw i32 %inc.2, 1
>   br label %for.cond, !llvm.loop !4
>
> for.end:                                          ; preds = %for.cond
>   ret i32 0
> }
>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20200526/e9f9be51/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: cfgmain-cc1667.dot
Type: application/msword-template
Size: 5466 bytes
Desc: not available
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20200526/e9f9be51/attachment.bin>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: cfgmain-cce7ad.dot
Type: application/msword-template
Size: 5466 bytes
Desc: not available
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20200526/e9f9be51/attachment-0001.bin>

David Green via llvm-dev

2020-May-27 11:04 UTC

head link

[llvm-dev] Loop Unroll

Hello

You can add -debug to (usually) get some more information as to what is going
on. In this case, depending on your target, it won't actually tell you much
because turning on unrollandjam also requires a target option at the moment. Try
adding -allow-unroll-and-jam to override the targets preference and enable it.

It will then tell you that the inner loop is more complex than it was expecting.
Use loop rotate to turn that inner loop into a single block.

So:
clang -O0 -Xclang -disable-O0-optnone -g0  -emit-llvm -S -o unj.ll unj.c
opt -mem2reg -simplifycfg -loop-rotate -instcombine unj.ll -S -o unj2.ll
opt unj2.ll --loop-unroll-and-jam --unroll-and-jam-count=4 -allow-unroll-and-jam

Unfortunately it will then unroll and jam too much! The first i loop, the matrix
multiply j loop _and_ the matrix multiple i loop all get unrolled and jammed
when I tried it. Although I was using an n of 8, not 4. There is still an issue
in unroll and jam where the loop info is not updated correctly, which may then
cause it to crash. D80619 is a (second) fix for that, if we don't go with
another solution.

With a tripcount of 4 you will often end up just completely unrolling the loops.
Dave


From: llvm-dev <llvm-dev-bounces at lists.llvm.org> on behalf of legend xx
via llvm-dev <llvm-dev at lists.llvm.org>
Sent: 26 May 2020 18:55
To: Florian Hahn <florian_hahn at apple.com>
Cc: llvm-dev <llvm-dev at lists.llvm.org>
Subject: Re: [llvm-dev] Loop Unroll 
 
Awesome, thanks! 

Now I have another question. I have a matrix multiplication code. This is my
code:

#include <stdio.h>
#include <stdlib.h>

#define n 4

int main(int argc, char *argv[]) {
    int i, j, k;
   
    int A[n][n], B[n][n], C[n][n]; 
    for(i=0;i<n;i++){
        for(j=0;j<n;j++){
           A[i][j] = 1;
           B[i][j] = 2;
           C[i][j] = 0;
        }
    }  
  
    for(i=0;i<n;i++){
        for(j=0;j<n;j++){
            for(k=0;k<n;k++){
                C[i][j]=(C[i][j]+(A[i][k]*B[k][j]));
            }
        }
    }
 
    return 0;
}


I tried over them the loop-unroll-and-jam pass. I run:

$ clang -O0 -Xclang -disable-O0-optnone -emit-llvm mult.c -S -o mult.ll
$ opt -O0 -S -mem2reg -simplifycfg -view-cfg mult.ll -o multopt.ll
$ opt -O0 -S -mem2reg -simplifycfg --loop-unroll-and-jam
--unroll-and-jam-count=4 -simplifycfg -view-cfg mult.ll -o mult-opt00-unroll4.ll

I get the same CFG graph in two case (I attach it). Also, I tried with -O1 level
(opt -01 -loop-unroll-and-jam --unroll-and-jam-count=4), but I didn't get
any differece.

Why the pass --loop-unroll-and-jam does not work?






El dom., 24 may. 2020 a las 14:36, Florian Hahn (<florian_hahn at
apple.com>) escribió:


On May 23, 2020, at 17:15, legend xx <legendaryxx7slh at gmail.com> wrote:

This is my example (for.c):

#include <stdio.h>

int add(int a, int b) {
    return a + b;
}

int main() {
   int a, b, c, d;
   a = 5;
   b = 15;
   c = add(a, b);
   d = 0;
   for(int i=0;i<16;i++)
       d = add(c, d);
}

I run:
$ clang -O0 -Xclang -disable-O0-optnone -emit-llvm for.c -S -o forO0.ll
$ opt -O0 -S --loop-unroll --unroll-count=4 -view-cfg forO0.ll -o
for-opt00-unroll4.ll

And this is the LLVM IR code that I get: 

; ModuleID = 'forO0.ll'
source_filename = "for.c"
target datalayout =
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: noinline nounwind uwtable
define dso_local i32 @add(i32 %a, i32 %b) #0 {
entry:
  %a.addr = alloca i32, align 4
  %b.addr = alloca i32, align 4
  store i32 %a, i32* %a.addr, align 4
  store i32 %b, i32* %b.addr, align 4
  %0 = load i32, i32* %a.addr, align 4
  %1 = load i32, i32* %b.addr, align 4
  %add = add nsw i32 %0, %1
  ret i32 %add
}

; Function Attrs: noinline nounwind uwtable
define dso_local i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  %a = alloca i32, align 4
  %b = alloca i32, align 4
  %c = alloca i32, align 4
  %d = alloca i32, align 4
  %i = alloca i32, align 4
  store i32 0, i32* %retval, align 4
  store i32 5, i32* %a, align 4
  store i32 15, i32* %b, align 4
  %0 = load i32, i32* %a, align 4
  %1 = load i32, i32* %b, align 4
  %call = call i32 @add(i32 %0, i32 %1)
  store i32 %call, i32* %c, align 4
  store i32 0, i32* %d, align 4
  store i32 0, i32* %i, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc.3, %entry
  %2 = load i32, i32* %i, align 4
  %cmp = icmp slt i32 %2, 16
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %3 = load i32, i32* %c, align 4
  %4 = load i32, i32* %d, align 4
  %call1 = call i32 @add(i32 %3, i32 %4)
  store i32 %call1, i32* %d, align 4
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %5 = load i32, i32* %i, align 4
  %inc = add nsw i32 %5, 1
  store i32 %inc, i32* %i, align 4
  %6 = load i32, i32* %i, align 4
  %cmp.1 = icmp slt i32 %6, 16
  br i1 %cmp.1, label %for.body.1, label %for.end

for.end:                                          ; preds = %for.inc.2,
%for.inc.1, %for.inc, %for.cond
  %7 = load i32, i32* %d, align 4
  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x i8],
[20 x i8]* @.str, i64 0, i64 0), i32 %7)
  %8 = load i32, i32* %retval, align 4
  ret i32 %8

for.body.1:                                       ; preds = %for.inc
  %9 = load i32, i32* %c, align 4
  %10 = load i32, i32* %d, align 4
  %call1.1 = call i32 @add(i32 %9, i32 %10)
  store i32 %call1.1, i32* %d, align 4
  br label %for.inc.1

for.inc.1:                                        ; preds = %for.body.1
  %11 = load i32, i32* %i, align 4
  %inc.1 = add nsw i32 %11, 1
  store i32 %inc.1, i32* %i, align 4
  %12 = load i32, i32* %i, align 4
  %cmp.2 = icmp slt i32 %12, 16
  br i1 %cmp.2, label %for.body.2, label %for.end

for.body.2:                                       ; preds = %for.inc.1
  %13 = load i32, i32* %c, align 4
  %14 = load i32, i32* %d, align 4
  %call1.2 = call i32 @add(i32 %13, i32 %14)
  store i32 %call1.2, i32* %d, align 4
  br label %for.inc.2

for.inc.2:                                        ; preds = %for.body.2
  %15 = load i32, i32* %i, align 4
  %inc.2 = add nsw i32 %15, 1
  store i32 %inc.2, i32* %i, align 4
  %16 = load i32, i32* %i, align 4
  %cmp.3 = icmp slt i32 %16, 16
  br i1 %cmp.3, label %for.body.3, label %for.end

for.body.3:                                       ; preds = %for.inc.2
  %17 = load i32, i32* %c, align 4
  %18 = load i32, i32* %d, align 4
  %call1.3 = call i32 @add(i32 %17, i32 %18)
  store i32 %call1.3, i32* %d, align 4
  br label %for.inc.3

for.inc.3:                                        ; preds = %for.body.3
  %19 = load i32, i32* %i, align 4
  %inc.3 = add nsw i32 %19, 1
  store i32 %inc.3, i32* %i, align 4
  br label %for.cond, !llvm.loop !2
}

declare dso_local i32 @printf(i8*, ...) #1

attributes #0 = { noinline nounwind uwtable
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false"
"frame-pointer"="all"
"less-precise-fpmad"="false"
"min-legal-vector-width"="0"
"no-infs-fp-math"="false"
"no-jump-tables"="false"
"no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
"no-trapping-math"="false"
"stack-protector-buffer-size"="8"
"target-cpu"="x86-64"
"target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false"
"use-soft-float"="false" }
attributes #1 = {
"correctly-rounded-divide-sqrt-fp-math"="false"
"disable-tail-calls"="false"
"frame-pointer"="all"
"less-precise-fpmad"="false"
"no-infs-fp-math"="false"
"no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
"no-trapping-math"="false"
"stack-protector-buffer-size"="8"
"target-cpu"="x86-64"
"target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
"unsafe-fp-math"="false"
"use-soft-float"="false" }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git
a3485301d4870f57590d7b69eed7959134a694ab)"}
!2 = distinct !{!2, !3}
!3 = !{!"llvm.loop.unroll.disable"}


So my problem is:
With unroll 4 on the loop with 16 bounds I should see one single block for the
incrementation i=i+4, then 4 instructions for each previous one instruction, and
the condition should check if i<16. This is the intuitive code. However, the
incrementation that I get is i=i+1 and there are only 4 blocks.


Do you know why this happen?

I think loop-unroll works as expected in your example, as you can see the copies
of the unrolled loop blocks (for.body.X, for.inc.X). The reason this is not
simplified to the single block you are expecting is the input for -loop-unroll:
-loop-unroll gets the IR without any optimizations (-O0). 

For the expected result, you need to run a few additional passes before
-loop-unroll to promote some of the loads/stores to registers and simplify the
CFG of the input.  Running `opt -mem2reg -simplifycfg -loop-unroll
-unroll-count=4 forO0.ll -S` should give you something like

define i32 @main() #0 {
entry:
  %call = call i32 @add(i32 5, i32 15)
  br label %for.cond

for.cond:                                         ; preds = %for.body.3, %entry
  %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body.3 ]
  %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body.3 ]
  %cmp = icmp ult i32 %i.0, 16
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %call1 = call i32 @add(i32 %call, i32 %d.0)
  %inc = add nuw nsw i32 %i.0, 1
  br label %for.body.1

for.end:                                          ; preds = %for.cond
  ret i32 0

for.body.1:                                       ; preds = %for.body
  %call1.1 = call i32 @add(i32 %call, i32 %call1)
  %inc.1 = add nuw nsw i32 %inc, 1
  br label %for.body.2

for.body.2:                                       ; preds = %for.body.1
  %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
  %inc.2 = add nuw nsw i32 %inc.1, 1
  br label %for.body.3

for.body.3:                                       ; preds = %for.body.2
  %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
  %inc.3 = add nuw nsw i32 %inc.2, 1
  br label %for.cond, !llvm.loop !4
}

Note that there are still 4 copies of the body instead of a single one. Like
many passes in LLVM, the loop-unroll pass focuses on performing one
transformation (duplicating the loop body a number of times) and relies on other
passes to clean-up/simplify the result. To fold the 4 copies of the body into a
single block, you need another round of CFG simplifications. Running `opt
-mem2reg -simplifycfg -loop-unroll -unroll-count=4 -simplifycfg forO0.ll -S`
produces the code below, which is what you are looking for IIUC.

define i32 @main() #0 {
entry:
  %call = call i32 @add(i32 5, i32 15)
  br label %for.cond

for.cond:                                         ; preds = %for.body, %entry
  %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body ]
  %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
  %cmp = icmp ult i32 %i.0, 16
  br i1 %cmp, label %for.body, label %for.end

for.body:                                         ; preds = %for.cond
  %call1 = call i32 @add(i32 %call, i32 %d.0)
  %inc = add nuw nsw i32 %i.0, 1
  %call1.1 = call i32 @add(i32 %call, i32 %call1)
  %inc.1 = add nuw nsw i32 %inc, 1
  %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
  %inc.2 = add nuw nsw i32 %inc.1, 1
  %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
  %inc.3 = add nuw nsw i32 %inc.2, 1
  br label %for.cond, !llvm.loop !4

for.end:                                          ; preds = %for.cond
  ret i32 0
}

legend xx via llvm-dev

2020-May-27 11:42 UTC

head link

[llvm-dev] Loop Unroll

Thanks for the tip!

What is the D80619 you say? Where can I find it?

El mié., 27 may. 2020 a las 13:04, David Green (<David.Green at arm.com>)
escribió:
> Hello
>
> You can add -debug to (usually) get some more information as to what is
> going on. In this case, depending on your target, it won't actually
tell
> you much because turning on unrollandjam also requires a target option at
> the moment. Try adding -allow-unroll-and-jam to override the targets
> preference and enable it.
>
> It will then tell you that the inner loop is more complex than it was
> expecting. Use loop rotate to turn that inner loop into a single block.
>
> So:
> clang -O0 -Xclang -disable-O0-optnone -g0  -emit-llvm -S -o unj.ll unj.c
> opt -mem2reg -simplifycfg -loop-rotate -instcombine unj.ll -S -o unj2.ll
> opt unj2.ll --loop-unroll-and-jam --unroll-and-jam-count=4
> -allow-unroll-and-jam
>
> Unfortunately it will then unroll and jam too much! The first i loop, the
> matrix multiply j loop _and_ the matrix multiple i loop all get unrolled
> and jammed when I tried it. Although I was using an n of 8, not 4. There is
> still an issue in unroll and jam where the loop info is not updated
> correctly, which may then cause it to crash. D80619 is a (second) fix for
> that, if we don't go with another solution.
>
> With a tripcount of 4 you will often end up just completely unrolling the
> loops.
> Dave
>
>
> From: llvm-dev <llvm-dev-bounces at lists.llvm.org> on behalf of
legend xx
> via llvm-dev <llvm-dev at lists.llvm.org>
> Sent: 26 May 2020 18:55
> To: Florian Hahn <florian_hahn at apple.com>
> Cc: llvm-dev <llvm-dev at lists.llvm.org>
> Subject: Re: [llvm-dev] Loop Unroll
>
> Awesome, thanks!
>
> Now I have another question. I have a matrix multiplication code. This is
> my code:
>
> #include <stdio.h>
> #include <stdlib.h>
>
> #define n 4
>
> int main(int argc, char *argv[]) {
>     int i, j, k;
>
>     int A[n][n], B[n][n], C[n][n];
>     for(i=0;i<n;i++){
>         for(j=0;j<n;j++){
>            A[i][j] = 1;
>            B[i][j] = 2;
>            C[i][j] = 0;
>         }
>     }
>
>     for(i=0;i<n;i++){
>         for(j=0;j<n;j++){
>             for(k=0;k<n;k++){
>                 C[i][j]=(C[i][j]+(A[i][k]*B[k][j]));
>             }
>         }
>     }
>
>     return 0;
> }
>
>
> I tried over them the loop-unroll-and-jam pass. I run:
>
> $ clang -O0 -Xclang -disable-O0-optnone -emit-llvm mult.c -S -o mult.ll
> $ opt -O0 -S -mem2reg -simplifycfg -view-cfg mult.ll -o multopt.ll
> $ opt -O0 -S -mem2reg -simplifycfg --loop-unroll-and-jam
> --unroll-and-jam-count=4 -simplifycfg -view-cfg mult.ll -o
> mult-opt00-unroll4.ll
>
> I get the same CFG graph in two case (I attach it). Also, I tried with -O1
> level (opt -01 -loop-unroll-and-jam --unroll-and-jam-count=4), but I
didn't
> get any differece.
>
> Why the pass --loop-unroll-and-jam does not work?
>
>
>
>
>
>
> El dom., 24 may. 2020 a las 14:36, Florian Hahn (<florian_hahn at
apple.com>)
> escribió:
>
>
> On May 23, 2020, at 17:15, legend xx <legendaryxx7slh at gmail.com>
wrote:
>
> This is my example (for.c):
>
> #include <stdio.h>
>
> int add(int a, int b) {
>     return a + b;
> }
>
> int main() {
>    int a, b, c, d;
>    a = 5;
>    b = 15;
>    c = add(a, b);
>    d = 0;
>    for(int i=0;i<16;i++)
>        d = add(c, d);
> }
>
> I run:
> $ clang -O0 -Xclang -disable-O0-optnone -emit-llvm for.c -S -o forO0.ll
> $ opt -O0 -S --loop-unroll --unroll-count=4 -view-cfg forO0.ll -o
> for-opt00-unroll4.ll
>
> And this is the LLVM IR code that I get:
>
> ; ModuleID = 'forO0.ll'
> source_filename = "for.c"
> target datalayout >
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-unknown-linux-gnu"
>
> ; Function Attrs: noinline nounwind uwtable
> define dso_local i32 @add(i32 %a, i32 %b) #0 {
> entry:
>   %a.addr = alloca i32, align 4
>   %b.addr = alloca i32, align 4
>   store i32 %a, i32* %a.addr, align 4
>   store i32 %b, i32* %b.addr, align 4
>   %0 = load i32, i32* %a.addr, align 4
>   %1 = load i32, i32* %b.addr, align 4
>   %add = add nsw i32 %0, %1
>   ret i32 %add
> }
>
> ; Function Attrs: noinline nounwind uwtable
> define dso_local i32 @main() #0 {
> entry:
>   %retval = alloca i32, align 4
>   %a = alloca i32, align 4
>   %b = alloca i32, align 4
>   %c = alloca i32, align 4
>   %d = alloca i32, align 4
>   %i = alloca i32, align 4
>   store i32 0, i32* %retval, align 4
>   store i32 5, i32* %a, align 4
>   store i32 15, i32* %b, align 4
>   %0 = load i32, i32* %a, align 4
>   %1 = load i32, i32* %b, align 4
>   %call = call i32 @add(i32 %0, i32 %1)
>   store i32 %call, i32* %c, align 4
>   store i32 0, i32* %d, align 4
>   store i32 0, i32* %i, align 4
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.inc.3,
> %entry
>   %2 = load i32, i32* %i, align 4
>   %cmp = icmp slt i32 %2, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %3 = load i32, i32* %c, align 4
>   %4 = load i32, i32* %d, align 4
>   %call1 = call i32 @add(i32 %3, i32 %4)
>   store i32 %call1, i32* %d, align 4
>   br label %for.inc
>
> for.inc:                                          ; preds = %for.body
>   %5 = load i32, i32* %i, align 4
>   %inc = add nsw i32 %5, 1
>   store i32 %inc, i32* %i, align 4
>   %6 = load i32, i32* %i, align 4
>   %cmp.1 = icmp slt i32 %6, 16
>   br i1 %cmp.1, label %for.body.1, label %for.end
>
> for.end:                                          ; preds = %for.inc.2,
> %for.inc.1, %for.inc, %for.cond
>   %7 = load i32, i32* %d, align 4
>   %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([20 x
> i8], [20 x i8]* @.str, i64 0, i64 0), i32 %7)
>   %8 = load i32, i32* %retval, align 4
>   ret i32 %8
>
> for.body.1:                                       ; preds = %for.inc
>   %9 = load i32, i32* %c, align 4
>   %10 = load i32, i32* %d, align 4
>   %call1.1 = call i32 @add(i32 %9, i32 %10)
>   store i32 %call1.1, i32* %d, align 4
>   br label %for.inc.1
>
> for.inc.1:                                        ; preds = %for.body.1
>   %11 = load i32, i32* %i, align 4
>   %inc.1 = add nsw i32 %11, 1
>   store i32 %inc.1, i32* %i, align 4
>   %12 = load i32, i32* %i, align 4
>   %cmp.2 = icmp slt i32 %12, 16
>   br i1 %cmp.2, label %for.body.2, label %for.end
>
> for.body.2:                                       ; preds = %for.inc.1
>   %13 = load i32, i32* %c, align 4
>   %14 = load i32, i32* %d, align 4
>   %call1.2 = call i32 @add(i32 %13, i32 %14)
>   store i32 %call1.2, i32* %d, align 4
>   br label %for.inc.2
>
> for.inc.2:                                        ; preds = %for.body.2
>   %15 = load i32, i32* %i, align 4
>   %inc.2 = add nsw i32 %15, 1
>   store i32 %inc.2, i32* %i, align 4
>   %16 = load i32, i32* %i, align 4
>   %cmp.3 = icmp slt i32 %16, 16
>   br i1 %cmp.3, label %for.body.3, label %for.end
>
> for.body.3:                                       ; preds = %for.inc.2
>   %17 = load i32, i32* %c, align 4
>   %18 = load i32, i32* %d, align 4
>   %call1.3 = call i32 @add(i32 %17, i32 %18)
>   store i32 %call1.3, i32* %d, align 4
>   br label %for.inc.3
>
> for.inc.3:                                        ; preds = %for.body.3
>   %19 = load i32, i32* %i, align 4
>   %inc.3 = add nsw i32 %19, 1
>   store i32 %inc.3, i32* %i, align 4
>   br label %for.cond, !llvm.loop !2
> }
>
> declare dso_local i32 @printf(i8*, ...) #1
>
> attributes #0 = { noinline nounwind uwtable
> "correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false"
"frame-pointer"="all"
> "less-precise-fpmad"="false"
"min-legal-vector-width"="0"
> "no-infs-fp-math"="false"
"no-jump-tables"="false"
> "no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false"
"stack-protector-buffer-size"="8"
> "target-cpu"="x86-64"
"target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false"
"use-soft-float"="false" }
> attributes #1 = {
"correctly-rounded-divide-sqrt-fp-math"="false"
> "disable-tail-calls"="false"
"frame-pointer"="all"
> "less-precise-fpmad"="false"
"no-infs-fp-math"="false"
> "no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false"
> "no-trapping-math"="false"
"stack-protector-buffer-size"="8"
> "target-cpu"="x86-64"
"target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87"
> "unsafe-fp-math"="false"
"use-soft-float"="false" }
>
> !llvm.module.flags = !{!0}
> !llvm.ident = !{!1}
>
> !0 = !{i32 1, !"wchar_size", i32 4}
> !1 = !{!"clang version 11.0.0
(https://github.com/llvm/llvm-project.git
> a3485301d4870f57590d7b69eed7959134a694ab)"}
> !2 = distinct !{!2, !3}
> !3 = !{!"llvm.loop.unroll.disable"}
>
>
> So my problem is:
> With unroll 4 on the loop with 16 bounds I should see one single block for
> the incrementation i=i+4, then 4 instructions for each previous one
> instruction, and the condition should check if i<16. This is the
intuitive
> code. However, the incrementation that I get is i=i+1 and there are only 4
> blocks.
>
>
> Do you know why this happen?
>
> I think loop-unroll works as expected in your example, as you can see the
> copies of the unrolled loop blocks (for.body.X, for.inc.X). The reason this
> is not simplified to the single block you are expecting is the input for
> -loop-unroll: -loop-unroll gets the IR without any optimizations (-O0).
>
> For the expected result, you need to run a few additional passes before
> -loop-unroll to promote some of the loads/stores to registers and simplify
> the CFG of the input.  Running `opt -mem2reg -simplifycfg -loop-unroll
> -unroll-count=4 forO0.ll -S` should give you something like
>
> define i32 @main() #0 {
> entry:
>   %call = call i32 @add(i32 5, i32 15)
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.body.3,
> %entry
>   %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body.3 ]
>   %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body.3 ]
>   %cmp = icmp ult i32 %i.0, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %call1 = call i32 @add(i32 %call, i32 %d.0)
>   %inc = add nuw nsw i32 %i.0, 1
>   br label %for.body.1
>
> for.end:                                          ; preds = %for.cond
>   ret i32 0
>
> for.body.1:                                       ; preds = %for.body
>   %call1.1 = call i32 @add(i32 %call, i32 %call1)
>   %inc.1 = add nuw nsw i32 %inc, 1
>   br label %for.body.2
>
> for.body.2:                                       ; preds = %for.body.1
>   %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
>   %inc.2 = add nuw nsw i32 %inc.1, 1
>   br label %for.body.3
>
> for.body.3:                                       ; preds = %for.body.2
>   %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
>   %inc.3 = add nuw nsw i32 %inc.2, 1
>   br label %for.cond, !llvm.loop !4
> }
>
> Note that there are still 4 copies of the body instead of a single one.
> Like many passes in LLVM, the loop-unroll pass focuses on performing one
> transformation (duplicating the loop body a number of times) and relies on
> other passes to clean-up/simplify the result. To fold the 4 copies of the
> body into a single block, you need another round of CFG simplifications.
> Running `opt -mem2reg -simplifycfg -loop-unroll -unroll-count=4
> -simplifycfg forO0.ll -S` produces the code below, which is what you are
> looking for IIUC.
>
> define i32 @main() #0 {
> entry:
>   %call = call i32 @add(i32 5, i32 15)
>   br label %for.cond
>
> for.cond:                                         ; preds = %for.body,
> %entry
>   %d.0 = phi i32 [ 0, %entry ], [ %call1.3, %for.body ]
>   %i.0 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
>   %cmp = icmp ult i32 %i.0, 16
>   br i1 %cmp, label %for.body, label %for.end
>
> for.body:                                         ; preds = %for.cond
>   %call1 = call i32 @add(i32 %call, i32 %d.0)
>   %inc = add nuw nsw i32 %i.0, 1
>   %call1.1 = call i32 @add(i32 %call, i32 %call1)
>   %inc.1 = add nuw nsw i32 %inc, 1
>   %call1.2 = call i32 @add(i32 %call, i32 %call1.1)
>   %inc.2 = add nuw nsw i32 %inc.1, 1
>   %call1.3 = call i32 @add(i32 %call, i32 %call1.2)
>   %inc.3 = add nuw nsw i32 %inc.2, 1
>   br label %for.cond, !llvm.loop !4
>
> for.end:                                          ; preds = %for.cond
>   ret i32 0
> }
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20200527/5d48207e/attachment.html>

Florian Hahn via llvm-dev

2020-May-27 11:55 UTC

head link

[llvm-dev] Loop Unroll

> On May 26, 2020, at 18:55, legend xx <legendaryxx7slh at gmail.com>
wrote:
> 
> Awesome, thanks! 
> 
> Now I have another question. I have a matrix multiplication code. This is
my code:
> 
On a related note, if you are looking to generate high-performance code for
matrix operations on matrixes with known dimension you might be interested in
Clang’s matrix types extension
(https://clang.llvm.org/docs/MatrixTypes.html#matrixtypes
<https://clang.llvm.org/docs/MatrixTypes.html#matrixtypes>). This should
be less fragile with respect to relying on a number of passes to align for good
performance.

Please note that the implementation in Clang is still incomplete, but the
outstanding patches to implement it (almost) completely can be found starting at
https://reviews.llvm.org/D76791 <https://reviews.llvm.org/D76791> and the
linked patches (‘stack’ in Phabricator terms).

Cheers,
Florian
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20200527/d6e6fde6/attachment.html>

llvm dev - May 2020 - Loop Unroll

[llvm-dev] Loop Unroll

[llvm-dev] Loop Unroll

[llvm-dev] Loop Unroll

[llvm-dev] Loop Unroll