thr3ads.net - llvm dev - [LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops [Jul 2014]

If this information is useful, please help other people find it:
Share via:

Dennis Luehring

2014-Jul-23 03:51 UTC

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

the very simple example

----
const int SIZE = 3;

int the_func(int* p_array)
{
    int dummy = 0;
#if defined(ITER)
    for(int* p = &p_array[0]; p < &p_array[SIZE]; ++p) dummy += *p;
#else
    for(int i = 0; i < SIZE; ++i) dummy += p_array[i];
#endif
    return dummy;
}

int main(int argc, char** argv)
{
    int* array = new int[SIZE];
    for(int i = 0; i < SIZE; ++i){ array[i] = *argv[i]; }
    int dummy = the_func(array);
    delete[] array;
    return dummy;
}
----

compiled with gcc 4.9.1 and clang 3.5

with clang3.5 + #define ITER the_func contains masses of code
the code in main is also sometimes different (not just inlined) to the_func

clang -DITER -O2
clang -DITER -O3

gives:

the_func:
      leaq    12(%rdi), %rcx
      leaq    4(%rdi), %rax
      cmpq    %rax, %rcx
      cmovaq    %rcx, %rax
      movq    %rdi, %rsi
      notq    %rsi
      addq    %rax, %rsi
      shrq    $2, %rsi
      incq    %rsi
      xorl    %edx, %edx
      movabsq    $9223372036854775800, %rax # imm = 0x7FFFFFFFFFFFFFF8
      andq    %rsi, %rax
      pxor    %xmm0, %xmm0
      je    .LBB0_1
# BB#2:                                 # %vector.body.preheader
      leaq    (%rdi,%rax,4), %r8
      addq    $16, %rdi
      movq    %rsi, %rdx
      andq    $-8, %rdx
      pxor    %xmm0, %xmm0
      pxor    %xmm1, %xmm1
      .align    16, 0x90
.LBB0_3:                                # %vector.body
                                          # =>This Inner Loop Header: Depth=1
      movdqa    %xmm1, %xmm2
      movdqa    %xmm0, %xmm3
      movdqu    -16(%rdi), %xmm0
      movdqu    (%rdi), %xmm1
      paddd    %xmm3, %xmm0
      paddd    %xmm2, %xmm1
      addq    $32, %rdi
      addq    $-8, %rdx
      jne    .LBB0_3
# BB#4:
      movq    %r8, %rdi
      movq    %rax, %rdx
      jmp    .LBB0_5
.LBB0_1:
      pxor    %xmm1, %xmm1
.LBB0_5:                                # %middle.block
      paddd    %xmm1, %xmm0
      movdqa    %xmm0, %xmm1
      movhlps    %xmm1, %xmm1            # xmm1 = xmm1[1,1]
      paddd    %xmm0, %xmm1
      pshufd    $1, %xmm1, %xmm0        # xmm0 = xmm1[1,0,0,0]
      paddd    %xmm1, %xmm0
      movd    %xmm0, %eax
      cmpq    %rdx, %rsi
      je    .LBB0_7
      .align    16, 0x90
.LBB0_6:                                # %scalar.ph
                                          # =>This Inner Loop Header: Depth=1
      addl    (%rdi), %eax
      addq    $4, %rdi
      cmpq    %rcx, %rdi
      jb    .LBB0_6
.LBB0_7:                                # %._crit_edge
      retq

isn't that a little bit too long?

other better looking results:

clang -O2
clang -O3
gcc -O3
gcc -DITER -O3

gives:

the_func:
      movl    4(%rdi), %eax
      addl    (%rdi), %eax
      addl    8(%rdi), %eax
      ret(q)

looks good

gcc -DITER -O2

gives:

the_func:
      leaq    12(%rdi), %rdx
      xorl    %eax, %eax
.L2:
      addl    (%rdi), %eax
      addq    $4, %rdi
      cmpq    %rdx, %rdi
      jne    .L2
      rep ret

looks good


gcc4.9.1 seems to be more "stable" in its optimization for the_func
and main

Hal Finkel

2014-Jul-23 04:00 UTC

head link

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

Hi Dennis,

Can you please file a bug for this at http://llvm.org/bugs/ -- we should not be
vectorizing this loop of length 3.

 -Hal

----- Original Message -----> From: "Dennis Luehring" <dl.soluz at gmx.net>
> To: llvmdev at cs.uiuc.edu
> Sent: Tuesday, July 22, 2014 10:51:54 PM
> Subject: [LLVMdev] the clang 3.5 loop optimizer seems to jump in
unintentional for simple loops
> 
> the clang 3.5 loop optimizer seems to jump in unintentional for
> simple loops
> 
> the very simple example
> 
> ----
> const int SIZE = 3;
> 
> int the_func(int* p_array)
> {
>     int dummy = 0;
> #if defined(ITER)
>     for(int* p = &p_array[0]; p < &p_array[SIZE]; ++p) dummy +=
*p;
> #else
>     for(int i = 0; i < SIZE; ++i) dummy += p_array[i];
> #endif
>     return dummy;
> }
> 
> int main(int argc, char** argv)
> {
>     int* array = new int[SIZE];
>     for(int i = 0; i < SIZE; ++i){ array[i] = *argv[i]; }
>     int dummy = the_func(array);
>     delete[] array;
>     return dummy;
> }
> ----
> 
> compiled with gcc 4.9.1 and clang 3.5
> 
> with clang3.5 + #define ITER the_func contains masses of code
> the code in main is also sometimes different (not just inlined) to
> the_func
> 
> clang -DITER -O2
> clang -DITER -O3
> 
> gives:
> 
> the_func:
>       leaq    12(%rdi), %rcx
>       leaq    4(%rdi), %rax
>       cmpq    %rax, %rcx
>       cmovaq    %rcx, %rax
>       movq    %rdi, %rsi
>       notq    %rsi
>       addq    %rax, %rsi
>       shrq    $2, %rsi
>       incq    %rsi
>       xorl    %edx, %edx
>       movabsq    $9223372036854775800, %rax # imm >      
0x7FFFFFFFFFFFFFF8
>       andq    %rsi, %rax
>       pxor    %xmm0, %xmm0
>       je    .LBB0_1
> # BB#2:                                 # %vector.body.preheader
>       leaq    (%rdi,%rax,4), %r8
>       addq    $16, %rdi
>       movq    %rsi, %rdx
>       andq    $-8, %rdx
>       pxor    %xmm0, %xmm0
>       pxor    %xmm1, %xmm1
>       .align    16, 0x90
> .LBB0_3:                                # %vector.body
>                                           # =>This Inner Loop Header:
>                                           Depth=1
>       movdqa    %xmm1, %xmm2
>       movdqa    %xmm0, %xmm3
>       movdqu    -16(%rdi), %xmm0
>       movdqu    (%rdi), %xmm1
>       paddd    %xmm3, %xmm0
>       paddd    %xmm2, %xmm1
>       addq    $32, %rdi
>       addq    $-8, %rdx
>       jne    .LBB0_3
> # BB#4:
>       movq    %r8, %rdi
>       movq    %rax, %rdx
>       jmp    .LBB0_5
> .LBB0_1:
>       pxor    %xmm1, %xmm1
> .LBB0_5:                                # %middle.block
>       paddd    %xmm1, %xmm0
>       movdqa    %xmm0, %xmm1
>       movhlps    %xmm1, %xmm1            # xmm1 = xmm1[1,1]
>       paddd    %xmm0, %xmm1
>       pshufd    $1, %xmm1, %xmm0        # xmm0 = xmm1[1,0,0,0]
>       paddd    %xmm1, %xmm0
>       movd    %xmm0, %eax
>       cmpq    %rdx, %rsi
>       je    .LBB0_7
>       .align    16, 0x90
> .LBB0_6:                                # %scalar.ph
>                                           # =>This Inner Loop Header:
>                                           Depth=1
>       addl    (%rdi), %eax
>       addq    $4, %rdi
>       cmpq    %rcx, %rdi
>       jb    .LBB0_6
> .LBB0_7:                                # %._crit_edge
>       retq
> 
> isn't that a little bit too long?
> 
> other better looking results:
> 
> clang -O2
> clang -O3
> gcc -O3
> gcc -DITER -O3
> 
> gives:
> 
> the_func:
>       movl    4(%rdi), %eax
>       addl    (%rdi), %eax
>       addl    8(%rdi), %eax
>       ret(q)
> 
> looks good
> 
> gcc -DITER -O2
> 
> gives:
> 
> the_func:
>       leaq    12(%rdi), %rdx
>       xorl    %eax, %eax
> .L2:
>       addl    (%rdi), %eax
>       addq    $4, %rdi
>       cmpq    %rdx, %rdi
>       jne    .L2
>       rep ret
> 
> looks good
> 
> 
> gcc4.9.1 seems to be more "stable" in its optimization for
the_func
> and main
> 
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
> 
-- 
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory

Chandler Carruth

2014-Jul-23 04:00 UTC

head link

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

On Tue, Jul 22, 2014 at 8:51 PM, Dennis Luehring <dl.soluz at gmx.net>
wrote:
> with clang3.5 + #define ITER the_func contains masses of code
> the code in main is also sometimes different (not just inlined) to the_func
>
It looks like it has vectorized the code which is usually good? I'm
actually surprised it failed to vectorize when using the integer indices.

Did you benchmark the performance?
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20140722/fad33969/attachment.html>

Chandler Carruth

2014-Jul-23 04:03 UTC

head link

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

On Tue, Jul 22, 2014 at 9:00 PM, Hal Finkel <hfinkel at anl.gov> wrote:
> Hi Dennis,
>
> Can you please file a bug for this at http://llvm.org/bugs/ -- we should
> not be vectorizing this loop of length 3.
>
Ahh, we lose track of the constant trip count. Yea, that's pretty terrible.
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20140722/9a639c34/attachment.html>

Dennis Luehring

2014-Jul-23 12:41 UTC

head link

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

Am 23.07.2014 06:00, schrieb Hal Finkel:> Hi Dennis,
>
> Can you please file a bug for this athttp://llvm.org/bugs/  -- we should
not be vectorizing this loop of length 3.
>
>   -Hal
filed http://llvm.org/bugs/show_bug.cgi?id=20409

Apparently Analagous Threads

Search for more apparently analagous threads

llvm dev - Jul 2014 - [LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

[LLVMdev] the clang 3.5 loop optimizer seems to jump in unintentional for simple loops

Apparently Analagous Threads