Hi,
Please consider the following loop:
using v4f32 = float __attribute__((__vector_size__(16)));
void fct6(v4f32 *x)
{
#pragma clang loop vectorize(enable)
  for (int i = 0; i < 256; ++i)
    x[i] = 7 * x[i];
}
After compiling it with:
clang++ -O3 -march=native -mtune=native \
-Rpass=loop-vectorize,slp-vectorize
-Rpass-missed=loop-vectorize,slp-vectorize
-Rpass-analysis=loop-vectorize,slp-vectorize \
-ffast-math -ffp-model=fast -ffp-exception-behavior=ignore
-ffp-contract=fast -mrecip=all:0 \
-c -o vec.o vec.cc
I get the following codegen:
0000000000000160 <_Z4fct6PDv4_f>:
 160: 31 c0                xor    %eax,%eax
 162: c4 e2 79 18 05 00 00 vbroadcastss 0x0(%rip),%xmm0        # 16b
<_Z4fct6PDv4_f+0xb>
 169: 00 00
 16b: 0f 1f 44 00 00        nopl   0x0(%rax,%rax,1)
 170: c5 f8 59 0c 07        vmulps (%rdi,%rax,1),%xmm0,%xmm1
 175: c5 f8 29 0c 07        vmovaps %xmm1,(%rdi,%rax,1)
 17a: c5 f8 59 4c 07 10    vmulps 0x10(%rdi,%rax,1),%xmm0,%xmm1
 180: c5 f8 29 4c 07 10    vmovaps %xmm1,0x10(%rdi,%rax,1)
 186: c5 f8 59 4c 07 20    vmulps 0x20(%rdi,%rax,1),%xmm0,%xmm1
 18c: c5 f8 29 4c 07 20    vmovaps %xmm1,0x20(%rdi,%rax,1)
 192: c5 f8 59 4c 07 30    vmulps 0x30(%rdi,%rax,1),%xmm0,%xmm1
 198: c5 f8 29 4c 07 30    vmovaps %xmm1,0x30(%rdi,%rax,1)
 19e: c5 f8 59 4c 07 40    vmulps 0x40(%rdi,%rax,1),%xmm0,%xmm1
 1a4: c5 f8 29 4c 07 40    vmovaps %xmm1,0x40(%rdi,%rax,1)
 1aa: c5 f8 59 4c 07 50    vmulps 0x50(%rdi,%rax,1),%xmm0,%xmm1
 1b0: c5 f8 29 4c 07 50    vmovaps %xmm1,0x50(%rdi,%rax,1)
 1b6: c5 f8 59 4c 07 60    vmulps 0x60(%rdi,%rax,1),%xmm0,%xmm1
 1bc: c5 f8 29 4c 07 60    vmovaps %xmm1,0x60(%rdi,%rax,1)
 1c2: c5 f8 59 4c 07 70    vmulps 0x70(%rdi,%rax,1),%xmm0,%xmm1
 1c8: c5 f8 29 4c 07 70    vmovaps %xmm1,0x70(%rdi,%rax,1)
 1ce: 48 83 e8 80          sub    $0xffffffffffffff80,%rax
 1d2: 48 3d 00 10 00 00    cmp    $0x1000,%rax
 1d8: 75 96                jne    170 <_Z4fct6PDv4_f+0x10>
 1da: c3                    retq
My CPU being Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz, I have AVX2. So
should the compiler understand the loop and upgrade the vector width?
On the other hand if I do the following loop:
void fct7(float *x)
{
#pragma clang loop vectorize(enable)
  for (int i = 0; i < 4 * 256; ++i)
    x[i] = 7 * x[i];
}
It compiles it to:
00000000000001e0 <_Z4fct7Pf>:
 1e0: 31 c0                xor    %eax,%eax
 1e2: c4 e2 7d 18 05 00 00 vbroadcastss 0x0(%rip),%ymm0        # 1eb
<_Z4fct7Pf+0xb>
 1e9: 00 00
 1eb: 0f 1f 44 00 00        nopl   0x0(%rax,%rax,1)
 1f0: c5 fc 59 0c 87        vmulps (%rdi,%rax,4),%ymm0,%ymm1
 1f5: c5 fc 59 54 87 20    vmulps 0x20(%rdi,%rax,4),%ymm0,%ymm2
 1fb: c5 fc 59 5c 87 40    vmulps 0x40(%rdi,%rax,4),%ymm0,%ymm3
 201: c5 fc 59 64 87 60    vmulps 0x60(%rdi,%rax,4),%ymm0,%ymm4
 207: c5 fc 11 0c 87        vmovups %ymm1,(%rdi,%rax,4)
 20c: c5 fc 11 54 87 20    vmovups %ymm2,0x20(%rdi,%rax,4)
 212: c5 fc 11 5c 87 40    vmovups %ymm3,0x40(%rdi,%rax,4)
 218: c5 fc 11 64 87 60    vmovups %ymm4,0x60(%rdi,%rax,4)
 21e: c5 fc 59 8c 87 80 00 vmulps 0x80(%rdi,%rax,4),%ymm0,%ymm1
 225: 00 00
 227: c5 fc 59 94 87 a0 00 vmulps 0xa0(%rdi,%rax,4),%ymm0,%ymm2
 22e: 00 00
 230: c5 fc 59 9c 87 c0 00 vmulps 0xc0(%rdi,%rax,4),%ymm0,%ymm3
 237: 00 00
 239: c5 fc 59 a4 87 e0 00 vmulps 0xe0(%rdi,%rax,4),%ymm0,%ymm4
 240: 00 00
 242: c5 fc 11 8c 87 80 00 vmovups %ymm1,0x80(%rdi,%rax,4)
 249: 00 00
 24b: c5 fc 11 94 87 a0 00 vmovups %ymm2,0xa0(%rdi,%rax,4)
 252: 00 00
 254: c5 fc 11 9c 87 c0 00 vmovups %ymm3,0xc0(%rdi,%rax,4)
 25b: 00 00
 25d: c5 fc 11 a4 87 e0 00 vmovups %ymm4,0xe0(%rdi,%rax,4)
 264: 00 00
 266: 48 83 c0 40          add    $0x40,%rax
 26a: 48 3d 00 04 00 00    cmp    $0x400,%rax
 270: 0f 85 7a ff ff ff    jne    1f0 <_Z4fct7Pf+0x10>
 276: c5 f8 77              vzeroupper
 279: c3                    retq
Which is using wider vectors.
What do you think? Why not transform the fct6's loop to use wider registers?
Regards,
-- 
Alexandre Bique
> On Sep 1, 2020, at 12:07, Alexandre Bique via llvm-dev <llvm-dev at lists.llvm.org> wrote: > > Hi, > > Please consider the following loop: > > using v4f32 = float __attribute__((__vector_size__(16))); > > void fct6(v4f32 *x) > { > #pragma clang loop vectorize(enable) > for (int i = 0; i < 256; ++i) > x[i] = 7 * x[i]; > } > > After compiling it with: > > clang++ -O3 -march=native -mtune=native \ > -Rpass=loop-vectorize,slp-vectorize > -Rpass-missed=loop-vectorize,slp-vectorize > -Rpass-analysis=loop-vectorize,slp-vectorize \ > -ffast-math -ffp-model=fast -ffp-exception-behavior=ignore > -ffp-contract=fast -mrecip=all:0 \ > -c -o vec.o vec.cc > > I get the following codegen: > > 0000000000000160 <_Z4fct6PDv4_f>: > 160: 31 c0 xor %eax,%eax > 162: c4 e2 79 18 05 00 00 vbroadcastss 0x0(%rip),%xmm0 # 16b > <_Z4fct6PDv4_f+0xb> > 169: 00 00 > 16b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) > 170: c5 f8 59 0c 07 vmulps (%rdi,%rax,1),%xmm0,%xmm1 > 175: c5 f8 29 0c 07 vmovaps %xmm1,(%rdi,%rax,1) > 17a: c5 f8 59 4c 07 10 vmulps 0x10(%rdi,%rax,1),%xmm0,%xmm1 > 180: c5 f8 29 4c 07 10 vmovaps %xmm1,0x10(%rdi,%rax,1) > 186: c5 f8 59 4c 07 20 vmulps 0x20(%rdi,%rax,1),%xmm0,%xmm1 > 18c: c5 f8 29 4c 07 20 vmovaps %xmm1,0x20(%rdi,%rax,1) > 192: c5 f8 59 4c 07 30 vmulps 0x30(%rdi,%rax,1),%xmm0,%xmm1 > 198: c5 f8 29 4c 07 30 vmovaps %xmm1,0x30(%rdi,%rax,1) > 19e: c5 f8 59 4c 07 40 vmulps 0x40(%rdi,%rax,1),%xmm0,%xmm1 > 1a4: c5 f8 29 4c 07 40 vmovaps %xmm1,0x40(%rdi,%rax,1) > 1aa: c5 f8 59 4c 07 50 vmulps 0x50(%rdi,%rax,1),%xmm0,%xmm1 > 1b0: c5 f8 29 4c 07 50 vmovaps %xmm1,0x50(%rdi,%rax,1) > 1b6: c5 f8 59 4c 07 60 vmulps 0x60(%rdi,%rax,1),%xmm0,%xmm1 > 1bc: c5 f8 29 4c 07 60 vmovaps %xmm1,0x60(%rdi,%rax,1) > 1c2: c5 f8 59 4c 07 70 vmulps 0x70(%rdi,%rax,1),%xmm0,%xmm1 > 1c8: c5 f8 29 4c 07 70 vmovaps %xmm1,0x70(%rdi,%rax,1) > 1ce: 48 83 e8 80 sub $0xffffffffffffff80,%rax > 1d2: 48 3d 00 10 00 00 cmp $0x1000,%rax > 1d8: 75 96 jne 170 <_Z4fct6PDv4_f+0x10> > 1da: c3 retq > > My CPU being Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz, I have AVX2. So > should the compiler understand the loop and upgrade the vector width? > > On the other hand if I do the following loop: > > void fct7(float *x) > { > #pragma clang loop vectorize(enable) > for (int i = 0; i < 4 * 256; ++i) > x[i] = 7 * x[i]; > } > > It compiles it to: > > 00000000000001e0 <_Z4fct7Pf>: > 1e0: 31 c0 xor %eax,%eax > 1e2: c4 e2 7d 18 05 00 00 vbroadcastss 0x0(%rip),%ymm0 # 1eb > <_Z4fct7Pf+0xb> > 1e9: 00 00 > 1eb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) > 1f0: c5 fc 59 0c 87 vmulps (%rdi,%rax,4),%ymm0,%ymm1 > 1f5: c5 fc 59 54 87 20 vmulps 0x20(%rdi,%rax,4),%ymm0,%ymm2 > 1fb: c5 fc 59 5c 87 40 vmulps 0x40(%rdi,%rax,4),%ymm0,%ymm3 > 201: c5 fc 59 64 87 60 vmulps 0x60(%rdi,%rax,4),%ymm0,%ymm4 > 207: c5 fc 11 0c 87 vmovups %ymm1,(%rdi,%rax,4) > 20c: c5 fc 11 54 87 20 vmovups %ymm2,0x20(%rdi,%rax,4) > 212: c5 fc 11 5c 87 40 vmovups %ymm3,0x40(%rdi,%rax,4) > 218: c5 fc 11 64 87 60 vmovups %ymm4,0x60(%rdi,%rax,4) > 21e: c5 fc 59 8c 87 80 00 vmulps 0x80(%rdi,%rax,4),%ymm0,%ymm1 > 225: 00 00 > 227: c5 fc 59 94 87 a0 00 vmulps 0xa0(%rdi,%rax,4),%ymm0,%ymm2 > 22e: 00 00 > 230: c5 fc 59 9c 87 c0 00 vmulps 0xc0(%rdi,%rax,4),%ymm0,%ymm3 > 237: 00 00 > 239: c5 fc 59 a4 87 e0 00 vmulps 0xe0(%rdi,%rax,4),%ymm0,%ymm4 > 240: 00 00 > 242: c5 fc 11 8c 87 80 00 vmovups %ymm1,0x80(%rdi,%rax,4) > 249: 00 00 > 24b: c5 fc 11 94 87 a0 00 vmovups %ymm2,0xa0(%rdi,%rax,4) > 252: 00 00 > 254: c5 fc 11 9c 87 c0 00 vmovups %ymm3,0xc0(%rdi,%rax,4) > 25b: 00 00 > 25d: c5 fc 11 a4 87 e0 00 vmovups %ymm4,0xe0(%rdi,%rax,4) > 264: 00 00 > 266: 48 83 c0 40 add $0x40,%rax > 26a: 48 3d 00 04 00 00 cmp $0x400,%rax > 270: 0f 85 7a ff ff ff jne 1f0 <_Z4fct7Pf+0x10> > 276: c5 f8 77 vzeroupper > 279: c3 retq > > Which is using wider vectors. > > What do you think? Why not transform the fct6's loop to use wider registers?The loop vectorizer does not really handle loops that already operate on vectors, so that is why the loop using v4f32 does not get widened. Arguably the user explicitly asked for 4xfloat vectors in the v4f32 version, so that is what gets generated. (Those kinds of issues are better to discuss on https://bugs.llvm.org/ IMO, because it is easier to keep track of the progress on the issue). Cheers, Florian
On Tue, Sep 1, 2020 at 5:10 PM Florian Hahn <florian_hahn at apple.com> wrote:> The loop vectorizer does not really handle loops that already operate on vectors, so that is why the loop using v4f32 does not get widened. > > Arguably the user explicitly asked for 4xfloat vectors in the v4f32 version, so that is what gets generated.In my case I have tons of legacy code written for SSE2 and if the compiler can make a better and correct version of it, why not?> (Those kinds of issues are better to discuss on https://bugs.llvm.org/ IMO, because it is easier to keep track of the progress on the issue).That is noted, but I can't think of it as a bug unless I understand the issue. Thank you Florian, -- Alexandre BIQUE