Hi, Please consider the following loop: using v4f32 = float __attribute__((__vector_size__(16))); void fct6(v4f32 *x) { #pragma clang loop vectorize(enable) for (int i = 0; i < 256; ++i) x[i] = 7 * x[i]; } After compiling it with: clang++ -O3 -march=native -mtune=native \ -Rpass=loop-vectorize,slp-vectorize -Rpass-missed=loop-vectorize,slp-vectorize -Rpass-analysis=loop-vectorize,slp-vectorize \ -ffast-math -ffp-model=fast -ffp-exception-behavior=ignore -ffp-contract=fast -mrecip=all:0 \ -c -o vec.o vec.cc I get the following codegen: 0000000000000160 <_Z4fct6PDv4_f>: 160: 31 c0 xor %eax,%eax 162: c4 e2 79 18 05 00 00 vbroadcastss 0x0(%rip),%xmm0 # 16b <_Z4fct6PDv4_f+0xb> 169: 00 00 16b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 170: c5 f8 59 0c 07 vmulps (%rdi,%rax,1),%xmm0,%xmm1 175: c5 f8 29 0c 07 vmovaps %xmm1,(%rdi,%rax,1) 17a: c5 f8 59 4c 07 10 vmulps 0x10(%rdi,%rax,1),%xmm0,%xmm1 180: c5 f8 29 4c 07 10 vmovaps %xmm1,0x10(%rdi,%rax,1) 186: c5 f8 59 4c 07 20 vmulps 0x20(%rdi,%rax,1),%xmm0,%xmm1 18c: c5 f8 29 4c 07 20 vmovaps %xmm1,0x20(%rdi,%rax,1) 192: c5 f8 59 4c 07 30 vmulps 0x30(%rdi,%rax,1),%xmm0,%xmm1 198: c5 f8 29 4c 07 30 vmovaps %xmm1,0x30(%rdi,%rax,1) 19e: c5 f8 59 4c 07 40 vmulps 0x40(%rdi,%rax,1),%xmm0,%xmm1 1a4: c5 f8 29 4c 07 40 vmovaps %xmm1,0x40(%rdi,%rax,1) 1aa: c5 f8 59 4c 07 50 vmulps 0x50(%rdi,%rax,1),%xmm0,%xmm1 1b0: c5 f8 29 4c 07 50 vmovaps %xmm1,0x50(%rdi,%rax,1) 1b6: c5 f8 59 4c 07 60 vmulps 0x60(%rdi,%rax,1),%xmm0,%xmm1 1bc: c5 f8 29 4c 07 60 vmovaps %xmm1,0x60(%rdi,%rax,1) 1c2: c5 f8 59 4c 07 70 vmulps 0x70(%rdi,%rax,1),%xmm0,%xmm1 1c8: c5 f8 29 4c 07 70 vmovaps %xmm1,0x70(%rdi,%rax,1) 1ce: 48 83 e8 80 sub $0xffffffffffffff80,%rax 1d2: 48 3d 00 10 00 00 cmp $0x1000,%rax 1d8: 75 96 jne 170 <_Z4fct6PDv4_f+0x10> 1da: c3 retq My CPU being Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz, I have AVX2. So should the compiler understand the loop and upgrade the vector width? On the other hand if I do the following loop: void fct7(float *x) { #pragma clang loop vectorize(enable) for (int i = 0; i < 4 * 256; ++i) x[i] = 7 * x[i]; } It compiles it to: 00000000000001e0 <_Z4fct7Pf>: 1e0: 31 c0 xor %eax,%eax 1e2: c4 e2 7d 18 05 00 00 vbroadcastss 0x0(%rip),%ymm0 # 1eb <_Z4fct7Pf+0xb> 1e9: 00 00 1eb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 1f0: c5 fc 59 0c 87 vmulps (%rdi,%rax,4),%ymm0,%ymm1 1f5: c5 fc 59 54 87 20 vmulps 0x20(%rdi,%rax,4),%ymm0,%ymm2 1fb: c5 fc 59 5c 87 40 vmulps 0x40(%rdi,%rax,4),%ymm0,%ymm3 201: c5 fc 59 64 87 60 vmulps 0x60(%rdi,%rax,4),%ymm0,%ymm4 207: c5 fc 11 0c 87 vmovups %ymm1,(%rdi,%rax,4) 20c: c5 fc 11 54 87 20 vmovups %ymm2,0x20(%rdi,%rax,4) 212: c5 fc 11 5c 87 40 vmovups %ymm3,0x40(%rdi,%rax,4) 218: c5 fc 11 64 87 60 vmovups %ymm4,0x60(%rdi,%rax,4) 21e: c5 fc 59 8c 87 80 00 vmulps 0x80(%rdi,%rax,4),%ymm0,%ymm1 225: 00 00 227: c5 fc 59 94 87 a0 00 vmulps 0xa0(%rdi,%rax,4),%ymm0,%ymm2 22e: 00 00 230: c5 fc 59 9c 87 c0 00 vmulps 0xc0(%rdi,%rax,4),%ymm0,%ymm3 237: 00 00 239: c5 fc 59 a4 87 e0 00 vmulps 0xe0(%rdi,%rax,4),%ymm0,%ymm4 240: 00 00 242: c5 fc 11 8c 87 80 00 vmovups %ymm1,0x80(%rdi,%rax,4) 249: 00 00 24b: c5 fc 11 94 87 a0 00 vmovups %ymm2,0xa0(%rdi,%rax,4) 252: 00 00 254: c5 fc 11 9c 87 c0 00 vmovups %ymm3,0xc0(%rdi,%rax,4) 25b: 00 00 25d: c5 fc 11 a4 87 e0 00 vmovups %ymm4,0xe0(%rdi,%rax,4) 264: 00 00 266: 48 83 c0 40 add $0x40,%rax 26a: 48 3d 00 04 00 00 cmp $0x400,%rax 270: 0f 85 7a ff ff ff jne 1f0 <_Z4fct7Pf+0x10> 276: c5 f8 77 vzeroupper 279: c3 retq Which is using wider vectors. What do you think? Why not transform the fct6's loop to use wider registers? Regards, -- Alexandre Bique
> On Sep 1, 2020, at 12:07, Alexandre Bique via llvm-dev <llvm-dev at lists.llvm.org> wrote: > > Hi, > > Please consider the following loop: > > using v4f32 = float __attribute__((__vector_size__(16))); > > void fct6(v4f32 *x) > { > #pragma clang loop vectorize(enable) > for (int i = 0; i < 256; ++i) > x[i] = 7 * x[i]; > } > > After compiling it with: > > clang++ -O3 -march=native -mtune=native \ > -Rpass=loop-vectorize,slp-vectorize > -Rpass-missed=loop-vectorize,slp-vectorize > -Rpass-analysis=loop-vectorize,slp-vectorize \ > -ffast-math -ffp-model=fast -ffp-exception-behavior=ignore > -ffp-contract=fast -mrecip=all:0 \ > -c -o vec.o vec.cc > > I get the following codegen: > > 0000000000000160 <_Z4fct6PDv4_f>: > 160: 31 c0 xor %eax,%eax > 162: c4 e2 79 18 05 00 00 vbroadcastss 0x0(%rip),%xmm0 # 16b > <_Z4fct6PDv4_f+0xb> > 169: 00 00 > 16b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) > 170: c5 f8 59 0c 07 vmulps (%rdi,%rax,1),%xmm0,%xmm1 > 175: c5 f8 29 0c 07 vmovaps %xmm1,(%rdi,%rax,1) > 17a: c5 f8 59 4c 07 10 vmulps 0x10(%rdi,%rax,1),%xmm0,%xmm1 > 180: c5 f8 29 4c 07 10 vmovaps %xmm1,0x10(%rdi,%rax,1) > 186: c5 f8 59 4c 07 20 vmulps 0x20(%rdi,%rax,1),%xmm0,%xmm1 > 18c: c5 f8 29 4c 07 20 vmovaps %xmm1,0x20(%rdi,%rax,1) > 192: c5 f8 59 4c 07 30 vmulps 0x30(%rdi,%rax,1),%xmm0,%xmm1 > 198: c5 f8 29 4c 07 30 vmovaps %xmm1,0x30(%rdi,%rax,1) > 19e: c5 f8 59 4c 07 40 vmulps 0x40(%rdi,%rax,1),%xmm0,%xmm1 > 1a4: c5 f8 29 4c 07 40 vmovaps %xmm1,0x40(%rdi,%rax,1) > 1aa: c5 f8 59 4c 07 50 vmulps 0x50(%rdi,%rax,1),%xmm0,%xmm1 > 1b0: c5 f8 29 4c 07 50 vmovaps %xmm1,0x50(%rdi,%rax,1) > 1b6: c5 f8 59 4c 07 60 vmulps 0x60(%rdi,%rax,1),%xmm0,%xmm1 > 1bc: c5 f8 29 4c 07 60 vmovaps %xmm1,0x60(%rdi,%rax,1) > 1c2: c5 f8 59 4c 07 70 vmulps 0x70(%rdi,%rax,1),%xmm0,%xmm1 > 1c8: c5 f8 29 4c 07 70 vmovaps %xmm1,0x70(%rdi,%rax,1) > 1ce: 48 83 e8 80 sub $0xffffffffffffff80,%rax > 1d2: 48 3d 00 10 00 00 cmp $0x1000,%rax > 1d8: 75 96 jne 170 <_Z4fct6PDv4_f+0x10> > 1da: c3 retq > > My CPU being Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz, I have AVX2. So > should the compiler understand the loop and upgrade the vector width? > > On the other hand if I do the following loop: > > void fct7(float *x) > { > #pragma clang loop vectorize(enable) > for (int i = 0; i < 4 * 256; ++i) > x[i] = 7 * x[i]; > } > > It compiles it to: > > 00000000000001e0 <_Z4fct7Pf>: > 1e0: 31 c0 xor %eax,%eax > 1e2: c4 e2 7d 18 05 00 00 vbroadcastss 0x0(%rip),%ymm0 # 1eb > <_Z4fct7Pf+0xb> > 1e9: 00 00 > 1eb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) > 1f0: c5 fc 59 0c 87 vmulps (%rdi,%rax,4),%ymm0,%ymm1 > 1f5: c5 fc 59 54 87 20 vmulps 0x20(%rdi,%rax,4),%ymm0,%ymm2 > 1fb: c5 fc 59 5c 87 40 vmulps 0x40(%rdi,%rax,4),%ymm0,%ymm3 > 201: c5 fc 59 64 87 60 vmulps 0x60(%rdi,%rax,4),%ymm0,%ymm4 > 207: c5 fc 11 0c 87 vmovups %ymm1,(%rdi,%rax,4) > 20c: c5 fc 11 54 87 20 vmovups %ymm2,0x20(%rdi,%rax,4) > 212: c5 fc 11 5c 87 40 vmovups %ymm3,0x40(%rdi,%rax,4) > 218: c5 fc 11 64 87 60 vmovups %ymm4,0x60(%rdi,%rax,4) > 21e: c5 fc 59 8c 87 80 00 vmulps 0x80(%rdi,%rax,4),%ymm0,%ymm1 > 225: 00 00 > 227: c5 fc 59 94 87 a0 00 vmulps 0xa0(%rdi,%rax,4),%ymm0,%ymm2 > 22e: 00 00 > 230: c5 fc 59 9c 87 c0 00 vmulps 0xc0(%rdi,%rax,4),%ymm0,%ymm3 > 237: 00 00 > 239: c5 fc 59 a4 87 e0 00 vmulps 0xe0(%rdi,%rax,4),%ymm0,%ymm4 > 240: 00 00 > 242: c5 fc 11 8c 87 80 00 vmovups %ymm1,0x80(%rdi,%rax,4) > 249: 00 00 > 24b: c5 fc 11 94 87 a0 00 vmovups %ymm2,0xa0(%rdi,%rax,4) > 252: 00 00 > 254: c5 fc 11 9c 87 c0 00 vmovups %ymm3,0xc0(%rdi,%rax,4) > 25b: 00 00 > 25d: c5 fc 11 a4 87 e0 00 vmovups %ymm4,0xe0(%rdi,%rax,4) > 264: 00 00 > 266: 48 83 c0 40 add $0x40,%rax > 26a: 48 3d 00 04 00 00 cmp $0x400,%rax > 270: 0f 85 7a ff ff ff jne 1f0 <_Z4fct7Pf+0x10> > 276: c5 f8 77 vzeroupper > 279: c3 retq > > Which is using wider vectors. > > What do you think? Why not transform the fct6's loop to use wider registers?The loop vectorizer does not really handle loops that already operate on vectors, so that is why the loop using v4f32 does not get widened. Arguably the user explicitly asked for 4xfloat vectors in the v4f32 version, so that is what gets generated. (Those kinds of issues are better to discuss on https://bugs.llvm.org/ IMO, because it is easier to keep track of the progress on the issue). Cheers, Florian
On Tue, Sep 1, 2020 at 5:10 PM Florian Hahn <florian_hahn at apple.com> wrote:> The loop vectorizer does not really handle loops that already operate on vectors, so that is why the loop using v4f32 does not get widened. > > Arguably the user explicitly asked for 4xfloat vectors in the v4f32 version, so that is what gets generated.In my case I have tons of legacy code written for SSE2 and if the compiler can make a better and correct version of it, why not?> (Those kinds of issues are better to discuss on https://bugs.llvm.org/ IMO, because it is easier to keep track of the progress on the issue).That is noted, but I can't think of it as a bug unless I understand the issue. Thank you Florian, -- Alexandre BIQUE