thr3ads.net - llvm dev - [LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address [Jul 2015]

If this information is useful, please help other people find it:
Share via:

Frank Winter

2015-Jul-29 20:02 UTC

[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address

When I compile attached IR with LLVM 3.6

llc -march=x86-64 -o f.S f.ll

it generates an aligned ADDPS with unaligned address. See attached f.S, 
here an extract:

         addq    $12, %r9         # $12 is not a multiple of 4, thus for 
xmm0 this is unaligned
         xorl    %esi, %esi
         .align  16, 0x90
.LBB0_1:                                # %loop2
                                         # =>This Inner Loop Header: Depth=1
         movq    offset_array3(,%rsi,8), %rdi
         movq    offset_array2(,%rsi,8), %r10
         movss   -28(%rax), %xmm0
         movss   -8(%rax), %xmm1
         movss   -4(%rax), %xmm2
         unpcklps        %xmm0, %xmm2    # xmm2 = 
xmm2[0],xmm0[0],xmm2[1],xmm0[1]
         movss   (%rax), %xmm0
         unpcklps        %xmm0, %xmm1    # xmm1 = 
xmm1[0],xmm0[0],xmm1[1],xmm0[1]
         unpcklps        %xmm2, %xmm1    # xmm1 = 
xmm1[0],xmm2[0],xmm1[1],xmm2[1]
         addps   (%r9), %xmm1          # here, it gets used, causes a 
segfault


Frank

-------------- next part --------------
;; ModuleID = 'module'
target triple = "x86_64-unknown-linux-gnu"

@offset_array2 = internal constant [8 x i64] [i64 60, i64 4, i64 12, i64 20, i64
28, i64 36, i64 44, i64 52]
@offset_array3 = internal constant [8 x i64] [i64 12, i64 20, i64 28, i64 36,
i64 44, i64 52, i64 60, i64 4]

declare float @sinf(float)

declare float @acosf(float)

declare float @asinf(float)

declare float @atanf(float)

declare float @ceilf(float)

declare float @floorf(float)

declare float @cosf(float)

declare float @coshf(float)

declare float @expf(float)

declare float @logf(float)

declare float @log10f(float)

declare float @sinhf(float)

declare float @tanf(float)

declare float @tanhf(float)

declare float @fabsf(float)

declare float @sqrtf(float)

declare float @powf(float, float)

declare float @atan2f(float, float)

declare double @sin(double)

declare double @acos(double)

declare double @asin(double)

declare double @atan(double)

declare double @ceil(double)

declare double @floor(double)

declare double @cos(double)

declare double @cosh(double)

declare double @exp(double)

declare double @log(double)

declare double @log10(double)

declare double @sinh(double)

declare double @tan(double)

declare double @tanh(double)

declare double @fabs(double)

declare double @sqrt(double)

declare double @pow(double, double)

declare double @atan2(double, double)

define void @func(i64 %lo, i64 %hi, float* %arg0, float* %arg1, float* %arg2,
float* %arg3, float* %arg4) {
pre_loop3:
  br label %loop2

loop2:                                            ; preds = %loop2, %pre_loop3
  %0 = phi i64 [ 0, %pre_loop3 ], [ %42, %loop2 ]
  %1 = getelementptr [8 x i64]* @offset_array3, i64 0, i64 %0
  %2 = load i64* %1
  %3 = getelementptr [8 x i64]* @offset_array2, i64 0, i64 %0
  %4 = load i64* %3
  %5 = getelementptr float* %arg1, i64 %4
  %6 = bitcast float* %5 to <4 x float>*
  %7 = load <4 x float>* %6
  %8 = getelementptr float* %arg2, i64 %2
  %9 = bitcast float* %8 to <4 x float>*
  %10 = load <4 x float>* %9
  %11 = mul i64 %0, 8
  %12 = add i64 %11, 3                            ; <--------- this creates
the unaligned address!!
  %13 = getelementptr float* %arg3, i64 %12
  %14 = bitcast float* %13 to <4 x float>*
  %15 = load <4 x float>* %14
  %16 = mul i64 %0, 8
  %17 = add i64 %16, 5
  %18 = getelementptr float* %arg4, i64 %17
  %19 = load float* %18
  %20 = mul i64 %0, 8
  %21 = add i64 %20, 6
  %22 = getelementptr float* %arg4, i64 %21
  %23 = load float* %22
  %24 = mul i64 %0, 8
  %25 = add i64 %24, 7
  %26 = getelementptr float* %arg4, i64 %25
  %27 = load float* %26
  %28 = mul i64 %0, 8
  %29 = getelementptr float* %arg4, i64 %28
  %30 = load float* %29
  %31 = insertelement <4 x float> undef, float %19, i32 0
  %32 = insertelement <4 x float> %31, float %23, i32 1
  %33 = insertelement <4 x float> %32, float %27, i32 2
  %34 = insertelement <4 x float> %33, float %30, i32 3
  %35 = mul i64 %0, 8
  %36 = add i64 %35, 4
  %37 = getelementptr float* %arg0, i64 %36
  %38 = fadd <4 x float> %34, %15
  %39 = fadd <4 x float> %38, %10
  %40 = fadd <4 x float> %39, %7
  %41 = bitcast float* %37 to <4 x float>*
  store <4 x float> %40, <4 x float>* %41
  %42 = add nsw i64 %0, 1
  %43 = icmp uge i64 %42, 8
  br i1 %43, label %exit_loop1, label %loop2

exit_loop1:                                       ; preds = %loop2
  br label %pre_loop

pre_loop:                                         ; preds = %exit_loop1
  br label %entrypoint

entrypoint:                                       ; preds = %vectorized
  ret void
}
-------------- next part --------------
	.text
	.file	"f.ll"
	.globl	func
	.align	16, 0x90
	.type	func, at function
func:                                   # @func
	.cfi_startproc
# BB#0:                                 # %pre_loop3
	movq	8(%rsp), %rax
	addq	$16, %rdx
	addq	$28, %rax
	addq	$12, %r9
	xorl	%esi, %esi
	.align	16, 0x90
.LBB0_1:                                # %loop2
                                        # =>This Inner Loop Header: Depth=1
	movq	offset_array3(,%rsi,8), %rdi
	movq	offset_array2(,%rsi,8), %r10
	movss	-28(%rax), %xmm0
	movss	-8(%rax), %xmm1
	movss	-4(%rax), %xmm2
	unpcklps	%xmm0, %xmm2    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
	movss	(%rax), %xmm0
	unpcklps	%xmm0, %xmm1    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
	unpcklps	%xmm2, %xmm1    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
	addps	(%r9), %xmm1
	addps	(%r8,%rdi,4), %xmm1
	addps	(%rcx,%r10,4), %xmm1
	movaps	%xmm1, (%rdx)
	incq	%rsi
	addq	$32, %rdx
	addq	$32, %rax
	addq	$32, %r9
	cmpq	$8, %rsi
	jb	.LBB0_1
# BB#2:                                 # %entrypoint
	retq
.Ltmp0:
	.size	func, .Ltmp0-func
	.cfi_endproc

	.type	offset_array2, at object   # @offset_array2
	.section	.rodata,"a", at progbits
	.align	16
offset_array2:
	.quad	60                      # 0x3c
	.quad	4                       # 0x4
	.quad	12                      # 0xc
	.quad	20                      # 0x14
	.quad	28                      # 0x1c
	.quad	36                      # 0x24
	.quad	44                      # 0x2c
	.quad	52                      # 0x34
	.size	offset_array2, 64

	.type	offset_array3, at object   # @offset_array3
	.align	16
offset_array3:
	.quad	12                      # 0xc
	.quad	20                      # 0x14
	.quad	28                      # 0x1c
	.quad	36                      # 0x24
	.quad	44                      # 0x2c
	.quad	52                      # 0x34
	.quad	60                      # 0x3c
	.quad	4                       # 0x4
	.size	offset_array3, 64


	.section	".note.GNU-stack","", at progbits

Reid Kleckner

2015-Jul-29 20:54 UTC

head link

[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address

This load instruction assumes the default ABI alignment for the <4 x
float>
type, which is 16:
  %15 = load <4 x float>* %14

You can set the alignment of loads to something lower than 16 in your
frontend, and this will make LLVM use movups instructions:
  %15 = load <4 x float>* %14, align 4

If some LLVM mid-level pass is introducing this load without proving that
the vector is 16-byte aligned, then that's a bug

On Wed, Jul 29, 2015 at 1:02 PM, Frank Winter <fwinter at jlab.org> wrote:
> When I compile attached IR with LLVM 3.6
>
> llc -march=x86-64 -o f.S f.ll
>
> it generates an aligned ADDPS with unaligned address. See attached f.S,
> here an extract:
>
>         addq    $12, %r9         # $12 is not a multiple of 4, thus for
> xmm0 this is unaligned
>         xorl    %esi, %esi
>         .align  16, 0x90
> .LBB0_1:                                # %loop2
>                                         # =>This Inner Loop Header:
Depth=1
>         movq    offset_array3(,%rsi,8), %rdi
>         movq    offset_array2(,%rsi,8), %r10
>         movss   -28(%rax), %xmm0
>         movss   -8(%rax), %xmm1
>         movss   -4(%rax), %xmm2
>         unpcklps        %xmm0, %xmm2    # xmm2 >
xmm2[0],xmm0[0],xmm2[1],xmm0[1]
>         movss   (%rax), %xmm0
>         unpcklps        %xmm0, %xmm1    # xmm1 >
xmm1[0],xmm0[0],xmm1[1],xmm0[1]
>         unpcklps        %xmm2, %xmm1    # xmm1 >
xmm1[0],xmm2[0],xmm1[1],xmm2[1]
>         addps   (%r9), %xmm1          # here, it gets used, causes a
> segfault
>
>
> Frank
>
>
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
>-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://lists.llvm.org/pipermail/llvm-dev/attachments/20150729/fe2b1842/attachment.html>

Frank Winter

2015-Jul-29 21:01 UTC

head link

[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address

No, I generated this IR. So, then I have to generate it along with 
alignment info if the pointers are not default ABI aligned. I wasn't 
aware of this.. Thanks!

Frank


On 07/29/2015 04:54 PM, Reid Kleckner wrote:> This load instruction assumes the default ABI alignment for the <4 x 
> float> type, which is 16:
>   %15 = load <4 x float>* %14
>
> You can set the alignment of loads to something lower than 16 in your 
> frontend, and this will make LLVM use movups instructions:
>   %15 = load <4 x float>* %14, align 4
>
> If some LLVM mid-level pass is introducing this load without proving 
> that the vector is 16-byte aligned, then that's a bug
>
> On Wed, Jul 29, 2015 at 1:02 PM, Frank Winter <fwinter at jlab.org 
> <mailto:fwinter at jlab.org>> wrote:
>
>     When I compile attached IR with LLVM 3.6
>
>     llc -march=x86-64 -o f.S f.ll
>
>     it generates an aligned ADDPS with unaligned address. See attached
>     f.S, here an extract:
>
>             addq    $12, %r9         # $12 is not a multiple of 4,
>     thus for xmm0 this is unaligned
>             xorl    %esi, %esi
>             .align  16, 0x90
>     .LBB0_1:                                # %loop2
>                                             # =>This Inner Loop
>     Header: Depth=1
>             movq    offset_array3(,%rsi,8), %rdi
>             movq    offset_array2(,%rsi,8), %r10
>             movss   -28(%rax), %xmm0
>             movss   -8(%rax), %xmm1
>             movss   -4(%rax), %xmm2
>             unpcklps        %xmm0, %xmm2    # xmm2 >    
xmm2[0],xmm0[0],xmm2[1],xmm0[1]
>             movss   (%rax), %xmm0
>             unpcklps        %xmm0, %xmm1    # xmm1 >    
xmm1[0],xmm0[0],xmm1[1],xmm0[1]
>             unpcklps        %xmm2, %xmm1    # xmm1 >    
xmm1[0],xmm2[0],xmm1[1],xmm2[1]
>             addps   (%r9), %xmm1          # here, it gets used, causes
>     a segfault
>
>
>     Frank
>
>
>     _______________________________________________
>     LLVM Developers mailing list
>     LLVMdev at cs.uiuc.edu <mailto:LLVMdev at cs.uiuc.edu>
>     http://llvm.cs.uiuc.edu
>     http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
>

Apparently Analagous Threads

Search for more apparently analagous threads

llvm dev - Jul 2015 - [LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address

[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address

[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address

[LLVMdev] x86-64 backend generates aligned ADDPS with unaligned address

Apparently Analagous Threads