I attach the orginal IR and the generated assembly where one can see
that the array elements are treated 1 bit wide. Is this the intended
behavior? I doubt it, because I am passing in C pointers, and those have
byte granularity. (In C a bit cannot be addressed with a pointer).
Frank
On 07/27/2015 01:44 PM, Frank Winter wrote:> I am running into a problem with 'i1*' as a function's argument
which
> seems to have appeared since I switched to LLVM 3.6 (but can have
> other source, of course). If I look at the assembler that the MCJIT
> generates for an x86-64 target I see that the array 'i1*' is taken
as
> a sequence of 1 bit wide elements. (I guess that's correct). However,
> I used to call the function from C passing in a 'bool*' which has 1
> byte wideelements, I guess. (not sure if that's a compiler's
choice)
> Now, since I haven't changed my code on these parts but only made the
> transition from LLVM 3.4/5 -> 3.6 I wonder if the element width has
> changed when i1* is used as a function's argument..!?
>
> Thanks,
> Frank
>
-------------- next part --------------
; ModuleID = 'module'
target triple = "x86_64-unknown-linux-gnu"
define void @main(i64 %lo, i64 %hi, float* %arg0, float* %arg1, i1* %arg2) {
vectorized:
%0 = getelementptr i1* %arg2, i32 0
%1 = bitcast i1* %0 to <4 x i1>*
%2 = load <4 x i1>* %1
%3 = getelementptr float* %arg1, i32 0
%4 = bitcast float* %3 to <4 x float>*
%5 = load <4 x float>* %4
%6 = getelementptr float* %arg0, i32 0
%7 = bitcast float* %6 to <4 x float>*
%8 = load <4 x float>* %7
%9 = getelementptr float* %arg0, i32 0
%10 = sext <4 x i1> %2 to <4 x i32>
%11 = bitcast <4 x float> %5 to <4 x i32>
%12 = and <4 x i32> %11, %10
%13 = xor <4 x i32> %10, <i32 -1, i32 -1, i32 -1, i32 -1>
%14 = bitcast <4 x float> %8 to <4 x i32>
%15 = and <4 x i32> %14, %13
%16 = or <4 x i32> %15, %12
%17 = bitcast <4 x i32> %16 to <4 x float>
%18 = bitcast float* %9 to <4 x float>*
store <4 x float> %17, <4 x float>* %18
%19 = getelementptr i1* %arg2, i32 4
%20 = bitcast i1* %19 to <4 x i1>*
%21 = load <4 x i1>* %20
%22 = getelementptr float* %arg1, i32 4
%23 = bitcast float* %22 to <4 x float>*
%24 = load <4 x float>* %23
%25 = getelementptr float* %arg0, i32 4
%26 = bitcast float* %25 to <4 x float>*
%27 = load <4 x float>* %26
%28 = getelementptr float* %arg0, i32 4
%29 = sext <4 x i1> %21 to <4 x i32>
%30 = bitcast <4 x float> %24 to <4 x i32>
%31 = and <4 x i32> %30, %29
%32 = xor <4 x i32> %29, <i32 -1, i32 -1, i32 -1, i32 -1>
%33 = bitcast <4 x float> %27 to <4 x i32>
%34 = and <4 x i32> %33, %32
%35 = or <4 x i32> %34, %31
%36 = bitcast <4 x i32> %35 to <4 x float>
%37 = bitcast float* %28 to <4 x float>*
store <4 x float> %36, <4 x float>* %37
%38 = getelementptr i1* %arg2, i32 8
%39 = bitcast i1* %38 to <4 x i1>*
%40 = load <4 x i1>* %39
%41 = getelementptr float* %arg1, i32 8
%42 = bitcast float* %41 to <4 x float>*
%43 = load <4 x float>* %42
%44 = getelementptr float* %arg0, i32 8
%45 = bitcast float* %44 to <4 x float>*
%46 = load <4 x float>* %45
%47 = getelementptr float* %arg0, i32 8
%48 = sext <4 x i1> %40 to <4 x i32>
%49 = bitcast <4 x float> %43 to <4 x i32>
%50 = and <4 x i32> %49, %48
%51 = xor <4 x i32> %48, <i32 -1, i32 -1, i32 -1, i32 -1>
%52 = bitcast <4 x float> %46 to <4 x i32>
%53 = and <4 x i32> %52, %51
%54 = or <4 x i32> %53, %50
%55 = bitcast <4 x i32> %54 to <4 x float>
%56 = bitcast float* %47 to <4 x float>*
store <4 x float> %55, <4 x float>* %56
%57 = getelementptr i1* %arg2, i32 12
%58 = bitcast i1* %57 to <4 x i1>*
%59 = load <4 x i1>* %58
%60 = getelementptr float* %arg1, i32 12
%61 = bitcast float* %60 to <4 x float>*
%62 = load <4 x float>* %61
%63 = getelementptr float* %arg0, i32 12
%64 = bitcast float* %63 to <4 x float>*
%65 = load <4 x float>* %64
%66 = getelementptr float* %arg0, i32 12
%67 = sext <4 x i1> %59 to <4 x i32>
%68 = bitcast <4 x float> %62 to <4 x i32>
%69 = and <4 x i32> %68, %67
%70 = xor <4 x i32> %67, <i32 -1, i32 -1, i32 -1, i32 -1>
%71 = bitcast <4 x float> %65 to <4 x i32>
%72 = and <4 x i32> %71, %70
%73 = or <4 x i32> %72, %69
%74 = bitcast <4 x i32> %73 to <4 x float>
%75 = bitcast float* %66 to <4 x float>*
store <4 x float> %74, <4 x float>* %75
br label %entrypoint
entrypoint: ; preds = %vectorized
ret void
}
-------------- next part --------------
.text
.file "module"
.globl main
.align 16, 0x90
.type main, at function
main:
.cfi_startproc
movzbl (%r8), %eax
movq %rax, %rsi
shlq $62, %rsi
sarq $63, %rsi
movq %rax, %rdi
shlq $63, %rdi
sarq $63, %rdi
movd %edi, %xmm0
pinsrd $1, %esi, %xmm0
movq %rax, %rsi
shlq $61, %rsi
sarq $63, %rsi
pinsrd $2, %esi, %xmm0
shlq $60, %rax
sarq $63, %rax
pinsrd $3, %eax, %xmm0
movdqa (%rcx), %xmm1
pand %xmm0, %xmm1
pandn (%rdx), %xmm0
por %xmm1, %xmm0
movdqa %xmm0, (%rdx)
movzbl 4(%r8), %eax
movq %rax, %rsi
shlq $62, %rsi
sarq $63, %rsi
movq %rax, %rdi
shlq $63, %rdi
sarq $63, %rdi
movd %edi, %xmm0
pinsrd $1, %esi, %xmm0
movq %rax, %rsi
shlq $61, %rsi
sarq $63, %rsi
pinsrd $2, %esi, %xmm0
shlq $60, %rax
sarq $63, %rax
pinsrd $3, %eax, %xmm0
movdqa 16(%rcx), %xmm1
pand %xmm0, %xmm1
pandn 16(%rdx), %xmm0
por %xmm1, %xmm0
movdqa %xmm0, 16(%rdx)
movzbl 8(%r8), %eax
movq %rax, %rsi
shlq $62, %rsi
sarq $63, %rsi
movq %rax, %rdi
shlq $63, %rdi
sarq $63, %rdi
movd %edi, %xmm0
pinsrd $1, %esi, %xmm0
movq %rax, %rsi
shlq $61, %rsi
sarq $63, %rsi
pinsrd $2, %esi, %xmm0
shlq $60, %rax
sarq $63, %rax
pinsrd $3, %eax, %xmm0
movdqa 32(%rcx), %xmm1
pand %xmm0, %xmm1
pandn 32(%rdx), %xmm0
por %xmm1, %xmm0
movdqa %xmm0, 32(%rdx)
movzbl 12(%r8), %eax
movq %rax, %rsi
shlq $62, %rsi
sarq $63, %rsi
movq %rax, %rdi
shlq $63, %rdi
sarq $63, %rdi
movd %edi, %xmm0
pinsrd $1, %esi, %xmm0
movq %rax, %rsi
shlq $61, %rsi
sarq $63, %rsi
pinsrd $2, %esi, %xmm0
shlq $60, %rax
sarq $63, %rax
pinsrd $3, %eax, %xmm0
movdqa 48(%rcx), %xmm1
pand %xmm0, %xmm1
pandn 48(%rdx), %xmm0
por %xmm1, %xmm0
movdqa %xmm0, 48(%rdx)
retq
.Ltmp0:
.size main, .Ltmp0-main
.cfi_endproc
.section ".note.GNU-stack","", at progbits