MITSUNARI Shigeo via llvm-dev
2016-Nov-09 03:31 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
Hi all, opt and opt + llc generate the difference aarch64 asm code for the following LLVM code. Is it intended behavior? I expected (A) because I cast %p from i192* to i64*. The information is dropped by opt and 8-byte padding is inserted or I write a bad code? % cat a.ll define void @store0_to_p4(i192* %p) { %p1 = bitcast i192* %p to i64* %p2 = getelementptr i64, i64* %p1, i64 3 %p3 = getelementptr i64, i64* %p2, i64 1 store i64 0, i64* %p3 ret void } % llc-3.8 a.ll -O3 -o - -march=aarch64 store0_to_p4: str xzr, [x0, #32] ; (A) ret % opt-3.8 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=aarch64 store0_to_p4: str xzr, [x0, #40] ; (B) ret Yours, Shigeo
Mehdi Amini via llvm-dev
2016-Nov-09 05:04 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
Can you provide the full repro? Also what is the IR output of opt -O3? — Mehdi> On Nov 8, 2016, at 7:31 PM, MITSUNARI Shigeo via llvm-dev <llvm-dev at lists.llvm.org> wrote: > > Hi all, > opt and opt + llc generate the difference aarch64 asm code for the following LLVM code. > > Is it intended behavior? > I expected (A) because I cast %p from i192* to i64*. > The information is dropped by opt and 8-byte padding is inserted or I write a bad code? > > % cat a.ll > define void @store0_to_p4(i192* %p) > { > %p1 = bitcast i192* %p to i64* > %p2 = getelementptr i64, i64* %p1, i64 3 > %p3 = getelementptr i64, i64* %p2, i64 1 > store i64 0, i64* %p3 > ret void > } > > % llc-3.8 a.ll -O3 -o - -march=aarch64 > store0_to_p4: > str xzr, [x0, #32] ; (A) > ret > > % opt-3.8 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=aarch64 > store0_to_p4: > str xzr, [x0, #40] ; (B) > ret > > Yours, > Shigeo > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
Tim Northover via llvm-dev
2016-Nov-09 05:27 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
Hi Shigeo, On 8 November 2016 at 19:31, MITSUNARI Shigeo via llvm-dev <llvm-dev at lists.llvm.org> wrote:> opt and opt + llc generate the difference aarch64 asm code for the following LLVM code.This looks like it's because the IR doesn't contain a datalayout declaration, which affects how i192 is interpreted (particularly sizeof(i192) for GEP purposes).> Is it intended behavior?It'll disappear if you provide a correct datalayout, incorrect ones are unsupported in any configuration. Cheers. Tim.
MITSUNARI Shigeo via llvm-dev
2016-Nov-09 05:48 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
Hi Tim, ------------------------------------------- % cat a.ll define void @store0_to_p4(i192* %p) { %p1 = bitcast i192* %p to i64* ; (X) %p2 = getelementptr i64, i64* %p1, i64 3 ; (X) %p3 = getelementptr i64, i64* %p2, i64 1 store i64 0, i64* %p3 ret void } ------------------------------------------- % opt-3.8 -O3 a.ll -S ; ModuleID = 'a.ll' ; Function Attrs: norecurse nounwind define void @store0_to_p4(i192* nocapture %p) #0 { %p21 = getelementptr i192, i192* %p, i64 1 ; (Y) %p2 = bitcast i192* %p21 to i64* %p3 = getelementptr i64, i64* %p2, i64 1 store i64 0, i64* %p3, align 4 ret void } attributes #0 = { norecurse nounwind } ------------------------------------------->This looks like it's because the IR doesn't contain a datalayout >declaration, which affects how i192 is interpreted (particularly >sizeof(i192) for GEP purposes).I think that (X) != (Y) for aarch64, but I don't know how to provide a correct datalayout, then I will avoid using i192*. Thank you. Yours, Shigeo
Mehdi Amini via llvm-dev
2016-Nov-09 05:58 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
> On Nov 8, 2016, at 9:48 PM, MITSUNARI Shigeo via llvm-dev <llvm-dev at lists.llvm.org> wrote: > > Hi Tim, > > ------------------------------------------- > % cat a.ll > define void @store0_to_p4(i192* %p) > { > %p1 = bitcast i192* %p to i64* ; (X) > %p2 = getelementptr i64, i64* %p1, i64 3 ; (X) > %p3 = getelementptr i64, i64* %p2, i64 1 > store i64 0, i64* %p3 > ret void > } > ------------------------------------------- > % opt-3.8 -O3 a.ll -S > ; ModuleID = 'a.ll' > > ; Function Attrs: norecurse nounwind > define void @store0_to_p4(i192* nocapture %p) #0 { > %p21 = getelementptr i192, i192* %p, i64 1 ; (Y) > %p2 = bitcast i192* %p21 to i64* > %p3 = getelementptr i64, i64* %p2, i64 1 > store i64 0, i64* %p3, align 4 > ret void > } > > attributes #0 = { norecurse nounwind } > ------------------------------------------- > >> This looks like it's because the IR doesn't contain a datalayout >> declaration, which affects how i192 is interpreted (particularly >> sizeof(i192) for GEP purposes). > > I think that (X) != (Y) for aarch64, but I don't know how to provide > a correct datalayout, then I will avoid using i192*.That’s why I asked for the full repro by the way, I though you were showing only part of the test case. The datalayout is *required* if you want to perform any transformation on the IR, otherwise you may have some surprise like this at codegen time. You should get it from the TargetMachine after you initialize a Target, and set it on the module from the beginning. If you just want to play with some IR, you can look it up in the source code, or in the test directory: $ git grep datalayout test/CodeGen/AArch64/ test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll:target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll:target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir: target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll:target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" … — Mehdi
MITSUNARI Shigeo via llvm-dev
2016-Nov-09 06:40 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* foropt + llc -march=aarch64?
Hi Mehdi,>If you just want to play with some IR, you can look it up in the source code, or in the test directory: > >$ git grep datalayout test/CodeGen/AArch64/ >test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll:target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"Thank you for advice. I verified it by adding 'target datalayout="e-m:o-i64:64-i128:128-n32:64-S128:i192:192"' at the top of a.ll. % opt-3.7 -O3 a.ll -o - | llc-3.7 -O3 -o - -march=aarch64 store0_to_p4: str xzr, [x0, #32] ret Yours, Shigeo
Tom Stellard via llvm-dev
2016-Nov-09 15:53 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
On Wed, Nov 09, 2016 at 12:31:18PM +0900, MITSUNARI Shigeo via llvm-dev wrote:> Hi all, > opt and opt + llc generate the difference aarch64 asm code for the following LLVM code. > > Is it intended behavior? > I expected (A) because I cast %p from i192* to i64*. > The information is dropped by opt and 8-byte padding is inserted or I write a bad code? > > % cat a.ll > define void @store0_to_p4(i192* %p) > { > %p1 = bitcast i192* %p to i64* > %p2 = getelementptr i64, i64* %p1, i64 3 > %p3 = getelementptr i64, i64* %p2, i64 1 > store i64 0, i64* %p3 > ret void > } > > % llc-3.8 a.ll -O3 -o - -march=aarch64 > store0_to_p4: > str xzr, [x0, #32] ; (A) > ret > > % opt-3.8 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=aarch64Is your default target aarch64? Otherwise opt may be assuming a different target which might explain the difference. -Tom> store0_to_p4: > str xzr, [x0, #40] ; (B) > ret > > Yours, > Shigeo > _______________________________________________ > LLVM Developers mailing list > llvm-dev at lists.llvm.org > http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
MITSUNARI Shigeo via llvm-dev
2016-Nov-10 04:29 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
Hi Tom,>Is your default target aarch64? Otherwise opt may be assuming a different >target which might explain the difference.No, My target is x86-64, x86, arm, aarch64, ..., then I'll avoid using i192* and datalayout. Yours, Shigeo
Mehdi Amini via llvm-dev
2016-Nov-10 06:41 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
> On Nov 9, 2016, at 8:29 PM, MITSUNARI Shigeo via llvm-dev <llvm-dev at lists.llvm.org> wrote: > > Hi Tom, > >> Is your default target aarch64? Otherwise opt may be assuming a different >> target which might explain the difference. > > No, My target is x86-64, x86, arm, aarch64, ..., then I'll avoid using i192* and datalayout.There is nothing specific with i192. You will likely run into issues by not specifying the right datalayout. The optimizations will always run with a datalayout: if you don’t specify one there will be a default one, which can cause problems on some target (like you saw on arm). For instance, the optimizer will assume a pointer size and optimize based on this. — Mehdi
Renato Golin via llvm-dev
2016-Nov-11 11:04 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* for opt + llc -march=aarch64?
On 10 November 2016 at 04:29, MITSUNARI Shigeo via llvm-dev <llvm-dev at lists.llvm.org> wrote:>>Is your default target aarch64? Otherwise opt may be assuming a different >>target which might explain the difference. > > No, My target is x86-64, x86, arm, aarch64, ..., then I'll avoid using i192* and datalayout.I believe Tom's point was about the line: % opt-3.8 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=aarch64 If your host is x86_64, then the first call to opt will assume x86_64 unless you have a triple in the IR (which I believe you didn't). You can override with: % opt-3.8 -march=aarch64 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=aarch64 Or making sure your IR always have triple+layout. I'm not sure it would have made any difference on the i192* case, but it will have noticeable impact on more complicated (and more target specific) IR, so you should be careful. Also, don't assume that OPT+LLC == LLC, as you'll be running more of the same passes on the first case, which can, in rare cases, have an impact (for better or worse) on the code generated. I recommend you keep the passes to a minimum. Opt is a debug tool, not an optimiser. To generate target code, use llc directly, which will (should) have the same effect without command line flag duplication. Better still, use Clang, or make sure your own front-end uses the middle and back ends in a consistent way, and use it instead of llc. cheers, --renato
MITSUNARI Shigeo via llvm-dev
2016-Nov-12 02:05 UTC
[llvm-dev] Is the correct behavior of getelementptr i192* foropt + llc -march=aarch64?
Hi Mehdi,>> No, My target is x86-64, x86, arm, aarch64, ..., then I'll avoid using i192* and datalayout. > >There is nothing specific with i192. You will likely run into issues by not specifying the right datalayout. > >The optimizations will always run with a datalayout: if you don’t specify one there will be a default one, which can cause problems on some target (like you saw on arm). >For instance, the optimizer will assume a pointer size and optimize based on this.I write a code without i192* as the following, then I get what I wanted. I'll rewrite the other code like this. // load 192-bit data from %r2 define i192 @load192(i64* %r2) { %r3 = load i64, i64* %r2 %r4 = zext i64 %r3 to i128 %r6 = getelementptr i64, i64* %r2, i32 1 %r7 = load i64, i64* %r6 %r8 = zext i64 %r7 to i128 %r9 = shl i128 %r8, 64 %r10 = or i128 %r4, %r9 %r11 = zext i128 %r10 to i192 %r13 = getelementptr i64, i64* %r2, i32 2 %r14 = load i64, i64* %r13 %r15 = zext i64 %r14 to i192 %r16 = shl i192 %r15, 128 %r17 = or i192 %r11, %r16 ret i192 %r17 } /* struct i192_t { uint64_t v[3]; }; void add(i192_t *y, const i192_t* x) { *y = x[0] + x[1]; // pseudo code } */ define void @add(i64* noalias %r1, i64* noalias %r2) { %r3 = call i192 @load192(i64* %r2) %r5 = getelementptr i64, i64* %r2, i32 3 %r6 = call i192 @load192(i64* %r5) %r7 = add i192 %r3, %r6 %r9 = getelementptr i64, i64* %r1, i32 0 %r10 = trunc i192 %r7 to i64 store i64 %r10, i64* %r9 %r11 = lshr i192 %r7, 64 %r13 = getelementptr i64, i64* %r1, i32 1 %r14 = trunc i192 %r11 to i64 store i64 %r14, i64* %r13 %r15 = lshr i192 %r11, 64 %r17 = getelementptr i64, i64* %r1, i32 2 %r18 = trunc i192 %r15 to i64 store i64 %r18, i64* %r17 ret void } % opt-3.8 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=x86-64 add: movq 16(%rsi), %rax movq 24(%rsi), %rcx movq 32(%rsi), %rdx addq (%rsi), %rcx adcq 8(%rsi), %rdx adcq 40(%rsi), %rax movq %rcx, (%rdi) movq %rdx, 8(%rdi) movq %rax, 16(%rdi) retq % opt-3.8 -O3 a.ll -o - | llc-3.8 -O3 -o - -march=aarch64 add: ldp x8, x9, [x1] ldp x10, x11, [x1, #24] ldr x12, [x1, #16] ldr x13, [x1, #40] adds x8, x10, x8 adcs x9, x11, x9 stp x8, x9, [x0] adcs x8, x13, x12 str x8, [x0, #16] ret Yours, Shigeo