Jeroen Dobbelaere
2012-Feb-02 15:18 UTC
[LLVMdev] How to improve code generated for 'getelementptr' ?
Hi all, I am working on an llvm backend for a processor with a relative simple instruction set. For small loops, the code that is produced depends heavily on how the loop is specified: The less information we provide to clang, the better the loop code becomes... Any idea how I can learn llvm that we don't have load/store instructions with register index, so that it is more efficient to convert Init1 to incrementing a pointer instead of recomputing the address of 'data[i]' every time ? The sample C-code looks like : --- void Init1(int* data) { int i=0; for (i=0; i<100; ++i) { data[i]=1; } } void Init2(int* p, int* e) { while (p!=e) { *p++=4; } } --- This produces following assembly code for -O3 (see below). NOTE: a branch instruction has a delay slot For Init1, the loop body consists of 8 instructions. For Init2, it consists of 5 instructions which is already much better (The optimal would use 4 instructions) Question: how can we teach llvm to provide code like Init2, for input of Init1 ? --- .text .globl _Init1 _Init1: ;; @Init1 ;; BB#0: ;; %entry ldi R1 , 0 ldi R2 , 1 ldi R3 , 100 _BB1_1: ;; %for.body ;; =>This Inner Loop Header: Depth=1 mov R4 , R1 add R1 , 1 mov R5 , R0 sll R4 , 2 cmpne R1 , R3 add R5 , R4 bcc24 _BB1_1 sw R2 , R5, 0 ;; BB#2: ;; %for.end b r15 nop .globl _Init2 _Init2: ;; @Init2 ;; BB#0: ;; %entry cmpeq R0 , R1 bcc24 _BB2_3 nop ;; BB#1: ldi R2 , 4 _BB2_2: ;; %while.body ;; =>This Inner Loop Header: Depth=1 sw R2 , R0, 0 add R0 , 4 cmpne R1 , R0 bcc24 _BB2_2 nop _BB2_3: ;; %while.end b r15 nop --- For reference, the .ll file: --- ; ModuleID = 'loop_test.c' target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f64:32:32-v64:32:32-v128:32:32-n32-s0:32:32-a0:0:32-S32" target triple = "arch--" define void @Init1(i32* nocapture %data) nounwind { entry: br label %for.body for.body: ; preds = %for.body, %entry %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] %arrayidx = getelementptr inbounds i32* %data, i32 %i.01 store i32 1, i32* %arrayidx, align 4, !tbaa !0 %inc = add nsw i32 %i.01, 1 %exitcond = icmp eq i32 %inc, 100 br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body ret void } define void @Init2(i32* %p, i32* %e) nounwind { entry: %cmp1 = icmp eq i32* %p, %e br i1 %cmp1, label %while.end, label %while.body while.body: ; preds = %entry, %while.body %p.addr.02 = phi i32* [ %incdec.ptr, %while.body ], [ %p, %entry ] %incdec.ptr = getelementptr inbounds i32* %p.addr.02, i32 1 store i32 4, i32* %p.addr.02, align 4, !tbaa !0 %cmp = icmp eq i32* %incdec.ptr, %e br i1 %cmp, label %while.end, label %while.body while.end: ; preds = %while.body, %entry ret void } !0 = metadata !{metadata !"int", metadata !1} !1 = metadata !{metadata !"omnipotent char", metadata !2} !2 = metadata !{metadata !"Simple C/C++ TBAA", null} --- Greetings, Jeroen Dobbelaere
Possibly Parallel Threads
- arima problems when using argument fixed=
- Using Rails.Logger in a gem in Rails 3beta3
- [LLVMdev] LiveIntervals analysis problem
- LoopVectorizer -- generating bad and unhandled shufflevector sequence
- [LLVMdev] [llvm-commits] [PATCH] BasicBlock Autovectorization Pass