zan jyu Wong
2015-May-21 13:21 UTC
[LLVMdev] How can I remove these redundant copy between registers?
Hi, I've been working on a Blackfin backend (llvm-3.6.0) based on the previous one that was removed in llvm-3.1. llc generates codes like this: 29 p1 = r2; 30 r5 = [p1]; 31 p1 = r2; 32 r6 = [p1 + 4]; 33 r5 = r6 + r5; 34 r6 = [p0 + -4]; 35 r5 *= r6; 36 p1 = r2; 37 r6 = [p1 + 8]; 38 p1 = r2; p1 and r2 are in different register classes. A p* register can be used for load/stroe values from memory while a r* register can not. As we can see, line 31, 36, 38 can be deleted. How can I configure llc to do this? Or do I have to write a custom pass to do this optimization? Any suggestion is welcome. Thanks, Huang -------------- next part -------------- An HTML attachment was scrubbed... URL: <lists.llvm.org/pipermail/llvm-dev/attachments/20150521/3f5110e8/attachment.html>
Samuel Crow
2015-May-21 16:24 UTC
[LLVMdev] How can I remove these redundant copy between registers?
On May 21, 2015, at 7:21 AM, zan jyu Wong wrote:> Hi, > > I've been working on a Blackfin backend (llvm-3.6.0) based on the previous one that was removed in llvm-3.1. > llc generates codes like this: > > 29 p1 = r2; > 30 r5 = [p1]; > 31 p1 = r2; > 32 r6 = [p1 + 4]; > 33 r5 = r6 + r5; > 34 r6 = [p0 + -4]; > 35 r5 *= r6; > 36 p1 = r2; > 37 r6 = [p1 + 8]; > 38 p1 = r2; > > p1 and r2 are in different register classes. > A p* register can be used for load/stroe values from memory while a r* register can not. > > As we can see, line 31, 36, 38 can be deleted. How can I configure llc to do this? Or do I have to write a custom pass to do this optimization? Any suggestion is welcome. > > Thanks, > > HuangHello Huang, SIlly as this may sound, did you run OPT on the bitcode first before using LLC? Cheers, Sam
zan jyu Wong
2015-May-22 02:26 UTC
[LLVMdev] How can I remove these redundant copy between registers?
Hi Sam, Thanks for your helping. I've never noticed OPT before, and I tried to run it on the bitcode, but still I get the code listed above. FYI, I did as the following: $ clang -c -m32 -O3 -emit-llvm ex11.c -o ex11.bc $ opt -S -gvn ex11.bc > ex11.ll $ llc -march=bfin ex11.ll Is there any thing I'm missing? And the following is how I did before: $ clang -S -m32 -emit-llvm -O3 file.c -o file.ll $ llc -march=bfin file.ll Original C Source File: 1 typedef struct state { 2 int V[8][8]; 3 int *offset[8]; 4 } state_t; 5 6 void foo(state_t* state, int ch, int *buffer) 7 { 8 int *offset = state->offset[ch]; 9 10 int idx, i; 11 for (i = 0, idx = 0; i < 100; i++, idx += 5) { 12 //long long tmp = 0; 13 int tmp = 0; 14 for (int j = 0; j < 2; j++) { 15 tmp += state->V[ch][offset[i]+2*j+0]*buffer[idx + j]; 16 tmp += state->V[ch][offset[i]+2*j+1]*buffer[idx + j]; 17 } 18 19 // disable optimization 20 //volatile long long ret = tmp; 21 volatile int ret = tmp; 22 } 23 } .ll file after run OPT on .bc file ; ModuleID = 'ex11.bc' target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" target triple = "i386-apple-macosx10.10.0" %struct.state = type { [8 x [8 x i32]], [8 x i32*] } ; Function Attrs: nounwind ssp define void @foo(%struct.state* nocapture readonly %state, i32 %ch, i32* nocapture readonly %buffer) #0 { entry: %ret = alloca i32, align 4 %arrayidx = getelementptr inbounds %struct.state* %state, i32 0, i32 1, i32 %ch %0 = load i32** %arrayidx, align 4, !tbaa !2 br label %for.cond3.preheader for.cond3.preheader: ; preds %for.cond3.preheader, %entry %i.052 = phi i32 [ 0, %entry ], [ %inc27, %for.cond3.preheader ] %idx.051 = phi i32 [ 0, %entry ], [ %add28, %for.cond3.preheader ] %arrayidx6 = getelementptr inbounds i32* %0, i32 %i.052 %1 = load i32* %arrayidx6, align 4, !tbaa !6 %arrayidx9 = getelementptr inbounds %struct.state* %state, i32 0, i32 0, i32 %ch, i32 %1 %2 = load i32* %arrayidx9, align 4, !tbaa !6 %arrayidx11 = getelementptr inbounds i32* %buffer, i32 %idx.051 %3 = load i32* %arrayidx11, align 4, !tbaa !6 %add17 = add nsw i32 %1, 1 %arrayidx20 = getelementptr inbounds %struct.state* %state, i32 0, i32 0, i32 %ch, i32 %add17 %4 = load i32* %arrayidx20, align 4, !tbaa !6 %tmp = add i32 %4, %2 %tmp48 = mul i32 %tmp, %3 %add.1 = add nsw i32 %1, 2 %arrayidx9.1 = getelementptr inbounds %struct.state* %state, i32 0, i32 0, i32 %ch, i32 %add.1 %5 = load i32* %arrayidx9.1, align 4, !tbaa !6 %add10.1 = add nuw nsw i32 %idx.051, 1 %arrayidx11.1 = getelementptr inbounds i32* %buffer, i32 %add10.1 %6 = load i32* %arrayidx11.1, align 4, !tbaa !6 %add17.1 = add nsw i32 %1, 3 %arrayidx20.1 = getelementptr inbounds %struct.state* %state, i32 0, i32 0, i32 %ch, i32 %add17.1 %7 = load i32* %arrayidx20.1, align 4, !tbaa !6 %tmp.1 = add i32 %7, %5 %tmp48.1 = mul i32 %tmp.1, %6 %add24.1 = add i32 %tmp48.1, %tmp48 store volatile i32 %add24.1, i32* %ret, align 4 %inc27 = add nuw nsw i32 %i.052, 1 %add28 = add nuw nsw i32 %idx.051, 5 %exitcond53 = icmp eq i32 %inc27, 100 br i1 %exitcond53, label %for.end29, label %for.cond3.preheader for.end29: ; preds %for.cond3.preheader ret void } attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.module.flags = !{!0} !llvm.ident = !{!1} !0 = !{i32 1, !"PIC Level", i32 2} !1 = !{!"clang version 3.6.0 (tags/RELEASE_360/final)"} !2 = !{!3, !3, i64 0} !3 = !{!"any pointer", !4, i64 0} !4 = !{!"omnipotent char", !5, i64 0} !5 = !{!"Simple C/C++ TBAA"} !6 = !{!7, !7, i64 0} !7 = !{!"int", !4, i64 0} And the generated .s file .text .macosx_version_min 10, 10 .file "ex11.ll" .globl foo .align 4 .type foo, at function foo: // @foo // BB#0: // %entry link 16; [fp - 4] = r4; [fp - 8] = r5; [fp - 12] = r6; r3 = r1 << 2; r4 = r0 + r3; r3 = 0 (x); r2 += 4; p0 = r4; r4 = [p0 + 256]; p0 = r2; LBB0_1: // %for.cond3.preheader // =>This Inner Loop Header: Depth=1 r2 = r1 << 5; r2 = r0 + r2; r5 = r4 + r3; p1 = r5; r5 = [p1]; r5 = r5 << 2; r2 = r2 + r5; p1 = r2; <-------------- r5 = [p1]; p1 = r2; <--------------- redundant copy r6 = [p1 + 4]; r5 = r6 + r5; r6 = [p0 + -4]; r5 *= r6; p1 = r2; <--------------- redundant copy r6 = [p1 + 8]; p1 = r2; <--------------- redundant copy r2 = [p1 + 12]; r2 = r2 + r6; r6 = [p0]; r2 *= r6; r2 = r2 + r5; [fp - 16] = r2; r2 = p0; r2 += 20; r3 += 4; r5 = 400 (z); cc = r3 == r5; p0 = r2; if !cc jump LBB0_1; jump LBB0_2; LBB0_2: // %for.end29 r6 = [fp - 12]; r5 = [fp - 8]; r4 = [fp - 4]; unlink; rts; Ltmp0: .size foo, Ltmp0-foo Huang On Fri, May 22, 2015 at 12:24 AM, Samuel Crow <samueldcrow at gmail.com> wrote:> > On May 21, 2015, at 7:21 AM, zan jyu Wong wrote: > > > Hi, > > > > I've been working on a Blackfin backend (llvm-3.6.0) based on the > previous one that was removed in llvm-3.1. > > llc generates codes like this: > > > > 29 p1 = r2; > > 30 r5 = [p1]; > > 31 p1 = r2; > > 32 r6 = [p1 + 4]; > > 33 r5 = r6 + r5; > > 34 r6 = [p0 + -4]; > > 35 r5 *= r6; > > 36 p1 = r2; > > 37 r6 = [p1 + 8]; > > 38 p1 = r2; > > > > p1 and r2 are in different register classes. > > A p* register can be used for load/stroe values from memory while a r* > register can not. > > > > As we can see, line 31, 36, 38 can be deleted. How can I configure llc > to do this? Or do I have to write a custom pass to do this optimization? > Any suggestion is welcome. > > > > Thanks, > > > > Huang > > Hello Huang, > > SIlly as this may sound, did you run OPT on the bitcode first before using > LLC? > > Cheers, > > Sam-------------- next part -------------- An HTML attachment was scrubbed... URL: <lists.llvm.org/pipermail/llvm-dev/attachments/20150522/71f36cf0/attachment.html>