The scoreboard hazard detector that I've added for the PPC 440 is not
detecting hazards as it should (which certainly could be my fault
somehow, but...). For example, it will produce a schedule that looks
like...
SU(28): 0x127969b0: f64,ch = LFD 0x12793aa0, 0x1277b4f0,
0x127965b0<Mem:LD8[%scevgep100](tbaa=!"double")> [ORD=41]
[ID=28]
SU(46): 0x12796ab0: f64 = FADD 0x127969b0, 0x127968b0 [ORD=42] [ID=46]
SU(27): 0x12796cb0: ch = STFD 0x12796ab0, 0x12793aa0, 0x1277b3f0,
0x127969b0:1<Mem:ST8[%scevgep103](tbaa=!"double")> [ORD=46]
[ID=27]
SU(26): 0x127970b0: f64,ch = LFD 0x127941a0, 0x1277b4f0,
0x12796cb0<Mem:LD8[%scevgep94](align=16)(tbaa=!"double")>
[ORD=50]
[ID=26]
SU(47): 0x127972c0: f64 = FADD 0x127970b0, 0x127968b0 [ORD=51] [ID=47]
SU(25): 0x127974c0: ch = STFD 0x127972c0, 0x127941a0, 0x1277b3f0,
0x127970b0:1<Mem:ST8[%scevgep97](align=16)(tbaa=!"double")>
[ORD=55]
[ID=25]
in other words, it produces a set of load, add, store triples,
non-interleaved, in order. The problem is that the result of the load is
not immediately available, and either is the result of the add. The
loads are covered by the itinerary:
InstrItinData<LdStLFD , [InstrStage<1, [IFTH1, IFTH2]>,
InstrStage<1, [PDCD1, PDCD2]>,
InstrStage<1, [DISS1, DISS2]>,
InstrStage<1, [LRACC]>,
InstrStage<1, [AGEN]>,
InstrStage<1, [CRD]>,
InstrStage<2, [LWB]>],
[9, 5, 5],
[NoBypass, GPR_Bypass, GPR_Bypass]>,
the add is covered by the itinerary:
InstrItinData<FPGeneral , [InstrStage<1, [IFTH1, IFTH2]>,
InstrStage<1, [PDCD1, PDCD2]>,
InstrStage<1, [DISS1, DISS2]>,
InstrStage<1, [FRACC]>,
InstrStage<1, [FEXE1]>,
InstrStage<1, [FEXE2]>,
InstrStage<1, [FEXE3]>,
InstrStage<1, [FEXE4]>,
InstrStage<1, [FEXE5]>,
InstrStage<1, [FEXE6]>,
InstrStage<1, [FWB]>],
[10, 4, 4],
[FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
the store is covered by:
InstrItinData<LdStUX , [InstrStage<1, [IFTH1, IFTH2]>,
InstrStage<1, [PDCD1, PDCD2]>,
InstrStage<1, [DISS1, DISS2]>,
InstrStage<1, [LRACC]>,
InstrStage<1, [AGEN]>,
InstrStage<1, [CRD]>,
InstrStage<1, [LWB]>],
[8, 5, 5],
[NoBypass, GPR_Bypass, GPR_Bypass]>,
So, say that the load dispatches in cycle 1. According to the itinerary,
the result of the load is not available until cycle 9. The add
dispatches in the same cycle, or one cycle later. In the best case, it
dispatches one cycle later (in cycle 2). It expects to read its inputs 4
cycles later in cycle number 6. The input, however, will not be
available until cycle 9 yielding a 3 cycle stall. As far as I can tell
by looking at the debug output, no hazard was reported by the scoreboard
detector. Is this a bug or am I doing something wrong?
I've attached a small test case, run with llc -mcpu=440
Thanks again,
Hal
--
Hal Finkel
Postdoctoral Appointee
Leadership Computing Facility
Argonne National Laboratory
-------------- next part --------------
; ModuleID = 'tsc_s000.c'
target datalayout =
"E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
target triple = "powerpc-unknown-linux-gnu"
@.str = private unnamed_addr constant [6 x i8] c"s000 \00", align 1
@Y = common global [16000 x double] zeroinitializer, align 16
@X = common global [16000 x double] zeroinitializer, align 16
@Z = common global [16000 x double] zeroinitializer, align 16
@U = common global [16000 x double] zeroinitializer, align 16
@V = common global [16000 x double] zeroinitializer, align 16
@aa = common global [256 x [256 x double]] zeroinitializer, align 16
@bb = common global [256 x [256 x double]] zeroinitializer, align 16
@cc = common global [256 x [256 x double]] zeroinitializer, align 16
@.str1 = private unnamed_addr constant [14 x i8] c"S000\09 %.2f
\09\09\00", align 1
@array = common global [65536 x double] zeroinitializer, align 16
@x = common global [16000 x double] zeroinitializer, align 16
@temp = common global double 0.000000e+00, align 8
@temp_int = common global i32 0, align 4
@a = common global [16000 x double] zeroinitializer, align 16
@b = common global [16000 x double] zeroinitializer, align 16
@c = common global [16000 x double] zeroinitializer, align 16
@d = common global [16000 x double] zeroinitializer, align 16
@e = common global [16000 x double] zeroinitializer, align 16
@tt = common global [256 x [256 x double]] zeroinitializer, align 16
@indx = common global [16000 x i32] zeroinitializer, align 16
@xx = common global double* null, align 4
@yy = common global double* null, align 4
@str = internal unnamed_addr constant [29 x i8] c"Loop \09 Time(Sec) \09
Checksum \00"
define i32 @s000() nounwind {
entry:
%call = tail call i32 bitcast (i32 (...)* @init to i32 (i8*)*)(i8*
getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0)) nounwind
%call1 = tail call i32 @clock() nounwind
br label %for.cond2.preheader
for.cond2.preheader: ; preds = %for.end, %entry
%nl.014 = phi i32 [ 0, %entry ], [ %inc8, %for.end ]
br label %for.body4
for.body4: ; preds = %for.body4,
%for.cond2.preheader
%i.013 = phi i32 [ 0, %for.cond2.preheader ], [ %inc.15, %for.body4 ]
%arrayidx = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %i.013
%0 = load double* %arrayidx, align 16, !tbaa !0
%add = fadd double %0, 1.000000e+00
%arrayidx5 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %i.013
store double %add, double* %arrayidx5, align 16, !tbaa !0
%inc15 = or i32 %i.013, 1
%arrayidx.1 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc15
%1 = load double* %arrayidx.1, align 8, !tbaa !0
%add.1 = fadd double %1, 1.000000e+00
%arrayidx5.1 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc15
store double %add.1, double* %arrayidx5.1, align 8, !tbaa !0
%inc.116 = or i32 %i.013, 2
%arrayidx.2 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.116
%2 = load double* %arrayidx.2, align 16, !tbaa !0
%add.2 = fadd double %2, 1.000000e+00
%arrayidx5.2 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.116
store double %add.2, double* %arrayidx5.2, align 16, !tbaa !0
%inc.217 = or i32 %i.013, 3
%arrayidx.3 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.217
%3 = load double* %arrayidx.3, align 8, !tbaa !0
%add.3 = fadd double %3, 1.000000e+00
%arrayidx5.3 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.217
store double %add.3, double* %arrayidx5.3, align 8, !tbaa !0
%inc.318 = or i32 %i.013, 4
%arrayidx.4 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.318
%4 = load double* %arrayidx.4, align 16, !tbaa !0
%add.4 = fadd double %4, 1.000000e+00
%arrayidx5.4 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.318
store double %add.4, double* %arrayidx5.4, align 16, !tbaa !0
%inc.419 = or i32 %i.013, 5
%arrayidx.5 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.419
%5 = load double* %arrayidx.5, align 8, !tbaa !0
%add.5 = fadd double %5, 1.000000e+00
%arrayidx5.5 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.419
store double %add.5, double* %arrayidx5.5, align 8, !tbaa !0
%inc.520 = or i32 %i.013, 6
%arrayidx.6 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.520
%6 = load double* %arrayidx.6, align 16, !tbaa !0
%add.6 = fadd double %6, 1.000000e+00
%arrayidx5.6 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.520
store double %add.6, double* %arrayidx5.6, align 16, !tbaa !0
%inc.621 = or i32 %i.013, 7
%arrayidx.7 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.621
%7 = load double* %arrayidx.7, align 8, !tbaa !0
%add.7 = fadd double %7, 1.000000e+00
%arrayidx5.7 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.621
store double %add.7, double* %arrayidx5.7, align 8, !tbaa !0
%inc.722 = or i32 %i.013, 8
%arrayidx.8 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.722
%8 = load double* %arrayidx.8, align 16, !tbaa !0
%add.8 = fadd double %8, 1.000000e+00
%arrayidx5.8 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.722
store double %add.8, double* %arrayidx5.8, align 16, !tbaa !0
%inc.823 = or i32 %i.013, 9
%arrayidx.9 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.823
%9 = load double* %arrayidx.9, align 8, !tbaa !0
%add.9 = fadd double %9, 1.000000e+00
%arrayidx5.9 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.823
store double %add.9, double* %arrayidx5.9, align 8, !tbaa !0
%inc.924 = or i32 %i.013, 10
%arrayidx.10 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32
%inc.924
%10 = load double* %arrayidx.10, align 16, !tbaa !0
%add.10 = fadd double %10, 1.000000e+00
%arrayidx5.10 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.924
store double %add.10, double* %arrayidx5.10, align 16, !tbaa !0
%inc.1025 = or i32 %i.013, 11
%arrayidx.11 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32
%inc.1025
%11 = load double* %arrayidx.11, align 8, !tbaa !0
%add.11 = fadd double %11, 1.000000e+00
%arrayidx5.11 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.1025
store double %add.11, double* %arrayidx5.11, align 8, !tbaa !0
%inc.1126 = or i32 %i.013, 12
%arrayidx.12 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32
%inc.1126
%12 = load double* %arrayidx.12, align 16, !tbaa !0
%add.12 = fadd double %12, 1.000000e+00
%arrayidx5.12 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.1126
store double %add.12, double* %arrayidx5.12, align 16, !tbaa !0
%inc.1227 = or i32 %i.013, 13
%arrayidx.13 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32
%inc.1227
%13 = load double* %arrayidx.13, align 8, !tbaa !0
%add.13 = fadd double %13, 1.000000e+00
%arrayidx5.13 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.1227
store double %add.13, double* %arrayidx5.13, align 8, !tbaa !0
%inc.1328 = or i32 %i.013, 14
%arrayidx.14 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32
%inc.1328
%14 = load double* %arrayidx.14, align 16, !tbaa !0
%add.14 = fadd double %14, 1.000000e+00
%arrayidx5.14 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.1328
store double %add.14, double* %arrayidx5.14, align 16, !tbaa !0
%inc.1429 = or i32 %i.013, 15
%arrayidx.15 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32
%inc.1429
%15 = load double* %arrayidx.15, align 8, !tbaa !0
%add.15 = fadd double %15, 1.000000e+00
%arrayidx5.15 = getelementptr inbounds [16000 x double]* @X, i32 0, i32
%inc.1429
store double %add.15, double* %arrayidx5.15, align 8, !tbaa !0
%inc.15 = add nsw i32 %i.013, 16
%exitcond.15 = icmp eq i32 %inc.15, 16000
br i1 %exitcond.15, label %for.end, label %for.body4
for.end: ; preds = %for.body4
%call6 = tail call i32 @dummy(double* getelementptr inbounds ([16000 x
double]* @X, i32 0, i32 0), double* getelementptr inbounds ([16000 x double]*
@Y, i32 0, i32 0), double* getelementptr inbounds ([16000 x double]* @Z, i32 0,
i32 0), double* getelementptr inbounds ([16000 x double]* @U, i32 0, i32 0),
double* getelementptr inbounds ([16000 x double]* @V, i32 0, i32 0), [256 x
double]* getelementptr inbounds ([256 x [256 x double]]* @aa, i32 0, i32 0),
[256 x double]* getelementptr inbounds ([256 x [256 x double]]* @bb, i32 0, i32
0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @cc, i32 0,
i32 0), double 0.000000e+00) nounwind
%inc8 = add nsw i32 %nl.014, 1
%exitcond = icmp eq i32 %inc8, 400000
br i1 %exitcond, label %for.end9, label %for.cond2.preheader
for.end9: ; preds = %for.end
%call10 = tail call i32 @clock() nounwind
%sub = sub nsw i32 %call10, %call1
%conv = sitofp i32 %sub to double
%div = fdiv double %conv, 1.000000e+06
%call11 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x
i8]* @.str1, i32 0, i32 0), double %div) nounwind
%call12 = tail call i32 bitcast (i32 (...)* @check to i32 (i32)*)(i32 1)
nounwind
ret i32 0
}
declare i32 @init(...)
declare i32 @clock() nounwind
declare i32 @dummy(double*, double*, double*, double*, double*, [256 x double]*,
[256 x double]*, [256 x double]*, double)
declare i32 @printf(i8* nocapture, ...) nounwind
declare i32 @check(...)
define i32 @main() nounwind {
entry:
%puts = tail call i32 @puts(i8* getelementptr inbounds ([29 x i8]* @str, i32
0, i32 0))
%call1 = tail call i32 @s000()
ret i32 0
}
declare i32 @puts(i8* nocapture) nounwind
!0 = metadata !{metadata !"double", metadata !1}
!1 = metadata !{metadata !"omnipotent char", metadata !2}
!2 = metadata !{metadata !"Simple C/C++ TBAA", null}