Aaryaman Vasishta
2017-Jun-07 16:58 UTC
[Nouveau] [PATCH v2] nv110/exa: update sched codes
On Tue, Jun 6, 2017 at 7:15 AM, Samuel Pitoiset <samuel.pitoiset at gmail.com> wrote:> Nice work! > > See my comments below, and double-check if some of them can be applied to > the shaders I didn't review yet. > > I recommend you to test your work because if one sched code is wrong, you > are likely going to kill your card and reboot your box. :-) > > > On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote: > >> v2: Add missing delays >> >> This patch adds proper delays to maxwell exa shaders. rendercheck tests >> seem consistent with/without this patch. I haven't extensively tested >> them though. >> >> Trello: >> https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-wit >> h-proper-delays >> >> Signed-off-by: Aaryaman Vasishta <jem456.vasishta at gmail.com> >> --- >> src/shader/exac8nv110.fp | 10 +++++----- >> src/shader/exac8nv110.fpc | 18 +++++++++--------- >> src/shader/exacanv110.fp | 10 +++++----- >> src/shader/exacanv110.fpc | 18 +++++++++--------- >> src/shader/exacmnv110.fp | 10 +++++----- >> src/shader/exacmnv110.fpc | 18 +++++++++--------- >> src/shader/exas8nv110.fp | 6 +++--- >> src/shader/exas8nv110.fpc | 12 ++++++------ >> src/shader/exasanv110.fp | 10 +++++----- >> src/shader/exasanv110.fpc | 18 +++++++++--------- >> src/shader/exascnv110.fp | 6 +++--- >> src/shader/exascnv110.fpc | 10 +++++----- >> src/shader/videonv110.fp | 14 +++++++------- >> src/shader/videonv110.fpc | 26 +++++++++++++------------- >> 14 files changed, 93 insertions(+), 93 deletions(-) >> >> diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp >> index ce78036..1c4a4f1 100644 >> --- a/src/shader/exac8nv110.fp >> +++ b/src/shader/exac8nv110.fp >> @@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r3 a[0x94] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr 0x1 wt >> 0x2) >> ipa $r2 a[0x90] $r0 0x0 0x1 >> tex nodep $r1 $r2 0x0 0x1 t2d 0x8 >> ipa $r3 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf) >> ipa $r2 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r2 0x0 0x0 t2d 0x8 >> > > Out of curiosity, what didn't you add a read-dep-bar on $r2:$r3 here?Missed it, thanks for pointing it out.> > > depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6 wt 0x3) (st 0x6) (st 0x1) >> fmul ftz $r3 $r0 $r1 >> mov $r2 $r3 0xf >> > > You can stall for only one cycle here, but the 6 cycles on fmul is needed. > > mov $r1 $r3 0xf >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6) (st 0xf) (st 0x0) >> mov $r0 $r3 0xf >> > > Same here.> > exit >> #endif >> diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc >> index 4aa1368..46943b7 100644 >> --- a/src/shader/exac8nv110.fpc >> +++ b/src/shader/exac8nv110.fpc >> @@ -1,36 +1,36 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff03, >> 0xe043ff89, >> -0xfc0007e0, >> -0x001f8000, >> +0x21e0072f, >> +0x005cbc03, >> 0x0007ff02, >> 0xe043ff89, >> 0x2ff70201, >> 0xc03a0014, >> 0x4007ff03, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe5e0074f, >> +0x001fbc06, >> 0x0007ff02, >> 0xe043ff88, >> 0x2ff70200, >> 0xc03a0004, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfcc01fe6, >> +0x001f8400, >> 0x00170003, >> 0x5c681000, >> 0x00370002, >> 0x5c980780, >> 0x00370001, >> 0x5c980780, >> -0xfc0007e0, >> +0xfde007e6, >> 0x001f8000, >> 0x00370000, >> 0x5c980780, >> diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp >> index a70d5c5..d7c2867 100644 >> --- a/src/shader/exacanv110.fp >> +++ b/src/shader/exacanv110.fp >> @@ -25,23 +25,23 @@ NV110FP_CAComposite[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r3 a[0x94] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2) >> ipa $r2 a[0x90] $r0 0x0 0x1 >> tex nodep $r4 $r2 0x0 0x1 t2d 0xf >> > > Please add a read-dep-bar and wait for on the first fmul because $r2:$r3 > are re-used before $r4. Should be safer.> > ipa $r1 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) >> ipa $r0 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x0 t2d 0xf >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) >> fmul ftz $r3 $r3 $r7 >> > > Why are you waiting all barriers? Only $r3 is needed here.After adding a read-dep-bar and waiting on that over here, I wasn't able to pass the same number of `rendercheck -f a8r8g8b8` tests as before this patch. After a little trial and error I discovered that wt 0xc fixes it, which means that bar 3 and 4 were being used in this fmul somehow (assuming bars start from 1), which is odd because this fmul only uses $r3 and $r7, and I think it should wait on the read-dep-bar set on "tex nodep $r4 $r2 0x0 0x1 t2d 0xf" (I could be wrong though). I'm kinda stumped on what's going on within this fmul that's causing this behavior.> > > fmul ftz $r2 $r2 $r6 >> fmul ftz $r1 $r1 $r5 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x3) (st 0xf) (st 0x0) >> fmul ftz $r0 $r0 $r4 >> exit >> #endif >> diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc >> index 7c0ca5e..9cad139 100644 >> --- a/src/shader/exacanv110.fpc >> +++ b/src/shader/exacanv110.fpc >> @@ -1,36 +1,36 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff03, >> 0xe043ff89, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x0008bc03, >> 0x0007ff02, >> 0xe043ff89, >> 0xaff70204, >> 0xc03a0017, >> 0x4007ff01, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe5e0274f, >> +0x001fbc06, >> 0x0007ff00, >> 0xe043ff88, >> 0xaff70000, >> 0xc03a0007, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc21ffe1, >> +0x001f8400, >> 0x00770303, >> 0x5c681000, >> 0x00670202, >> 0x5c681000, >> 0x00570101, >> 0x5c681000, >> -0xfc0007e0, >> +0xfde01fe1, >> 0x001f8000, >> 0x00470000, >> 0x5c681000, >> diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp >> index fe5c294..d717138 100644 >> --- a/src/shader/exacmnv110.fp >> +++ b/src/shader/exacmnv110.fp >> @@ -25,23 +25,23 @@ NV110FP_Composite[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r3 a[0x94] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2) >> ipa $r2 a[0x90] $r0 0x0 0x1 >> tex nodep $r4 $r2 0x0 0x1 t2d 0x8 >> ipa $r1 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) >> ipa $r0 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x0 t2d 0xf >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) >> fmul ftz $r3 $r3 $r4 >> fmul ftz $r2 $r2 $r4 >> fmul ftz $r1 $r1 $r4 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6 wt 0x2) (st 0xf) (st 0x0) >> fmul ftz $r0 $r0 $r4 >> exit >> #endif >> diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc >> index 9d62c1a..c150875 100644 >> --- a/src/shader/exacmnv110.fpc >> +++ b/src/shader/exacmnv110.fpc >> @@ -1,36 +1,36 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff03, >> 0xe043ff89, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x0008bc03, >> 0x0007ff02, >> 0xe043ff89, >> 0x2ff70204, >> 0xc03a0014, >> 0x4007ff01, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe5e0274f, >> +0x001fbc06, >> 0x0007ff00, >> 0xe043ff88, >> 0xaff70000, >> 0xc03a0007, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc21ffe1, >> +0x001f8400, >> 0x00470303, >> 0x5c681000, >> 0x00470202, >> 0x5c681000, >> 0x00470101, >> 0x5c681000, >> -0xfc0007e0, >> +0xfde017e6, >> 0x001f8000, >> 0x00470000, >> 0x5c681000, >> diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp >> index 4fe2e19..a555beb 100644 >> --- a/src/shader/exas8nv110.fp >> +++ b/src/shader/exas8nv110.fp >> @@ -25,15 +25,15 @@ NV110FP_Source_A8[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r1 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf) >> ipa $r0 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x0 t2d 0x8 >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x1) (st 0x1) (st 0x1) >> mov $r3 $r0 0xf >> mov $r2 $r0 0xf >> mov $r1 $r0 0xf >> > > This one looks good! > > > diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc >> index 1181c41..e58d168 100644 >> --- a/src/shader/exas8nv110.fpc >> +++ b/src/shader/exas8nv110.fpc >> @@ -1,21 +1,21 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff01, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x001fbc03, >> 0x0007ff00, >> 0xe043ff88, >> 0x2ff70000, >> 0xc03a0004, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc200fe1, >> +0x001f8400, >> 0x00070003, >> 0x5c980780, >> 0x00070002, >> diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp >> index 61374a6..ad7ca36 100644 >> --- a/src/shader/exasanv110.fp >> +++ b/src/shader/exasanv110.fp >> @@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r3 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2) >> ipa $r2 a[0x80] $r0 0x0 0x1 >> tex nodep $r4 $r2 0x0 0x0 t2d 0x8 >> ipa $r1 a[0x94] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) >> ipa $r0 a[0x90] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x1 t2d 0xf >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) >> fmul ftz $r3 $r3 $r4 >> fmul ftz $r2 $r2 $r4 >> fmul ftz $r1 $r1 $r4 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x2) (st 0xf) (st 0x0) >> fmul ftz $r0 $r0 $r4 >> exit >> #endif >> diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc >> index 5516a03..1485f11 100644 >> --- a/src/shader/exasanv110.fpc >> +++ b/src/shader/exasanv110.fpc >> @@ -1,36 +1,36 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff03, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x0008bc03, >> 0x0007ff02, >> 0xe043ff88, >> 0x2ff70204, >> 0xc03a0004, >> 0x4007ff01, >> 0xe043ff89, >> -0xfc0007e0, >> -0x001f8000, >> +0xe5e0274f, >> +0x001fbc06, >> 0x0007ff00, >> 0xe043ff89, >> 0xaff70000, >> 0xc03a0017, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc21ffe1, >> +0x001f8400, >> 0x00470303, >> 0x5c681000, >> 0x00470202, >> 0x5c681000, >> 0x00470101, >> 0x5c681000, >> -0xfc0007e0, >> +0xfde017e1, >> 0x001f8000, >> 0x00470000, >> 0x5c681000, >> diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp >> index 90bbb55..86e14e8 100644 >> --- a/src/shader/exascnv110.fp >> +++ b/src/shader/exascnv110.fp >> @@ -25,14 +25,14 @@ NV110FP_Source[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r1 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf) >> ipa $r0 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x0 t2d 0xf >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf) (st 0x0) (st 0x0) >> > > Looks good. > > > exit >> #endif >> diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc >> index 2dba15d..1fef5d2 100644 >> --- a/src/shader/exascnv110.fpc >> +++ b/src/shader/exascnv110.fpc >> @@ -1,20 +1,20 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff01, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xfde0072f, >> +0x001fbc03, >> 0x0007ff00, >> 0xe043ff88, >> 0xaff70000, >> 0xc03a0007, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> +0xfc0007ef, >> 0x001f8000, >> 0x0007000f, >> 0xe3000000, >> diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp >> index 2728311..dd3816c 100644 >> --- a/src/shader/videonv110.fp >> +++ b/src/shader/videonv110.fp >> @@ -25,30 +25,30 @@ NV110FP_NV12[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) >> ipa pass $r2 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r2 $r2 >> ipa $r0 a[0x80] $r2 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1) >> ipa $r1 a[0x84] $r2 0x0 0x1 >> tex nodep $r4 $r0 0x0 0x0 t2d 0x8 >> tex nodep $r0 $r0 0x0 0x1 t2d 0xc >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf) (st 0x6 wt 0x1) (st 0x6) >> depbar le 0x5 0x1 0x1 >> fmul ftz $r5 $r4 c0[0x0] >> fadd ftz $r3 $r5 c0[0x4] >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6) (st 0x6) (st 0xf) >> fadd ftz $r4 $r5 c0[0x8] >> fadd ftz $r5 $r5 c0[0xc] >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6 wt 0x2) (st 0x1) (st 0x1) >> ffma ftz $r3 $r0 c0[0x10] $r3 >> ffma ftz $r4 $r0 c0[0x14] $r4 >> ffma ftz $r5 $r0 c0[0x18] $r5 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1) (st 0x1) (st 0x6) >> ffma ftz $r0 $r1 c0[0x1c] $r3 >> ffma ftz $r2 $r1 c0[0x24] $r5 >> ffma ftz $r1 $r1 c0[0x20] $r4 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf) (st 0x0) (st 0x0) >> exit >> #endif >> diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc >> index 31d745a..8fbc246 100644 >> --- a/src/shader/videonv110.fpc >> +++ b/src/shader/videonv110.fpc >> @@ -1,52 +1,52 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff02, >> 0xe003ff87, >> 0x00470202, >> 0x50800000, >> 0x0027ff00, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x001cbc03, >> 0x4027ff01, >> 0xe043ff88, >> 0x2ff70004, >> 0xc03a0004, >> 0x2ff70000, >> 0xc03a0016, >> -0xfc0007e0, >> -0x001f8000, >> +0xfcc007ef, >> +0x001f9801, >> 0x34170001, >> 0xf0f00000, >> 0x00070405, >> 0x4c681000, >> 0x00170503, >> 0x4c581000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfcc007e6, >> +0x001fbc00, >> 0x00270504, >> 0x4c581000, >> 0x00370505, >> 0x4c581000, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc2017e6, >> +0x001f8400, >> 0x00470003, >> 0x49a00180, >> 0x00570004, >> 0x49a00200, >> 0x00670005, >> 0x49a00280, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc2007e1, >> +0x001f9800, >> 0x00770100, >> 0x49a00180, >> 0x00970102, >> 0x49a00280, >> 0x00870101, >> 0x49a00200, >> -0xfc0007e0, >> +0xfc0007ef, >> 0x001f8000, >> 0x0007000f, >> 0xe3000000, >> >As for your other comments, I have made the suggested changes. Thanks for your review! Cheers, Aaryaman -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.freedesktop.org/archives/nouveau/attachments/20170608/3a5ecc1a/attachment-0001.html>
On 06/07/2017 06:58 PM, Aaryaman Vasishta wrote:> > > On Tue, Jun 6, 2017 at 7:15 AM, Samuel Pitoiset > <samuel.pitoiset at gmail.com <mailto:samuel.pitoiset at gmail.com>> wrote: > > Nice work! > > See my comments below, and double-check if some of them can be > applied to the shaders I didn't review yet. > > I recommend you to test your work because if one sched code is > wrong, you are likely going to kill your card and reboot your box. :-) > > > On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote: > > v2: Add missing delays > > This patch adds proper delays to maxwell exa shaders. > rendercheck tests > seem consistent with/without this patch. I haven't extensively > tested > them though. > > Trello: > https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays > <https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays> > > Signed-off-by: Aaryaman Vasishta <jem456.vasishta at gmail.com > <mailto:jem456.vasishta at gmail.com>> > --- > src/shader/exac8nv110.fp | 10 +++++----- > src/shader/exac8nv110.fpc | 18 +++++++++--------- > src/shader/exacanv110.fp | 10 +++++----- > src/shader/exacanv110.fpc | 18 +++++++++--------- > src/shader/exacmnv110.fp | 10 +++++----- > src/shader/exacmnv110.fpc | 18 +++++++++--------- > src/shader/exas8nv110.fp | 6 +++--- > src/shader/exas8nv110.fpc | 12 ++++++------ > src/shader/exasanv110.fp | 10 +++++----- > src/shader/exasanv110.fpc | 18 +++++++++--------- > src/shader/exascnv110.fp | 6 +++--- > src/shader/exascnv110.fpc | 10 +++++----- > src/shader/videonv110.fp | 14 +++++++------- > src/shader/videonv110.fpc | 26 +++++++++++++------------- > 14 files changed, 93 insertions(+), 93 deletions(-) > > diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp > index ce78036..1c4a4f1 100644 > --- a/src/shader/exac8nv110.fp > +++ b/src/shader/exac8nv110.fp > @@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = { > }; > #else > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) > ipa pass $r0 a[0x7c] 0x0 0x0 0x1 > mufu rcp $r0 $r0 > ipa $r3 a[0x94] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr > 0x1 wt 0x2) > ipa $r2 a[0x90] $r0 0x0 0x1 > tex nodep $r1 $r2 0x0 0x1 t2d 0x8 > ipa $r3 a[0x84] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf) > ipa $r2 a[0x80] $r0 0x0 0x1 > tex nodep $r0 $r2 0x0 0x0 t2d 0x8 > > > Out of curiosity, what didn't you add a read-dep-bar on $r2:$r3 here? > > Missed it, thanks for pointing it out.You don't have to. 'tex' reads two sources ($r2:$r3) and writes into $r0, but as $r2:$r3 are NOT re-used before $r0 is read, you can assume that $r0 will be ready and don't need any read-dep-bar.> > > > depbar le 0x5 0x0 0x0 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x6 wt 0x3) (st 0x6) (st 0x1) > fmul ftz $r3 $r0 $r1 > mov $r2 $r3 0xf > > > You can stall for only one cycle here, but the 6 cycles on fmul is > needed. > > mov $r1 $r3 0xf > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x6) (st 0xf) (st 0x0) > mov $r0 $r3 0xf > > > Same here. > > > > exit > #endif > diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc > index 4aa1368..46943b7 100644 > --- a/src/shader/exac8nv110.fpc > +++ b/src/shader/exac8nv110.fpc > @@ -1,36 +1,36 @@ > -0xfc0007e0, > -0x001f8000, > +0xe1a0070f, > +0x003c3c01, > 0xcff7ff00, > 0xe003ff87, > 0x00470000, > 0x50800000, > 0x4007ff03, > 0xe043ff89, > -0xfc0007e0, > -0x001f8000, > +0x21e0072f, > +0x005cbc03, > 0x0007ff02, > 0xe043ff89, > 0x2ff70201, > 0xc03a0014, > 0x4007ff03, > 0xe043ff88, > -0xfc0007e0, > -0x001f8000, > +0xe5e0074f, > +0x001fbc06, > 0x0007ff02, > 0xe043ff88, > 0x2ff70200, > 0xc03a0004, > 0x34070000, > 0xf0f00000, > -0xfc0007e0, > -0x001f8000, > +0xfcc01fe6, > +0x001f8400, > 0x00170003, > 0x5c681000, > 0x00370002, > 0x5c980780, > 0x00370001, > 0x5c980780, > -0xfc0007e0, > +0xfde007e6, > 0x001f8000, > 0x00370000, > 0x5c980780, > diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp > index a70d5c5..d7c2867 100644 > --- a/src/shader/exacanv110.fp > +++ b/src/shader/exacanv110.fp > @@ -25,23 +25,23 @@ NV110FP_CAComposite[] = { > }; > #else > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) > ipa pass $r0 a[0x7c] 0x0 0x0 0x1 > mufu rcp $r0 $r0 > ipa $r3 a[0x94] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2) > ipa $r2 a[0x90] $r0 0x0 0x1 > tex nodep $r4 $r2 0x0 0x1 t2d 0xf > > > Please add a read-dep-bar and wait for on the first fmul because > $r2:$r3 are re-used before $r4. Should be safer. > > > > ipa $r1 a[0x84] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) > ipa $r0 a[0x80] $r0 0x0 0x1 > tex nodep $r0 $r0 0x0 0x0 t2d 0xf > depbar le 0x5 0x0 0x0 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) > fmul ftz $r3 $r3 $r7 > > > Why are you waiting all barriers? Only $r3 is needed here. > > After adding a read-dep-bar and waiting on that over here, I wasn't able > to pass the same number of `rendercheck -f a8r8g8b8` tests as before > this patch. After a little trial and error I discovered that wt 0xc > fixes it, which means that bar 3 and 4 were being used in this fmul > somehow (assuming bars start from 1), which is odd because this fmul > only uses $r3 and $r7, and I think it should wait on the read-dep-bar > set on "tex nodep $r4 $r2 0x0 0x1 t2d 0xf" (I could be wrong though). > I'm kinda stumped on what's going on within this fmul that's causing > this behavior.Because you are missing a read-dep-bar on the first 'tex' in this shader. Presumably, if you add one, you no longer need to wait for all bars. Samuel.> > > > fmul ftz $r2 $r2 $r6 > fmul ftz $r1 $r1 $r5 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x1 wt 0x3) (st 0xf) (st 0x0) > fmul ftz $r0 $r0 $r4 > exit > #endif > diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc > index 7c0ca5e..9cad139 100644 > --- a/src/shader/exacanv110.fpc > +++ b/src/shader/exacanv110.fpc > @@ -1,36 +1,36 @@ > -0xfc0007e0, > -0x001f8000, > +0xe1a0070f, > +0x003c3c01, > 0xcff7ff00, > 0xe003ff87, > 0x00470000, > 0x50800000, > 0x4007ff03, > 0xe043ff89, > -0xfc0007e0, > -0x001f8000, > +0xe1e0072f, > +0x0008bc03, > 0x0007ff02, > 0xe043ff89, > 0xaff70204, > 0xc03a0017, > 0x4007ff01, > 0xe043ff88, > -0xfc0007e0, > -0x001f8000, > +0xe5e0274f, > +0x001fbc06, > 0x0007ff00, > 0xe043ff88, > 0xaff70000, > 0xc03a0007, > 0x34070000, > 0xf0f00000, > -0xfc0007e0, > -0x001f8000, > +0xfc21ffe1, > +0x001f8400, > 0x00770303, > 0x5c681000, > 0x00670202, > 0x5c681000, > 0x00570101, > 0x5c681000, > -0xfc0007e0, > +0xfde01fe1, > 0x001f8000, > 0x00470000, > 0x5c681000, > diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp > index fe5c294..d717138 100644 > --- a/src/shader/exacmnv110.fp > +++ b/src/shader/exacmnv110.fp > @@ -25,23 +25,23 @@ NV110FP_Composite[] = { > }; > #else > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) > ipa pass $r0 a[0x7c] 0x0 0x0 0x1 > mufu rcp $r0 $r0 > ipa $r3 a[0x94] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2) > ipa $r2 a[0x90] $r0 0x0 0x1 > tex nodep $r4 $r2 0x0 0x1 t2d 0x8 > ipa $r1 a[0x84] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) > ipa $r0 a[0x80] $r0 0x0 0x1 > tex nodep $r0 $r0 0x0 0x0 t2d 0xf > depbar le 0x5 0x0 0x0 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) > fmul ftz $r3 $r3 $r4 > fmul ftz $r2 $r2 $r4 > fmul ftz $r1 $r1 $r4 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x6 wt 0x2) (st 0xf) (st 0x0) > fmul ftz $r0 $r0 $r4 > exit > #endif > diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc > index 9d62c1a..c150875 100644 > --- a/src/shader/exacmnv110.fpc > +++ b/src/shader/exacmnv110.fpc > @@ -1,36 +1,36 @@ > -0xfc0007e0, > -0x001f8000, > +0xe1a0070f, > +0x003c3c01, > 0xcff7ff00, > 0xe003ff87, > 0x00470000, > 0x50800000, > 0x4007ff03, > 0xe043ff89, > -0xfc0007e0, > -0x001f8000, > +0xe1e0072f, > +0x0008bc03, > 0x0007ff02, > 0xe043ff89, > 0x2ff70204, > 0xc03a0014, > 0x4007ff01, > 0xe043ff88, > -0xfc0007e0, > -0x001f8000, > +0xe5e0274f, > +0x001fbc06, > 0x0007ff00, > 0xe043ff88, > 0xaff70000, > 0xc03a0007, > 0x34070000, > 0xf0f00000, > -0xfc0007e0, > -0x001f8000, > +0xfc21ffe1, > +0x001f8400, > 0x00470303, > 0x5c681000, > 0x00470202, > 0x5c681000, > 0x00470101, > 0x5c681000, > -0xfc0007e0, > +0xfde017e6, > 0x001f8000, > 0x00470000, > 0x5c681000, > diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp > index 4fe2e19..a555beb 100644 > --- a/src/shader/exas8nv110.fp > +++ b/src/shader/exas8nv110.fp > @@ -25,15 +25,15 @@ NV110FP_Source_A8[] = { > }; > #else > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) > ipa pass $r0 a[0x7c] 0x0 0x0 0x1 > mufu rcp $r0 $r0 > ipa $r1 a[0x84] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf) > ipa $r0 a[0x80] $r0 0x0 0x1 > tex nodep $r0 $r0 0x0 0x0 t2d 0x8 > depbar le 0x5 0x0 0x0 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x1 wt 0x1) (st 0x1) (st 0x1) > mov $r3 $r0 0xf > mov $r2 $r0 0xf > mov $r1 $r0 0xf > > > This one looks good! > > > diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc > index 1181c41..e58d168 100644 > --- a/src/shader/exas8nv110.fpc > +++ b/src/shader/exas8nv110.fpc > @@ -1,21 +1,21 @@ > -0xfc0007e0, > -0x001f8000, > +0xe1a0070f, > +0x003c3c01, > 0xcff7ff00, > 0xe003ff87, > 0x00470000, > 0x50800000, > 0x4007ff01, > 0xe043ff88, > -0xfc0007e0, > -0x001f8000, > +0xe1e0072f, > +0x001fbc03, > 0x0007ff00, > 0xe043ff88, > 0x2ff70000, > 0xc03a0004, > 0x34070000, > 0xf0f00000, > -0xfc0007e0, > -0x001f8000, > +0xfc200fe1, > +0x001f8400, > 0x00070003, > 0x5c980780, > 0x00070002, > diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp > index 61374a6..ad7ca36 100644 > --- a/src/shader/exasanv110.fp > +++ b/src/shader/exasanv110.fp > @@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = { > }; > #else > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) > ipa pass $r0 a[0x7c] 0x0 0x0 0x1 > mufu rcp $r0 $r0 > ipa $r3 a[0x84] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2) > ipa $r2 a[0x80] $r0 0x0 0x1 > tex nodep $r4 $r2 0x0 0x0 t2d 0x8 > ipa $r1 a[0x94] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) > ipa $r0 a[0x90] $r0 0x0 0x1 > tex nodep $r0 $r0 0x0 0x1 t2d 0xf > depbar le 0x5 0x0 0x0 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) > fmul ftz $r3 $r3 $r4 > fmul ftz $r2 $r2 $r4 > fmul ftz $r1 $r1 $r4 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x1 wt 0x2) (st 0xf) (st 0x0) > fmul ftz $r0 $r0 $r4 > exit > #endif > diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc > index 5516a03..1485f11 100644 > --- a/src/shader/exasanv110.fpc > +++ b/src/shader/exasanv110.fpc > @@ -1,36 +1,36 @@ > -0xfc0007e0, > -0x001f8000, > +0xe1a0070f, > +0x003c3c01, > 0xcff7ff00, > 0xe003ff87, > 0x00470000, > 0x50800000, > 0x4007ff03, > 0xe043ff88, > -0xfc0007e0, > -0x001f8000, > +0xe1e0072f, > +0x0008bc03, > 0x0007ff02, > 0xe043ff88, > 0x2ff70204, > 0xc03a0004, > 0x4007ff01, > 0xe043ff89, > -0xfc0007e0, > -0x001f8000, > +0xe5e0274f, > +0x001fbc06, > 0x0007ff00, > 0xe043ff89, > 0xaff70000, > 0xc03a0017, > 0x34070000, > 0xf0f00000, > -0xfc0007e0, > -0x001f8000, > +0xfc21ffe1, > +0x001f8400, > 0x00470303, > 0x5c681000, > 0x00470202, > 0x5c681000, > 0x00470101, > 0x5c681000, > -0xfc0007e0, > +0xfde017e1, > 0x001f8000, > 0x00470000, > 0x5c681000, > diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp > index 90bbb55..86e14e8 100644 > --- a/src/shader/exascnv110.fp > +++ b/src/shader/exascnv110.fp > @@ -25,14 +25,14 @@ NV110FP_Source[] = { > }; > #else > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) > ipa pass $r0 a[0x7c] 0x0 0x0 0x1 > mufu rcp $r0 $r0 > ipa $r1 a[0x84] $r0 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf) > ipa $r0 a[0x80] $r0 0x0 0x1 > tex nodep $r0 $r0 0x0 0x0 t2d 0xf > depbar le 0x5 0x0 0x0 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf) (st 0x0) (st 0x0) > > > Looks good. > > > exit > #endif > diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc > index 2dba15d..1fef5d2 100644 > --- a/src/shader/exascnv110.fpc > +++ b/src/shader/exascnv110.fpc > @@ -1,20 +1,20 @@ > -0xfc0007e0, > -0x001f8000, > +0xe1a0070f, > +0x003c3c01, > 0xcff7ff00, > 0xe003ff87, > 0x00470000, > 0x50800000, > 0x4007ff01, > 0xe043ff88, > -0xfc0007e0, > -0x001f8000, > +0xfde0072f, > +0x001fbc03, > 0x0007ff00, > 0xe043ff88, > 0xaff70000, > 0xc03a0007, > 0x34070000, > 0xf0f00000, > -0xfc0007e0, > +0xfc0007ef, > 0x001f8000, > 0x0007000f, > 0xe3000000, > diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp > index 2728311..dd3816c 100644 > --- a/src/shader/videonv110.fp > +++ b/src/shader/videonv110.fp > @@ -25,30 +25,30 @@ NV110FP_NV12[] = { > }; > #else > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1) > ipa pass $r2 a[0x7c] 0x0 0x0 0x1 > mufu rcp $r2 $r2 > ipa $r0 a[0x80] $r2 0x0 0x1 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1) > ipa $r1 a[0x84] $r2 0x0 0x1 > tex nodep $r4 $r0 0x0 0x0 t2d 0x8 > tex nodep $r0 $r0 0x0 0x1 t2d 0xc > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf) (st 0x6 wt 0x1) (st 0x6) > depbar le 0x5 0x1 0x1 > fmul ftz $r5 $r4 c0[0x0] > fadd ftz $r3 $r5 c0[0x4] > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x6) (st 0x6) (st 0xf) > fadd ftz $r4 $r5 c0[0x8] > fadd ftz $r5 $r5 c0[0xc] > depbar le 0x5 0x0 0x0 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x6 wt 0x2) (st 0x1) (st 0x1) > ffma ftz $r3 $r0 c0[0x10] $r3 > ffma ftz $r4 $r0 c0[0x14] $r4 > ffma ftz $r5 $r0 c0[0x18] $r5 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0x1) (st 0x1) (st 0x6) > ffma ftz $r0 $r1 c0[0x1c] $r3 > ffma ftz $r2 $r1 c0[0x24] $r5 > ffma ftz $r1 $r1 c0[0x20] $r4 > -sched (st 0x0) (st 0x0) (st 0x0) > +sched (st 0xf) (st 0x0) (st 0x0) > exit > #endif > diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc > index 31d745a..8fbc246 100644 > --- a/src/shader/videonv110.fpc > +++ b/src/shader/videonv110.fpc > @@ -1,52 +1,52 @@ > -0xfc0007e0, > -0x001f8000, > +0xe1a0070f, > +0x003c3c01, > 0xcff7ff02, > 0xe003ff87, > 0x00470202, > 0x50800000, > 0x0027ff00, > 0xe043ff88, > -0xfc0007e0, > -0x001f8000, > +0xe1e0072f, > +0x001cbc03, > 0x4027ff01, > 0xe043ff88, > 0x2ff70004, > 0xc03a0004, > 0x2ff70000, > 0xc03a0016, > -0xfc0007e0, > -0x001f8000, > +0xfcc007ef, > +0x001f9801, > 0x34170001, > 0xf0f00000, > 0x00070405, > 0x4c681000, > 0x00170503, > 0x4c581000, > -0xfc0007e0, > -0x001f8000, > +0xfcc007e6, > +0x001fbc00, > 0x00270504, > 0x4c581000, > 0x00370505, > 0x4c581000, > 0x34070000, > 0xf0f00000, > -0xfc0007e0, > -0x001f8000, > +0xfc2017e6, > +0x001f8400, > 0x00470003, > 0x49a00180, > 0x00570004, > 0x49a00200, > 0x00670005, > 0x49a00280, > -0xfc0007e0, > -0x001f8000, > +0xfc2007e1, > +0x001f9800, > 0x00770100, > 0x49a00180, > 0x00970102, > 0x49a00280, > 0x00870101, > 0x49a00200, > -0xfc0007e0, > +0xfc0007ef, > 0x001f8000, > 0x0007000f, > 0xe3000000, > > As for your other comments, I have made the suggested changes. > > Thanks for your review! > > Cheers, > Aaryaman
Aaryaman Vasishta
2017-Jun-08 15:19 UTC
[Nouveau] [PATCH v2] nv110/exa: update sched codes
On Thu, Jun 8, 2017 at 5:01 AM, Samuel Pitoiset <samuel.pitoiset at gmail.com> wrote:> > > On 06/07/2017 06:58 PM, Aaryaman Vasishta wrote: > >> >> >> On Tue, Jun 6, 2017 at 7:15 AM, Samuel Pitoiset < >> samuel.pitoiset at gmail.com <mailto:samuel.pitoiset at gmail.com>> wrote: >> >> Nice work! >> >> See my comments below, and double-check if some of them can be >> applied to the shaders I didn't review yet. >> >> I recommend you to test your work because if one sched code is >> wrong, you are likely going to kill your card and reboot your box. :-) >> >> >> On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote: >> >> v2: Add missing delays >> >> This patch adds proper delays to maxwell exa shaders. >> rendercheck tests >> seem consistent with/without this patch. I haven't extensively >> tested >> them though. >> >> Trello: >> https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-wit >> h-proper-delays >> <https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-wi >> th-proper-delays> >> >> Signed-off-by: Aaryaman Vasishta <jem456.vasishta at gmail.com >> <mailto:jem456.vasishta at gmail.com>> >> >> --- >> src/shader/exac8nv110.fp | 10 +++++----- >> src/shader/exac8nv110.fpc | 18 +++++++++--------- >> src/shader/exacanv110.fp | 10 +++++----- >> src/shader/exacanv110.fpc | 18 +++++++++--------- >> src/shader/exacmnv110.fp | 10 +++++----- >> src/shader/exacmnv110.fpc | 18 +++++++++--------- >> src/shader/exas8nv110.fp | 6 +++--- >> src/shader/exas8nv110.fpc | 12 ++++++------ >> src/shader/exasanv110.fp | 10 +++++----- >> src/shader/exasanv110.fpc | 18 +++++++++--------- >> src/shader/exascnv110.fp | 6 +++--- >> src/shader/exascnv110.fpc | 10 +++++----- >> src/shader/videonv110.fp | 14 +++++++------- >> src/shader/videonv110.fpc | 26 +++++++++++++------------- >> 14 files changed, 93 insertions(+), 93 deletions(-) >> >> diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp >> index ce78036..1c4a4f1 100644 >> --- a/src/shader/exac8nv110.fp >> +++ b/src/shader/exac8nv110.fp >> @@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt >> 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r3 a[0x94] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr >> 0x1 wt 0x2) >> ipa $r2 a[0x90] $r0 0x0 0x1 >> tex nodep $r1 $r2 0x0 0x1 t2d 0x8 >> ipa $r3 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf) >> ipa $r2 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r2 0x0 0x0 t2d 0x8 >> >> >> Out of curiosity, what didn't you add a read-dep-bar on $r2:$r3 here? >> >> Missed it, thanks for pointing it out. >> > > You don't have to. 'tex' reads two sources ($r2:$r3) and writes into $r0, > but as $r2:$r3 are NOT re-used before $r0 is read, you can assume that $r0 > will be ready and don't need any read-dep-bar.Ah, so r2:r3, which are written on by the two 'ipa' above it, have already been waited on in this tex, and both of them read $r0 so we can safely assume that since the two 'ipa' instructions are already waited on, $r0 will be ready?>> > >> >> >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6 wt 0x3) (st 0x6) (st 0x1) >> fmul ftz $r3 $r0 $r1 >> mov $r2 $r3 0xf >> >> >> You can stall for only one cycle here, but the 6 cycles on fmul is >> needed. >> >> mov $r1 $r3 0xf >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6) (st 0xf) (st 0x0) >> mov $r0 $r3 0xf >> >> >> Same here. >> >> >> exit >> #endif >> diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc >> index 4aa1368..46943b7 100644 >> --- a/src/shader/exac8nv110.fpc >> +++ b/src/shader/exac8nv110.fpc >> @@ -1,36 +1,36 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff03, >> 0xe043ff89, >> -0xfc0007e0, >> -0x001f8000, >> +0x21e0072f, >> +0x005cbc03, >> 0x0007ff02, >> 0xe043ff89, >> 0x2ff70201, >> 0xc03a0014, >> 0x4007ff03, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe5e0074f, >> +0x001fbc06, >> 0x0007ff02, >> 0xe043ff88, >> 0x2ff70200, >> 0xc03a0004, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfcc01fe6, >> +0x001f8400, >> 0x00170003, >> 0x5c681000, >> 0x00370002, >> 0x5c980780, >> 0x00370001, >> 0x5c980780, >> -0xfc0007e0, >> +0xfde007e6, >> 0x001f8000, >> 0x00370000, >> 0x5c980780, >> diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp >> index a70d5c5..d7c2867 100644 >> --- a/src/shader/exacanv110.fp >> +++ b/src/shader/exacanv110.fp >> @@ -25,23 +25,23 @@ NV110FP_CAComposite[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt >> 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r3 a[0x94] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd >> 0x2) >> ipa $r2 a[0x90] $r0 0x0 0x1 >> tex nodep $r4 $r2 0x0 0x1 t2d 0xf >> >> >> Please add a read-dep-bar and wait for on the first fmul because >> $r2:$r3 are re-used before $r4. Should be safer. >> >> >> ipa $r1 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) >> ipa $r0 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x0 t2d 0xf >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) >> fmul ftz $r3 $r3 $r7 >> >> >> Why are you waiting all barriers? Only $r3 is needed here. >> >> After adding a read-dep-bar and waiting on that over here, I wasn't able >> to pass the same number of `rendercheck -f a8r8g8b8` tests as before this >> patch. After a little trial and error I discovered that wt 0xc fixes it, >> which means that bar 3 and 4 were being used in this fmul somehow (assuming >> bars start from 1), which is odd because this fmul only uses $r3 and $r7, >> and I think it should wait on the read-dep-bar set on "tex nodep $r4 $r2 >> 0x0 0x1 t2d 0xf" (I could be wrong though). I'm kinda stumped on what's >> going on within this fmul that's causing this behavior. >> > > Because you are missing a read-dep-bar on the first 'tex' in this shader. > Presumably, if you add one, you no longer need to wait for all bars.I made some changes which you can check out here https://hastebin.com/cazirimeva.bash. Here are my comments on the same: In ' sched (st 0x1 wt 0x2) (st 0x1) (st 0x1 wt 0x4)' I'm facing the same issue, even after waiting on the read-dep bar set on the first 'tex'. I replaced 'wt 0x2' with 'wt 0x4' so it waits on bar 3 instead of bar 2 which was set as the read-dep bar, and the above issue goes away. It's kinda odd for me because bar 3 is set for $r0 on the second 'tex' and that fmul doesn't seem to use it anywhere. Cheers, Aaryaman> > > Samuel. > > >> >> >> fmul ftz $r2 $r2 $r6 >> fmul ftz $r1 $r1 $r5 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x3) (st 0xf) (st 0x0) >> fmul ftz $r0 $r0 $r4 >> exit >> #endif >> diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc >> index 7c0ca5e..9cad139 100644 >> --- a/src/shader/exacanv110.fpc >> +++ b/src/shader/exacanv110.fpc >> @@ -1,36 +1,36 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff03, >> 0xe043ff89, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x0008bc03, >> 0x0007ff02, >> 0xe043ff89, >> 0xaff70204, >> 0xc03a0017, >> 0x4007ff01, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe5e0274f, >> +0x001fbc06, >> 0x0007ff00, >> 0xe043ff88, >> 0xaff70000, >> 0xc03a0007, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc21ffe1, >> +0x001f8400, >> 0x00770303, >> 0x5c681000, >> 0x00670202, >> 0x5c681000, >> 0x00570101, >> 0x5c681000, >> -0xfc0007e0, >> +0xfde01fe1, >> 0x001f8000, >> 0x00470000, >> 0x5c681000, >> diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp >> index fe5c294..d717138 100644 >> --- a/src/shader/exacmnv110.fp >> +++ b/src/shader/exacmnv110.fp >> @@ -25,23 +25,23 @@ NV110FP_Composite[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt >> 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r3 a[0x94] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd >> 0x2) >> ipa $r2 a[0x90] $r0 0x0 0x1 >> tex nodep $r4 $r2 0x0 0x1 t2d 0x8 >> ipa $r1 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) >> ipa $r0 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x0 t2d 0xf >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) >> fmul ftz $r3 $r3 $r4 >> fmul ftz $r2 $r2 $r4 >> fmul ftz $r1 $r1 $r4 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6 wt 0x2) (st 0xf) (st 0x0) >> fmul ftz $r0 $r0 $r4 >> exit >> #endif >> diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc >> index 9d62c1a..c150875 100644 >> --- a/src/shader/exacmnv110.fpc >> +++ b/src/shader/exacmnv110.fpc >> @@ -1,36 +1,36 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff03, >> 0xe043ff89, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x0008bc03, >> 0x0007ff02, >> 0xe043ff89, >> 0x2ff70204, >> 0xc03a0014, >> 0x4007ff01, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe5e0274f, >> +0x001fbc06, >> 0x0007ff00, >> 0xe043ff88, >> 0xaff70000, >> 0xc03a0007, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc21ffe1, >> +0x001f8400, >> 0x00470303, >> 0x5c681000, >> 0x00470202, >> 0x5c681000, >> 0x00470101, >> 0x5c681000, >> -0xfc0007e0, >> +0xfde017e6, >> 0x001f8000, >> 0x00470000, >> 0x5c681000, >> diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp >> index 4fe2e19..a555beb 100644 >> --- a/src/shader/exas8nv110.fp >> +++ b/src/shader/exas8nv110.fp >> @@ -25,15 +25,15 @@ NV110FP_Source_A8[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt >> 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r1 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf) >> ipa $r0 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x0 t2d 0x8 >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x1) (st 0x1) (st 0x1) >> mov $r3 $r0 0xf >> mov $r2 $r0 0xf >> mov $r1 $r0 0xf >> >> >> This one looks good! >> >> >> diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc >> index 1181c41..e58d168 100644 >> --- a/src/shader/exas8nv110.fpc >> +++ b/src/shader/exas8nv110.fpc >> @@ -1,21 +1,21 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff01, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x001fbc03, >> 0x0007ff00, >> 0xe043ff88, >> 0x2ff70000, >> 0xc03a0004, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc200fe1, >> +0x001f8400, >> 0x00070003, >> 0x5c980780, >> 0x00070002, >> diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp >> index 61374a6..ad7ca36 100644 >> --- a/src/shader/exasanv110.fp >> +++ b/src/shader/exasanv110.fp >> @@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt >> 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r3 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd >> 0x2) >> ipa $r2 a[0x80] $r0 0x0 0x1 >> tex nodep $r4 $r2 0x0 0x0 t2d 0x8 >> ipa $r1 a[0x94] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf) >> ipa $r0 a[0x90] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x1 t2d 0xf >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1) >> fmul ftz $r3 $r3 $r4 >> fmul ftz $r2 $r2 $r4 >> fmul ftz $r1 $r1 $r4 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1 wt 0x2) (st 0xf) (st 0x0) >> fmul ftz $r0 $r0 $r4 >> exit >> #endif >> diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc >> index 5516a03..1485f11 100644 >> --- a/src/shader/exasanv110.fpc >> +++ b/src/shader/exasanv110.fpc >> @@ -1,36 +1,36 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff03, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x0008bc03, >> 0x0007ff02, >> 0xe043ff88, >> 0x2ff70204, >> 0xc03a0004, >> 0x4007ff01, >> 0xe043ff89, >> -0xfc0007e0, >> -0x001f8000, >> +0xe5e0274f, >> +0x001fbc06, >> 0x0007ff00, >> 0xe043ff89, >> 0xaff70000, >> 0xc03a0017, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc21ffe1, >> +0x001f8400, >> 0x00470303, >> 0x5c681000, >> 0x00470202, >> 0x5c681000, >> 0x00470101, >> 0x5c681000, >> -0xfc0007e0, >> +0xfde017e1, >> 0x001f8000, >> 0x00470000, >> 0x5c681000, >> diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp >> index 90bbb55..86e14e8 100644 >> --- a/src/shader/exascnv110.fp >> +++ b/src/shader/exascnv110.fp >> @@ -25,14 +25,14 @@ NV110FP_Source[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt >> 0x1) >> ipa pass $r0 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r0 $r0 >> ipa $r1 a[0x84] $r0 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf) >> ipa $r0 a[0x80] $r0 0x0 0x1 >> tex nodep $r0 $r0 0x0 0x0 t2d 0xf >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf) (st 0x0) (st 0x0) >> >> >> Looks good. >> >> >> exit >> #endif >> diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc >> index 2dba15d..1fef5d2 100644 >> --- a/src/shader/exascnv110.fpc >> +++ b/src/shader/exascnv110.fpc >> @@ -1,20 +1,20 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff00, >> 0xe003ff87, >> 0x00470000, >> 0x50800000, >> 0x4007ff01, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xfde0072f, >> +0x001fbc03, >> 0x0007ff00, >> 0xe043ff88, >> 0xaff70000, >> 0xc03a0007, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> +0xfc0007ef, >> 0x001f8000, >> 0x0007000f, >> 0xe3000000, >> diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp >> index 2728311..dd3816c 100644 >> --- a/src/shader/videonv110.fp >> +++ b/src/shader/videonv110.fp >> @@ -25,30 +25,30 @@ NV110FP_NV12[] = { >> }; >> #else >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt >> 0x1) >> ipa pass $r2 a[0x7c] 0x0 0x0 0x1 >> mufu rcp $r2 $r2 >> ipa $r0 a[0x80] $r2 0x0 0x1 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1) >> ipa $r1 a[0x84] $r2 0x0 0x1 >> tex nodep $r4 $r0 0x0 0x0 t2d 0x8 >> tex nodep $r0 $r0 0x0 0x1 t2d 0xc >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf) (st 0x6 wt 0x1) (st 0x6) >> depbar le 0x5 0x1 0x1 >> fmul ftz $r5 $r4 c0[0x0] >> fadd ftz $r3 $r5 c0[0x4] >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6) (st 0x6) (st 0xf) >> fadd ftz $r4 $r5 c0[0x8] >> fadd ftz $r5 $r5 c0[0xc] >> depbar le 0x5 0x0 0x0 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x6 wt 0x2) (st 0x1) (st 0x1) >> ffma ftz $r3 $r0 c0[0x10] $r3 >> ffma ftz $r4 $r0 c0[0x14] $r4 >> ffma ftz $r5 $r0 c0[0x18] $r5 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0x1) (st 0x1) (st 0x6) >> ffma ftz $r0 $r1 c0[0x1c] $r3 >> ffma ftz $r2 $r1 c0[0x24] $r5 >> ffma ftz $r1 $r1 c0[0x20] $r4 >> -sched (st 0x0) (st 0x0) (st 0x0) >> +sched (st 0xf) (st 0x0) (st 0x0) >> exit >> #endif >> diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc >> index 31d745a..8fbc246 100644 >> --- a/src/shader/videonv110.fpc >> +++ b/src/shader/videonv110.fpc >> @@ -1,52 +1,52 @@ >> -0xfc0007e0, >> -0x001f8000, >> +0xe1a0070f, >> +0x003c3c01, >> 0xcff7ff02, >> 0xe003ff87, >> 0x00470202, >> 0x50800000, >> 0x0027ff00, >> 0xe043ff88, >> -0xfc0007e0, >> -0x001f8000, >> +0xe1e0072f, >> +0x001cbc03, >> 0x4027ff01, >> 0xe043ff88, >> 0x2ff70004, >> 0xc03a0004, >> 0x2ff70000, >> 0xc03a0016, >> -0xfc0007e0, >> -0x001f8000, >> +0xfcc007ef, >> +0x001f9801, >> 0x34170001, >> 0xf0f00000, >> 0x00070405, >> 0x4c681000, >> 0x00170503, >> 0x4c581000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfcc007e6, >> +0x001fbc00, >> 0x00270504, >> 0x4c581000, >> 0x00370505, >> 0x4c581000, >> 0x34070000, >> 0xf0f00000, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc2017e6, >> +0x001f8400, >> 0x00470003, >> 0x49a00180, >> 0x00570004, >> 0x49a00200, >> 0x00670005, >> 0x49a00280, >> -0xfc0007e0, >> -0x001f8000, >> +0xfc2007e1, >> +0x001f9800, >> 0x00770100, >> 0x49a00180, >> 0x00970102, >> 0x49a00280, >> 0x00870101, >> 0x49a00200, >> -0xfc0007e0, >> +0xfc0007ef, >> 0x001f8000, >> 0x0007000f, >> 0xe3000000, >> >> As for your other comments, I have made the suggested changes. >> >> Thanks for your review! >> >> Cheers, >> Aaryaman >> >-------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.freedesktop.org/archives/nouveau/attachments/20170609/3c2b7626/attachment-0001.html>