Roland Scheidegger
2015-Feb-23 13:24 UTC
[Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
Does this give correct results for special floats (0, infs)? We tried to improve (for single floats) x86 rcp in llvmpipe with newton-raphson, but unfortunately not being able to give correct results for these two cases (without even more additional code) meant it got all disabled in the end (you can still see that code in the driver) since the problems are at least as bad as those due to bad accuracy... Roland Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> > --- > > Not sure how many steps are needed for the necessary accuracy. Just > doing 2 because that seems like a reasonable number. > > .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++-- > 1 file changed, 39 insertions(+), 3 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > index 87e75e1..9767566 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) > bld.setPosition(i, false); > > // 1. Take the source and it up. > - Value *src[2], *dst[2], *def = i->getDef(0); > - bld.mkSplit(src, 4, i->getSrc(0)); > + Value *input = i->getSrc(0); > + Value *src[2], *dst[2], *guess, *def = i->getDef(0); > + bld.mkSplit(src, 4, input); > > // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. > dst[0] = bld.loadImm(NULL, 0); > @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) > > // 4. Recombine the two dst pieces back into the original destination. > bld.setPosition(i, true); > - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); > + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]); > + > + // 5. Perform 2 Newton-Raphson steps > + if (i->op == OP_RCP) { > + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 > + Value *two = bld.getSSA(8); > + > + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); > + > + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); > + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); > + } else { > + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2) > + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); > + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f)); > + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f)); > + > + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input); > + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) > + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, > + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), > + three_half)); > + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, > + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), > + three_half)); > + } > + > + bld.mkMov(def, guess); > } > > bool >
Ilia Mirkin
2015-Feb-23 15:40 UTC
[Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
Oh right. I think the NVIDIA blob executes those steps conditionally based on the upper bits not being 0x7ff (== infinity/nan). I should do the same thing here. [FWIW I was able to test the nv50 code last night and that one's a total fail for rcp/rsq... will need to port that over to my nvc0 and debug there.] On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <sroland at vmware.com> wrote:> Does this give correct results for special floats (0, infs)? > We tried to improve (for single floats) x86 rcp in llvmpipe with > newton-raphson, but unfortunately not being able to give correct results > for these two cases (without even more additional code) meant it got all > disabled in the end (you can still see that code in the driver) since > the problems are at least as bad as those due to bad accuracy... > > Roland > > Am 23.02.2015 um 05:01 schrieb Ilia Mirkin: >> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> >> --- >> >> Not sure how many steps are needed for the necessary accuracy. Just >> doing 2 because that seems like a reasonable number. >> >> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++-- >> 1 file changed, 39 insertions(+), 3 deletions(-) >> >> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> index 87e75e1..9767566 100644 >> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) >> bld.setPosition(i, false); >> >> // 1. Take the source and it up. >> - Value *src[2], *dst[2], *def = i->getDef(0); >> - bld.mkSplit(src, 4, i->getSrc(0)); >> + Value *input = i->getSrc(0); >> + Value *src[2], *dst[2], *guess, *def = i->getDef(0); >> + bld.mkSplit(src, 4, input); >> >> // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. >> dst[0] = bld.loadImm(NULL, 0); >> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) >> >> // 4. Recombine the two dst pieces back into the original destination. >> bld.setPosition(i, true); >> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); >> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]); >> + >> + // 5. Perform 2 Newton-Raphson steps >> + if (i->op == OP_RCP) { >> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 >> + Value *two = bld.getSSA(8); >> + >> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); >> + >> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); >> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); >> + } else { >> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2) >> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); >> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f)); >> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f)); >> + >> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input); >> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) >> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, >> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), >> + three_half)); >> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, >> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, >> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), >> + three_half)); >> + } >> + >> + bld.mkMov(def, guess); >> } >> >> bool >> >
Ilia Mirkin
2015-Feb-23 20:23 UTC
[Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
Just to follow up, this is the code the nvidia blob emits for rsq(). The argument is in c1[0] + c1[4]. I _think_ that c3 contains: PB: 0x7fffffff GF100_M2MF.DATA = 0x7fffffff PB: 0x7ff00000 GF100_M2MF.DATA = 0x7ff00000 PB: 0x00000000 GF100_M2MF.DATA = 0 PB: 0x3fe00000 GF100_M2MF.DATA = 0x3fe00000 PB: 0x3ff00000 GF100_M2MF.DATA = 0x3ff00000 00000000: 10005de4 28004400 mov b32 $r1 c1[0x4] 00000008: 10109c03 68004c00 and b32 $r2 $r1 c3[0x4] r2 = exponent bits 00000010: 00101c03 68004c00 and b32 $r0 $r1 c3[0x0] r0 = non-sign bits (of upper word) 00000018: 1021dc03 1a8e4c00 set $p0 0x1 ne u32 $r2 c3[0x4] p0 = exponent == 0x7ff (i.e. inf/nan) 00000020: 00001c43 68004400 or b32 $r0 $r0 c1[0x0] r0 = all non-sign bits or'd 00000028: fc001c04 20000000 selp b32 $r0 $r0 0x0 $p0 if (exponent == 0x7ff) r0 = all mantissa bits else r0 = 0 00000030: fc01dc23 190e0000 set $p0 0x1 eq s32 $r0 0x0 p0 = is input a nan (i.e. exponent = 0x7ff and mantissa bits set) 00000038: 00001de4 28004400 mov b32 $r0 c1[0x0] 00000040: 200081e7 40000001 $p0 bra allwarp 0x90 So all of the (not $p0) stuff happens for all non-nan's, including infinities. r0d = input 00000048: 0000a1e2 19ff0000 (not $p0) mov b32 $r2 0x7fc00000 00000050: fc0121e4 28000000 (not $p0) mov b32 $r4 0x0 00000058: 1020a103 68004400 (not $p0) and b32 $r2 $r2 not c1[0x4] 00000060: 00216002 08004000 (not $p0) add b32 $r5 $r2 0x100000 00000068: 0450e203 5800c000 (not $p0) shr u32 $r3 $r5 wrap 0x1 00000070: 10002001 50000000 (not $p0) mul rn f64 $r0d $r0d $r4d 00000078: fc00a1e4 28000000 (not $p0) mov b32 $r2 0x0 00000080: 0030e002 087fe000 (not $p0) add b32 $r3 $r3 0x1ff80000 I tried decoding this, but it's some crazy business -- futzing with the exponent, I think it's trying to compress more into the high 32-bits so that rsqrt64h below has more precision. 00000088: 00001de4 40000000 nop 00000090: fc011de4 28000000 B mov b32 $r4 0x0 00000098: 1c115c00 c8000000 rsqrt64h $r5 $r1 000000a0: 200081e7 40000001 $p0 bra allwarp 0xf0 000000a8: 00002001 5000cff8 (not $p0) mul rn f64 $r0d $r0d 0x3fe0000000000000 r0d = input * 0.5 000000b0: 1001a001 50000000 (not $p0) mul rn f64 $r6d $r0d $r4d r6d = 0.5 * input * guess 000000b8: 3061a201 20088c00 (not $p0) fma rn f64 $r6d neg $r6d $r4d c3[0xc] r6d = -0.5 * input * guess * guess + 0.5 (? actually 0.5000001190928742, i.e. 0x3fe000003ff00000... seems like a bug, but maybe it'll only read it as a f32, not f64 like I'm assuming) 000000c0: 18412001 20080000 (not $p0) fma rn f64 $r4d $r4d $r6d $r4d guess = guess * (0.5 - 0.5 * input * guess * guess) + guess i.e. newton-raphson step. [why didn't they just throw in a 1.5 instead of 0.5? who knows. maybe something to do with numeric stability. or perhaps a bug in their const upload logic which got papered over with the extra + guess.] 000000c8: 10002001 50000000 (not $p0) mul rn f64 $r0d $r0d $r4d 000000d0: 30002201 20088c00 (not $p0) fma rn f64 $r0d neg $r0d $r4d c3[0xc] 000000d8: 00402001 20080000 (not $p0) fma rn f64 $r0d $r4d $r0d $r4d 000000e0: 08012001 50000000 (not $p0) mul rn f64 $r4d $r0d $r2d And this is another step. Not sure why they're so careful to still go through the motions for infinity and only skip for nan -- seems like they could just as well skip it for inf as well, with less code. That's what I plan on doing. On Mon, Feb 23, 2015 at 10:40 AM, Ilia Mirkin <imirkin at alum.mit.edu> wrote:> Oh right. I think the NVIDIA blob executes those steps conditionally > based on the upper bits not being 0x7ff (== infinity/nan). I should do > the same thing here. [FWIW I was able to test the nv50 code last night > and that one's a total fail for rcp/rsq... will need to port that over > to my nvc0 and debug there.] > > On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <sroland at vmware.com> wrote: >> Does this give correct results for special floats (0, infs)? >> We tried to improve (for single floats) x86 rcp in llvmpipe with >> newton-raphson, but unfortunately not being able to give correct results >> for these two cases (without even more additional code) meant it got all >> disabled in the end (you can still see that code in the driver) since >> the problems are at least as bad as those due to bad accuracy... >> >> Roland >> >> Am 23.02.2015 um 05:01 schrieb Ilia Mirkin: >>> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> >>> --- >>> >>> Not sure how many steps are needed for the necessary accuracy. Just >>> doing 2 because that seems like a reasonable number. >>> >>> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++-- >>> 1 file changed, 39 insertions(+), 3 deletions(-) >>> >>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >>> index 87e75e1..9767566 100644 >>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >>> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) >>> bld.setPosition(i, false); >>> >>> // 1. Take the source and it up. >>> - Value *src[2], *dst[2], *def = i->getDef(0); >>> - bld.mkSplit(src, 4, i->getSrc(0)); >>> + Value *input = i->getSrc(0); >>> + Value *src[2], *dst[2], *guess, *def = i->getDef(0); >>> + bld.mkSplit(src, 4, input); >>> >>> // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. >>> dst[0] = bld.loadImm(NULL, 0); >>> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) >>> >>> // 4. Recombine the two dst pieces back into the original destination. >>> bld.setPosition(i, true); >>> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); >>> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]); >>> + >>> + // 5. Perform 2 Newton-Raphson steps >>> + if (i->op == OP_RCP) { >>> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 >>> + Value *two = bld.getSSA(8); >>> + >>> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); >>> + >>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); >>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); >>> + } else { >>> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2) >>> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); >>> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f)); >>> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f)); >>> + >>> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input); >>> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) >>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, >>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), >>> + three_half)); >>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, >>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), >>> + three_half)); >>> + } >>> + >>> + bld.mkMov(def, guess); >>> } >>> >>> bool >>> >>
Possibly Parallel Threads
- [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
- [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
- [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0)
- [PATCH] gm107/ir: fix loading z offset for layered 3d image bindings
- [PATCH 1/2] nv50/ir: fix s32 x s32 -> high s32 multiply logic