Ilia Mirkin
2015-Feb-23 04:01 UTC
[Nouveau] [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0)
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- Untested beyond compiling a few shaders to see if they look like they might work. nvdisasm agrees with envydis's decoding of these things. Will definitely get ahold of a G200 to run tests on before pushing this. .../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 94 ++++++++++++++++++--- .../nouveau/codegen/nv50_ir_lowering_nv50.cpp | 97 +++++++++++++++++++++- .../nouveau/codegen/nv50_ir_target_nv50.cpp | 2 +- src/gallium/drivers/nouveau/nv50/nv50_screen.c | 4 + 4 files changed, 185 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index b1e7409..7c6f7da 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -96,9 +96,12 @@ private: void emitUADD(const Instruction *); void emitAADD(const Instruction *); void emitFADD(const Instruction *); + void emitDADD(const Instruction *); void emitIMUL(const Instruction *); void emitFMUL(const Instruction *); + void emitDMUL(const Instruction *); void emitFMAD(const Instruction *); + void emitDMAD(const Instruction *); void emitIMAD(const Instruction *); void emitISAD(const Instruction *); @@ -923,11 +926,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i) assert(0); break; } - code[1] |= i->src(0).mod.abs() << 20; - code[1] |= i->src(0).mod.neg() << 26; - code[1] |= i->src(1).mod.abs() << 19; - code[1] |= i->src(1).mod.neg() << 27; } + + code[1] |= i->src(0).mod.abs() << 20; + code[1] |= i->src(0).mod.neg() << 26; + code[1] |= i->src(1).mod.abs() << 19; + code[1] |= i->src(1).mod.neg() << 27; + emitForm_MAD(i); } @@ -963,6 +968,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i) } void +CodeEmitterNV50::emitDMAD(const Instruction *i) +{ + const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg(); + const int neg_add = i->src(2).mod.neg(); + + assert(i->encSize == 8); + assert(!i->saturate); + + code[1] = 0x40000000; + code[0] = 0xe0000000; + + code[1] |= neg_mul << 26; + code[1] |= neg_add << 27; + + roundMode_MAD(i); + + emitForm_MAD(i); +} + +void CodeEmitterNV50::emitFADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -997,6 +1022,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i) } void +CodeEmitterNV50::emitDADD(const Instruction *i) +{ + const int neg0 = i->src(0).mod.neg(); + const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); + + assert(!(i->src(0).mod | i->src(1).mod).abs()); + assert(!i->saturate); + assert(i->encSize == 8); + + code[1] = 0x60000000; + code[0] = 0xe0000000; + + emitForm_ADD(i); + + code[1] |= neg0 << 26; + code[1] |= neg1 << 27; +} + +void CodeEmitterNV50::emitUADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -1090,6 +1134,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i) } void +CodeEmitterNV50::emitDMUL(const Instruction *i) +{ + const int neg = (i->src(0).mod ^ i->src(1).mod).neg(); + + assert(!i->saturate); + assert(i->encSize == 8); + + code[1] = 0x80000000; + code[0] = 0xe0000000; + + if (neg) + code[1] |= 0x08000000; + + roundMode_CVT(i->rnd); + + emitForm_MAD(i); +} + +void CodeEmitterNV50::emitIMAD(const Instruction *i) { code[0] = 0x60000000; @@ -1150,9 +1213,11 @@ CodeEmitterNV50::emitSET(const Instruction *i) code[0] = 0x30000000; code[1] = 0x60000000; - emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); - switch (i->sType) { + case TYPE_F64: + code[0] = 0xe0000000; + code[1] = 0xe0000000; + break; case TYPE_F32: code[0] |= 0x80000000; break; case TYPE_S32: code[1] |= 0x0c000000; break; case TYPE_U32: code[1] |= 0x04000000; break; @@ -1162,6 +1227,9 @@ CodeEmitterNV50::emitSET(const Instruction *i) assert(0); break; } + + emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); + if (i->src(0).mod.neg()) code[1] |= 0x04000000; if (i->src(1).mod.neg()) code[1] |= 0x08000000; if (i->src(0).mod.abs()) code[1] |= 0x00100000; @@ -1725,7 +1793,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) break; case OP_ADD: case OP_SUB: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDADD(insn); + else if (isFloatType(insn->dType)) emitFADD(insn); else if (insn->getDef(0)->reg.file == FILE_ADDRESS) emitAADD(insn); @@ -1733,14 +1803,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) emitUADD(insn); break; case OP_MUL: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMUL(insn); + else if (isFloatType(insn->dType)) emitFMUL(insn); else emitIMUL(insn); break; case OP_MAD: case OP_FMA: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMAD(insn); + else if (isFloatType(insn->dType)) emitFMAD(insn); else emitIMAD(insn); @@ -1912,7 +1986,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const { const Target::OpInfo &info = targ->getOpInfo(i); - if (info.minEncSize > 4) + if (info.minEncSize > 4 || i->dType == TYPE_F64) return 8; // check constraints on dst and src operands diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 1ad0860..d5dadc2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -314,6 +314,7 @@ private: void handleDIV(Instruction *); void handleMOD(Instruction *); void handleMUL(Instruction *); + void handleDRCPRSQ(Instruction *); void handleAddrDef(Instruction *); inline bool isARL(const Instruction *) const; @@ -552,6 +553,95 @@ NV50LegalizeSSA::handleMOD(Instruction *mod) mod->setSrc(1, m); } +void +NV50LegalizeSSA::handleDRCPRSQ(Instruction *i) +{ + /* We need to replace this instruction with a sequence that computes the + * appropriate function. As a first guess, we use the "quake" style + * approximation for RSQ: + * + * 0x5fe6eb50c7b537a9 - num >> 1 + * + * For RCP, we will then square it. + */ + Value *abs, *guess, *parts[2], *input[2], *shr[4], *pred; + + bld.setPosition(i, false); + + abs = bld.mkOp1v(OP_ABS, TYPE_F64, bld.getSSA(8), i->getSrc(0)); + + parts[0] = bld.loadImm(NULL, 0xc7b537a9); + parts[1] = bld.loadImm(NULL, 0x5fe6eb50); + guess = bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), parts[0], parts[1]); + + bld.mkSplit(input, 4, abs); + shr[0] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[0], bld.mkImm(1)); + shr[1] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[1], bld.mkImm(1)); + + // If the bottom bit of the high word was set, set the high bit of the + // bottom word. + pred = bld.getSSA(1, FILE_FLAGS); + bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 1)) + ->setFlagsDef(0, pred); + shr[2] = bld.getSSA(4); shr[3] = bld.getSSA(4); + bld.mkOp2(OP_OR, TYPE_U32, shr[2], shr[0], bld.loadImm(NULL, 0x80000000)) + ->setPredicate(CC_S, pred); + bld.mkMov(shr[3], shr[0]) + ->setPredicate(CC_NS, pred); + shr[0] = bld.mkOp2v(OP_UNION, TYPE_U32, bld.getSSA(4), shr[2], shr[3]); + + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), guess, + bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), shr[0], shr[1])); + + if (i->op == OP_RCP) { + Value *two = bld.getSSA(8), *neg = bld.getSSA(8), *copy = bld.getSSA(8); + + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); + + /* Square the guess first, since it was for RSQ */ + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess); + + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); + + // Restore the sign on the output + bld.mkSplit(input, 4, i->getSrc(0)); + bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 0x80000000)) + ->setFlagsDef(0, (pred = bld.getSSA(1, FILE_FLAGS))); + bld.mkOp1(OP_NEG, TYPE_F64, neg, guess) + ->setPredicate(CC_S, pred); + bld.mkMov(copy, guess) + ->setPredicate(CC_NS, pred); + guess = bld.mkOp2v(OP_UNION, TYPE_U64, bld.getSSA(8), neg, copy); + } else { + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f)); + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f)); + + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, abs); + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), + three_half)); + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), + three_half)); + } + + i->op = OP_MOV; + i->setSrc(0, guess); +} + + bool NV50LegalizeSSA::visit(BasicBlock *bb) { @@ -578,6 +668,11 @@ NV50LegalizeSSA::visit(BasicBlock *bb) case OP_MUL: handleMUL(insn); break; + case OP_RCP: + case OP_RSQ: + if (insn->dType == TYPE_F64) + handleDRCPRSQ(insn); + break; default: break; } @@ -1162,7 +1257,7 @@ NV50LoweringPreSSA::handleDIV(Instruction *i) bool NV50LoweringPreSSA::handleSQRT(Instruction *i) { - Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, + Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType, bld.getSSA(), i->getSrc(0)); i->op = OP_MUL; i->setSrc(1, rsq->getDef(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index 178a167..f3d8733 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -388,7 +388,7 @@ TargetNV50::isAccessSupported(DataFile file, DataType ty) const bool TargetNV50::isOpSupported(operation op, DataType ty) const { - if (ty == TYPE_F64 && chipset < 0xa0) + if (ty == TYPE_F64 && chipset != 0xa0) return false; switch (op) { diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index ed07ba4..4532957 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -237,6 +237,8 @@ static int nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, enum pipe_shader_cap param) { + struct nouveau_device *dev = nouveau_screen(pscreen)->device; + switch (shader) { case PIPE_SHADER_VERTEX: case PIPE_SHADER_GEOMETRY: @@ -287,7 +289,9 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: return MIN2(32, PIPE_MAX_SAMPLERS); case PIPE_SHADER_CAP_DOUBLES: + return dev->chipset == 0xa0; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: + return dev->chipset == 0xa0; case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: return 0; default: -- 2.0.5
Ilia Mirkin
2015-Feb-23 04:01 UTC
[Nouveau] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- Not sure how many steps are needed for the necessary accuracy. Just doing 2 because that seems like a reasonable number. .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 87e75e1..9767566 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) bld.setPosition(i, false); // 1. Take the source and it up. - Value *src[2], *dst[2], *def = i->getDef(0); - bld.mkSplit(src, 4, i->getSrc(0)); + Value *input = i->getSrc(0); + Value *src[2], *dst[2], *guess, *def = i->getDef(0); + bld.mkSplit(src, 4, input); // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. dst[0] = bld.loadImm(NULL, 0); @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) // 4. Recombine the two dst pieces back into the original destination. bld.setPosition(i, true); - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]); + + // 5. Perform 2 Newton-Raphson steps + if (i->op == OP_RCP) { + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 + Value *two = bld.getSSA(8); + + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); + + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); + } else { + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2) + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f)); + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f)); + + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input); + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), + three_half)); + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), + three_half)); + } + + bld.mkMov(def, guess); } bool -- 2.0.5
Roland Scheidegger
2015-Feb-23 13:24 UTC
[Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
Does this give correct results for special floats (0, infs)? We tried to improve (for single floats) x86 rcp in llvmpipe with newton-raphson, but unfortunately not being able to give correct results for these two cases (without even more additional code) meant it got all disabled in the end (you can still see that code in the driver) since the problems are at least as bad as those due to bad accuracy... Roland Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> > --- > > Not sure how many steps are needed for the necessary accuracy. Just > doing 2 because that seems like a reasonable number. > > .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++-- > 1 file changed, 39 insertions(+), 3 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > index 87e75e1..9767566 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) > bld.setPosition(i, false); > > // 1. Take the source and it up. > - Value *src[2], *dst[2], *def = i->getDef(0); > - bld.mkSplit(src, 4, i->getSrc(0)); > + Value *input = i->getSrc(0); > + Value *src[2], *dst[2], *guess, *def = i->getDef(0); > + bld.mkSplit(src, 4, input); > > // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. > dst[0] = bld.loadImm(NULL, 0); > @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) > > // 4. Recombine the two dst pieces back into the original destination. > bld.setPosition(i, true); > - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); > + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]); > + > + // 5. Perform 2 Newton-Raphson steps > + if (i->op == OP_RCP) { > + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 > + Value *two = bld.getSSA(8); > + > + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); > + > + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); > + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess))); > + } else { > + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2) > + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); > + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f)); > + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f)); > + > + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input); > + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) > + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, > + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), > + three_half)); > + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, > + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input, > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess), > + three_half)); > + } > + > + bld.mkMov(def, guess); > } > > bool >
Apparently Analagous Threads
- [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
- [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
- [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0)
- [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax
- [PATCH 1/2] nv50/ir: fix s32 x s32 -> high s32 multiply logic