Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- .../drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp | 66 +++++++++++++++++++++- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index dfb093c..e38a3b8 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -92,11 +92,14 @@ private: void emitUADD(const Instruction *); void emitFADD(const Instruction *); + void emitDADD(const Instruction *); void emitUMUL(const Instruction *); void emitFMUL(const Instruction *); + void emitDMUL(const Instruction *); void emitIMAD(const Instruction *); void emitISAD(const Instruction *); void emitFMAD(const Instruction *); + void emitDMAD(const Instruction *); void emitMADSP(const Instruction *); void emitNOT(Instruction *); @@ -523,6 +526,25 @@ CodeEmitterNVC0::emitFMAD(const Instruction *i) } void +CodeEmitterNVC0::emitDMAD(const Instruction *i) +{ + bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg(); + + emitForm_A(i, HEX64(20000000, 00000001)); + + if (i->src(2).mod.neg()) + code[0] |= 1 << 8; + + roundMode_A(i); + + if (neg1) + code[0] |= 1 << 9; + + assert(!i->saturate); + assert(!i->ftz); +} + +void CodeEmitterNVC0::emitFMUL(const Instruction *i) { bool neg = (i->src(0).mod ^ i->src(1).mod).neg(); @@ -557,6 +579,23 @@ CodeEmitterNVC0::emitFMUL(const Instruction *i) } void +CodeEmitterNVC0::emitDMUL(const Instruction *i) +{ + bool neg = (i->src(0).mod ^ i->src(1).mod).neg(); + + emitForm_A(i, HEX64(50000000, 00000001)); + roundMode_A(i); + + if (neg) + code[0] |= 1 << 9; + + assert(!i->saturate); + assert(!i->ftz); + assert(!i->dnz); + assert(!i->postFactor); +} + +void CodeEmitterNVC0::emitUMUL(const Instruction *i) { if (i->encSize == 8) { @@ -619,6 +658,19 @@ CodeEmitterNVC0::emitFADD(const Instruction *i) } void +CodeEmitterNVC0::emitDADD(const Instruction *i) +{ + assert(i->encSize == 8); + emitForm_A(i, HEX64(48000000, 00000001)); + roundMode_A(i); + assert(!i->saturate); + assert(!i->ftz); + emitNegAbs12(i); + if (i->op == OP_SUB) + code[0] ^= 1 << 8; +} + +void CodeEmitterNVC0::emitUADD(const Instruction *i) { uint32_t addOp = 0; @@ -895,6 +947,8 @@ CodeEmitterNVC0::emitMINMAX(const Instruction *i) else if (!isFloatType(i->dType)) op |= isSignedType(i->dType) ? 0x23 : 0x03; + if (i->dType == TYPE_F64) + op |= 0x01; emitForm_A(i, op); emitNegAbs12(i); @@ -2242,20 +2296,26 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn) break; case OP_ADD: case OP_SUB: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDADD(insn); + else if (isFloatType(insn->dType)) emitFADD(insn); else emitUADD(insn); break; case OP_MUL: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMUL(insn); + else if (isFloatType(insn->dType)) emitFMUL(insn); else emitUMUL(insn); break; case OP_MAD: case OP_FMA: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMAD(insn); + else if (isFloatType(insn->dType)) emitFMAD(insn); else emitIMAD(insn); -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 02/11] gk110/ir: add emission of dadd/dmul/dmad opcodes
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- .../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 80 +++++++++++++++++++++- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index d8adc93..204d911 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -84,11 +84,14 @@ private: void emitUADD(const Instruction *); void emitFADD(const Instruction *); + void emitDADD(const Instruction *); void emitIMUL(const Instruction *); void emitFMUL(const Instruction *); + void emitDMUL(const Instruction *); void emitIMAD(const Instruction *); void emitISAD(const Instruction *); void emitFMAD(const Instruction *); + void emitDMAD(const Instruction *); void emitNOT(const Instruction *); void emitLogicOp(const Instruction *, uint8_t subOp); @@ -479,6 +482,28 @@ CodeEmitterGK110::emitFMAD(const Instruction *i) } void +CodeEmitterGK110::emitDMAD(const Instruction *i) +{ + assert(!i->saturate); + assert(!i->ftz); + + emitForm_21(i, 0x1b8, 0xb38); + + NEG_(34, 2); + RND_(36, F); + + bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg(); + + if (code[0] & 0x1) { + if (neg1) + code[1] ^= 1 << 27; + } else + if (neg1) { + code[1] |= 1 << 19; + } +} + +void CodeEmitterGK110::emitFMUL(const Instruction *i) { bool neg = (i->src(0).mod ^ i->src(1).mod).neg(); @@ -516,6 +541,29 @@ CodeEmitterGK110::emitFMUL(const Instruction *i) } void +CodeEmitterGK110::emitDMUL(const Instruction *i) +{ + bool neg = (i->src(0).mod ^ i->src(1).mod).neg(); + + assert(!i->postFactor); + assert(!i->saturate); + assert(!i->ftz); + assert(!i->dnz); + + emitForm_21(i, 0x240, 0xc40); + + RND_(2a, F); + + if (code[0] & 0x1) { + if (neg) + code[1] ^= 1 << 27; + } else + if (neg) { + code[1] |= 1 << 19; + } +} + +void CodeEmitterGK110::emitIMUL(const Instruction *i) { assert(!i->src(0).mod.neg() && !i->src(1).mod.neg()); @@ -574,6 +622,26 @@ CodeEmitterGK110::emitFADD(const Instruction *i) } void +CodeEmitterGK110::emitDADD(const Instruction *i) +{ + assert(!i->saturate); + assert(!i->ftz); + + emitForm_21(i, 0x238, 0xc38); + RND_(2a, F); + ABS_(31, 0); + NEG_(33, 0); + if (code[0] & 0x1) { + modNegAbsF32_3b(i, 1); + if (i->op == OP_SUB) code[1] ^= 1 << 27; + } else { + NEG_(30, 1); + ABS_(34, 1); + if (i->op == OP_SUB) code[1] ^= 1 << 16; + } +} + +void CodeEmitterGK110::emitUADD(const Instruction *i) { uint8_t addOp = (i->src(0).mod.neg() << 1) | i->src(1).mod.neg(); @@ -1634,20 +1702,26 @@ CodeEmitterGK110::emitInstruction(Instruction *insn) break; case OP_ADD: case OP_SUB: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDADD(insn); + else if (isFloatType(insn->dType)) emitFADD(insn); else emitUADD(insn); break; case OP_MUL: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMUL(insn); + else if (isFloatType(insn->dType)) emitFMUL(insn); else emitIMUL(insn); break; case OP_MAD: case OP_FMA: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMAD(insn); + else if (isFloatType(insn->dType)) emitFMAD(insn); else emitIMAD(insn); -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 03/11] gm107/ir: fix DMUL opcode encoding
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 944ceb2..9f4c435 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -924,15 +924,15 @@ CodeEmitterGM107::emitDMUL() { switch (insn->src(1).getFile()) { case FILE_GPR: - emitInsn(0x5c680000); + emitInsn(0x5c800000); emitGPR (0x14, insn->src(1)); break; case FILE_MEMORY_CONST: - emitInsn(0x4c680000); + emitInsn(0x4c800000); emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1)); break; case FILE_IMMEDIATE: - emitInsn(0x38680000); + emitInsn(0x38800000); emitIMMD(0x14, 19, insn->src(1)); break; default: -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 04/11] gm107/ir: fix DSET boolean float flag
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 9f4c435..73a65fa 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -1060,6 +1060,7 @@ CodeEmitterGM107::emitDSET() emitABS (0x36, insn->src(0)); emitNEG (0x35, insn->src(1)); + emitField(0x34, 1, insn->dType == TYPE_F32); emitCond4(0x30, insn->setCond); emitCC (0x2f); emitABS (0x2c, insn->src(1)); -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 05/11] gm107/ir: fix F2F flipped stype/dtype flags
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 73a65fa..3e1da7e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -731,8 +731,8 @@ CodeEmitterGM107::emitF2F() emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg()); emitFMZ (0x2c, 1); emitRND (0x27, rnd, 0x2a); - emitField(0x0a, 2, util_logbase2(typeSizeof(insn->dType))); - emitField(0x08, 2, util_logbase2(typeSizeof(insn->sType))); + emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType))); + emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType))); emitGPR (0x00, insn->def(0)); } -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 06/11] nvc0/ir: fix lowering of RSQ/RCP/SQRT/MOD to work with F64
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- src/gallium/drivers/nouveau/codegen/nv50_ir.h | 1 + .../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 4 +- .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 4 +- .../drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp | 4 +- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 43 +++++++++++++++++----- 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index 0ff5e5d..529dcb9 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -175,6 +175,7 @@ enum operation #define NV50_IR_SUBOP_MOV_FINAL 1 #define NV50_IR_SUBOP_EXTBF_REV 1 #define NV50_IR_SUBOP_BFIND_SAMT 1 +#define NV50_IR_SUBOP_RCPRSQ_64H 1 #define NV50_IR_SUBOP_PERMT_F4E 1 #define NV50_IR_SUBOP_PERMT_B4E 2 #define NV50_IR_SUBOP_PERMT_RC8 3 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index 204d911..674be69 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -1771,10 +1771,10 @@ CodeEmitterGK110::emitInstruction(Instruction *insn) emitCVT(insn); break; case OP_RSQ: - emitSFnOp(insn, 5); + emitSFnOp(insn, 5 + 2 * insn->subOp); break; case OP_RCP: - emitSFnOp(insn, 4); + emitSFnOp(insn, 4 + 2 * insn->subOp); break; case OP_LG2: emitSFnOp(insn, 3); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 3e1da7e..ee0487f 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -1265,8 +1265,8 @@ CodeEmitterGM107::emitMUFU() case OP_SIN: mufu = 1; break; case OP_EX2: mufu = 2; break; case OP_LG2: mufu = 3; break; - case OP_RCP: mufu = 4; break; - case OP_RSQ: mufu = 5; break; + case OP_RCP: mufu = 4 + 2 * insn->subOp; break; + case OP_RSQ: mufu = 5 + 2 * insn->subOp; break; default: assert(!"invalid mufu"); break; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index e38a3b8..1a4f6e0 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -2365,10 +2365,10 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn) emitCVT(insn); break; case OP_RSQ: - emitSFnOp(insn, 5); + emitSFnOp(insn, 5 + 2 * insn->subOp); break; case OP_RCP: - emitSFnOp(insn, 4); + emitSFnOp(insn, 4 + 2 * insn->subOp); break; case OP_LG2: emitSFnOp(insn, 3); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 5dfb777..8ac3b26 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -70,7 +70,30 @@ NVC0LegalizeSSA::handleDIV(Instruction *i) void NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) { - // TODO + assert(i->dType == TYPE_F64); + // There are instructions that will compute the high 32 bits of the 64-bit + // float. We will just stick 0 in the bottom 32 bits. + + bld.setPosition(i, false); + + // 1. Take the source and it up. + Value *src[2], *dst[2], *def = i->getDef(0); + bld.mkSplit(src, 4, i->getSrc(0)); + + // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. + dst[0] = bld.loadImm(NULL, 0); + dst[1] = bld.getSSA(); + + // 3. The new version of the instruction takes the high 32 bits of the + // source and outputs the high 32 bits of the destination. + i->setSrc(0, src[1]); + i->setDef(0, dst[1]); + i->setType(TYPE_F32); + i->subOp = NV50_IR_SUBOP_RCPRSQ_64H; + + // 4. Recombine the two dst pieces back into the original destination. + bld.setPosition(i, true); + bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); } bool @@ -1520,7 +1543,7 @@ NVC0LoweringPass::handleDIV(Instruction *i) if (!isFloatType(i->dType)) return true; bld.setPosition(i, false); - Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1)); + Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1)); i->op = OP_MUL; i->setSrc(1, rcp->getDef(0)); return true; @@ -1529,13 +1552,13 @@ NVC0LoweringPass::handleDIV(Instruction *i) bool NVC0LoweringPass::handleMOD(Instruction *i) { - if (i->dType != TYPE_F32) + if (!isFloatType(i->dType)) return true; - LValue *value = bld.getScratch(); - bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1)); - bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value); - bld.mkOp1(OP_TRUNC, TYPE_F32, value, value); - bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value); + LValue *value = bld.getScratch(typeSizeof(i->dType)); + bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1)); + bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value); + bld.mkOp1(OP_TRUNC, i->dType, value, value); + bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value); i->op = OP_SUB; i->setSrc(1, value); return true; @@ -1544,8 +1567,8 @@ NVC0LoweringPass::handleMOD(Instruction *i) bool NVC0LoweringPass::handleSQRT(Instruction *i) { - Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, - bld.getSSA(), i->getSrc(0)); + Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType, + bld.getSSA(typeSizeof(i->dType)), i->getSrc(0)); i->op = OP_MUL; i->setSrc(1, rsq->getDef(0)); -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 07/11] nvc0/ir: no instruction can load a double immediate
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 817ceb8..7d4a859 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -337,6 +337,8 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s, if (sf == FILE_IMMEDIATE) { Storage ® = ld->getSrc(0)->asImm()->reg; + if (typeSizeof(i->sType) > 4) + return false; if (opInfo[i->op].immdBits != 0xffffffff) { if (i->sType == TYPE_F32) { if (reg.data.u32 & 0xfff) -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 08/11] nvc0/ir: handle zero and negative sqrt arguments
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- .../drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 8ac3b26..18e8e67 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -1567,10 +1567,22 @@ NVC0LoweringPass::handleMOD(Instruction *i) bool NVC0LoweringPass::handleSQRT(Instruction *i) { - Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType, - bld.getSSA(typeSizeof(i->dType)), i->getSrc(0)); + Value *pred = bld.getSSA(1, FILE_PREDICATE); + Value *zero = bld.getSSA(); + Instruction *rsq; + + bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0)); + if (i->dType == TYPE_F64) + zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero); + bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero); + bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred); + rsq = bld.mkOp1(OP_RSQ, i->dType, + bld.getSSA(typeSizeof(i->dType)), i->getSrc(0)); + rsq->setPredicate(CC_NOT_P, pred); i->op = OP_MUL; i->setSrc(1, rsq->getDef(0)); + i->setPredicate(CC_NOT_P, pred); + return true; } -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 09/11] nvc0/ir: add support for new TGSI double opcodes (v2)
v2: drop DDIV Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 196 +++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 9ee927f..028a17e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -441,6 +441,27 @@ nv50_ir::DataType Instruction::inferSrcType() const case TGSI_OPCODE_IBFE: case TGSI_OPCODE_IMSB: return nv50_ir::TYPE_S32; + case TGSI_OPCODE_D2F: + case TGSI_OPCODE_DABS: + case TGSI_OPCODE_DNEG: + case TGSI_OPCODE_DADD: + case TGSI_OPCODE_DMUL: + case TGSI_OPCODE_DMAX: + case TGSI_OPCODE_DMIN: + case TGSI_OPCODE_DSLT: + case TGSI_OPCODE_DSGE: + case TGSI_OPCODE_DSEQ: + case TGSI_OPCODE_DSNE: + case TGSI_OPCODE_DRCP: + case TGSI_OPCODE_DSQRT: + case TGSI_OPCODE_DMAD: + case TGSI_OPCODE_DFRAC: + case TGSI_OPCODE_DRSQ: + case TGSI_OPCODE_DTRUNC: + case TGSI_OPCODE_DCEIL: + case TGSI_OPCODE_DFLR: + case TGSI_OPCODE_DROUND: + return nv50_ir::TYPE_F64; default: return nv50_ir::TYPE_F32; } @@ -455,10 +476,17 @@ nv50_ir::DataType Instruction::inferDstType() const case TGSI_OPCODE_FSGE: case TGSI_OPCODE_FSLT: case TGSI_OPCODE_FSNE: + case TGSI_OPCODE_DSEQ: + case TGSI_OPCODE_DSGE: + case TGSI_OPCODE_DSLT: + case TGSI_OPCODE_DSNE: return nv50_ir::TYPE_U32; case TGSI_OPCODE_I2F: case TGSI_OPCODE_U2F: + case TGSI_OPCODE_D2F: return nv50_ir::TYPE_F32; + case TGSI_OPCODE_F2D: + return nv50_ir::TYPE_F64; default: return inferSrcType(); } @@ -473,6 +501,7 @@ nv50_ir::CondCode Instruction::getSetCond() const case TGSI_OPCODE_ISLT: case TGSI_OPCODE_USLT: case TGSI_OPCODE_FSLT: + case TGSI_OPCODE_DSLT: return CC_LT; case TGSI_OPCODE_SLE: return CC_LE; @@ -480,15 +509,18 @@ nv50_ir::CondCode Instruction::getSetCond() const case TGSI_OPCODE_ISGE: case TGSI_OPCODE_USGE: case TGSI_OPCODE_FSGE: + case TGSI_OPCODE_DSGE: return CC_GE; case TGSI_OPCODE_SGT: return CC_GT; case TGSI_OPCODE_SEQ: case TGSI_OPCODE_USEQ: case TGSI_OPCODE_FSEQ: + case TGSI_OPCODE_DSEQ: return CC_EQ; case TGSI_OPCODE_SNE: case TGSI_OPCODE_FSNE: + case TGSI_OPCODE_DSNE: return CC_NEU; case TGSI_OPCODE_USNE: return CC_NE; @@ -601,6 +633,25 @@ static nv50_ir::operation translateOpcode(uint opcode) NV50_IR_OPCODE_CASE(USLT, SET); NV50_IR_OPCODE_CASE(USNE, SET); + NV50_IR_OPCODE_CASE(DABS, ABS); + NV50_IR_OPCODE_CASE(DNEG, NEG); + NV50_IR_OPCODE_CASE(DADD, ADD); + NV50_IR_OPCODE_CASE(DMUL, MUL); + NV50_IR_OPCODE_CASE(DMAX, MAX); + NV50_IR_OPCODE_CASE(DMIN, MIN); + NV50_IR_OPCODE_CASE(DSLT, SET); + NV50_IR_OPCODE_CASE(DSGE, SET); + NV50_IR_OPCODE_CASE(DSEQ, SET); + NV50_IR_OPCODE_CASE(DSNE, SET); + NV50_IR_OPCODE_CASE(DRCP, RCP); + NV50_IR_OPCODE_CASE(DSQRT, SQRT); + NV50_IR_OPCODE_CASE(DMAD, MAD); + NV50_IR_OPCODE_CASE(DRSQ, RSQ); + NV50_IR_OPCODE_CASE(DTRUNC, TRUNC); + NV50_IR_OPCODE_CASE(DCEIL, CEIL); + NV50_IR_OPCODE_CASE(DFLR, FLOOR); + NV50_IR_OPCODE_CASE(DROUND, CVT); + NV50_IR_OPCODE_CASE(IMUL_HI, MUL); NV50_IR_OPCODE_CASE(UMUL_HI, MUL); @@ -2880,6 +2931,151 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) case TGSI_OPCODE_INTERP_OFFSET: handleINTERP(dst0); break; + case TGSI_OPCODE_D2F: { + int pos = 0; + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + Value *dreg = getSSA(8); + src0 = fetchSrc(0, pos); + src1 = fetchSrc(0, pos + 1); + mkOp2(OP_MERGE, TYPE_U64, dreg, src0, src1); + mkCvt(OP_CVT, dstTy, dst0[c], srcTy, dreg); + pos += 2; + } + break; + } + case TGSI_OPCODE_F2D: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + Value *dreg = getSSA(8); + mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2)); + mkSplit(&dst0[c], 4, dreg); + c++; + } + break; + case TGSI_OPCODE_DABS: + case TGSI_OPCODE_DNEG: + case TGSI_OPCODE_DRCP: + case TGSI_OPCODE_DSQRT: + case TGSI_OPCODE_DRSQ: + case TGSI_OPCODE_DTRUNC: + case TGSI_OPCODE_DCEIL: + case TGSI_OPCODE_DFLR: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *dst = getSSA(8), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + mkOp1(op, dstTy, dst, src0); + mkSplit(&dst0[c], 4, dst); + c++; + } + break; + case TGSI_OPCODE_DFRAC: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *dst = getSSA(8), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + mkOp1(OP_FLOOR, TYPE_F64, dst, src0); + mkOp2(OP_SUB, TYPE_F64, dst, src0, dst); + mkSplit(&dst0[c], 4, dst); + c++; + } + break; + case TGSI_OPCODE_DSLT: + case TGSI_OPCODE_DSGE: + case TGSI_OPCODE_DSEQ: + case TGSI_OPCODE_DSNE: { + int pos = 0; + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + Value *tmp[2]; + + src0 = getSSA(8); + src1 = getSSA(8); + tmp[0] = fetchSrc(0, pos); + tmp[1] = fetchSrc(0, pos + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + tmp[0] = fetchSrc(1, pos); + tmp[1] = fetchSrc(1, pos + 1); + mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]); + mkCmp(op, tgsi.getSetCond(), dstTy, dst0[c], srcTy, src0, src1); + pos += 2; + } + break; + } + case TGSI_OPCODE_DADD: + case TGSI_OPCODE_DMUL: + case TGSI_OPCODE_DMAX: + case TGSI_OPCODE_DMIN: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + src1 = getSSA(8); + Value *dst = getSSA(8), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + tmp[0] = fetchSrc(1, c); + tmp[1] = fetchSrc(1, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]); + mkOp2(op, dstTy, dst, src0, src1); + mkSplit(&dst0[c], 4, dst); + c++; + } + break; + case TGSI_OPCODE_DMAD: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + src1 = getSSA(8); + src2 = getSSA(8); + Value *dst = getSSA(8), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + tmp[0] = fetchSrc(1, c); + tmp[1] = fetchSrc(1, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]); + tmp[0] = fetchSrc(2, c); + tmp[1] = fetchSrc(2, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src2, tmp[0], tmp[1]); + mkOp3(op, dstTy, dst, src0, src1, src2); + mkSplit(&dst0[c], 4, dst); + c++; + } + break; + case TGSI_OPCODE_DROUND: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *dst = getSSA(8), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + mkCvt(OP_CVT, TYPE_F64, dst, TYPE_F64, src0) + ->rnd = ROUND_NI; + mkSplit(&dst0[c], 4, dst); + c++; + } + break; + case TGSI_OPCODE_DSSG: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = getSSA(8); + Value *dst = getSSA(8), *dstF32 = getSSA(), *tmp[2]; + tmp[0] = fetchSrc(0, c); + tmp[1] = fetchSrc(0, c + 1); + mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]); + + val0 = getScratch(); + val1 = getScratch(); + // The zero is wrong here since it's only 32-bit, but it works out in + // the end since it gets replaced with $r63. + mkCmp(OP_SET, CC_GT, TYPE_F32, val0, TYPE_F64, src0, zero); + mkCmp(OP_SET, CC_LT, TYPE_F32, val1, TYPE_F64, src0, zero); + mkOp2(OP_SUB, TYPE_F32, dstF32, val0, val1); + mkCvt(OP_CVT, TYPE_F64, dst, TYPE_F32, dstF32); + mkSplit(&dst0[c], 4, dst); + c++; + } + break; default: ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode()); assert(0); -- 2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 10/11] nvc0/ir: remove merge/split pairs to allow normal propagation to occur
Because the TGSI interface creates merges for each instruction source and then splits them back out, there are a lot of unnecessary merge/split pairs which do essentially nothing. The various modifier/etc propagation doesn't know how to walk though those, so just remove them when they're unnecessary. Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 62d2ef7..6a4ea4e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -118,6 +118,35 @@ CopyPropagation::visit(BasicBlock *bb) // ============================================================================ +class MergeSplits : public Pass +{ +private: + virtual bool visit(BasicBlock *); +}; + +// For SPLIT / MERGE pairs that operate on the same registers, replace the +// post-merge def with the SPLIT's source. +bool +MergeSplits::visit(BasicBlock *bb) +{ + Instruction *i, *next, *si; + + for (i = bb->getEntry(); i; i = next) { + next = i->next; + if (i->op != OP_MERGE || typeSizeof(i->dType) != 8) + continue; + si = i->getSrc(0)->getInsn(); + if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn()) + continue; + i->def(0).replace(si->getSrc(0), false); + delete_Instruction(prog, i); + } + + return true; +} + +// ============================================================================+ class LoadPropagation : public Pass { private: @@ -2662,6 +2691,7 @@ Program::optimizeSSA(int level) { RUN_PASS(1, DeadCodeElim, buryAll); RUN_PASS(1, CopyPropagation, run); + RUN_PASS(1, MergeSplits, run); RUN_PASS(2, GlobalCSE, run); RUN_PASS(1, LocalCSE, run); RUN_PASS(2, AlgebraicOpt, run); -- 2.0.5
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 8546ac8..686d892 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -291,9 +291,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_INTEGERS: return 1; case PIPE_SHADER_CAP_DOUBLES: - return 0; + return 1; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: - return 0; + return 1; case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: return 0; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: -- 2.0.5
Apparently Analagous Threads
- [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0)
- [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
- [PATCH 1/2] nv50/ir: fix s32 x s32 -> high s32 multiply logic
- [PATCH mesa 0/5] nouveau: codegen: Make use of double immediates
- [PATCH] gm107/ir: fix loading z offset for layered 3d image bindings