Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
.../drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp | 66 +++++++++++++++++++++-
1 file changed, 63 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index dfb093c..e38a3b8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -92,11 +92,14 @@ private:
void emitUADD(const Instruction *);
void emitFADD(const Instruction *);
+ void emitDADD(const Instruction *);
void emitUMUL(const Instruction *);
void emitFMUL(const Instruction *);
+ void emitDMUL(const Instruction *);
void emitIMAD(const Instruction *);
void emitISAD(const Instruction *);
void emitFMAD(const Instruction *);
+ void emitDMAD(const Instruction *);
void emitMADSP(const Instruction *);
void emitNOT(Instruction *);
@@ -523,6 +526,25 @@ CodeEmitterNVC0::emitFMAD(const Instruction *i)
}
void
+CodeEmitterNVC0::emitDMAD(const Instruction *i)
+{
+ bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ emitForm_A(i, HEX64(20000000, 00000001));
+
+ if (i->src(2).mod.neg())
+ code[0] |= 1 << 8;
+
+ roundMode_A(i);
+
+ if (neg1)
+ code[0] |= 1 << 9;
+
+ assert(!i->saturate);
+ assert(!i->ftz);
+}
+
+void
CodeEmitterNVC0::emitFMUL(const Instruction *i)
{
bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
@@ -557,6 +579,23 @@ CodeEmitterNVC0::emitFMUL(const Instruction *i)
}
void
+CodeEmitterNVC0::emitDMUL(const Instruction *i)
+{
+ bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ emitForm_A(i, HEX64(50000000, 00000001));
+ roundMode_A(i);
+
+ if (neg)
+ code[0] |= 1 << 9;
+
+ assert(!i->saturate);
+ assert(!i->ftz);
+ assert(!i->dnz);
+ assert(!i->postFactor);
+}
+
+void
CodeEmitterNVC0::emitUMUL(const Instruction *i)
{
if (i->encSize == 8) {
@@ -619,6 +658,19 @@ CodeEmitterNVC0::emitFADD(const Instruction *i)
}
void
+CodeEmitterNVC0::emitDADD(const Instruction *i)
+{
+ assert(i->encSize == 8);
+ emitForm_A(i, HEX64(48000000, 00000001));
+ roundMode_A(i);
+ assert(!i->saturate);
+ assert(!i->ftz);
+ emitNegAbs12(i);
+ if (i->op == OP_SUB)
+ code[0] ^= 1 << 8;
+}
+
+void
CodeEmitterNVC0::emitUADD(const Instruction *i)
{
uint32_t addOp = 0;
@@ -895,6 +947,8 @@ CodeEmitterNVC0::emitMINMAX(const Instruction *i)
else
if (!isFloatType(i->dType))
op |= isSignedType(i->dType) ? 0x23 : 0x03;
+ if (i->dType == TYPE_F64)
+ op |= 0x01;
emitForm_A(i, op);
emitNegAbs12(i);
@@ -2242,20 +2296,26 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
break;
case OP_ADD:
case OP_SUB:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDADD(insn);
+ else if (isFloatType(insn->dType))
emitFADD(insn);
else
emitUADD(insn);
break;
case OP_MUL:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMUL(insn);
+ else if (isFloatType(insn->dType))
emitFMUL(insn);
else
emitUMUL(insn);
break;
case OP_MAD:
case OP_FMA:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMAD(insn);
+ else if (isFloatType(insn->dType))
emitFMAD(insn);
else
emitIMAD(insn);
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 02/11] gk110/ir: add emission of dadd/dmul/dmad opcodes
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
.../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 80 +++++++++++++++++++++-
1 file changed, 77 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index d8adc93..204d911 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -84,11 +84,14 @@ private:
void emitUADD(const Instruction *);
void emitFADD(const Instruction *);
+ void emitDADD(const Instruction *);
void emitIMUL(const Instruction *);
void emitFMUL(const Instruction *);
+ void emitDMUL(const Instruction *);
void emitIMAD(const Instruction *);
void emitISAD(const Instruction *);
void emitFMAD(const Instruction *);
+ void emitDMAD(const Instruction *);
void emitNOT(const Instruction *);
void emitLogicOp(const Instruction *, uint8_t subOp);
@@ -479,6 +482,28 @@ CodeEmitterGK110::emitFMAD(const Instruction *i)
}
void
+CodeEmitterGK110::emitDMAD(const Instruction *i)
+{
+ assert(!i->saturate);
+ assert(!i->ftz);
+
+ emitForm_21(i, 0x1b8, 0xb38);
+
+ NEG_(34, 2);
+ RND_(36, F);
+
+ bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ if (code[0] & 0x1) {
+ if (neg1)
+ code[1] ^= 1 << 27;
+ } else
+ if (neg1) {
+ code[1] |= 1 << 19;
+ }
+}
+
+void
CodeEmitterGK110::emitFMUL(const Instruction *i)
{
bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
@@ -516,6 +541,29 @@ CodeEmitterGK110::emitFMUL(const Instruction *i)
}
void
+CodeEmitterGK110::emitDMUL(const Instruction *i)
+{
+ bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ assert(!i->postFactor);
+ assert(!i->saturate);
+ assert(!i->ftz);
+ assert(!i->dnz);
+
+ emitForm_21(i, 0x240, 0xc40);
+
+ RND_(2a, F);
+
+ if (code[0] & 0x1) {
+ if (neg)
+ code[1] ^= 1 << 27;
+ } else
+ if (neg) {
+ code[1] |= 1 << 19;
+ }
+}
+
+void
CodeEmitterGK110::emitIMUL(const Instruction *i)
{
assert(!i->src(0).mod.neg() && !i->src(1).mod.neg());
@@ -574,6 +622,26 @@ CodeEmitterGK110::emitFADD(const Instruction *i)
}
void
+CodeEmitterGK110::emitDADD(const Instruction *i)
+{
+ assert(!i->saturate);
+ assert(!i->ftz);
+
+ emitForm_21(i, 0x238, 0xc38);
+ RND_(2a, F);
+ ABS_(31, 0);
+ NEG_(33, 0);
+ if (code[0] & 0x1) {
+ modNegAbsF32_3b(i, 1);
+ if (i->op == OP_SUB) code[1] ^= 1 << 27;
+ } else {
+ NEG_(30, 1);
+ ABS_(34, 1);
+ if (i->op == OP_SUB) code[1] ^= 1 << 16;
+ }
+}
+
+void
CodeEmitterGK110::emitUADD(const Instruction *i)
{
uint8_t addOp = (i->src(0).mod.neg() << 1) |
i->src(1).mod.neg();
@@ -1634,20 +1702,26 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
break;
case OP_ADD:
case OP_SUB:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDADD(insn);
+ else if (isFloatType(insn->dType))
emitFADD(insn);
else
emitUADD(insn);
break;
case OP_MUL:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMUL(insn);
+ else if (isFloatType(insn->dType))
emitFMUL(insn);
else
emitIMUL(insn);
break;
case OP_MAD:
case OP_FMA:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMAD(insn);
+ else if (isFloatType(insn->dType))
emitFMAD(insn);
else
emitIMAD(insn);
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 03/11] gm107/ir: fix DMUL opcode encoding
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 944ceb2..9f4c435 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -924,15 +924,15 @@ CodeEmitterGM107::emitDMUL()
{
switch (insn->src(1).getFile()) {
case FILE_GPR:
- emitInsn(0x5c680000);
+ emitInsn(0x5c800000);
emitGPR (0x14, insn->src(1));
break;
case FILE_MEMORY_CONST:
- emitInsn(0x4c680000);
+ emitInsn(0x4c800000);
emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1));
break;
case FILE_IMMEDIATE:
- emitInsn(0x38680000);
+ emitInsn(0x38800000);
emitIMMD(0x14, 19, insn->src(1));
break;
default:
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 04/11] gm107/ir: fix DSET boolean float flag
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 9f4c435..73a65fa 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -1060,6 +1060,7 @@ CodeEmitterGM107::emitDSET()
emitABS (0x36, insn->src(0));
emitNEG (0x35, insn->src(1));
+ emitField(0x34, 1, insn->dType == TYPE_F32);
emitCond4(0x30, insn->setCond);
emitCC (0x2f);
emitABS (0x2c, insn->src(1));
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 05/11] gm107/ir: fix F2F flipped stype/dtype flags
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 73a65fa..3e1da7e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -731,8 +731,8 @@ CodeEmitterGM107::emitF2F()
emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
emitFMZ (0x2c, 1);
emitRND (0x27, rnd, 0x2a);
- emitField(0x0a, 2, util_logbase2(typeSizeof(insn->dType)));
- emitField(0x08, 2, util_logbase2(typeSizeof(insn->sType)));
+ emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
+ emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType)));
emitGPR (0x00, insn->def(0));
}
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 06/11] nvc0/ir: fix lowering of RSQ/RCP/SQRT/MOD to work with F64
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
src/gallium/drivers/nouveau/codegen/nv50_ir.h | 1 +
.../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 4 +-
.../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 4 +-
.../drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp | 4 +-
.../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 43 +++++++++++++++++-----
5 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 0ff5e5d..529dcb9 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -175,6 +175,7 @@ enum operation
#define NV50_IR_SUBOP_MOV_FINAL 1
#define NV50_IR_SUBOP_EXTBF_REV 1
#define NV50_IR_SUBOP_BFIND_SAMT 1
+#define NV50_IR_SUBOP_RCPRSQ_64H 1
#define NV50_IR_SUBOP_PERMT_F4E 1
#define NV50_IR_SUBOP_PERMT_B4E 2
#define NV50_IR_SUBOP_PERMT_RC8 3
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 204d911..674be69 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1771,10 +1771,10 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
emitCVT(insn);
break;
case OP_RSQ:
- emitSFnOp(insn, 5);
+ emitSFnOp(insn, 5 + 2 * insn->subOp);
break;
case OP_RCP:
- emitSFnOp(insn, 4);
+ emitSFnOp(insn, 4 + 2 * insn->subOp);
break;
case OP_LG2:
emitSFnOp(insn, 3);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 3e1da7e..ee0487f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -1265,8 +1265,8 @@ CodeEmitterGM107::emitMUFU()
case OP_SIN: mufu = 1; break;
case OP_EX2: mufu = 2; break;
case OP_LG2: mufu = 3; break;
- case OP_RCP: mufu = 4; break;
- case OP_RSQ: mufu = 5; break;
+ case OP_RCP: mufu = 4 + 2 * insn->subOp; break;
+ case OP_RSQ: mufu = 5 + 2 * insn->subOp; break;
default:
assert(!"invalid mufu");
break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index e38a3b8..1a4f6e0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2365,10 +2365,10 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
emitCVT(insn);
break;
case OP_RSQ:
- emitSFnOp(insn, 5);
+ emitSFnOp(insn, 5 + 2 * insn->subOp);
break;
case OP_RCP:
- emitSFnOp(insn, 4);
+ emitSFnOp(insn, 4 + 2 * insn->subOp);
break;
case OP_LG2:
emitSFnOp(insn, 3);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 5dfb777..8ac3b26 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -70,7 +70,30 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
void
NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
{
- // TODO
+ assert(i->dType == TYPE_F64);
+ // There are instructions that will compute the high 32 bits of the 64-bit
+ // float. We will just stick 0 in the bottom 32 bits.
+
+ bld.setPosition(i, false);
+
+ // 1. Take the source and it up.
+ Value *src[2], *dst[2], *def = i->getDef(0);
+ bld.mkSplit(src, 4, i->getSrc(0));
+
+ // 2. We don't care about the low 32 bits of the destination. Stick a 0
in.
+ dst[0] = bld.loadImm(NULL, 0);
+ dst[1] = bld.getSSA();
+
+ // 3. The new version of the instruction takes the high 32 bits of the
+ // source and outputs the high 32 bits of the destination.
+ i->setSrc(0, src[1]);
+ i->setDef(0, dst[1]);
+ i->setType(TYPE_F32);
+ i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
+
+ // 4. Recombine the two dst pieces back into the original destination.
+ bld.setPosition(i, true);
+ bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
}
bool
@@ -1520,7 +1543,7 @@ NVC0LoweringPass::handleDIV(Instruction *i)
if (!isFloatType(i->dType))
return true;
bld.setPosition(i, false);
- Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(),
i->getSrc(1));
+ Instruction *rcp = bld.mkOp1(OP_RCP, i->dType,
bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
i->op = OP_MUL;
i->setSrc(1, rcp->getDef(0));
return true;
@@ -1529,13 +1552,13 @@ NVC0LoweringPass::handleDIV(Instruction *i)
bool
NVC0LoweringPass::handleMOD(Instruction *i)
{
- if (i->dType != TYPE_F32)
+ if (!isFloatType(i->dType))
return true;
- LValue *value = bld.getScratch();
- bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
- bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
- bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
- bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+ LValue *value = bld.getScratch(typeSizeof(i->dType));
+ bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
+ bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
+ bld.mkOp1(OP_TRUNC, i->dType, value, value);
+ bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
i->op = OP_SUB;
i->setSrc(1, value);
return true;
@@ -1544,8 +1567,8 @@ NVC0LoweringPass::handleMOD(Instruction *i)
bool
NVC0LoweringPass::handleSQRT(Instruction *i)
{
- Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
- bld.getSSA(), i->getSrc(0));
+ Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType,
+ bld.getSSA(typeSizeof(i->dType)),
i->getSrc(0));
i->op = OP_MUL;
i->setSrc(1, rsq->getDef(0));
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 07/11] nvc0/ir: no instruction can load a double immediate
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 817ceb8..7d4a859 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -337,6 +337,8 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
if (sf == FILE_IMMEDIATE) {
Storage ® = ld->getSrc(0)->asImm()->reg;
+ if (typeSizeof(i->sType) > 4)
+ return false;
if (opInfo[i->op].immdBits != 0xffffffff) {
if (i->sType == TYPE_F32) {
if (reg.data.u32 & 0xfff)
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 08/11] nvc0/ir: handle zero and negative sqrt arguments
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
.../drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 8ac3b26..18e8e67 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1567,10 +1567,22 @@ NVC0LoweringPass::handleMOD(Instruction *i)
bool
NVC0LoweringPass::handleSQRT(Instruction *i)
{
- Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType,
- bld.getSSA(typeSizeof(i->dType)),
i->getSrc(0));
+ Value *pred = bld.getSSA(1, FILE_PREDICATE);
+ Value *zero = bld.getSSA();
+ Instruction *rsq;
+
+ bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
+ if (i->dType == TYPE_F64)
+ zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
+ bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0),
zero);
+ bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P,
pred);
+ rsq = bld.mkOp1(OP_RSQ, i->dType,
+ bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
+ rsq->setPredicate(CC_NOT_P, pred);
i->op = OP_MUL;
i->setSrc(1, rsq->getDef(0));
+ i->setPredicate(CC_NOT_P, pred);
+
return true;
}
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 09/11] nvc0/ir: add support for new TGSI double opcodes (v2)
v2: drop DDIV
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
.../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 196 +++++++++++++++++++++
1 file changed, 196 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 9ee927f..028a17e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -441,6 +441,27 @@ nv50_ir::DataType Instruction::inferSrcType() const
case TGSI_OPCODE_IBFE:
case TGSI_OPCODE_IMSB:
return nv50_ir::TYPE_S32;
+ case TGSI_OPCODE_D2F:
+ case TGSI_OPCODE_DABS:
+ case TGSI_OPCODE_DNEG:
+ case TGSI_OPCODE_DADD:
+ case TGSI_OPCODE_DMUL:
+ case TGSI_OPCODE_DMAX:
+ case TGSI_OPCODE_DMIN:
+ case TGSI_OPCODE_DSLT:
+ case TGSI_OPCODE_DSGE:
+ case TGSI_OPCODE_DSEQ:
+ case TGSI_OPCODE_DSNE:
+ case TGSI_OPCODE_DRCP:
+ case TGSI_OPCODE_DSQRT:
+ case TGSI_OPCODE_DMAD:
+ case TGSI_OPCODE_DFRAC:
+ case TGSI_OPCODE_DRSQ:
+ case TGSI_OPCODE_DTRUNC:
+ case TGSI_OPCODE_DCEIL:
+ case TGSI_OPCODE_DFLR:
+ case TGSI_OPCODE_DROUND:
+ return nv50_ir::TYPE_F64;
default:
return nv50_ir::TYPE_F32;
}
@@ -455,10 +476,17 @@ nv50_ir::DataType Instruction::inferDstType() const
case TGSI_OPCODE_FSGE:
case TGSI_OPCODE_FSLT:
case TGSI_OPCODE_FSNE:
+ case TGSI_OPCODE_DSEQ:
+ case TGSI_OPCODE_DSGE:
+ case TGSI_OPCODE_DSLT:
+ case TGSI_OPCODE_DSNE:
return nv50_ir::TYPE_U32;
case TGSI_OPCODE_I2F:
case TGSI_OPCODE_U2F:
+ case TGSI_OPCODE_D2F:
return nv50_ir::TYPE_F32;
+ case TGSI_OPCODE_F2D:
+ return nv50_ir::TYPE_F64;
default:
return inferSrcType();
}
@@ -473,6 +501,7 @@ nv50_ir::CondCode Instruction::getSetCond() const
case TGSI_OPCODE_ISLT:
case TGSI_OPCODE_USLT:
case TGSI_OPCODE_FSLT:
+ case TGSI_OPCODE_DSLT:
return CC_LT;
case TGSI_OPCODE_SLE:
return CC_LE;
@@ -480,15 +509,18 @@ nv50_ir::CondCode Instruction::getSetCond() const
case TGSI_OPCODE_ISGE:
case TGSI_OPCODE_USGE:
case TGSI_OPCODE_FSGE:
+ case TGSI_OPCODE_DSGE:
return CC_GE;
case TGSI_OPCODE_SGT:
return CC_GT;
case TGSI_OPCODE_SEQ:
case TGSI_OPCODE_USEQ:
case TGSI_OPCODE_FSEQ:
+ case TGSI_OPCODE_DSEQ:
return CC_EQ;
case TGSI_OPCODE_SNE:
case TGSI_OPCODE_FSNE:
+ case TGSI_OPCODE_DSNE:
return CC_NEU;
case TGSI_OPCODE_USNE:
return CC_NE;
@@ -601,6 +633,25 @@ static nv50_ir::operation translateOpcode(uint opcode)
NV50_IR_OPCODE_CASE(USLT, SET);
NV50_IR_OPCODE_CASE(USNE, SET);
+ NV50_IR_OPCODE_CASE(DABS, ABS);
+ NV50_IR_OPCODE_CASE(DNEG, NEG);
+ NV50_IR_OPCODE_CASE(DADD, ADD);
+ NV50_IR_OPCODE_CASE(DMUL, MUL);
+ NV50_IR_OPCODE_CASE(DMAX, MAX);
+ NV50_IR_OPCODE_CASE(DMIN, MIN);
+ NV50_IR_OPCODE_CASE(DSLT, SET);
+ NV50_IR_OPCODE_CASE(DSGE, SET);
+ NV50_IR_OPCODE_CASE(DSEQ, SET);
+ NV50_IR_OPCODE_CASE(DSNE, SET);
+ NV50_IR_OPCODE_CASE(DRCP, RCP);
+ NV50_IR_OPCODE_CASE(DSQRT, SQRT);
+ NV50_IR_OPCODE_CASE(DMAD, MAD);
+ NV50_IR_OPCODE_CASE(DRSQ, RSQ);
+ NV50_IR_OPCODE_CASE(DTRUNC, TRUNC);
+ NV50_IR_OPCODE_CASE(DCEIL, CEIL);
+ NV50_IR_OPCODE_CASE(DFLR, FLOOR);
+ NV50_IR_OPCODE_CASE(DROUND, CVT);
+
NV50_IR_OPCODE_CASE(IMUL_HI, MUL);
NV50_IR_OPCODE_CASE(UMUL_HI, MUL);
@@ -2880,6 +2931,151 @@ Converter::handleInstruction(const struct
tgsi_full_instruction *insn)
case TGSI_OPCODE_INTERP_OFFSET:
handleINTERP(dst0);
break;
+ case TGSI_OPCODE_D2F: {
+ int pos = 0;
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ Value *dreg = getSSA(8);
+ src0 = fetchSrc(0, pos);
+ src1 = fetchSrc(0, pos + 1);
+ mkOp2(OP_MERGE, TYPE_U64, dreg, src0, src1);
+ mkCvt(OP_CVT, dstTy, dst0[c], srcTy, dreg);
+ pos += 2;
+ }
+ break;
+ }
+ case TGSI_OPCODE_F2D:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ Value *dreg = getSSA(8);
+ mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2));
+ mkSplit(&dst0[c], 4, dreg);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_DABS:
+ case TGSI_OPCODE_DNEG:
+ case TGSI_OPCODE_DRCP:
+ case TGSI_OPCODE_DSQRT:
+ case TGSI_OPCODE_DRSQ:
+ case TGSI_OPCODE_DTRUNC:
+ case TGSI_OPCODE_DCEIL:
+ case TGSI_OPCODE_DFLR:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *dst = getSSA(8), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ mkOp1(op, dstTy, dst, src0);
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_DFRAC:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *dst = getSSA(8), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ mkOp1(OP_FLOOR, TYPE_F64, dst, src0);
+ mkOp2(OP_SUB, TYPE_F64, dst, src0, dst);
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_DSLT:
+ case TGSI_OPCODE_DSGE:
+ case TGSI_OPCODE_DSEQ:
+ case TGSI_OPCODE_DSNE: {
+ int pos = 0;
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ Value *tmp[2];
+
+ src0 = getSSA(8);
+ src1 = getSSA(8);
+ tmp[0] = fetchSrc(0, pos);
+ tmp[1] = fetchSrc(0, pos + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ tmp[0] = fetchSrc(1, pos);
+ tmp[1] = fetchSrc(1, pos + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]);
+ mkCmp(op, tgsi.getSetCond(), dstTy, dst0[c], srcTy, src0, src1);
+ pos += 2;
+ }
+ break;
+ }
+ case TGSI_OPCODE_DADD:
+ case TGSI_OPCODE_DMUL:
+ case TGSI_OPCODE_DMAX:
+ case TGSI_OPCODE_DMIN:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ src1 = getSSA(8);
+ Value *dst = getSSA(8), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ tmp[0] = fetchSrc(1, c);
+ tmp[1] = fetchSrc(1, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]);
+ mkOp2(op, dstTy, dst, src0, src1);
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_DMAD:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ src1 = getSSA(8);
+ src2 = getSSA(8);
+ Value *dst = getSSA(8), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ tmp[0] = fetchSrc(1, c);
+ tmp[1] = fetchSrc(1, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]);
+ tmp[0] = fetchSrc(2, c);
+ tmp[1] = fetchSrc(2, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src2, tmp[0], tmp[1]);
+ mkOp3(op, dstTy, dst, src0, src1, src2);
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_DROUND:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *dst = getSSA(8), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+ mkCvt(OP_CVT, TYPE_F64, dst, TYPE_F64, src0)
+ ->rnd = ROUND_NI;
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
+ case TGSI_OPCODE_DSSG:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = getSSA(8);
+ Value *dst = getSSA(8), *dstF32 = getSSA(), *tmp[2];
+ tmp[0] = fetchSrc(0, c);
+ tmp[1] = fetchSrc(0, c + 1);
+ mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+
+ val0 = getScratch();
+ val1 = getScratch();
+ // The zero is wrong here since it's only 32-bit, but it works out
in
+ // the end since it gets replaced with $r63.
+ mkCmp(OP_SET, CC_GT, TYPE_F32, val0, TYPE_F64, src0, zero);
+ mkCmp(OP_SET, CC_LT, TYPE_F32, val1, TYPE_F64, src0, zero);
+ mkOp2(OP_SUB, TYPE_F32, dstF32, val0, val1);
+ mkCvt(OP_CVT, TYPE_F64, dst, TYPE_F32, dstF32);
+ mkSplit(&dst0[c], 4, dst);
+ c++;
+ }
+ break;
default:
ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode());
assert(0);
--
2.0.5
Ilia Mirkin
2015-Feb-20 01:02 UTC
[Nouveau] [PATCH 10/11] nvc0/ir: remove merge/split pairs to allow normal propagation to occur
Because the TGSI interface creates merges for each instruction source
and then splits them back out, there are a lot of unnecessary
merge/split pairs which do essentially nothing. The various modifier/etc
propagation doesn't know how to walk though those, so just remove them
when they're unnecessary.
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 30 ++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 62d2ef7..6a4ea4e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -118,6 +118,35 @@ CopyPropagation::visit(BasicBlock *bb)
// ============================================================================
+class MergeSplits : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+};
+
+// For SPLIT / MERGE pairs that operate on the same registers, replace the
+// post-merge def with the SPLIT's source.
+bool
+MergeSplits::visit(BasicBlock *bb)
+{
+ Instruction *i, *next, *si;
+
+ for (i = bb->getEntry(); i; i = next) {
+ next = i->next;
+ if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
+ continue;
+ si = i->getSrc(0)->getInsn();
+ if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
+ continue;
+ i->def(0).replace(si->getSrc(0), false);
+ delete_Instruction(prog, i);
+ }
+
+ return true;
+}
+
+//
============================================================================+
class LoadPropagation : public Pass
{
private:
@@ -2662,6 +2691,7 @@ Program::optimizeSSA(int level)
{
RUN_PASS(1, DeadCodeElim, buryAll);
RUN_PASS(1, CopyPropagation, run);
+ RUN_PASS(1, MergeSplits, run);
RUN_PASS(2, GlobalCSE, run);
RUN_PASS(1, LocalCSE, run);
RUN_PASS(2, AlgebraicOpt, run);
--
2.0.5
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 8546ac8..686d892 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -291,9 +291,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen,
unsigned shader,
case PIPE_SHADER_CAP_INTEGERS:
return 1;
case PIPE_SHADER_CAP_DOUBLES:
- return 0;
+ return 1;
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
- return 0;
+ return 1;
case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
return 0;
case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
--
2.0.5
Seemingly Similar Threads
- [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0)
- [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
- [PATCH 1/2] nv50/ir: fix s32 x s32 -> high s32 multiply logic
- [PATCH mesa 0/5] nouveau: codegen: Make use of double immediates
- [PATCH] gm107/ir: fix loading z offset for layered 3d image bindings