thr3ads.net - Nouveau - [Nouveau] [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax [Feb 2015]

If this information is useful, please help other people find it:
Share via:

Ilia Mirkin

2015-Feb-20 01:02 UTC

[Nouveau] [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp  | 66 +++++++++++++++++++++-
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index dfb093c..e38a3b8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -92,11 +92,14 @@ private:
 
    void emitUADD(const Instruction *);
    void emitFADD(const Instruction *);
+   void emitDADD(const Instruction *);
    void emitUMUL(const Instruction *);
    void emitFMUL(const Instruction *);
+   void emitDMUL(const Instruction *);
    void emitIMAD(const Instruction *);
    void emitISAD(const Instruction *);
    void emitFMAD(const Instruction *);
+   void emitDMAD(const Instruction *);
    void emitMADSP(const Instruction *);
 
    void emitNOT(Instruction *);
@@ -523,6 +526,25 @@ CodeEmitterNVC0::emitFMAD(const Instruction *i)
 }
 
 void
+CodeEmitterNVC0::emitDMAD(const Instruction *i)
+{
+   bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   emitForm_A(i, HEX64(20000000, 00000001));
+
+   if (i->src(2).mod.neg())
+      code[0] |= 1 << 8;
+
+   roundMode_A(i);
+
+   if (neg1)
+      code[0] |= 1 << 9;
+
+   assert(!i->saturate);
+   assert(!i->ftz);
+}
+
+void
 CodeEmitterNVC0::emitFMUL(const Instruction *i)
 {
    bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
@@ -557,6 +579,23 @@ CodeEmitterNVC0::emitFMUL(const Instruction *i)
 }
 
 void
+CodeEmitterNVC0::emitDMUL(const Instruction *i)
+{
+   bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   emitForm_A(i, HEX64(50000000, 00000001));
+   roundMode_A(i);
+
+   if (neg)
+      code[0] |= 1 << 9;
+
+   assert(!i->saturate);
+   assert(!i->ftz);
+   assert(!i->dnz);
+   assert(!i->postFactor);
+}
+
+void
 CodeEmitterNVC0::emitUMUL(const Instruction *i)
 {
    if (i->encSize == 8) {
@@ -619,6 +658,19 @@ CodeEmitterNVC0::emitFADD(const Instruction *i)
 }
 
 void
+CodeEmitterNVC0::emitDADD(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   emitForm_A(i, HEX64(48000000, 00000001));
+   roundMode_A(i);
+   assert(!i->saturate);
+   assert(!i->ftz);
+   emitNegAbs12(i);
+   if (i->op == OP_SUB)
+      code[0] ^= 1 << 8;
+}
+
+void
 CodeEmitterNVC0::emitUADD(const Instruction *i)
 {
    uint32_t addOp = 0;
@@ -895,6 +947,8 @@ CodeEmitterNVC0::emitMINMAX(const Instruction *i)
    else
    if (!isFloatType(i->dType))
       op |= isSignedType(i->dType) ? 0x23 : 0x03;
+   if (i->dType == TYPE_F64)
+      op |= 0x01;
 
    emitForm_A(i, op);
    emitNegAbs12(i);
@@ -2242,20 +2296,26 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
       break;
    case OP_ADD:
    case OP_SUB:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDADD(insn);
+      else if (isFloatType(insn->dType))
          emitFADD(insn);
       else
          emitUADD(insn);
       break;
    case OP_MUL:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMUL(insn);
+      else if (isFloatType(insn->dType))
          emitFMUL(insn);
       else
          emitUMUL(insn);
       break;
    case OP_MAD:
    case OP_FMA:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMAD(insn);
+      else if (isFloatType(insn->dType))
          emitFMAD(insn);
       else
          emitIMAD(insn);
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 02/11] gk110/ir: add emission of dadd/dmul/dmad opcodes

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 80 +++++++++++++++++++++-
 1 file changed, 77 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index d8adc93..204d911 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -84,11 +84,14 @@ private:
 
    void emitUADD(const Instruction *);
    void emitFADD(const Instruction *);
+   void emitDADD(const Instruction *);
    void emitIMUL(const Instruction *);
    void emitFMUL(const Instruction *);
+   void emitDMUL(const Instruction *);
    void emitIMAD(const Instruction *);
    void emitISAD(const Instruction *);
    void emitFMAD(const Instruction *);
+   void emitDMAD(const Instruction *);
 
    void emitNOT(const Instruction *);
    void emitLogicOp(const Instruction *, uint8_t subOp);
@@ -479,6 +482,28 @@ CodeEmitterGK110::emitFMAD(const Instruction *i)
 }
 
 void
+CodeEmitterGK110::emitDMAD(const Instruction *i)
+{
+   assert(!i->saturate);
+   assert(!i->ftz);
+
+   emitForm_21(i, 0x1b8, 0xb38);
+
+   NEG_(34, 2);
+   RND_(36, F);
+
+   bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   if (code[0] & 0x1) {
+      if (neg1)
+         code[1] ^= 1 << 27;
+   } else
+   if (neg1) {
+      code[1] |= 1 << 19;
+   }
+}
+
+void
 CodeEmitterGK110::emitFMUL(const Instruction *i)
 {
    bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
@@ -516,6 +541,29 @@ CodeEmitterGK110::emitFMUL(const Instruction *i)
 }
 
 void
+CodeEmitterGK110::emitDMUL(const Instruction *i)
+{
+   bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(!i->postFactor);
+   assert(!i->saturate);
+   assert(!i->ftz);
+   assert(!i->dnz);
+
+   emitForm_21(i, 0x240, 0xc40);
+
+   RND_(2a, F);
+
+   if (code[0] & 0x1) {
+      if (neg)
+         code[1] ^= 1 << 27;
+   } else
+   if (neg) {
+      code[1] |= 1 << 19;
+   }
+}
+
+void
 CodeEmitterGK110::emitIMUL(const Instruction *i)
 {
    assert(!i->src(0).mod.neg() && !i->src(1).mod.neg());
@@ -574,6 +622,26 @@ CodeEmitterGK110::emitFADD(const Instruction *i)
 }
 
 void
+CodeEmitterGK110::emitDADD(const Instruction *i)
+{
+   assert(!i->saturate);
+   assert(!i->ftz);
+
+   emitForm_21(i, 0x238, 0xc38);
+   RND_(2a, F);
+   ABS_(31, 0);
+   NEG_(33, 0);
+   if (code[0] & 0x1) {
+      modNegAbsF32_3b(i, 1);
+      if (i->op == OP_SUB) code[1] ^= 1 << 27;
+   } else {
+      NEG_(30, 1);
+      ABS_(34, 1);
+      if (i->op == OP_SUB) code[1] ^= 1 << 16;
+   }
+}
+
+void
 CodeEmitterGK110::emitUADD(const Instruction *i)
 {
    uint8_t addOp = (i->src(0).mod.neg() << 1) |
i->src(1).mod.neg();
@@ -1634,20 +1702,26 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
       break;
    case OP_ADD:
    case OP_SUB:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDADD(insn);
+      else if (isFloatType(insn->dType))
          emitFADD(insn);
       else
          emitUADD(insn);
       break;
    case OP_MUL:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMUL(insn);
+      else if (isFloatType(insn->dType))
          emitFMUL(insn);
       else
          emitIMUL(insn);
       break;
    case OP_MAD:
    case OP_FMA:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMAD(insn);
+      else if (isFloatType(insn->dType))
          emitFMAD(insn);
       else
          emitIMAD(insn);
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 03/11] gm107/ir: fix DMUL opcode encoding

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 944ceb2..9f4c435 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -924,15 +924,15 @@ CodeEmitterGM107::emitDMUL()
 {
    switch (insn->src(1).getFile()) {
    case FILE_GPR:
-      emitInsn(0x5c680000);
+      emitInsn(0x5c800000);
       emitGPR (0x14, insn->src(1));
       break;
    case FILE_MEMORY_CONST:
-      emitInsn(0x4c680000);
+      emitInsn(0x4c800000);
       emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1));
       break;
    case FILE_IMMEDIATE:
-      emitInsn(0x38680000);
+      emitInsn(0x38800000);
       emitIMMD(0x14, 19, insn->src(1));
       break;
    default:
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 04/11] gm107/ir: fix DSET boolean float flag

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 9f4c435..73a65fa 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -1060,6 +1060,7 @@ CodeEmitterGM107::emitDSET()
 
    emitABS  (0x36, insn->src(0));
    emitNEG  (0x35, insn->src(1));
+   emitField(0x34, 1, insn->dType == TYPE_F32);
    emitCond4(0x30, insn->setCond);
    emitCC   (0x2f);
    emitABS  (0x2c, insn->src(1));
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 05/11] gm107/ir: fix F2F flipped stype/dtype flags

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 73a65fa..3e1da7e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -731,8 +731,8 @@ CodeEmitterGM107::emitF2F()
    emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
    emitFMZ  (0x2c, 1);
    emitRND  (0x27, rnd, 0x2a);
-   emitField(0x0a, 2, util_logbase2(typeSizeof(insn->dType)));
-   emitField(0x08, 2, util_logbase2(typeSizeof(insn->sType)));
+   emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
+   emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType)));
    emitGPR  (0x00, insn->def(0));
 }
 
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 06/11] nvc0/ir: fix lowering of RSQ/RCP/SQRT/MOD to work with F64

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.h      |  1 +
 .../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp |  4 +-
 .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp |  4 +-
 .../drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp  |  4 +-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 43 +++++++++++++++++-----
 5 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 0ff5e5d..529dcb9 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -175,6 +175,7 @@ enum operation
 #define NV50_IR_SUBOP_MOV_FINAL    1
 #define NV50_IR_SUBOP_EXTBF_REV    1
 #define NV50_IR_SUBOP_BFIND_SAMT   1
+#define NV50_IR_SUBOP_RCPRSQ_64H   1
 #define NV50_IR_SUBOP_PERMT_F4E    1
 #define NV50_IR_SUBOP_PERMT_B4E    2
 #define NV50_IR_SUBOP_PERMT_RC8    3
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 204d911..674be69 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1771,10 +1771,10 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
       emitCVT(insn);
       break;
    case OP_RSQ:
-      emitSFnOp(insn, 5);
+      emitSFnOp(insn, 5 + 2 * insn->subOp);
       break;
    case OP_RCP:
-      emitSFnOp(insn, 4);
+      emitSFnOp(insn, 4 + 2 * insn->subOp);
       break;
    case OP_LG2:
       emitSFnOp(insn, 3);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 3e1da7e..ee0487f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -1265,8 +1265,8 @@ CodeEmitterGM107::emitMUFU()
    case OP_SIN: mufu = 1; break;
    case OP_EX2: mufu = 2; break;
    case OP_LG2: mufu = 3; break;
-   case OP_RCP: mufu = 4; break;
-   case OP_RSQ: mufu = 5; break;
+   case OP_RCP: mufu = 4 + 2 * insn->subOp; break;
+   case OP_RSQ: mufu = 5 + 2 * insn->subOp; break;
    default:
       assert(!"invalid mufu");
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index e38a3b8..1a4f6e0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2365,10 +2365,10 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
       emitCVT(insn);
       break;
    case OP_RSQ:
-      emitSFnOp(insn, 5);
+      emitSFnOp(insn, 5 + 2 * insn->subOp);
       break;
    case OP_RCP:
-      emitSFnOp(insn, 4);
+      emitSFnOp(insn, 4 + 2 * insn->subOp);
       break;
    case OP_LG2:
       emitSFnOp(insn, 3);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 5dfb777..8ac3b26 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -70,7 +70,30 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
 void
 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
 {
-   // TODO
+   assert(i->dType == TYPE_F64);
+   // There are instructions that will compute the high 32 bits of the 64-bit
+   // float. We will just stick 0 in the bottom 32 bits.
+
+   bld.setPosition(i, false);
+
+   // 1. Take the source and it up.
+   Value *src[2], *dst[2], *def = i->getDef(0);
+   bld.mkSplit(src, 4, i->getSrc(0));
+
+   // 2. We don't care about the low 32 bits of the destination. Stick a 0
in.
+   dst[0] = bld.loadImm(NULL, 0);
+   dst[1] = bld.getSSA();
+
+   // 3. The new version of the instruction takes the high 32 bits of the
+   // source and outputs the high 32 bits of the destination.
+   i->setSrc(0, src[1]);
+   i->setDef(0, dst[1]);
+   i->setType(TYPE_F32);
+   i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
+
+   // 4. Recombine the two dst pieces back into the original destination.
+   bld.setPosition(i, true);
+   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
 }
 
 bool
@@ -1520,7 +1543,7 @@ NVC0LoweringPass::handleDIV(Instruction *i)
    if (!isFloatType(i->dType))
       return true;
    bld.setPosition(i, false);
-   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(),
i->getSrc(1));
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType,
bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
    i->op = OP_MUL;
    i->setSrc(1, rcp->getDef(0));
    return true;
@@ -1529,13 +1552,13 @@ NVC0LoweringPass::handleDIV(Instruction *i)
 bool
 NVC0LoweringPass::handleMOD(Instruction *i)
 {
-   if (i->dType != TYPE_F32)
+   if (!isFloatType(i->dType))
       return true;
-   LValue *value = bld.getScratch();
-   bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
-   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
-   bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
-   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+   LValue *value = bld.getScratch(typeSizeof(i->dType));
+   bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
+   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
+   bld.mkOp1(OP_TRUNC, i->dType, value, value);
+   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
    i->op = OP_SUB;
    i->setSrc(1, value);
    return true;
@@ -1544,8 +1567,8 @@ NVC0LoweringPass::handleMOD(Instruction *i)
 bool
 NVC0LoweringPass::handleSQRT(Instruction *i)
 {
-   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
-                                bld.getSSA(), i->getSrc(0));
+   Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType,
+                                bld.getSSA(typeSizeof(i->dType)),
i->getSrc(0));
    i->op = OP_MUL;
    i->setSrc(1, rsq->getDef(0));
 
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 07/11] nvc0/ir: no instruction can load a double immediate

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 817ceb8..7d4a859 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -337,6 +337,8 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
    if (sf == FILE_IMMEDIATE) {
       Storage &reg = ld->getSrc(0)->asImm()->reg;
 
+      if (typeSizeof(i->sType) > 4)
+         return false;
       if (opInfo[i->op].immdBits != 0xffffffff) {
          if (i->sType == TYPE_F32) {
             if (reg.data.u32 & 0xfff)
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 08/11] nvc0/ir: handle zero and negative sqrt arguments

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp    | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 8ac3b26..18e8e67 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1567,10 +1567,22 @@ NVC0LoweringPass::handleMOD(Instruction *i)
 bool
 NVC0LoweringPass::handleSQRT(Instruction *i)
 {
-   Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType,
-                                bld.getSSA(typeSizeof(i->dType)),
i->getSrc(0));
+   Value *pred = bld.getSSA(1, FILE_PREDICATE);
+   Value *zero = bld.getSSA();
+   Instruction *rsq;
+
+   bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
+   if (i->dType == TYPE_F64)
+      zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
+   bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0),
zero);
+   bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P,
pred);
+   rsq = bld.mkOp1(OP_RSQ, i->dType,
+                   bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
+   rsq->setPredicate(CC_NOT_P, pred);
    i->op = OP_MUL;
    i->setSrc(1, rsq->getDef(0));
+   i->setPredicate(CC_NOT_P, pred);
+
 
    return true;
 }
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 09/11] nvc0/ir: add support for new TGSI double opcodes (v2)

v2: drop DDIV

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  | 196 +++++++++++++++++++++
 1 file changed, 196 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 9ee927f..028a17e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -441,6 +441,27 @@ nv50_ir::DataType Instruction::inferSrcType() const
    case TGSI_OPCODE_IBFE:
    case TGSI_OPCODE_IMSB:
       return nv50_ir::TYPE_S32;
+   case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_DABS:
+   case TGSI_OPCODE_DNEG:
+   case TGSI_OPCODE_DADD:
+   case TGSI_OPCODE_DMUL:
+   case TGSI_OPCODE_DMAX:
+   case TGSI_OPCODE_DMIN:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_DRCP:
+   case TGSI_OPCODE_DSQRT:
+   case TGSI_OPCODE_DMAD:
+   case TGSI_OPCODE_DFRAC:
+   case TGSI_OPCODE_DRSQ:
+   case TGSI_OPCODE_DTRUNC:
+   case TGSI_OPCODE_DCEIL:
+   case TGSI_OPCODE_DFLR:
+   case TGSI_OPCODE_DROUND:
+      return nv50_ir::TYPE_F64;
    default:
       return nv50_ir::TYPE_F32;
    }
@@ -455,10 +476,17 @@ nv50_ir::DataType Instruction::inferDstType() const
    case TGSI_OPCODE_FSGE:
    case TGSI_OPCODE_FSLT:
    case TGSI_OPCODE_FSNE:
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSNE:
       return nv50_ir::TYPE_U32;
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_U2F:
+   case TGSI_OPCODE_D2F:
       return nv50_ir::TYPE_F32;
+   case TGSI_OPCODE_F2D:
+      return nv50_ir::TYPE_F64;
    default:
       return inferSrcType();
    }
@@ -473,6 +501,7 @@ nv50_ir::CondCode Instruction::getSetCond() const
    case TGSI_OPCODE_ISLT:
    case TGSI_OPCODE_USLT:
    case TGSI_OPCODE_FSLT:
+   case TGSI_OPCODE_DSLT:
       return CC_LT;
    case TGSI_OPCODE_SLE:
       return CC_LE;
@@ -480,15 +509,18 @@ nv50_ir::CondCode Instruction::getSetCond() const
    case TGSI_OPCODE_ISGE:
    case TGSI_OPCODE_USGE:
    case TGSI_OPCODE_FSGE:
+   case TGSI_OPCODE_DSGE:
       return CC_GE;
    case TGSI_OPCODE_SGT:
       return CC_GT;
    case TGSI_OPCODE_SEQ:
    case TGSI_OPCODE_USEQ:
    case TGSI_OPCODE_FSEQ:
+   case TGSI_OPCODE_DSEQ:
       return CC_EQ;
    case TGSI_OPCODE_SNE:
    case TGSI_OPCODE_FSNE:
+   case TGSI_OPCODE_DSNE:
       return CC_NEU;
    case TGSI_OPCODE_USNE:
       return CC_NE;
@@ -601,6 +633,25 @@ static nv50_ir::operation translateOpcode(uint opcode)
    NV50_IR_OPCODE_CASE(USLT, SET);
    NV50_IR_OPCODE_CASE(USNE, SET);
 
+   NV50_IR_OPCODE_CASE(DABS, ABS);
+   NV50_IR_OPCODE_CASE(DNEG, NEG);
+   NV50_IR_OPCODE_CASE(DADD, ADD);
+   NV50_IR_OPCODE_CASE(DMUL, MUL);
+   NV50_IR_OPCODE_CASE(DMAX, MAX);
+   NV50_IR_OPCODE_CASE(DMIN, MIN);
+   NV50_IR_OPCODE_CASE(DSLT, SET);
+   NV50_IR_OPCODE_CASE(DSGE, SET);
+   NV50_IR_OPCODE_CASE(DSEQ, SET);
+   NV50_IR_OPCODE_CASE(DSNE, SET);
+   NV50_IR_OPCODE_CASE(DRCP, RCP);
+   NV50_IR_OPCODE_CASE(DSQRT, SQRT);
+   NV50_IR_OPCODE_CASE(DMAD, MAD);
+   NV50_IR_OPCODE_CASE(DRSQ, RSQ);
+   NV50_IR_OPCODE_CASE(DTRUNC, TRUNC);
+   NV50_IR_OPCODE_CASE(DCEIL, CEIL);
+   NV50_IR_OPCODE_CASE(DFLR, FLOOR);
+   NV50_IR_OPCODE_CASE(DROUND, CVT);
+
    NV50_IR_OPCODE_CASE(IMUL_HI, MUL);
    NV50_IR_OPCODE_CASE(UMUL_HI, MUL);
 
@@ -2880,6 +2931,151 @@ Converter::handleInstruction(const struct
tgsi_full_instruction *insn)
    case TGSI_OPCODE_INTERP_OFFSET:
       handleINTERP(dst0);
       break;
+   case TGSI_OPCODE_D2F: {
+      int pos = 0;
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         Value *dreg = getSSA(8);
+         src0 = fetchSrc(0, pos);
+         src1 = fetchSrc(0, pos + 1);
+         mkOp2(OP_MERGE, TYPE_U64, dreg, src0, src1);
+         mkCvt(OP_CVT, dstTy, dst0[c], srcTy, dreg);
+         pos += 2;
+      }
+      break;
+   }
+   case TGSI_OPCODE_F2D:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         Value *dreg = getSSA(8);
+         mkCvt(OP_CVT, dstTy, dreg, srcTy, fetchSrc(0, c / 2));
+         mkSplit(&dst0[c], 4, dreg);
+         c++;
+      }
+      break;
+   case TGSI_OPCODE_DABS:
+   case TGSI_OPCODE_DNEG:
+   case TGSI_OPCODE_DRCP:
+   case TGSI_OPCODE_DSQRT:
+   case TGSI_OPCODE_DRSQ:
+   case TGSI_OPCODE_DTRUNC:
+   case TGSI_OPCODE_DCEIL:
+   case TGSI_OPCODE_DFLR:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = getSSA(8);
+         Value *dst = getSSA(8), *tmp[2];
+         tmp[0] = fetchSrc(0, c);
+         tmp[1] = fetchSrc(0, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+         mkOp1(op, dstTy, dst, src0);
+         mkSplit(&dst0[c], 4, dst);
+         c++;
+      }
+      break;
+   case TGSI_OPCODE_DFRAC:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = getSSA(8);
+         Value *dst = getSSA(8), *tmp[2];
+         tmp[0] = fetchSrc(0, c);
+         tmp[1] = fetchSrc(0, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+         mkOp1(OP_FLOOR, TYPE_F64, dst, src0);
+         mkOp2(OP_SUB, TYPE_F64, dst, src0, dst);
+         mkSplit(&dst0[c], 4, dst);
+         c++;
+      }
+      break;
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSNE: {
+      int pos = 0;
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         Value *tmp[2];
+
+         src0 = getSSA(8);
+         src1 = getSSA(8);
+         tmp[0] = fetchSrc(0, pos);
+         tmp[1] = fetchSrc(0, pos + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+         tmp[0] = fetchSrc(1, pos);
+         tmp[1] = fetchSrc(1, pos + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]);
+         mkCmp(op, tgsi.getSetCond(), dstTy, dst0[c], srcTy, src0, src1);
+         pos += 2;
+      }
+      break;
+   }
+   case TGSI_OPCODE_DADD:
+   case TGSI_OPCODE_DMUL:
+   case TGSI_OPCODE_DMAX:
+   case TGSI_OPCODE_DMIN:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = getSSA(8);
+         src1 = getSSA(8);
+         Value *dst = getSSA(8), *tmp[2];
+         tmp[0] = fetchSrc(0, c);
+         tmp[1] = fetchSrc(0, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+         tmp[0] = fetchSrc(1, c);
+         tmp[1] = fetchSrc(1, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]);
+         mkOp2(op, dstTy, dst, src0, src1);
+         mkSplit(&dst0[c], 4, dst);
+         c++;
+      }
+      break;
+   case TGSI_OPCODE_DMAD:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = getSSA(8);
+         src1 = getSSA(8);
+         src2 = getSSA(8);
+         Value *dst = getSSA(8), *tmp[2];
+         tmp[0] = fetchSrc(0, c);
+         tmp[1] = fetchSrc(0, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+         tmp[0] = fetchSrc(1, c);
+         tmp[1] = fetchSrc(1, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src1, tmp[0], tmp[1]);
+         tmp[0] = fetchSrc(2, c);
+         tmp[1] = fetchSrc(2, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src2, tmp[0], tmp[1]);
+         mkOp3(op, dstTy, dst, src0, src1, src2);
+         mkSplit(&dst0[c], 4, dst);
+         c++;
+      }
+      break;
+   case TGSI_OPCODE_DROUND:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = getSSA(8);
+         Value *dst = getSSA(8), *tmp[2];
+         tmp[0] = fetchSrc(0, c);
+         tmp[1] = fetchSrc(0, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+         mkCvt(OP_CVT, TYPE_F64, dst, TYPE_F64, src0)
+         ->rnd = ROUND_NI;
+         mkSplit(&dst0[c], 4, dst);
+         c++;
+      }
+      break;
+   case TGSI_OPCODE_DSSG:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = getSSA(8);
+         Value *dst = getSSA(8), *dstF32 = getSSA(), *tmp[2];
+         tmp[0] = fetchSrc(0, c);
+         tmp[1] = fetchSrc(0, c + 1);
+         mkOp2(OP_MERGE, TYPE_U64, src0, tmp[0], tmp[1]);
+
+         val0 = getScratch();
+         val1 = getScratch();
+         // The zero is wrong here since it's only 32-bit, but it works out
in
+         // the end since it gets replaced with $r63.
+         mkCmp(OP_SET, CC_GT, TYPE_F32, val0, TYPE_F64, src0, zero);
+         mkCmp(OP_SET, CC_LT, TYPE_F32, val1, TYPE_F64, src0, zero);
+         mkOp2(OP_SUB, TYPE_F32, dstF32, val0, val1);
+         mkCvt(OP_CVT, TYPE_F64, dst, TYPE_F32, dstF32);
+         mkSplit(&dst0[c], 4, dst);
+         c++;
+      }
+      break;
    default:
       ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode());
       assert(0);
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 10/11] nvc0/ir: remove merge/split pairs to allow normal propagation to occur

Because the TGSI interface creates merges for each instruction source
and then splits them back out, there are a lot of unnecessary
merge/split pairs which do essentially nothing. The various modifier/etc
propagation doesn't know how to walk though those, so just remove them
when they're unnecessary.

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 62d2ef7..6a4ea4e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -118,6 +118,35 @@ CopyPropagation::visit(BasicBlock *bb)
 
 // ============================================================================
+class MergeSplits : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+// For SPLIT / MERGE pairs that operate on the same registers, replace the
+// post-merge def with the SPLIT's source.
+bool
+MergeSplits::visit(BasicBlock *bb)
+{
+   Instruction *i, *next, *si;
+
+   for (i = bb->getEntry(); i; i = next) {
+      next = i->next;
+      if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
+         continue;
+      si = i->getSrc(0)->getInsn();
+      if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
+         continue;
+      i->def(0).replace(si->getSrc(0), false);
+      delete_Instruction(prog, i);
+   }
+
+   return true;
+}
+
+//
============================================================================+
 class LoadPropagation : public Pass
 {
 private:
@@ -2662,6 +2691,7 @@ Program::optimizeSSA(int level)
 {
    RUN_PASS(1, DeadCodeElim, buryAll);
    RUN_PASS(1, CopyPropagation, run);
+   RUN_PASS(1, MergeSplits, run);
    RUN_PASS(2, GlobalCSE, run);
    RUN_PASS(1, LocalCSE, run);
    RUN_PASS(2, AlgebraicOpt, run);
-- 
2.0.5

Ilia Mirkin

2015-Feb-20 01:02 UTC

head link

[Nouveau] [PATCH 11/11] nvc0: enable double support

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 8546ac8..686d892 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -291,9 +291,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen,
unsigned shader,
    case PIPE_SHADER_CAP_INTEGERS:
       return 1;
    case PIPE_SHADER_CAP_DOUBLES:
-      return 0;
+      return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
-      return 0;
+      return 1;
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       return 0;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
-- 
2.0.5

Apparently Analagous Threads

Search for more maybe matching threads

Nouveau - Feb 2015 - [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax

[Nouveau] [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax

[Nouveau] [PATCH 02/11] gk110/ir: add emission of dadd/dmul/dmad opcodes

[Nouveau] [PATCH 03/11] gm107/ir: fix DMUL opcode encoding

[Nouveau] [PATCH 04/11] gm107/ir: fix DSET boolean float flag

[Nouveau] [PATCH 05/11] gm107/ir: fix F2F flipped stype/dtype flags

[Nouveau] [PATCH 06/11] nvc0/ir: fix lowering of RSQ/RCP/SQRT/MOD to work with F64

[Nouveau] [PATCH 07/11] nvc0/ir: no instruction can load a double immediate

[Nouveau] [PATCH 08/11] nvc0/ir: handle zero and negative sqrt arguments

[Nouveau] [PATCH 09/11] nvc0/ir: add support for new TGSI double opcodes (v2)

[Nouveau] [PATCH 10/11] nvc0/ir: remove merge/split pairs to allow normal propagation to occur

[Nouveau] [PATCH 11/11] nvc0: enable double support

Apparently Analagous Threads