V2: clarify code, commit msgs, add comments. Drop code to was supposed to make register assignment prefer SDST == SRC2 (patch 2) for now, because it didn't quite do what I intended.
Roy Spliet
2015-Jan-13  20:35 UTC
[Nouveau] [PATCH 1/3] nv50/ir: Add support for MAD short+IMM notation
Add emission rules for negative and saturate flags for MAD 4-byte opcodes,
and get rid of constraints. Short MAD has a very specific SDST == SSRC2
requirement, and since MAD IMM is short notation + 4-byte immediate, don't
have the compiler create MAD IMM instructions yet.
V2: Document MAD as supported short form
Signed-off-by: Roy Spliet <rspliet at eclipso.eu>
---
 .../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp      | 18 ++++++++++++------
 .../drivers/nouveau/codegen/nv50_ir_target_nv50.cpp    |  4 ++--
 2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 2077388..b1e7409 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -939,9 +939,20 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
 
    code[0] = 0xe0000000;
 
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
+      emitForm_IMM(i);
+      code[0] |= neg_mul << 15;
+      code[0] |= neg_add << 22;
+      if (i->saturate)
+         code[0] |= 1 << 8;
+   } else
    if (i->encSize == 4) {
       emitForm_MUL(i);
-      assert(!neg_mul && !neg_add);
+      code[0] |= neg_mul << 15;
+      code[0] |= neg_add << 22;
+      if (i->saturate)
+         code[0] |= 1 << 8;
    } else {
       code[1]  = neg_mul << 26;
       code[1] |= neg_add << 27;
@@ -1931,11 +1942,6 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i)
const
 
    // check constraints on short MAD
    if (info.srcNr >= 2 && i->srcExists(2)) {
-      if (i->saturate || i->src(2).mod)
-         return 8;
-      if ((i->src(0).mod ^ i->src(1).mod) ||
-          (i->src(0).mod | i->src(1).mod).abs())
-         return 8;
       if (!i->defExists(0) ||
           i->def(0).rep()->reg.data.id !=
i->src(2).rep()->reg.data.id)
          return 8;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 48f996b..178a167 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -117,8 +117,8 @@ void TargetNV50::initOpInfo()
    };
    static const uint32_t shortForm[(OP_LAST + 31) / 32]     {
-      // MOV,ADD,SUB,MUL,SAD,L/PINTERP,RCP,TEX,TXF
-      0x00010e40, 0x00000040, 0x00000498, 0x00000000
+      // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF
+      0x00014e40, 0x00000040, 0x00000498, 0x00000000
    };
    static const operation noDestList[]     {
-- 
2.1.0
Add a specific optimisation pass for NV50 to check whether SRC0 or SRC1 is
a MOV dst, IMM. If so: fold the IMM in and try to drop the MOV. Must be
done post-RA because it requires that SDST == SSRC2.
V2: improve readability and add comments to clarify decisions
Signed-off-by: Roy Spliet <rspliet at eclipso.eu>
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 60 ++++++++++++++++++++++
 1 file changed, 60 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 21d20ca..723c255 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2259,6 +2259,63 @@ FlatteningPass::tryPredicateConditional(BasicBlock *bb)
 
 // ============================================================================
+// Fold Immediate into MAD; must be done after register allocation due to
+// constraint SDST == SSRC2
+// TODO:
+// Does NVC0+ have other situations where this pass makes sense?
+class NV50PostRaConstantFolding : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+bool
+NV50PostRaConstantFolding::visit(BasicBlock *bb)
+{
+   Value *vtmp;
+   Instruction *def;
+
+   for (Instruction *i = bb->getFirst(); i; i = i->next) {
+      switch (i->op) {
+      case OP_MAD:
+         if(i->def(0).getFile() != FILE_GPR ||
+            i->src(0).getFile() != FILE_GPR ||
+            i->src(1).getFile() != FILE_GPR ||
+            i->src(2).getFile() != FILE_GPR ||
+            i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
+            break;
+
+         for (int s = 0; s < 2; s++) {
+            def = i->getSrc(1)->getInsn();
+            if (def->op == OP_MOV && def->src(0).getFile() ==
FILE_IMMEDIATE) {
+               vtmp = i->getSrc(1);
+               i->setSrc(1, def->getSrc(0));
+
+               /* There's no post-RA dead code elimination, so do it here
+                * XXX: if we add more code-removing post-RA passes, we might
+                *      want to create a post-RA dead-code elim pass */
+               if (vtmp->refCount() == 0)
+                  delete_Instruction(bb->getProgram(), def);
+
+               break;
+            }
+
+            /* Swap inputs, IMM must be SRC1 */
+            vtmp = i->getSrc(0);
+            i->setSrc(0, i->getSrc(1));
+            i->setSrc(1, vtmp);
+         }
+         break;
+      default:
+         break;
+      }
+   }
+
+   return true;
+}
+
+//
============================================================================+
 // Common subexpression elimination. Stupid O^2 implementation.
 class LocalCSE : public Pass
 {
@@ -2629,6 +2686,9 @@ bool
 Program::optimizePostRA(int level)
 {
    RUN_PASS(2, FlatteningPass, run);
+   if (getTarget()->getChipset() < 0xc0)
+      RUN_PASS(2, NV50PostRaConstantFolding, run);
+
    return true;
 }
 
-- 
2.1.0
On Tue, Jan 13, 2015 at 3:35 PM, Roy Spliet <rspliet at eclipso.eu> wrote:> Add a specific optimisation pass for NV50 to check whether SRC0 or SRC1 is > a MOV dst, IMM. If so: fold the IMM in and try to drop the MOV. Must be > done post-RA because it requires that SDST == SSRC2. > > V2: improve readability and add comments to clarify decisions > > Signed-off-by: Roy Spliet <rspliet at eclipso.eu> > --- > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 60 ++++++++++++++++++++++ > 1 file changed, 60 insertions(+) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > index 21d20ca..723c255 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > @@ -2259,6 +2259,63 @@ FlatteningPass::tryPredicateConditional(BasicBlock *bb) > > // ============================================================================> > +// Fold Immediate into MAD; must be done after register allocation due to > +// constraint SDST == SSRC2 > +// TODO: > +// Does NVC0+ have other situations where this pass makes sense? > +class NV50PostRaConstantFolding : public Pass > +{ > +private: > + virtual bool visit(BasicBlock *); > +}; > + > +bool > +NV50PostRaConstantFolding::visit(BasicBlock *bb) > +{ > + Value *vtmp; > + Instruction *def; > + > + for (Instruction *i = bb->getFirst(); i; i = i->next) { > + switch (i->op) { > + case OP_MAD: > + if(i->def(0).getFile() != FILE_GPR ||if (> + i->src(0).getFile() != FILE_GPR || > + i->src(1).getFile() != FILE_GPR || > + i->src(2).getFile() != FILE_GPR || > + i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)|| neither src0 nor src1's def is an immediate load. There's a getImmediate() that may do the trick. Once you know that, you can pre-emptively do the swap (there's a ->swapSources() ) and avoid the loop.> + break; > + > + for (int s = 0; s < 2; s++) { > + def = i->getSrc(1)->getInsn(); > + if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { > + vtmp = i->getSrc(1); > + i->setSrc(1, def->getSrc(0)); > + > + /* There's no post-RA dead code elimination, so do it here > + * XXX: if we add more code-removing post-RA passes, we might > + * want to create a post-RA dead-code elim pass */ > + if (vtmp->refCount() == 0) > + delete_Instruction(bb->getProgram(), def); > + > + break; > + } > + > + /* Swap inputs, IMM must be SRC1 */ > + vtmp = i->getSrc(0); > + i->setSrc(0, i->getSrc(1)); > + i->setSrc(1, vtmp); > + } > + break; > + default: > + break; > + } > + } > + > + return true; > +} > + > +// ============================================================================> + > // Common subexpression elimination. Stupid O^2 implementation. > class LocalCSE : public Pass > { > @@ -2629,6 +2686,9 @@ bool > Program::optimizePostRA(int level) > { > RUN_PASS(2, FlatteningPass, run); > + if (getTarget()->getChipset() < 0xc0) > + RUN_PASS(2, NV50PostRaConstantFolding, run); > + > return true; > } > > -- > 2.1.0 > > > > _______________________________________________ > Nouveau mailing list > Nouveau at lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/nouveau