Roy Spliet
2015-Jan-23 21:46 UTC
[Nouveau] [PATCH 1/2] nv50/ir: Add support for MAD short+IMM notation
Add emission rules for negative and saturate flags for MAD 4-byte opcodes, and get rid of constraints. Short MAD has a very specific SDST == SSRC2 requirement, and since MAD IMM is short notation + 4-byte immediate, don't have the compiler create MAD IMM instructions yet. V2: Document MAD as supported short form Signed-off-by: Roy Spliet <rspliet at eclipso.eu> --- .../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 18 ++++++++++++------ .../drivers/nouveau/codegen/nv50_ir_target_nv50.cpp | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index 2077388..b1e7409 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -939,9 +939,20 @@ CodeEmitterNV50::emitFMAD(const Instruction *i) code[0] = 0xe0000000; + if (i->src(1).getFile() == FILE_IMMEDIATE) { + code[1] = 0; + emitForm_IMM(i); + code[0] |= neg_mul << 15; + code[0] |= neg_add << 22; + if (i->saturate) + code[0] |= 1 << 8; + } else if (i->encSize == 4) { emitForm_MUL(i); - assert(!neg_mul && !neg_add); + code[0] |= neg_mul << 15; + code[0] |= neg_add << 22; + if (i->saturate) + code[0] |= 1 << 8; } else { code[1] = neg_mul << 26; code[1] |= neg_add << 27; @@ -1931,11 +1942,6 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const // check constraints on short MAD if (info.srcNr >= 2 && i->srcExists(2)) { - if (i->saturate || i->src(2).mod) - return 8; - if ((i->src(0).mod ^ i->src(1).mod) || - (i->src(0).mod | i->src(1).mod).abs()) - return 8; if (!i->defExists(0) || i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id) return 8; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index 48f996b..178a167 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -117,8 +117,8 @@ void TargetNV50::initOpInfo() }; static const uint32_t shortForm[(OP_LAST + 31) / 32] { - // MOV,ADD,SUB,MUL,SAD,L/PINTERP,RCP,TEX,TXF - 0x00010e40, 0x00000040, 0x00000498, 0x00000000 + // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF + 0x00014e40, 0x00000040, 0x00000498, 0x00000000 }; static const operation noDestList[] { -- 2.1.0
Add a specific optimisation pass for NV50 to check whether SRC0 or SRC1 is a MOV dst, IMM. If so: fold the IMM in and try to drop the MOV. Must be done post-RA because it requires that SDST == SSRC2. V2: improve readability and add comments to clarify decisions V3: Remove redundant code... compiler already attempts to put the IMM in SSRC1 Signed-off-by: Roy Spliet <rspliet at eclipso.eu> --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 53 ++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 21d20ca..62d2ef7 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -2259,6 +2259,56 @@ FlatteningPass::tryPredicateConditional(BasicBlock *bb) // ============================================================================ +// Fold Immediate into MAD; must be done after register allocation due to +// constraint SDST == SSRC2 +// TODO: +// Does NVC0+ have other situations where this pass makes sense? +class NV50PostRaConstantFolding : public Pass +{ +private: + virtual bool visit(BasicBlock *); +}; + +bool +NV50PostRaConstantFolding::visit(BasicBlock *bb) +{ + Value *vtmp; + Instruction *def; + + for (Instruction *i = bb->getFirst(); i; i = i->next) { + switch (i->op) { + case OP_MAD: + if (i->def(0).getFile() != FILE_GPR || + i->src(0).getFile() != FILE_GPR || + i->src(1).getFile() != FILE_GPR || + i->src(2).getFile() != FILE_GPR || + i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id) + break; + + def = i->getSrc(1)->getInsn(); + if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { + vtmp = i->getSrc(1); + i->setSrc(1, def->getSrc(0)); + + /* There's no post-RA dead code elimination, so do it here + * XXX: if we add more code-removing post-RA passes, we might + * want to create a post-RA dead-code elim pass */ + if (vtmp->refCount() == 0) + delete_Instruction(bb->getProgram(), def); + + break; + } + break; + default: + break; + } + } + + return true; +} + +// ============================================================================+ // Common subexpression elimination. Stupid O^2 implementation. class LocalCSE : public Pass { @@ -2629,6 +2679,9 @@ bool Program::optimizePostRA(int level) { RUN_PASS(2, FlatteningPass, run); + if (getTarget()->getChipset() < 0xc0) + RUN_PASS(2, NV50PostRaConstantFolding, run); + return true; } -- 2.1.0
Ilia Mirkin
2015-Jan-23 22:19 UTC
[Nouveau] [PATCH 1/2] nv50/ir: Add support for MAD short+IMM notation
On Fri, Jan 23, 2015 at 1:46 PM, Roy Spliet <rspliet at eclipso.eu> wrote:> Add emission rules for negative and saturate flags for MAD 4-byte opcodes, > and get rid of constraints. Short MAD has a very specific SDST == SSRC2 > requirement, and since MAD IMM is short notation + 4-byte immediate, don't > have the compiler create MAD IMM instructions yet. > > V2: Document MAD as supported short form > > Signed-off-by: Roy Spliet <rspliet at eclipso.eu> > --- > .../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 18 ++++++++++++------ > .../drivers/nouveau/codegen/nv50_ir_target_nv50.cpp | 4 ++-- > 2 files changed, 14 insertions(+), 8 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp > index 2077388..b1e7409 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp > @@ -939,9 +939,20 @@ CodeEmitterNV50::emitFMAD(const Instruction *i) > > code[0] = 0xe0000000; > > + if (i->src(1).getFile() == FILE_IMMEDIATE) { > + code[1] = 0; > + emitForm_IMM(i); > + code[0] |= neg_mul << 15; > + code[0] |= neg_add << 22; > + if (i->saturate) > + code[0] |= 1 << 8; > + } else > if (i->encSize == 4) { > emitForm_MUL(i); > - assert(!neg_mul && !neg_add); > + code[0] |= neg_mul << 15; > + code[0] |= neg_add << 22; > + if (i->saturate) > + code[0] |= 1 << 8; > } else { > code[1] = neg_mul << 26; > code[1] |= neg_add << 27; > @@ -1931,11 +1942,6 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const > > // check constraints on short MAD > if (info.srcNr >= 2 && i->srcExists(2)) { > - if (i->saturate || i->src(2).mod) > - return 8; > - if ((i->src(0).mod ^ i->src(1).mod) || > - (i->src(0).mod | i->src(1).mod).abs()) > - return 8;Please confirm that you tested shaders with these specific instructions being generated, i.e. short form mad + sat and mad + neg/abs. Separately, please split this (and related bits) from the mad + imm change, which is separate. When there's a regression because of this, I want the bisected change to say "enable sat/neg/abs for mad short form", not "add mad imm but don't enable it" :)> if (!i->defExists(0) || > i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id) > return 8; > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp > index 48f996b..178a167 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp > @@ -117,8 +117,8 @@ void TargetNV50::initOpInfo() > }; > static const uint32_t shortForm[(OP_LAST + 31) / 32] > { > - // MOV,ADD,SUB,MUL,SAD,L/PINTERP,RCP,TEX,TXF > - 0x00010e40, 0x00000040, 0x00000498, 0x00000000 > + // MOV,ADD,SUB,MUL,MAD,SAD,L/PINTERP,RCP,TEX,TXF > + 0x00014e40, 0x00000040, 0x00000498, 0x00000000 > }; > static const operation noDestList[] > { > -- > 2.1.0 > > > > _______________________________________________ > Nouveau mailing list > Nouveau at lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/nouveau
On Fri, Jan 23, 2015 at 1:46 PM, Roy Spliet <rspliet at eclipso.eu> wrote:> Add a specific optimisation pass for NV50 to check whether SRC0 or SRC1 is > a MOV dst, IMM. If so: fold the IMM in and try to drop the MOV. Must be > done post-RA because it requires that SDST == SSRC2. > > V2: improve readability and add comments to clarify decisions > V3: Remove redundant code... compiler already attempts to put the IMM in > SSRC1 > > Signed-off-by: Roy Spliet <rspliet at eclipso.eu>Reviewed-by: Ilia Mirkin <imirkin at alum.mit.edu>> --- > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 53 ++++++++++++++++++++++ > 1 file changed, 53 insertions(+) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > index 21d20ca..62d2ef7 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > @@ -2259,6 +2259,56 @@ FlatteningPass::tryPredicateConditional(BasicBlock *bb) > > // ============================================================================> > +// Fold Immediate into MAD; must be done after register allocation due to > +// constraint SDST == SSRC2 > +// TODO: > +// Does NVC0+ have other situations where this pass makes sense? > +class NV50PostRaConstantFolding : public Pass > +{ > +private: > + virtual bool visit(BasicBlock *); > +}; > + > +bool > +NV50PostRaConstantFolding::visit(BasicBlock *bb) > +{ > + Value *vtmp; > + Instruction *def; > + > + for (Instruction *i = bb->getFirst(); i; i = i->next) { > + switch (i->op) { > + case OP_MAD: > + if (i->def(0).getFile() != FILE_GPR || > + i->src(0).getFile() != FILE_GPR || > + i->src(1).getFile() != FILE_GPR || > + i->src(2).getFile() != FILE_GPR || > + i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id) > + break; > + > + def = i->getSrc(1)->getInsn(); > + if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { > + vtmp = i->getSrc(1); > + i->setSrc(1, def->getSrc(0)); > + > + /* There's no post-RA dead code elimination, so do it here > + * XXX: if we add more code-removing post-RA passes, we might > + * want to create a post-RA dead-code elim pass */ > + if (vtmp->refCount() == 0) > + delete_Instruction(bb->getProgram(), def); > + > + break; > + } > + break; > + default: > + break; > + } > + } > + > + return true; > +} > + > +// ============================================================================> + > // Common subexpression elimination. Stupid O^2 implementation. > class LocalCSE : public Pass > { > @@ -2629,6 +2679,9 @@ bool > Program::optimizePostRA(int level) > { > RUN_PASS(2, FlatteningPass, run); > + if (getTarget()->getChipset() < 0xc0) > + RUN_PASS(2, NV50PostRaConstantFolding, run); > + > return true; > } > > -- > 2.1.0 > > > > _______________________________________________ > Nouveau mailing list > Nouveau at lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/nouveau