Karol Herbst
2017-Mar-26 19:45 UTC
[Nouveau] [PATCH v5 0/5] nvc0/ir: add support for MAD/FMA PostRALoadPropagation
was "nv50/ir: PostRaConstantFolding improvements" before. nothing really changed from the last version, just minor things. Karol Herbst (5): nv50/ir: restructure and rename postraconstantfolding pass nv50/ir: implement mad post ra folding for nvc0+ gk110/ir: add LIMM form of mad gm107/ir: add LIMM form of mad nv50/ir: also do PostRaLoadPropagation for FMA .../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 50 ++++--- .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 34 +++-- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 162 +++++++++++++-------- src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp | 2 +- 4 files changed, 164 insertions(+), 84 deletions(-) -- 2.12.0
Karol Herbst
2017-Mar-26 19:45 UTC
[Nouveau] [PATCH v5 1/5] nv50/ir: restructure and rename postraconstantfolding pass
we might want to add more folding passes here, so make it a bit more generic v2: leave the comment and reword commit message v4: rename it to PostRaLoadPropagation Signed-off-by: Karol Herbst <karolherbst at gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com> --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 +++++++++++---------- 1 file changed, 63 insertions(+), 58 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index d358abc5bd..af5a8c7ffd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -3189,10 +3189,12 @@ FlatteningPass::tryPredicateConditional(BasicBlock *bb) // constraint SDST == SSRC2 // TODO: // Does NVC0+ have other situations where this pass makes sense? -class NV50PostRaConstantFolding : public Pass +class PostRaLoadPropagation : public Pass { private: - virtual bool visit(BasicBlock *); + virtual bool visit(Instruction *); + + void handleMAD(Instruction *); }; static bool @@ -3204,69 +3206,72 @@ post_ra_dead(Instruction *i) return true; } -bool -NV50PostRaConstantFolding::visit(BasicBlock *bb) +// Fold Immediate into MAD; must be done after register allocation due to +// constraint SDST == SSRC2 +void +PostRaLoadPropagation::handleMAD(Instruction *i) { - Value *vtmp; - Instruction *def; - - for (Instruction *i = bb->getFirst(); i; i = i->next) { - switch (i->op) { - case OP_MAD: - if (i->def(0).getFile() != FILE_GPR || - i->src(0).getFile() != FILE_GPR || - i->src(1).getFile() != FILE_GPR || - i->src(2).getFile() != FILE_GPR || - i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id) - break; - - if (i->getDef(0)->reg.data.id >= 64 || - i->getSrc(0)->reg.data.id >= 64) - break; + if (i->def(0).getFile() != FILE_GPR || + i->src(0).getFile() != FILE_GPR || + i->src(1).getFile() != FILE_GPR || + i->src(2).getFile() != FILE_GPR || + i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id) + return; - if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0) - break; + if (i->getDef(0)->reg.data.id >= 64 || + i->getSrc(0)->reg.data.id >= 64) + return; - if (i->getPredicate()) - break; + if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0) + return; - def = i->getSrc(1)->getInsn(); - if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4) - def = def->getSrc(0)->getInsn(); - if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { - vtmp = i->getSrc(1); - if (isFloatType(i->sType)) { - i->setSrc(1, def->getSrc(0)); - } else { - ImmediateValue val; - bool ret = def->src(0).getImmediate(val); - assert(ret); - if (i->getSrc(1)->reg.data.id & 1) - val.reg.data.u32 >>= 16; - val.reg.data.u32 &= 0xffff; - i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32)); - } + if (i->getPredicate()) + return; - /* There's no post-RA dead code elimination, so do it here - * XXX: if we add more code-removing post-RA passes, we might - * want to create a post-RA dead-code elim pass */ - if (post_ra_dead(vtmp->getInsn())) { - Value *src = vtmp->getInsn()->getSrc(0); - // Careful -- splits will have already been removed from the - // functions. Don't double-delete. - if (vtmp->getInsn()->bb) - delete_Instruction(prog, vtmp->getInsn()); - if (src->getInsn() && post_ra_dead(src->getInsn())) - delete_Instruction(prog, src->getInsn()); - } + Value *vtmp; + Instruction *def = i->getSrc(1)->getInsn(); + + if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4) + def = def->getSrc(0)->getInsn(); + if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { + vtmp = i->getSrc(1); + if (isFloatType(i->sType)) { + i->setSrc(1, def->getSrc(0)); + } else { + ImmediateValue val; + bool ret = def->src(0).getImmediate(val); + assert(ret); + if (i->getSrc(1)->reg.data.id & 1) + val.reg.data.u32 >>= 16; + val.reg.data.u32 &= 0xffff; + i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32)); + } - break; - } - break; - default: - break; + /* There's no post-RA dead code elimination, so do it here + * XXX: if we add more code-removing post-RA passes, we might + * want to create a post-RA dead-code elim pass */ + if (post_ra_dead(vtmp->getInsn())) { + Value *src = vtmp->getInsn()->getSrc(0); + // Careful -- splits will have already been removed from the + // functions. Don't double-delete. + if (vtmp->getInsn()->bb) + delete_Instruction(prog, vtmp->getInsn()); + if (src->getInsn() && post_ra_dead(src->getInsn())) + delete_Instruction(prog, src->getInsn()); } } +} + +bool +PostRaLoadPropagation::visit(Instruction *i) +{ + switch (i->op) { + case OP_MAD: + handleMAD(i); + break; + default: + break; + } return true; } @@ -3693,7 +3698,7 @@ Program::optimizePostRA(int level) { RUN_PASS(2, FlatteningPass, run); if (getTarget()->getChipset() < 0xc0) - RUN_PASS(2, NV50PostRaConstantFolding, run); + RUN_PASS(2, PostRaLoadPropagation, run); return true; } -- 2.12.0
Karol Herbst
2017-Mar-26 19:45 UTC
[Nouveau] [PATCH v5 2/5] nv50/ir: implement mad post ra folding for nvc0+
changes for GpuTest /test=pixmark_piano /benchmark /no_scorebox /msaa=0 /benchmark_duration_ms=60000 /width=1024 /height=640: score: 1026 -> 1045 changes for shader-db: total instructions in shared programs : 3943335 -> 3934925 (-0.21%) total gprs used in shared programs : 481563 -> 481563 (0.00%) total local used in shared programs : 27469 -> 27469 (0.00%) total bytes used in shared programs : 36139384 -> 36061888 (-0.21%) local gpr inst bytes helped 0 0 3587 3587 hurt 0 0 0 0 v2: removed TODO reorderd to show changes without RA modification removed stale debugging print() call v3: remove predicate checks enable only for gf100 ISA Signed-off-by: Karol Herbst <karolherbst at gmail.com> --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 51 ++++++++++++++++++++-- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index af5a8c7ffd..5424322b24 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -3194,7 +3194,8 @@ class PostRaLoadPropagation : public Pass private: virtual bool visit(Instruction *); - void handleMAD(Instruction *); + void handleMADforNV50(Instruction *); + void handleMADforNVC0(Instruction *); }; static bool @@ -3209,7 +3210,7 @@ post_ra_dead(Instruction *i) // Fold Immediate into MAD; must be done after register allocation due to // constraint SDST == SSRC2 void -PostRaLoadPropagation::handleMAD(Instruction *i) +PostRaLoadPropagation::handleMADforNV50(Instruction *i) { if (i->def(0).getFile() != FILE_GPR || i->src(0).getFile() != FILE_GPR || @@ -3262,12 +3263,54 @@ PostRaLoadPropagation::handleMAD(Instruction *i) } } +void +PostRaLoadPropagation::handleMADforNVC0(Instruction *i) +{ + if (i->def(0).getFile() != FILE_GPR || + i->src(0).getFile() != FILE_GPR || + i->src(1).getFile() != FILE_GPR || + i->src(2).getFile() != FILE_GPR || + i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id) + return; + + // TODO: gm107 can also do this for S32, maybe other chipsets as well + if (i->dType != TYPE_F32) + return; + + if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG)) + return; + + ImmediateValue val; + int s; + + if (i->src(0).getImmediate(val)) + s = 1; + else if (i->src(1).getImmediate(val)) + s = 0; + else + return; + + if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG)) + return; + + if (s == 1) + i->swapSources(0, 1); + + Instruction *imm = i->getSrc(1)->getInsn(); + i->setSrc(1, imm->getSrc(0)); + if (post_ra_dead(imm)) + delete_Instruction(prog, imm); +} + bool PostRaLoadPropagation::visit(Instruction *i) { switch (i->op) { case OP_MAD: - handleMAD(i); + if (prog->getTarget()->getChipset() < 0xc0) + handleMADforNV50(i); + else + handleMADforNVC0(i); break; default: break; @@ -3697,7 +3740,7 @@ bool Program::optimizePostRA(int level) { RUN_PASS(2, FlatteningPass, run); - if (getTarget()->getChipset() < 0xc0) + if (getTarget()->getChipset() < NVISA_GK20A_CHIPSET) RUN_PASS(2, PostRaLoadPropagation, run); return true; -- 2.12.0
Karol Herbst
2017-Mar-26 19:45 UTC
[Nouveau] [PATCH v5 3/5] gk110/ir: add LIMM form of mad
v2: renamed commit reordered modifiers add assert(dst == src2) v3: removed wrong neg mod emission Signed-off-by: Karol Herbst <karolherbst at gmail.com> --- .../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 50 ++++++++++++++-------- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 2 +- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index 4210321ae1..1121ae0912 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -47,7 +47,7 @@ private: private: void emitForm_21(const Instruction *, uint32_t opc2, uint32_t opc1); void emitForm_C(const Instruction *, uint32_t opc, uint8_t ctg); - void emitForm_L(const Instruction *, uint32_t opc, uint8_t ctg, Modifier); + void emitForm_L(const Instruction *, uint32_t opc, uint8_t ctg, Modifier, int sCount = 3); void emitPredicate(const Instruction *); @@ -365,7 +365,7 @@ CodeEmitterGK110::setImmediate32(const Instruction *i, const int s, void CodeEmitterGK110::emitForm_L(const Instruction *i, uint32_t opc, uint8_t ctg, - Modifier mod) + Modifier mod, int sCount) { code[0] = ctg; code[1] = opc << 20; @@ -374,7 +374,7 @@ CodeEmitterGK110::emitForm_L(const Instruction *i, uint32_t opc, uint8_t ctg, defId(i->def(0), 2); - for (int s = 0; s < 3 && i->srcExists(s); ++s) { + for (int s = 0; s < sCount && i->srcExists(s); ++s) { switch (i->src(s).getFile()) { case FILE_GPR: srcId(i->src(s), s ? 42 : 10); @@ -487,25 +487,41 @@ CodeEmitterGK110::emitNOP(const Instruction *i) void CodeEmitterGK110::emitFMAD(const Instruction *i) { - assert(!isLIMM(i->src(1), TYPE_F32)); + bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg(); - emitForm_21(i, 0x0c0, 0x940); + if (isLIMM(i->src(1), TYPE_F32)) { + assert(i->getDef(0)->reg.data.id == i->getSrc(2)->reg.data.id); - NEG_(34, 2); - SAT_(35); - RND_(36, F); - FTZ_(38); - DNZ_(39); + // last source is dst, so force 2 sources + emitForm_L(i, 0x600, 0x0, 0, 2); - bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg(); + if (i->flagsDef >= 0) + code[1] |= 1 << 23; - if (code[0] & 0x1) { - if (neg1) - code[1] ^= 1 << 27; - } else - if (neg1) { - code[1] |= 1 << 19; + SAT_(3a); + NEG_(3c, 2); + + if (neg1) { + code[1] |= 1 << 27; + } + } else { + emitForm_21(i, 0x0c0, 0x940); + + NEG_(34, 2); + SAT_(35); + RND_(36, F); + + if (code[0] & 0x1) { + if (neg1) + code[1] ^= 1 << 27; + } else + if (neg1) { + code[1] |= 1 << 19; + } } + + FTZ_(38); + DNZ_(39); } void diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 5424322b24..59caca8146 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -3740,7 +3740,7 @@ bool Program::optimizePostRA(int level) { RUN_PASS(2, FlatteningPass, run); - if (getTarget()->getChipset() < NVISA_GK20A_CHIPSET) + if (getTarget()->getChipset() < NVISA_GM107_CHIPSET) RUN_PASS(2, PostRaLoadPropagation, run); return true; -- 2.12.0
Karol Herbst
2017-Mar-26 19:46 UTC
[Nouveau] [PATCH v5 4/5] gm107/ir: add LIMM form of mad
v2: renamed commit reordered modifiers add assert(dst == src2) v3: reordered modifiers again v5: no rounding bit for limms Signed-off-by: Karol Herbst <karolherbst at gmail.com> --- .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 34 ++++++++++++++++------ .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 3 +- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 6de3f396e3..6903132efa 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -1311,7 +1311,7 @@ CodeEmitterGM107::emitFMUL() void CodeEmitterGM107::emitFFMA() { - /*XXX: ffma32i exists, but not using it as third src overlaps dst */ + bool isLongIMMD = false; switch(insn->src(2).getFile()) { case FILE_GPR: switch (insn->src(1).getFile()) { @@ -1324,14 +1324,22 @@ CodeEmitterGM107::emitFFMA() emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1)); break; case FILE_IMMEDIATE: - emitInsn(0x32800000); - emitIMMD(0x14, 19, insn->src(1)); + if (longIMMD(insn->getSrc(1))) { + assert(insn->getDef(0)->reg.data.id == insn->getSrc(2)->reg.data.id); + isLongIMMD = true; + emitInsn(0x0c000000); + emitIMMD(0x14, 32, insn->src(1)); + } else { + emitInsn(0x32800000); + emitIMMD(0x14, 19, insn->src(1)); + } break; default: assert(!"bad src1 file"); break; } - emitGPR (0x27, insn->src(2)); + if (!isLongIMMD) + emitGPR (0x27, insn->src(2)); break; case FILE_MEMORY_CONST: emitInsn(0x51800000); @@ -1342,11 +1350,19 @@ CodeEmitterGM107::emitFFMA() assert(!"bad src2 file"); break; } - emitRND (0x33); - emitSAT (0x32); - emitNEG (0x31, insn->src(2)); - emitNEG2(0x30, insn->src(0), insn->src(1)); - emitCC (0x2f); + + if (isLongIMMD) { + emitNEG (0x39, insn->src(2)); + emitNEG2(0x38, insn->src(0), insn->src(1)); + emitSAT (0x37); + emitCC (0x34); + } else { + emitRND (0x33); + emitSAT (0x32); + emitNEG (0x31, insn->src(2)); + emitNEG2(0x30, insn->src(0), insn->src(1)); + emitCC (0x2f); + } emitFMZ(0x35, 2); emitGPR(0x08, insn->src(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 59caca8146..3786838a35 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -3740,8 +3740,7 @@ bool Program::optimizePostRA(int level) { RUN_PASS(2, FlatteningPass, run); - if (getTarget()->getChipset() < NVISA_GM107_CHIPSET) - RUN_PASS(2, PostRaLoadPropagation, run); + RUN_PASS(2, PostRaLoadPropagation, run); return true; } -- 2.12.0
Karol Herbst
2017-Mar-26 19:46 UTC
[Nouveau] [PATCH v5 5/5] nv50/ir: also do PostRaLoadPropagation for FMA
Helps Feral-ported games, due to their use of fma() shader-db changes: total instructions in shared programs : 3934925 -> 3934327 (-0.02%) total gprs used in shared programs : 481563 -> 481563 (0.00%) total local used in shared programs : 27469 -> 27469 (0.00%) total bytes used in shared programs : 36061888 -> 36056504 (-0.01%) local gpr inst bytes helped 0 0 228 228 hurt 0 0 0 0 Signed-off-by: Karol Herbst <karolherbst at gmail.com> --- src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 1 + src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 3786838a35..98e3ccb4e6 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -3306,6 +3306,7 @@ bool PostRaLoadPropagation::visit(Instruction *i) { switch (i->op) { + case OP_FMA: case OP_MAD: if (prog->getTarget()->getChipset() < 0xc0) handleMADforNV50(i); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index d36c8531a2..193628cfbd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -1471,7 +1471,7 @@ GCRA::allocateRegisters(ArrayList& insns) if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL && prog->getTarget()->getChipset() < 0xc0) { Instruction *insn = lval->getInsn(); - if (insn->op == OP_MAD || insn->op == OP_SAD) + if (insn->op == OP_MAD || insn->op == OP_FMA || insn->op == OP_SAD) // Short encoding only possible if they're all GPRs, no need to // affect them otherwise. if (insn->flagsDef < 0 && -- 2.12.0
Reasonably Related Threads
- nv50/ir: Implement short notation for MAD V2
- [PATCH 1/2] nv50/ir: Add support for MAD short+IMM notation
- [PATCH 1/3] nv50/ir: Add support for MAD short+IMM notation
- [PATCH 1/3] nv50/ir: Add support for MAD 4-byte opcode
- [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax