Karol Herbst
2017-Mar-26 19:45 UTC
[Nouveau] [PATCH v5 0/5] nvc0/ir: add support for MAD/FMA PostRALoadPropagation
was "nv50/ir: PostRaConstantFolding improvements" before. nothing really changed from the last version, just minor things. Karol Herbst (5): nv50/ir: restructure and rename postraconstantfolding pass nv50/ir: implement mad post ra folding for nvc0+ gk110/ir: add LIMM form of mad gm107/ir: add LIMM form of mad nv50/ir: also do PostRaLoadPropagation for FMA .../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 50 ++++--- .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 34 +++-- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 162 +++++++++++++-------- src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp | 2 +- 4 files changed, 164 insertions(+), 84 deletions(-) -- 2.12.0
Karol Herbst
2017-Mar-26 19:45 UTC
[Nouveau] [PATCH v5 1/5] nv50/ir: restructure and rename postraconstantfolding pass
we might want to add more folding passes here, so make it a bit more generic
v2: leave the comment and reword commit message
v4: rename it to PostRaLoadPropagation
Signed-off-by: Karol Herbst <karolherbst at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 +++++++++++----------
1 file changed, 63 insertions(+), 58 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index d358abc5bd..af5a8c7ffd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -3189,10 +3189,12 @@ FlatteningPass::tryPredicateConditional(BasicBlock *bb)
// constraint SDST == SSRC2
// TODO:
// Does NVC0+ have other situations where this pass makes sense?
-class NV50PostRaConstantFolding : public Pass
+class PostRaLoadPropagation : public Pass
{
private:
- virtual bool visit(BasicBlock *);
+ virtual bool visit(Instruction *);
+
+ void handleMAD(Instruction *);
};
static bool
@@ -3204,69 +3206,72 @@ post_ra_dead(Instruction *i)
return true;
}
-bool
-NV50PostRaConstantFolding::visit(BasicBlock *bb)
+// Fold Immediate into MAD; must be done after register allocation due to
+// constraint SDST == SSRC2
+void
+PostRaLoadPropagation::handleMAD(Instruction *i)
{
- Value *vtmp;
- Instruction *def;
-
- for (Instruction *i = bb->getFirst(); i; i = i->next) {
- switch (i->op) {
- case OP_MAD:
- if (i->def(0).getFile() != FILE_GPR ||
- i->src(0).getFile() != FILE_GPR ||
- i->src(1).getFile() != FILE_GPR ||
- i->src(2).getFile() != FILE_GPR ||
- i->getDef(0)->reg.data.id !=
i->getSrc(2)->reg.data.id)
- break;
-
- if (i->getDef(0)->reg.data.id >= 64 ||
- i->getSrc(0)->reg.data.id >= 64)
- break;
+ if (i->def(0).getFile() != FILE_GPR ||
+ i->src(0).getFile() != FILE_GPR ||
+ i->src(1).getFile() != FILE_GPR ||
+ i->src(2).getFile() != FILE_GPR ||
+ i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
+ return;
- if (i->flagsSrc >= 0 &&
i->getSrc(i->flagsSrc)->reg.data.id != 0)
- break;
+ if (i->getDef(0)->reg.data.id >= 64 ||
+ i->getSrc(0)->reg.data.id >= 64)
+ return;
- if (i->getPredicate())
- break;
+ if (i->flagsSrc >= 0 &&
i->getSrc(i->flagsSrc)->reg.data.id != 0)
+ return;
- def = i->getSrc(1)->getInsn();
- if (def && def->op == OP_SPLIT &&
typeSizeof(def->sType) == 4)
- def = def->getSrc(0)->getInsn();
- if (def && def->op == OP_MOV &&
def->src(0).getFile() == FILE_IMMEDIATE) {
- vtmp = i->getSrc(1);
- if (isFloatType(i->sType)) {
- i->setSrc(1, def->getSrc(0));
- } else {
- ImmediateValue val;
- bool ret = def->src(0).getImmediate(val);
- assert(ret);
- if (i->getSrc(1)->reg.data.id & 1)
- val.reg.data.u32 >>= 16;
- val.reg.data.u32 &= 0xffff;
- i->setSrc(1, new_ImmediateValue(bb->getProgram(),
val.reg.data.u32));
- }
+ if (i->getPredicate())
+ return;
- /* There's no post-RA dead code elimination, so do it here
- * XXX: if we add more code-removing post-RA passes, we might
- * want to create a post-RA dead-code elim pass */
- if (post_ra_dead(vtmp->getInsn())) {
- Value *src = vtmp->getInsn()->getSrc(0);
- // Careful -- splits will have already been removed from the
- // functions. Don't double-delete.
- if (vtmp->getInsn()->bb)
- delete_Instruction(prog, vtmp->getInsn());
- if (src->getInsn() &&
post_ra_dead(src->getInsn()))
- delete_Instruction(prog, src->getInsn());
- }
+ Value *vtmp;
+ Instruction *def = i->getSrc(1)->getInsn();
+
+ if (def && def->op == OP_SPLIT &&
typeSizeof(def->sType) == 4)
+ def = def->getSrc(0)->getInsn();
+ if (def && def->op == OP_MOV && def->src(0).getFile()
== FILE_IMMEDIATE) {
+ vtmp = i->getSrc(1);
+ if (isFloatType(i->sType)) {
+ i->setSrc(1, def->getSrc(0));
+ } else {
+ ImmediateValue val;
+ bool ret = def->src(0).getImmediate(val);
+ assert(ret);
+ if (i->getSrc(1)->reg.data.id & 1)
+ val.reg.data.u32 >>= 16;
+ val.reg.data.u32 &= 0xffff;
+ i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32));
+ }
- break;
- }
- break;
- default:
- break;
+ /* There's no post-RA dead code elimination, so do it here
+ * XXX: if we add more code-removing post-RA passes, we might
+ * want to create a post-RA dead-code elim pass */
+ if (post_ra_dead(vtmp->getInsn())) {
+ Value *src = vtmp->getInsn()->getSrc(0);
+ // Careful -- splits will have already been removed from the
+ // functions. Don't double-delete.
+ if (vtmp->getInsn()->bb)
+ delete_Instruction(prog, vtmp->getInsn());
+ if (src->getInsn() && post_ra_dead(src->getInsn()))
+ delete_Instruction(prog, src->getInsn());
}
}
+}
+
+bool
+PostRaLoadPropagation::visit(Instruction *i)
+{
+ switch (i->op) {
+ case OP_MAD:
+ handleMAD(i);
+ break;
+ default:
+ break;
+ }
return true;
}
@@ -3693,7 +3698,7 @@ Program::optimizePostRA(int level)
{
RUN_PASS(2, FlatteningPass, run);
if (getTarget()->getChipset() < 0xc0)
- RUN_PASS(2, NV50PostRaConstantFolding, run);
+ RUN_PASS(2, PostRaLoadPropagation, run);
return true;
}
--
2.12.0
Karol Herbst
2017-Mar-26 19:45 UTC
[Nouveau] [PATCH v5 2/5] nv50/ir: implement mad post ra folding for nvc0+
changes for GpuTest /test=pixmark_piano /benchmark /no_scorebox /msaa=0
/benchmark_duration_ms=60000 /width=1024 /height=640:
score: 1026 -> 1045
changes for shader-db:
total instructions in shared programs : 3943335 -> 3934925 (-0.21%)
total gprs used in shared programs : 481563 -> 481563 (0.00%)
total local used in shared programs : 27469 -> 27469 (0.00%)
total bytes used in shared programs : 36139384 -> 36061888 (-0.21%)
local gpr inst bytes
helped 0 0 3587 3587
hurt 0 0 0 0
v2: removed TODO
reorderd to show changes without RA modification
removed stale debugging print() call
v3: remove predicate checks
enable only for gf100 ISA
Signed-off-by: Karol Herbst <karolherbst at gmail.com>
---
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 51 ++++++++++++++++++++--
1 file changed, 47 insertions(+), 4 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index af5a8c7ffd..5424322b24 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -3194,7 +3194,8 @@ class PostRaLoadPropagation : public Pass
private:
virtual bool visit(Instruction *);
- void handleMAD(Instruction *);
+ void handleMADforNV50(Instruction *);
+ void handleMADforNVC0(Instruction *);
};
static bool
@@ -3209,7 +3210,7 @@ post_ra_dead(Instruction *i)
// Fold Immediate into MAD; must be done after register allocation due to
// constraint SDST == SSRC2
void
-PostRaLoadPropagation::handleMAD(Instruction *i)
+PostRaLoadPropagation::handleMADforNV50(Instruction *i)
{
if (i->def(0).getFile() != FILE_GPR ||
i->src(0).getFile() != FILE_GPR ||
@@ -3262,12 +3263,54 @@ PostRaLoadPropagation::handleMAD(Instruction *i)
}
}
+void
+PostRaLoadPropagation::handleMADforNVC0(Instruction *i)
+{
+ if (i->def(0).getFile() != FILE_GPR ||
+ i->src(0).getFile() != FILE_GPR ||
+ i->src(1).getFile() != FILE_GPR ||
+ i->src(2).getFile() != FILE_GPR ||
+ i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
+ return;
+
+ // TODO: gm107 can also do this for S32, maybe other chipsets as well
+ if (i->dType != TYPE_F32)
+ return;
+
+ if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) !=
Modifier(NV50_IR_MOD_NEG))
+ return;
+
+ ImmediateValue val;
+ int s;
+
+ if (i->src(0).getImmediate(val))
+ s = 1;
+ else if (i->src(1).getImmediate(val))
+ s = 0;
+ else
+ return;
+
+ if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) !=
Modifier(NV50_IR_MOD_NEG))
+ return;
+
+ if (s == 1)
+ i->swapSources(0, 1);
+
+ Instruction *imm = i->getSrc(1)->getInsn();
+ i->setSrc(1, imm->getSrc(0));
+ if (post_ra_dead(imm))
+ delete_Instruction(prog, imm);
+}
+
bool
PostRaLoadPropagation::visit(Instruction *i)
{
switch (i->op) {
case OP_MAD:
- handleMAD(i);
+ if (prog->getTarget()->getChipset() < 0xc0)
+ handleMADforNV50(i);
+ else
+ handleMADforNVC0(i);
break;
default:
break;
@@ -3697,7 +3740,7 @@ bool
Program::optimizePostRA(int level)
{
RUN_PASS(2, FlatteningPass, run);
- if (getTarget()->getChipset() < 0xc0)
+ if (getTarget()->getChipset() < NVISA_GK20A_CHIPSET)
RUN_PASS(2, PostRaLoadPropagation, run);
return true;
--
2.12.0
Karol Herbst
2017-Mar-26 19:45 UTC
[Nouveau] [PATCH v5 3/5] gk110/ir: add LIMM form of mad
v2: renamed commit
reordered modifiers
add assert(dst == src2)
v3: removed wrong neg mod emission
Signed-off-by: Karol Herbst <karolherbst at gmail.com>
---
.../drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 50 ++++++++++++++--------
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 2 +-
2 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 4210321ae1..1121ae0912 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -47,7 +47,7 @@ private:
private:
void emitForm_21(const Instruction *, uint32_t opc2, uint32_t opc1);
void emitForm_C(const Instruction *, uint32_t opc, uint8_t ctg);
- void emitForm_L(const Instruction *, uint32_t opc, uint8_t ctg, Modifier);
+ void emitForm_L(const Instruction *, uint32_t opc, uint8_t ctg, Modifier,
int sCount = 3);
void emitPredicate(const Instruction *);
@@ -365,7 +365,7 @@ CodeEmitterGK110::setImmediate32(const Instruction *i, const
int s,
void
CodeEmitterGK110::emitForm_L(const Instruction *i, uint32_t opc, uint8_t ctg,
- Modifier mod)
+ Modifier mod, int sCount)
{
code[0] = ctg;
code[1] = opc << 20;
@@ -374,7 +374,7 @@ CodeEmitterGK110::emitForm_L(const Instruction *i, uint32_t
opc, uint8_t ctg,
defId(i->def(0), 2);
- for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+ for (int s = 0; s < sCount && i->srcExists(s); ++s) {
switch (i->src(s).getFile()) {
case FILE_GPR:
srcId(i->src(s), s ? 42 : 10);
@@ -487,25 +487,41 @@ CodeEmitterGK110::emitNOP(const Instruction *i)
void
CodeEmitterGK110::emitFMAD(const Instruction *i)
{
- assert(!isLIMM(i->src(1), TYPE_F32));
+ bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
- emitForm_21(i, 0x0c0, 0x940);
+ if (isLIMM(i->src(1), TYPE_F32)) {
+ assert(i->getDef(0)->reg.data.id ==
i->getSrc(2)->reg.data.id);
- NEG_(34, 2);
- SAT_(35);
- RND_(36, F);
- FTZ_(38);
- DNZ_(39);
+ // last source is dst, so force 2 sources
+ emitForm_L(i, 0x600, 0x0, 0, 2);
- bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+ if (i->flagsDef >= 0)
+ code[1] |= 1 << 23;
- if (code[0] & 0x1) {
- if (neg1)
- code[1] ^= 1 << 27;
- } else
- if (neg1) {
- code[1] |= 1 << 19;
+ SAT_(3a);
+ NEG_(3c, 2);
+
+ if (neg1) {
+ code[1] |= 1 << 27;
+ }
+ } else {
+ emitForm_21(i, 0x0c0, 0x940);
+
+ NEG_(34, 2);
+ SAT_(35);
+ RND_(36, F);
+
+ if (code[0] & 0x1) {
+ if (neg1)
+ code[1] ^= 1 << 27;
+ } else
+ if (neg1) {
+ code[1] |= 1 << 19;
+ }
}
+
+ FTZ_(38);
+ DNZ_(39);
}
void
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 5424322b24..59caca8146 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -3740,7 +3740,7 @@ bool
Program::optimizePostRA(int level)
{
RUN_PASS(2, FlatteningPass, run);
- if (getTarget()->getChipset() < NVISA_GK20A_CHIPSET)
+ if (getTarget()->getChipset() < NVISA_GM107_CHIPSET)
RUN_PASS(2, PostRaLoadPropagation, run);
return true;
--
2.12.0
Karol Herbst
2017-Mar-26 19:46 UTC
[Nouveau] [PATCH v5 4/5] gm107/ir: add LIMM form of mad
v2: renamed commit
reordered modifiers
add assert(dst == src2)
v3: reordered modifiers again
v5: no rounding bit for limms
Signed-off-by: Karol Herbst <karolherbst at gmail.com>
---
.../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 34 ++++++++++++++++------
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 3 +-
2 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 6de3f396e3..6903132efa 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -1311,7 +1311,7 @@ CodeEmitterGM107::emitFMUL()
void
CodeEmitterGM107::emitFFMA()
{
- /*XXX: ffma32i exists, but not using it as third src overlaps dst */
+ bool isLongIMMD = false;
switch(insn->src(2).getFile()) {
case FILE_GPR:
switch (insn->src(1).getFile()) {
@@ -1324,14 +1324,22 @@ CodeEmitterGM107::emitFFMA()
emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1));
break;
case FILE_IMMEDIATE:
- emitInsn(0x32800000);
- emitIMMD(0x14, 19, insn->src(1));
+ if (longIMMD(insn->getSrc(1))) {
+ assert(insn->getDef(0)->reg.data.id ==
insn->getSrc(2)->reg.data.id);
+ isLongIMMD = true;
+ emitInsn(0x0c000000);
+ emitIMMD(0x14, 32, insn->src(1));
+ } else {
+ emitInsn(0x32800000);
+ emitIMMD(0x14, 19, insn->src(1));
+ }
break;
default:
assert(!"bad src1 file");
break;
}
- emitGPR (0x27, insn->src(2));
+ if (!isLongIMMD)
+ emitGPR (0x27, insn->src(2));
break;
case FILE_MEMORY_CONST:
emitInsn(0x51800000);
@@ -1342,11 +1350,19 @@ CodeEmitterGM107::emitFFMA()
assert(!"bad src2 file");
break;
}
- emitRND (0x33);
- emitSAT (0x32);
- emitNEG (0x31, insn->src(2));
- emitNEG2(0x30, insn->src(0), insn->src(1));
- emitCC (0x2f);
+
+ if (isLongIMMD) {
+ emitNEG (0x39, insn->src(2));
+ emitNEG2(0x38, insn->src(0), insn->src(1));
+ emitSAT (0x37);
+ emitCC (0x34);
+ } else {
+ emitRND (0x33);
+ emitSAT (0x32);
+ emitNEG (0x31, insn->src(2));
+ emitNEG2(0x30, insn->src(0), insn->src(1));
+ emitCC (0x2f);
+ }
emitFMZ(0x35, 2);
emitGPR(0x08, insn->src(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 59caca8146..3786838a35 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -3740,8 +3740,7 @@ bool
Program::optimizePostRA(int level)
{
RUN_PASS(2, FlatteningPass, run);
- if (getTarget()->getChipset() < NVISA_GM107_CHIPSET)
- RUN_PASS(2, PostRaLoadPropagation, run);
+ RUN_PASS(2, PostRaLoadPropagation, run);
return true;
}
--
2.12.0
Karol Herbst
2017-Mar-26 19:46 UTC
[Nouveau] [PATCH v5 5/5] nv50/ir: also do PostRaLoadPropagation for FMA
Helps Feral-ported games, due to their use of fma()
shader-db changes:
total instructions in shared programs : 3934925 -> 3934327 (-0.02%)
total gprs used in shared programs : 481563 -> 481563 (0.00%)
total local used in shared programs : 27469 -> 27469 (0.00%)
total bytes used in shared programs : 36061888 -> 36056504 (-0.01%)
local gpr inst bytes
helped 0 0 228 228
hurt 0 0 0 0
Signed-off-by: Karol Herbst <karolherbst at gmail.com>
---
src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 1 +
src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 3786838a35..98e3ccb4e6 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -3306,6 +3306,7 @@ bool
PostRaLoadPropagation::visit(Instruction *i)
{
switch (i->op) {
+ case OP_FMA:
case OP_MAD:
if (prog->getTarget()->getChipset() < 0xc0)
handleMADforNV50(i);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index d36c8531a2..193628cfbd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1471,7 +1471,7 @@ GCRA::allocateRegisters(ArrayList& insns)
if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL
&&
prog->getTarget()->getChipset() < 0xc0) {
Instruction *insn = lval->getInsn();
- if (insn->op == OP_MAD || insn->op == OP_SAD)
+ if (insn->op == OP_MAD || insn->op == OP_FMA || insn->op
== OP_SAD)
// Short encoding only possible if they're all GPRs, no need
to
// affect them otherwise.
if (insn->flagsDef < 0 &&
--
2.12.0
Maybe Matching Threads
- nv50/ir: Implement short notation for MAD V2
- [PATCH 1/2] nv50/ir: Add support for MAD short+IMM notation
- [PATCH 1/3] nv50/ir: Add support for MAD short+IMM notation
- [PATCH 1/3] nv50/ir: Add support for MAD 4-byte opcode
- [PATCH 01/11] nvc0/ir: add emission of dadd/dmul/dmad opcodes, fix minmax