Ilia Mirkin
2016-Jan-14 02:27 UTC
[Nouveau] [PATCH] nv50/ir: only use FILE_LOCAL_MEMORY for temp arrays that use indirection
Previously we were treating any indirect temp array usage to mean that everything should end up in lmem. The MemoryOpt pass would clean a lot of that up later, but in the meanwhile we would lose a lot of opportunity for optimization. This helps a lot of Metro 2033 Redux and a handful of KSP shaders: total instructions in shared programs : 6288373 -> 6261517 (-0.43%) total gprs used in shared programs : 944051 -> 945131 (0.11%) total local used in shared programs : 54116 -> 54116 (0.00%) total bytes used in shared programs : 50306984 -> 50092136 (-0.43%) A typical case is for register usage to double and for instructions to halve. A future commit can also optimize local memory usage size to be reduced with better packing. Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 65 +++++++++++++++++----- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 7e3b093..507749d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -96,6 +96,13 @@ public: return tgsi_util_get_src_register_swizzle(®, chan); } + int getArrayId() const + { + if (isIndirect(0)) + return fsr->Indirect.ArrayID; + return 0; + } + nv50_ir::Modifier getMod(int chan) const; SrcRegister getIndirect(int dim) const @@ -155,6 +162,13 @@ public: return SrcRegister(fdr->Indirect); } + int getArrayId() const + { + if (isIndirect(0)) + return fdr->Indirect.ArrayID; + return 0; + } + private: const struct tgsi_dst_register reg; const struct tgsi_full_dst_register *fdr; @@ -826,7 +840,8 @@ public: // these registers are per-subroutine, cannot be used for parameter passing std::set<Location> locals; - bool mainTempsInLMem; + std::set<int> indirectTempArrays; + std::vector<int> tempArrayId; int clipVertexOutput; @@ -859,8 +874,6 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog) if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) tgsi_dump(tokens, 0); - - mainTempsInLMem = false; } Source::~Source() @@ -890,6 +903,7 @@ bool Source::scanSource() textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1); resources.resize(scan.file_max[TGSI_FILE_IMAGE] + 1); + tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1); info->immd.bufSize = 0; @@ -935,7 +949,8 @@ bool Source::scanSource() } tgsi_parse_free(&parse); - if (mainTempsInLMem) + // TODO: Compute based on relevant array sizes + if (indirectTempArrays.size()) info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16; if (info->io.genUserClip > 0) { @@ -1046,6 +1061,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) unsigned sn = TGSI_SEMANTIC_GENERIC; unsigned si = 0; const unsigned first = decl->Range.First, last = decl->Range.Last; + const int arrayId = decl->Array.ArrayID; if (decl->Declaration.Semantic) { sn = decl->Semantic.Name; @@ -1189,8 +1205,11 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) for (i = first; i <= last; ++i) textureViews[i].target = decl->SamplerView.Resource; break; - case TGSI_FILE_NULL: case TGSI_FILE_TEMPORARY: + for (i = first; i <= last; ++i) + tempArrayId[i] = arrayId; + break; + case TGSI_FILE_NULL: case TGSI_FILE_ADDRESS: case TGSI_FILE_CONSTANT: case TGSI_FILE_IMMEDIATE: @@ -1241,7 +1260,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) } else if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) { if (insn.getDst(0).isIndirect(0)) - mainTempsInLMem = true; + indirectTempArrays.insert(insn.getDst(0).getArrayId()); } else if (insn.getDst(0).getFile() == TGSI_FILE_BUFFER) { info->io.globalAccess |= 0x2; @@ -1252,7 +1271,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) Instruction::SrcRegister src = insn.getSrc(s); if (src.getFile() == TGSI_FILE_TEMPORARY) { if (src.isIndirect(0)) - mainTempsInLMem = true; + indirectTempArrays.insert(src.getArrayId()); } else if (src.getFile() == TGSI_FILE_BUFFER) { info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ? @@ -1434,6 +1453,7 @@ private: DataType srcTy; DataArray tData; // TGSI_FILE_TEMPORARY + DataArray lData; // TGSI_FILE_TEMPORARY, for indirect arrays DataArray aData; // TGSI_FILE_ADDRESS DataArray pData; // TGSI_FILE_PREDICATE DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers) @@ -1637,7 +1657,7 @@ Converter::getArrayForFile(unsigned file, int idx) { switch (file) { case TGSI_FILE_TEMPORARY: - return &tData; + return idx == 0 ? &tData : &lData; case TGSI_FILE_PREDICATE: return &pData; case TGSI_FILE_ADDRESS: @@ -1662,7 +1682,7 @@ Converter::shiftAddress(Value *index) Value * Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) { - const int idx2d = src.is2D() ? src.getIndex(1) : 0; + int idx2d = src.is2D() ? src.getIndex(1) : 0; const int idx = src.getIndex(0); const int swz = src.getSwizzle(c); Instruction *ld; @@ -1704,6 +1724,14 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c)); ld->perPatch = info->sv[idx].patch; return ld->getDef(0); + case TGSI_FILE_TEMPORARY: { + int arrayid = src.getArrayId(); + if (!arrayid) + arrayid = code->tempArrayId[idx]; + idx2d = (code->indirectTempArrays.find(arrayid) !+ code->indirectTempArrays.end()); + } + /* fallthrough */ default: return getArrayForFile(src.getFile(), idx2d)->load( sub.cur->values, idx, swz, shiftAddress(ptr)); @@ -1716,7 +1744,7 @@ Converter::acquireDst(int d, int c) const tgsi::Instruction::DstRegister dst = tgsi.getDst(d); const unsigned f = dst.getFile(); const int idx = dst.getIndex(0); - const int idx2d = dst.is2D() ? dst.getIndex(1) : 0; + int idx2d = dst.is2D() ? dst.getIndex(1) : 0; if (dst.isMasked(c) || f == TGSI_FILE_BUFFER || f == TGSI_FILE_IMAGE) return NULL; @@ -1726,6 +1754,10 @@ Converter::acquireDst(int d, int c) (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT)) return getScratch(); + if (f == TGSI_FILE_TEMPORARY) + idx2d = code->indirectTempArrays.find(code->tempArrayId[idx]) !+ code->indirectTempArrays.end(); + return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c); } @@ -1758,7 +1790,7 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c, { const unsigned f = dst.getFile(); const int idx = dst.getIndex(0); - const int idx2d = dst.is2D() ? dst.getIndex(1) : 0; + int idx2d = dst.is2D() ? dst.getIndex(1) : 0; if (f == TGSI_FILE_SYSTEM_VALUE) { assert(!ptr); @@ -1781,6 +1813,10 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c, f == TGSI_FILE_PREDICATE || f == TGSI_FILE_ADDRESS || f == TGSI_FILE_OUTPUT) { + if (f == TGSI_FILE_TEMPORARY) + idx2d = code->indirectTempArrays.find(code->tempArrayId[idx]) !+ code->indirectTempArrays.end(); + getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val); } else { assert(!"invalid dst file"); @@ -3417,18 +3453,17 @@ Converter::exportOutputs() Converter::Converter(Program *ir, const tgsi::Source *code) : BuildUtil(ir), code(code), tgsi(NULL), - tData(this), aData(this), pData(this), oData(this) + tData(this), lData(this), aData(this), pData(this), oData(this) { info = code->info; - const DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR; - const unsigned tSize = code->fileSize(TGSI_FILE_TEMPORARY); const unsigned pSize = code->fileSize(TGSI_FILE_PREDICATE); const unsigned aSize = code->fileSize(TGSI_FILE_ADDRESS); const unsigned oSize = code->fileSize(TGSI_FILE_OUTPUT); - tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, tFile, 0); + tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, FILE_GPR, 0); + lData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, FILE_MEMORY_LOCAL, 0); pData.setup(TGSI_FILE_PREDICATE, 0, 0, pSize, 4, 4, FILE_PREDICATE, 0); aData.setup(TGSI_FILE_ADDRESS, 0, 0, aSize, 4, 4, FILE_GPR, 0); oData.setup(TGSI_FILE_OUTPUT, 0, 0, oSize, 4, 4, FILE_GPR, 0); -- 2.4.10
Possibly Parallel Threads
- [PATCH] nv50/ir: rebase indirect temp arrays to 0, so that we use less lmem space
- [PATCH mesa 2/6] nouveau: codegen: Slightly refactor Source::scanInstruction() dst handling
- Very slow disk I/O
- slightly off-topic, RAID program for on-board SAS 2308-4i ?
- Very slow disk I/O