Ilia Mirkin
2016-Jan-14 06:23 UTC
[Nouveau] [PATCH] nv50/ir: rebase indirect temp arrays to 0, so that we use less lmem space
Reduces local memory usage in a lot of Metro 2033 Redux and a few KSP shaders: total local used in shared programs : 54116 -> 30372 (-43.88%) Probably modest advantage to execution, but it's an imporant prerequisite to dropping some of the TGSI optimizations done by the state tracker. Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- Seems like there ought to be a simpler way of doing this... oh well. .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 64 +++++++++++++++++----- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 0e1c332..2085978 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -841,6 +841,11 @@ public: std::set<Location> locals; std::set<int> indirectTempArrays; + struct TempBase { + int oldBase, newBase; + }; + std::map<int, TempBase> indirectTempBases; + std::map<int, std::pair<int, int> > tempArrayInfo; std::vector<int> tempArrayId; int clipVertexOutput; @@ -949,9 +954,19 @@ bool Source::scanSource() } tgsi_parse_free(&parse); - // TODO: Compute based on relevant array sizes - if (indirectTempArrays.size()) - info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16; + if (indirectTempArrays.size()) { + int tempBase = 0; + for (std::set<int>::const_iterator it = indirectTempArrays.begin(); + it != indirectTempArrays.end(); ++it) { + std::pair<int, int>& info = tempArrayInfo[*it]; + TempBase base; + base.oldBase = info.first; + base.newBase = tempBase; + indirectTempBases.insert(std::make_pair(*it, base)); + tempBase += info.second; + } + info->bin.tlsSpace += tempBase * 16; + } if (info->io.genUserClip > 0) { info->io.clipDistances = info->io.genUserClip; @@ -1208,6 +1223,9 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) case TGSI_FILE_TEMPORARY: for (i = first; i <= last; ++i) tempArrayId[i] = arrayId; + if (arrayId) + tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair( + first, last - first + 1))); break; case TGSI_FILE_NULL: case TGSI_FILE_ADDRESS: @@ -1374,6 +1392,7 @@ private: void storeDst(const tgsi::Instruction::DstRegister dst, int c, Value *val, Value *ptr); + void adjustTempIndex(int arrayId, int &idx, int &idx2d) const; Value *applySrcMod(Value *, int s, int c); Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr); @@ -1679,11 +1698,23 @@ Converter::shiftAddress(Value *index) return mkOp2v(OP_SHL, TYPE_U32, getSSA(4, FILE_ADDRESS), index, mkImm(4)); } +void +Converter::adjustTempIndex(int arrayId, int &idx, int &idx2d) const +{ + std::map<int, tgsi::Source::TempBase>::const_iterator it + code->indirectTempBases.find(arrayId); + if (it == code->indirectTempBases.end()) + return; + + idx2d = 1; + idx += it->second.newBase - it->second.oldBase; +} + Value * Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) { int idx2d = src.is2D() ? src.getIndex(1) : 0; - const int idx = src.getIndex(0); + int idx = src.getIndex(0); const int swz = src.getSwizzle(c); Instruction *ld; @@ -1728,8 +1759,7 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) int arrayid = src.getArrayId(); if (!arrayid) arrayid = code->tempArrayId[idx]; - idx2d = (code->indirectTempArrays.find(arrayid) !- code->indirectTempArrays.end()); + adjustTempIndex(arrayid, idx, idx2d); } /* fallthrough */ default: @@ -1743,7 +1773,7 @@ Converter::acquireDst(int d, int c) { const tgsi::Instruction::DstRegister dst = tgsi.getDst(d); const unsigned f = dst.getFile(); - const int idx = dst.getIndex(0); + int idx = dst.getIndex(0); int idx2d = dst.is2D() ? dst.getIndex(1) : 0; if (dst.isMasked(c) || f == TGSI_FILE_BUFFER || f == TGSI_FILE_IMAGE) @@ -1754,9 +1784,12 @@ Converter::acquireDst(int d, int c) (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT)) return getScratch(); - if (f == TGSI_FILE_TEMPORARY) - idx2d = code->indirectTempArrays.find(code->tempArrayId[idx]) !- code->indirectTempArrays.end(); + if (f == TGSI_FILE_TEMPORARY) { + int arrayid = dst.getArrayId(); + if (!arrayid) + arrayid = code->tempArrayId[idx]; + adjustTempIndex(arrayid, idx, idx2d); + } return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c); } @@ -1789,7 +1822,7 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c, Value *val, Value *ptr) { const unsigned f = dst.getFile(); - const int idx = dst.getIndex(0); + int idx = dst.getIndex(0); int idx2d = dst.is2D() ? dst.getIndex(1) : 0; if (f == TGSI_FILE_SYSTEM_VALUE) { @@ -1813,9 +1846,12 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c, f == TGSI_FILE_PREDICATE || f == TGSI_FILE_ADDRESS || f == TGSI_FILE_OUTPUT) { - if (f == TGSI_FILE_TEMPORARY) - idx2d = code->indirectTempArrays.find(code->tempArrayId[idx]) !- code->indirectTempArrays.end(); + if (f == TGSI_FILE_TEMPORARY) { + int arrayid = dst.getArrayId(); + if (!arrayid) + arrayid = code->tempArrayId[idx]; + adjustTempIndex(arrayid, idx, idx2d); + } getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val); } else { -- 2.4.10
Possibly Parallel Threads
- [PATCH] nv50/ir: only use FILE_LOCAL_MEMORY for temp arrays that use indirection
- Very slow disk I/O
- slightly off-topic, RAID program for on-board SAS 2308-4i ?
- Very slow disk I/O
- [PATCH mesa 2/6] nouveau: codegen: Slightly refactor Source::scanInstruction() dst handling