Ilia Mirkin
2017-Dec-20 04:41 UTC
[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling
This is parallel to the pre-SM50 change which does this. Adjusts the shuffles / quadops to make the values correct relative to lane 0, and then splat the results to all lanes for the final move into the target register. Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> --- Entirely untested beyond compilation. Should check bin/tex-miplevel-selection textureGrad Cube bin/tex-miplevel-selection textureGrad CubeShadow bin/tex-miplevel-selection textureGrad CubeArray KHR-GL45.texture_cube_map_array.sampling to see if they start passing with this change. .../nouveau/codegen/nv50_ir_lowering_gm107.cpp | 56 ++++++++++++++-------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp index 6b9edd48645..a2427526a81 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp @@ -95,18 +95,15 @@ GM107LegalizeSSA::visit(Instruction *i) bool GM107LoweringPass::handleManualTXD(TexInstruction *i) { - static const uint8_t qOps[4][2] - { - { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 - { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 - { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 - { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 - }; + // See NVC0LoweringPass::handleManualTXD for rationale. This function + // implements the same logic, but using SM50-friendly primitives. + static const uint8_t qOps[2] + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }; Value *def[4][4]; - Value *crd[3]; + Value *crd[3], *arr, *shadow; Value *tmp; Instruction *tex, *add; - Value *zero = bld.loadImm(bld.getSSA(), 0); + Value *quad = bld.mkImm(SHFL_BOUND_QUAD); int l, c; const int dim = i->tex.target.getDim() + i->tex.target.isCube(); const int array = i->tex.target.isArray(); @@ -115,35 +112,40 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) for (c = 0; c < dim; ++c) crd[c] = bld.getScratch(); + arr = bld.getScratch(); + shadow = bld.getScratch(); tmp = bld.getScratch(); for (l = 0; l < 4; ++l) { Value *src[3], *val; - // mov coordinates from lane l to all lanes + Value *lane = bld.mkImm(l); bld.mkOp(OP_QUADON, TYPE_NONE, NULL); + // Make sure lane 0 has the appropriate array/depth compare values + if (l != 0) { + if (array) + bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad); + if (i->tex.target.isShadow()) + bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim), lane, quad); + } + + // mov coordinates from lane l to all lanes for (c = 0; c < dim; ++c) { - bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), - bld.mkImm(l), bld.mkImm(SHFL_BOUND_QUAD)); - add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero); - add->subOp = 0x00; - add->lanes = 1; /* abused for .ndv */ + bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad); } // add dPdx from lane l to lanes dx for (c = 0; c < dim; ++c) { - bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), bld.mkImm(l), - bld.mkImm(SHFL_BOUND_QUAD)); + bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad); add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); - add->subOp = qOps[l][0]; + add->subOp = qOps[0]; add->lanes = 1; /* abused for .ndv */ } // add dPdy from lane l to lanes dy for (c = 0; c < dim; ++c) { - bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), bld.mkImm(l), - bld.mkImm(SHFL_BOUND_QUAD)); + bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad); add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); - add->subOp = qOps[l][1]; + add->subOp = qOps[1]; add->lanes = 1; /* abused for .ndv */ } @@ -164,8 +166,20 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) // texture bld.insert(tex = cloneForward(func, i)); + if (l != 0) { + if (array) + tex->setSrc(0, arr); + if (i->tex.target.isShadow()) + tex->setSrc(array + dim, shadow); + } for (c = 0; c < dim; ++c) tex->setSrc(c + array, src[c]); + // broadcast results from lane 0 to all lanes + if (l != 0) { + Value *lane = bld.mkImm(l); + for (c = 0; i->defExists(c); ++c) + bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), lane, quad); + } bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); // save results -- 2.13.6
Ilia Mirkin
2017-Dec-20 14:44 UTC
[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling
On Tue, Dec 19, 2017 at 11:41 PM, Ilia Mirkin <imirkin at alum.mit.edu> wrote:> This is parallel to the pre-SM50 change which does this. Adjusts the > shuffles / quadops to make the values correct relative to lane 0, and > then splat the results to all lanes for the final move into the target > register. > > Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> > --- > > Entirely untested beyond compilation. Should check > > bin/tex-miplevel-selection textureGrad Cube > bin/tex-miplevel-selection textureGrad CubeShadow > bin/tex-miplevel-selection textureGrad CubeArray > KHR-GL45.texture_cube_map_array.sampling > > to see if they start passing with this change. > > .../nouveau/codegen/nv50_ir_lowering_gm107.cpp | 56 ++++++++++++++-------- > 1 file changed, 35 insertions(+), 21 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp > index 6b9edd48645..a2427526a81 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp > @@ -95,18 +95,15 @@ GM107LegalizeSSA::visit(Instruction *i) > bool > GM107LoweringPass::handleManualTXD(TexInstruction *i) > { > - static const uint8_t qOps[4][2] > - { > - { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 > - { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 > - { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 > - { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 > - }; > + // See NVC0LoweringPass::handleManualTXD for rationale. This function > + // implements the same logic, but using SM50-friendly primitives. > + static const uint8_t qOps[2] > + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }; > Value *def[4][4]; > - Value *crd[3]; > + Value *crd[3], *arr, *shadow; > Value *tmp; > Instruction *tex, *add; > - Value *zero = bld.loadImm(bld.getSSA(), 0); > + Value *quad = bld.mkImm(SHFL_BOUND_QUAD); > int l, c; > const int dim = i->tex.target.getDim() + i->tex.target.isCube(); > const int array = i->tex.target.isArray(); > @@ -115,35 +112,40 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) > > for (c = 0; c < dim; ++c) > crd[c] = bld.getScratch(); > + arr = bld.getScratch(); > + shadow = bld.getScratch(); > tmp = bld.getScratch(); > > for (l = 0; l < 4; ++l) { > Value *src[3], *val; > - // mov coordinates from lane l to all lanes > + Value *lane = bld.mkImm(l); > bld.mkOp(OP_QUADON, TYPE_NONE, NULL); > + // Make sure lane 0 has the appropriate array/depth compare values > + if (l != 0) { > + if (array) > + bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad); > + if (i->tex.target.isShadow()) > + bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim), lane, quad);In the great argument switcheroo between each SM version, the shadow compare is actually after the indirect handle (which in turn is after array + dim). So this should become array + dim + indirect (and similarly below).> + } > + > + // mov coordinates from lane l to all lanes > for (c = 0; c < dim; ++c) { > - bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), > - bld.mkImm(l), bld.mkImm(SHFL_BOUND_QUAD)); > - add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero); > - add->subOp = 0x00; > - add->lanes = 1; /* abused for .ndv */ > + bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad); > } > > // add dPdx from lane l to lanes dx > for (c = 0; c < dim; ++c) { > - bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), bld.mkImm(l), > - bld.mkImm(SHFL_BOUND_QUAD)); > + bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad); > add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); > - add->subOp = qOps[l][0]; > + add->subOp = qOps[0]; > add->lanes = 1; /* abused for .ndv */ > } > > // add dPdy from lane l to lanes dy > for (c = 0; c < dim; ++c) { > - bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), bld.mkImm(l), > - bld.mkImm(SHFL_BOUND_QUAD)); > + bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad); > add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); > - add->subOp = qOps[l][1]; > + add->subOp = qOps[1]; > add->lanes = 1; /* abused for .ndv */ > } > > @@ -164,8 +166,20 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) > > // texture > bld.insert(tex = cloneForward(func, i)); > + if (l != 0) { > + if (array) > + tex->setSrc(0, arr); > + if (i->tex.target.isShadow()) > + tex->setSrc(array + dim, shadow); > + } > for (c = 0; c < dim; ++c) > tex->setSrc(c + array, src[c]); > + // broadcast results from lane 0 to all lanes > + if (l != 0) { > + Value *lane = bld.mkImm(l);This should of course be bld.mkImm(0), not l, since we're broadcasting from lane *0* to all lanes. These are all fixed up in https://github.com/imirkin/mesa/commit/618b99d86396417e31551dc464ab2ca5d038151f> + for (c = 0; i->defExists(c); ++c) > + bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), lane, quad); > + } > bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); > > // save results > -- > 2.13.6 >
Karol Herbst
2017-Dec-21 08:50 UTC
[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling
On Wed, Dec 20, 2017 at 3:44 PM, Ilia Mirkin <imirkin at alum.mit.edu> wrote:> On Tue, Dec 19, 2017 at 11:41 PM, Ilia Mirkin <imirkin at alum.mit.edu> wrote: >> This is parallel to the pre-SM50 change which does this. Adjusts the >> shuffles / quadops to make the values correct relative to lane 0, and >> then splat the results to all lanes for the final move into the target >> register. >> >> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu> >> --- >> >> Entirely untested beyond compilation. Should check >> >> bin/tex-miplevel-selection textureGrad Cube >> bin/tex-miplevel-selection textureGrad CubeShadow >> bin/tex-miplevel-selection textureGrad CubeArray >> KHR-GL45.texture_cube_map_array.sampling >> >> to see if they start passing with this change. >> >> .../nouveau/codegen/nv50_ir_lowering_gm107.cpp | 56 ++++++++++++++-------- >> 1 file changed, 35 insertions(+), 21 deletions(-) >> >> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp >> index 6b9edd48645..a2427526a81 100644 >> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp >> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp >> @@ -95,18 +95,15 @@ GM107LegalizeSSA::visit(Instruction *i) >> bool >> GM107LoweringPass::handleManualTXD(TexInstruction *i) >> { >> - static const uint8_t qOps[4][2] >> - { >> - { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 >> - { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 >> - { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 >> - { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 >> - }; >> + // See NVC0LoweringPass::handleManualTXD for rationale. This function >> + // implements the same logic, but using SM50-friendly primitives. >> + static const uint8_t qOps[2] >> + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }; >> Value *def[4][4]; >> - Value *crd[3]; >> + Value *crd[3], *arr, *shadow; >> Value *tmp; >> Instruction *tex, *add; >> - Value *zero = bld.loadImm(bld.getSSA(), 0); >> + Value *quad = bld.mkImm(SHFL_BOUND_QUAD); >> int l, c; >> const int dim = i->tex.target.getDim() + i->tex.target.isCube(); >> const int array = i->tex.target.isArray(); >> @@ -115,35 +112,40 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) >> >> for (c = 0; c < dim; ++c) >> crd[c] = bld.getScratch(); >> + arr = bld.getScratch(); >> + shadow = bld.getScratch(); >> tmp = bld.getScratch(); >> >> for (l = 0; l < 4; ++l) { >> Value *src[3], *val; >> - // mov coordinates from lane l to all lanes >> + Value *lane = bld.mkImm(l); >> bld.mkOp(OP_QUADON, TYPE_NONE, NULL); >> + // Make sure lane 0 has the appropriate array/depth compare values >> + if (l != 0) { >> + if (array) >> + bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad); >> + if (i->tex.target.isShadow()) >> + bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim), lane, quad); > > In the great argument switcheroo between each SM version, the shadow > compare is actually after the indirect handle (which in turn is after > array + dim). So this should become array + dim + indirect (and > similarly below). > >> + } >> + >> + // mov coordinates from lane l to all lanes >> for (c = 0; c < dim; ++c) { >> - bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), >> - bld.mkImm(l), bld.mkImm(SHFL_BOUND_QUAD)); >> - add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero); >> - add->subOp = 0x00; >> - add->lanes = 1; /* abused for .ndv */ >> + bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad); >> } >> >> // add dPdx from lane l to lanes dx >> for (c = 0; c < dim; ++c) { >> - bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), bld.mkImm(l), >> - bld.mkImm(SHFL_BOUND_QUAD)); >> + bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad); >> add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); >> - add->subOp = qOps[l][0]; >> + add->subOp = qOps[0]; >> add->lanes = 1; /* abused for .ndv */ >> } >> >> // add dPdy from lane l to lanes dy >> for (c = 0; c < dim; ++c) { >> - bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), bld.mkImm(l), >> - bld.mkImm(SHFL_BOUND_QUAD)); >> + bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad); >> add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); >> - add->subOp = qOps[l][1]; >> + add->subOp = qOps[1]; >> add->lanes = 1; /* abused for .ndv */ >> } >> >> @@ -164,8 +166,20 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) >> >> // texture >> bld.insert(tex = cloneForward(func, i)); >> + if (l != 0) { >> + if (array) >> + tex->setSrc(0, arr); >> + if (i->tex.target.isShadow()) >> + tex->setSrc(array + dim, shadow); >> + } >> for (c = 0; c < dim; ++c) >> tex->setSrc(c + array, src[c]); >> + // broadcast results from lane 0 to all lanes >> + if (l != 0) { >> + Value *lane = bld.mkImm(l); > > This should of course be bld.mkImm(0), not l, since we're broadcasting > from lane *0* to all lanes. > > These are all fixed up in > https://github.com/imirkin/mesa/commit/618b99d86396417e31551dc464ab2ca5d038151f >I did a piglit './piglit run -x glx -x egl -x streaming-texture-leak -x max-texture-size tests/gpu.py' on a GP107 and the three tests pass now as you said. In addition to that the CTS one and 'spec at arb_shader_texture_lod@execution at tex-miplevel-selection *gradarb cube' pass as well. So this is Tested-By: Karol Herbst <kherbst at redhat.com>>> + for (c = 0; i->defExists(c); ++c) >> + bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), lane, quad); >> + } >> bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); >> >> // save results >> -- >> 2.13.6 >> > _______________________________________________ > Nouveau mailing list > Nouveau at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/nouveau
Possibly Parallel Threads
- [PATCH] gm107/ir: use lane 0 for manual textureGrad handling
- [PATCH 1/2] nvc0/ir: use manual TXD when offsets are involved
- [LLVMdev] Proposal: Adding aligned instruction bundle support to MC
- [LLVMdev] Proposal: Adding aligned instruction bundle support to MC
- [LLVMdev] Proposal: Adding aligned instruction bundle support to MC