thr3ads.net - Nouveau - [Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling [Dec 2017]

If this information is useful, please help other people find it:
Share via:

Ilia Mirkin

2017-Dec-20 04:41 UTC

[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling

This is parallel to the pre-SM50 change which does this. Adjusts the
shuffles / quadops to make the values correct relative to lane 0, and
then splat the results to all lanes for the final move into the target
register.

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
---

Entirely untested beyond compilation. Should check

bin/tex-miplevel-selection textureGrad Cube
bin/tex-miplevel-selection textureGrad CubeShadow
bin/tex-miplevel-selection textureGrad CubeArray
KHR-GL45.texture_cube_map_array.sampling

to see if they start passing with this change.

 .../nouveau/codegen/nv50_ir_lowering_gm107.cpp     | 56 ++++++++++++++--------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 6b9edd48645..a2427526a81 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -95,18 +95,15 @@ GM107LegalizeSSA::visit(Instruction *i)
 bool
 GM107LoweringPass::handleManualTXD(TexInstruction *i)
 {
-   static const uint8_t qOps[4][2] -   {
-      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
-      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
-      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
-      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
-   };
+   // See NVC0LoweringPass::handleManualTXD for rationale. This function
+   // implements the same logic, but using SM50-friendly primitives.
+   static const uint8_t qOps[2] +      { QUADOP(MOV2, ADD,  MOV2, ADD), 
QUADOP(MOV2, MOV2, ADD,  ADD) };
    Value *def[4][4];
-   Value *crd[3];
+   Value *crd[3], *arr, *shadow;
    Value *tmp;
    Instruction *tex, *add;
-   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   Value *quad = bld.mkImm(SHFL_BOUND_QUAD);
    int l, c;
    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    const int array = i->tex.target.isArray();
@@ -115,35 +112,40 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
 
    for (c = 0; c < dim; ++c)
       crd[c] = bld.getScratch();
+   arr = bld.getScratch();
+   shadow = bld.getScratch();
    tmp = bld.getScratch();
 
    for (l = 0; l < 4; ++l) {
       Value *src[3], *val;
-      // mov coordinates from lane l to all lanes
+      Value *lane = bld.mkImm(l);
       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+      // Make sure lane 0 has the appropriate array/depth compare values
+      if (l != 0) {
+         if (array)
+            bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad);
+         if (i->tex.target.isShadow())
+            bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim),
lane, quad);
+      }
+
+      // mov coordinates from lane l to all lanes
       for (c = 0; c < dim; ++c) {
-         bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array),
-                   bld.mkImm(l), bld.mkImm(SHFL_BOUND_QUAD));
-         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero);
-         add->subOp = 0x00;
-         add->lanes = 1; /* abused for .ndv */
+         bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane,
quad);
       }
 
       // add dPdx from lane l to lanes dx
       for (c = 0; c < dim; ++c) {
-         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), bld.mkImm(l),
-                   bld.mkImm(SHFL_BOUND_QUAD));
+         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad);
          add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
-         add->subOp = qOps[l][0];
+         add->subOp = qOps[0];
          add->lanes = 1; /* abused for .ndv */
       }
 
       // add dPdy from lane l to lanes dy
       for (c = 0; c < dim; ++c) {
-         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), bld.mkImm(l),
-                   bld.mkImm(SHFL_BOUND_QUAD));
+         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad);
          add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
-         add->subOp = qOps[l][1];
+         add->subOp = qOps[1];
          add->lanes = 1; /* abused for .ndv */
       }
 
@@ -164,8 +166,20 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
 
       // texture
       bld.insert(tex = cloneForward(func, i));
+      if (l != 0) {
+         if (array)
+            tex->setSrc(0, arr);
+         if (i->tex.target.isShadow())
+            tex->setSrc(array + dim, shadow);
+      }
       for (c = 0; c < dim; ++c)
          tex->setSrc(c + array, src[c]);
+      // broadcast results from lane 0 to all lanes
+      if (l != 0) {
+         Value *lane = bld.mkImm(l);
+         for (c = 0; i->defExists(c); ++c)
+            bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c),
lane, quad);
+      }
       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 
       // save results
-- 
2.13.6

Ilia Mirkin

2017-Dec-20 14:44 UTC

head link

[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling

On Tue, Dec 19, 2017 at 11:41 PM, Ilia Mirkin <imirkin at alum.mit.edu>
wrote:> This is parallel to the pre-SM50 change which does this. Adjusts the
> shuffles / quadops to make the values correct relative to lane 0, and
> then splat the results to all lanes for the final move into the target
> register.
>
> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
> ---
>
> Entirely untested beyond compilation. Should check
>
> bin/tex-miplevel-selection textureGrad Cube
> bin/tex-miplevel-selection textureGrad CubeShadow
> bin/tex-miplevel-selection textureGrad CubeArray
> KHR-GL45.texture_cube_map_array.sampling
>
> to see if they start passing with this change.
>
>  .../nouveau/codegen/nv50_ir_lowering_gm107.cpp     | 56
++++++++++++++--------
>  1 file changed, 35 insertions(+), 21 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> index 6b9edd48645..a2427526a81 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
> @@ -95,18 +95,15 @@ GM107LegalizeSSA::visit(Instruction *i)
>  bool
>  GM107LoweringPass::handleManualTXD(TexInstruction *i)
>  {
> -   static const uint8_t qOps[4][2] > -   {
> -      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) },
// l0
> -      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) },
// l1
> -      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) },
// l2
> -      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) },
// l3
> -   };
> +   // See NVC0LoweringPass::handleManualTXD for rationale. This function
> +   // implements the same logic, but using SM50-friendly primitives.
> +   static const uint8_t qOps[2] > +      { QUADOP(MOV2, ADD,  MOV2,
ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
>     Value *def[4][4];
> -   Value *crd[3];
> +   Value *crd[3], *arr, *shadow;
>     Value *tmp;
>     Instruction *tex, *add;
> -   Value *zero = bld.loadImm(bld.getSSA(), 0);
> +   Value *quad = bld.mkImm(SHFL_BOUND_QUAD);
>     int l, c;
>     const int dim = i->tex.target.getDim() + i->tex.target.isCube();
>     const int array = i->tex.target.isArray();
> @@ -115,35 +112,40 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
>
>     for (c = 0; c < dim; ++c)
>        crd[c] = bld.getScratch();
> +   arr = bld.getScratch();
> +   shadow = bld.getScratch();
>     tmp = bld.getScratch();
>
>     for (l = 0; l < 4; ++l) {
>        Value *src[3], *val;
> -      // mov coordinates from lane l to all lanes
> +      Value *lane = bld.mkImm(l);
>        bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
> +      // Make sure lane 0 has the appropriate array/depth compare values
> +      if (l != 0) {
> +         if (array)
> +            bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane,
quad);
> +         if (i->tex.target.isShadow())
> +            bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array +
dim), lane, quad);
In the great argument switcheroo between each SM version, the shadow
compare is actually after the indirect handle (which in turn is after
array + dim). So this should become array + dim + indirect (and
similarly below).
> +      }
> +
> +      // mov coordinates from lane l to all lanes
>        for (c = 0; c < dim; ++c) {
> -         bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array),
> -                   bld.mkImm(l), bld.mkImm(SHFL_BOUND_QUAD));
> -         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero);
> -         add->subOp = 0x00;
> -         add->lanes = 1; /* abused for .ndv */
> +         bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array),
lane, quad);
>        }
>
>        // add dPdx from lane l to lanes dx
>        for (c = 0; c < dim; ++c) {
> -         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(),
bld.mkImm(l),
> -                   bld.mkImm(SHFL_BOUND_QUAD));
> +         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane,
quad);
>           add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
> -         add->subOp = qOps[l][0];
> +         add->subOp = qOps[0];
>           add->lanes = 1; /* abused for .ndv */
>        }
>
>        // add dPdy from lane l to lanes dy
>        for (c = 0; c < dim; ++c) {
> -         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(),
bld.mkImm(l),
> -                   bld.mkImm(SHFL_BOUND_QUAD));
> +         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane,
quad);
>           add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
> -         add->subOp = qOps[l][1];
> +         add->subOp = qOps[1];
>           add->lanes = 1; /* abused for .ndv */
>        }
>
> @@ -164,8 +166,20 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
>
>        // texture
>        bld.insert(tex = cloneForward(func, i));
> +      if (l != 0) {
> +         if (array)
> +            tex->setSrc(0, arr);
> +         if (i->tex.target.isShadow())
> +            tex->setSrc(array + dim, shadow);
> +      }
>        for (c = 0; c < dim; ++c)
>           tex->setSrc(c + array, src[c]);
> +      // broadcast results from lane 0 to all lanes
> +      if (l != 0) {
> +         Value *lane = bld.mkImm(l);
This should of course be bld.mkImm(0), not l, since we're broadcasting
from lane *0* to all lanes.

These are all fixed up in
https://github.com/imirkin/mesa/commit/618b99d86396417e31551dc464ab2ca5d038151f
> +         for (c = 0; i->defExists(c); ++c)
> +            bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c),
tex->getDef(c), lane, quad);
> +      }
>        bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
>
>        // save results
> --
> 2.13.6
>

Karol Herbst

2017-Dec-21 08:50 UTC

head link

[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling

On Wed, Dec 20, 2017 at 3:44 PM, Ilia Mirkin <imirkin at alum.mit.edu>
wrote:> On Tue, Dec 19, 2017 at 11:41 PM, Ilia Mirkin <imirkin at
alum.mit.edu> wrote:
>> This is parallel to the pre-SM50 change which does this. Adjusts the
>> shuffles / quadops to make the values correct relative to lane 0, and
>> then splat the results to all lanes for the final move into the target
>> register.
>>
>> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
>> ---
>>
>> Entirely untested beyond compilation. Should check
>>
>> bin/tex-miplevel-selection textureGrad Cube
>> bin/tex-miplevel-selection textureGrad CubeShadow
>> bin/tex-miplevel-selection textureGrad CubeArray
>> KHR-GL45.texture_cube_map_array.sampling
>>
>> to see if they start passing with this change.
>>
>>  .../nouveau/codegen/nv50_ir_lowering_gm107.cpp     | 56
++++++++++++++--------
>>  1 file changed, 35 insertions(+), 21 deletions(-)
>>
>> diff --git
a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
>> index 6b9edd48645..a2427526a81 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
>> @@ -95,18 +95,15 @@ GM107LegalizeSSA::visit(Instruction *i)
>>  bool
>>  GM107LoweringPass::handleManualTXD(TexInstruction *i)
>>  {
>> -   static const uint8_t qOps[4][2] >> -   {
>> -      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD)
}, // l0
>> -      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD)
}, // l1
>> -      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2)
}, // l2
>> -      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2)
}, // l3
>> -   };
>> +   // See NVC0LoweringPass::handleManualTXD for rationale. This
function
>> +   // implements the same logic, but using SM50-friendly primitives.
>> +   static const uint8_t qOps[2] >> +      { QUADOP(MOV2, ADD, 
MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
>>     Value *def[4][4];
>> -   Value *crd[3];
>> +   Value *crd[3], *arr, *shadow;
>>     Value *tmp;
>>     Instruction *tex, *add;
>> -   Value *zero = bld.loadImm(bld.getSSA(), 0);
>> +   Value *quad = bld.mkImm(SHFL_BOUND_QUAD);
>>     int l, c;
>>     const int dim = i->tex.target.getDim() +
i->tex.target.isCube();
>>     const int array = i->tex.target.isArray();
>> @@ -115,35 +112,40 @@ GM107LoweringPass::handleManualTXD(TexInstruction
*i)
>>
>>     for (c = 0; c < dim; ++c)
>>        crd[c] = bld.getScratch();
>> +   arr = bld.getScratch();
>> +   shadow = bld.getScratch();
>>     tmp = bld.getScratch();
>>
>>     for (l = 0; l < 4; ++l) {
>>        Value *src[3], *val;
>> -      // mov coordinates from lane l to all lanes
>> +      Value *lane = bld.mkImm(l);
>>        bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
>> +      // Make sure lane 0 has the appropriate array/depth compare
values
>> +      if (l != 0) {
>> +         if (array)
>> +            bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane,
quad);
>> +         if (i->tex.target.isShadow())
>> +            bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array +
dim), lane, quad);
>
> In the great argument switcheroo between each SM version, the shadow
> compare is actually after the indirect handle (which in turn is after
> array + dim). So this should become array + dim + indirect (and
> similarly below).
>
>> +      }
>> +
>> +      // mov coordinates from lane l to all lanes
>>        for (c = 0; c < dim; ++c) {
>> -         bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array),
>> -                   bld.mkImm(l), bld.mkImm(SHFL_BOUND_QUAD));
>> -         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero);
>> -         add->subOp = 0x00;
>> -         add->lanes = 1; /* abused for .ndv */
>> +         bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array),
lane, quad);
>>        }
>>
>>        // add dPdx from lane l to lanes dx
>>        for (c = 0; c < dim; ++c) {
>> -         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(),
bld.mkImm(l),
>> -                   bld.mkImm(SHFL_BOUND_QUAD));
>> +         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane,
quad);
>>           add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
>> -         add->subOp = qOps[l][0];
>> +         add->subOp = qOps[0];
>>           add->lanes = 1; /* abused for .ndv */
>>        }
>>
>>        // add dPdy from lane l to lanes dy
>>        for (c = 0; c < dim; ++c) {
>> -         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(),
bld.mkImm(l),
>> -                   bld.mkImm(SHFL_BOUND_QUAD));
>> +         bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane,
quad);
>>           add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
>> -         add->subOp = qOps[l][1];
>> +         add->subOp = qOps[1];
>>           add->lanes = 1; /* abused for .ndv */
>>        }
>>
>> @@ -164,8 +166,20 @@ GM107LoweringPass::handleManualTXD(TexInstruction
*i)
>>
>>        // texture
>>        bld.insert(tex = cloneForward(func, i));
>> +      if (l != 0) {
>> +         if (array)
>> +            tex->setSrc(0, arr);
>> +         if (i->tex.target.isShadow())
>> +            tex->setSrc(array + dim, shadow);
>> +      }
>>        for (c = 0; c < dim; ++c)
>>           tex->setSrc(c + array, src[c]);
>> +      // broadcast results from lane 0 to all lanes
>> +      if (l != 0) {
>> +         Value *lane = bld.mkImm(l);
>
> This should of course be bld.mkImm(0), not l, since we're broadcasting
> from lane *0* to all lanes.
>
> These are all fixed up in
>
https://github.com/imirkin/mesa/commit/618b99d86396417e31551dc464ab2ca5d038151f
>
I did a piglit './piglit run -x glx -x egl -x streaming-texture-leak
-x max-texture-size tests/gpu.py' on a GP107 and the three tests pass
now as you said.

In addition to that the CTS one and
'spec at arb_shader_texture_lod@execution at tex-miplevel-selection *gradarb
cube' pass as well.

So this is Tested-By: Karol Herbst <kherbst at redhat.com>
>> +         for (c = 0; i->defExists(c); ++c)
>> +            bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c),
tex->getDef(c), lane, quad);
>> +      }
>>        bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
>>
>>        // save results
>> --
>> 2.13.6
>>
> _______________________________________________
> Nouveau mailing list
> Nouveau at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/nouveau

Possibly Parallel Threads

Search for more seemingly similar threads

Nouveau - Dec 2017 - [PATCH] gm107/ir: use lane 0 for manual textureGrad handling

[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling

[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling

[Nouveau] [PATCH] gm107/ir: use lane 0 for manual textureGrad handling

Possibly Parallel Threads