Don't use extra TEMPs unnecessarily in some cases. --- src/gallium/drivers/nv50/nv50_program.c | 120 +++++++++++++++--------------- 1 files changed, 60 insertions(+), 60 deletions(-) diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index d7ab28a..5594560 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -1294,18 +1294,20 @@ static boolean nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) { const struct tgsi_full_instruction *inst = &tok->FullInstruction; - struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp; - unsigned mask, sat, unit; + struct nv50_reg *rdst[4], *dst[4], *src[3][4]; + struct nv50_reg **pp_rtmp, *rtmp = NULL, *temp = NULL; + unsigned mask, sat, unit = 0; boolean assimilate = FALSE; - int i, c; + int i, c, nr_dst = 0; mask = inst->FullDstRegisters[0].DstRegister.WriteMask; sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; for (c = 0; c < 4; c++) { - if (mask & (1 << c)) + if (mask & (1 << c)) { dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); - else + ++nr_dst; + } else dst[c] = NULL; rdst[c] = NULL; src[0][c] = NULL; @@ -1313,8 +1315,13 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) src[2][c] = NULL; } + pp_rtmp = &dst[ffs(mask) - 1]; + if (*pp_rtmp && (*pp_rtmp)->type != P_TEMP && (nr_dst > 1 || sat)) + pp_rtmp = &temp; + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; + const struct tgsi_full_src_register *fs + &inst->FullSrcRegisters[i]; if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) unit = fs->SrcRegister.Index; @@ -1327,10 +1334,15 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) if (sat) { for (c = 0; c < 4; c++) { rdst[c] = dst[c]; - dst[c] = temp_temp(pc); + if (dst[c] && dst[c]->type != P_TEMP) + dst[c] = temp_temp(pc); } - } else - if (direct2dest_op(inst)) { + } + + if (direct2dest_op(inst) && (*pp_rtmp)) { + /* We really don't lose the real dst as we do not + * get here if sat overwrites dst with temp. + */ for (c = 0; c < 4; c++) { if (!dst[c] || dst[c]->type != P_TEMP) continue; @@ -1341,7 +1353,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) dst[c] == src[2][i]) break; } - if (i == 4) + if (i == 4 || !dst[i]) continue; assimilate = TRUE; @@ -1367,48 +1379,32 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) break; case TGSI_OPCODE_COS: temp = temp_temp(pc); + rtmp = *pp_rtmp; emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, 5, temp, temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_flop(pc, 5, rtmp, temp); break; case TGSI_OPCODE_DP3: temp = temp_temp(pc); + rtmp = *pp_rtmp; emit_mul(pc, temp, src[0][0], src[1][0]); emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, temp, src[0][2], src[1][2], temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_mad(pc, rtmp, src[0][2], src[1][2], temp); break; case TGSI_OPCODE_DP4: temp = temp_temp(pc); + rtmp = *pp_rtmp; emit_mul(pc, temp, src[0][0], src[1][0]); emit_mad(pc, temp, src[0][1], src[1][1], temp); emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_mad(pc, temp, src[0][3], src[1][3], temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_mad(pc, rtmp, src[0][3], src[1][3], temp); break; case TGSI_OPCODE_DPH: temp = temp_temp(pc); + rtmp = *pp_rtmp; emit_mul(pc, temp, src[0][0], src[1][0]); emit_mad(pc, temp, src[0][1], src[1][1], temp); emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_add(pc, temp, src[1][3], temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_add(pc, rtmp, src[1][3], temp); break; case TGSI_OPCODE_DST: { @@ -1426,13 +1422,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) break; case TGSI_OPCODE_EX2: temp = temp_temp(pc); + rtmp = *pp_rtmp; emit_preex2(pc, temp, src[0][0]); - emit_flop(pc, 6, temp, temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_flop(pc, 6, rtmp, temp); break; case TGSI_OPCODE_FLR: for (c = 0; c < 4; c++) { @@ -1461,13 +1453,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) emit_lit(pc, &dst[0], mask, &src[0][0]); break; case TGSI_OPCODE_LG2: - temp = temp_temp(pc); - emit_flop(pc, 3, temp, src[0][0]); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + rtmp = *pp_rtmp; + if (!rtmp) + rtmp = temp_temp(pc); + emit_flop(pc, 3, rtmp, src[0][0]); break; case TGSI_OPCODE_LRP: temp = temp_temp(pc); @@ -1523,18 +1512,16 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) } break; case TGSI_OPCODE_RCP: - for (c = 3; c >= 0; c--) { - if (!(mask & (1 << c))) - continue; - emit_flop(pc, 0, dst[c], src[0][0]); - } + rtmp = *pp_rtmp; + if (!rtmp) + rtmp = temp_temp(pc); + emit_flop(pc, 0, rtmp, src[0][0]); break; case TGSI_OPCODE_RSQ: - for (c = 3; c >= 0; c--) { - if (!(mask & (1 << c))) - continue; - emit_flop(pc, 2, dst[c], src[0][0]); - } + rtmp = *pp_rtmp; + if (!rtmp) + rtmp = temp_temp(pc); + emit_flop(pc, 2, rtmp, src[0][0]); break; case TGSI_OPCODE_SCS: temp = temp_temp(pc); @@ -1557,6 +1544,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) break; case TGSI_OPCODE_SIN: temp = temp_temp(pc); + rtmp = *pp_rtmp; emit_precossin(pc, temp, src[0][0]); emit_flop(pc, 4, temp, temp); for (c = 0; c < 4; c++) { @@ -1611,14 +1599,26 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) return FALSE; } + if (rtmp) { + if (sat) + dst[0] = dst[1] = dst[2] = dst[3] = rtmp; + else { + for (c = 0; c < 4; c++) { + if (mask & (1 << c)) + emit_mov(pc, dst[c], rtmp); + } + } + } + if (sat) { for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT, - CVT_F32_F32); + emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT, 0xc4); } - } else if (assimilate) { + } + + if (assimilate) { for (c = 0; c < 4; c++) if (rdst[c]) assimilate_temp(pc, rdst[c], dst[c]); -- 1.6.0.6 --------------090503050107050804030002 Content-Type: text/plain; name="0012-nv50-initial-support-for-IF-ELSE-ENDIF-insns.patch" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename*0="0012-nv50-initial-support-for-IF-ELSE-ENDIF-insns.patch"