hameeza ahmed via llvm-dev
2018-Apr-10 19:14 UTC
[llvm-dev] 64 bit mask in x86vshuffle instruction
Please tell me whether the following implementation is correct..... My target supports 64 bit mask means immediate(0-2^63) I have implemented it but i dont know whether its correct or not. Please see the changes below that i have made in x86isellowering.cpp static SDValue lower2048BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = Mask.size(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Check for being able to broadcast a single element. if (SDValue Broadcast lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v32f64: return lowerV32F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64f32: return lowerV64F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i64: return lowerV32I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i32: return lowerV64I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid P x86 vector type!"); } } static SDValue lowerV64I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v64i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i32 && "Bad operand type!"); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v64i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; SmallVector<int, 16> RepeatedMask; bool Is128BitLaneRepeatedShuffle is128BitLaneRepeatedShuffleMask(MVT::v64i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { // assert(RepeatedMask.size() == 16 && "Unexpected repeated mask size!"); if (V2.isUndef()) { return DAG.getNode(X86ISD::PSHUFD_P64, DL, MVT::v64i32, V1, getV16X86ShuffleImm64ForMask(/*Repeated*/Mask, DL, DAG)); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V lowerVectorShuffleWithUNPCK(DL, MVT::v64i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v64i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Assume that a single SHUFPS is faster than using a permv shuffle. // If some CPU is harmed by the domain switch, we can fix it in a later pass. // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v64i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; return lowerVectorShuffleWithPERMV(DL, MVT::v64i32, Mask, V1, V2, DAG); } static SDValue lowerV32I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i64 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v8 shuffle!"); if (SDValue Shuf128 lowerV16X128VectorShuffle(DL, MVT::v32i64, Mask, V1, V2, DAG)) return Shuf128; if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four // 128-bit lanes. SmallVector<int, 8> Repeated128Mask; if (is128BitLaneRepeatedShuffleMask(MVT::v32i64, Mask, Repeated128Mask)) { SmallVector<int, 64> PSHUFDMask; scaleShuffleMask(8, Repeated128Mask, PSHUFDMask); return DAG.getBitcast( MVT::v32i64, DAG.getNode(X86ISD::PSHUFD_P64, DL, MVT::v64i32, DAG.getBitcast(MVT::v64i32, V1), getV16X86ShuffleImm64ForMask(PSHUFDMask, DL, DAG))); } SmallVector<int, 16> Repeated256Mask; if (is256BitLaneRepeatedShuffleMask(MVT::v32i64, Mask, Repeated256Mask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v32i64, V1, getV16X86ShuffleImm64ForMask(Repeated256Mask, DL, DAG)); } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v32i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v32i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Unpck lowerVectorShuffleWithUNPCK(DL, MVT::v32i64, Mask, V1, V2, DAG)) return Unpck; // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v32i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; return lowerVectorShuffleWithPERMV(DL, MVT::v32i64, Mask, V1, V2, DAG); } static SDValue getV64X86ShuffleImm64ForMask(ArrayRef<int> Mask, SDLoc DL, SelectionDAG &DAG) { return DAG.getConstant(getV64X86ShuffleImm(Mask), DL, MVT::i64); } static uint64_t getV16X86ShuffleImm(ArrayRef<int> Mask) { // assert(Mask.size() == 16 && "Only 16-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 16 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 16 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 16 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 16 && "Out of bound mask element!"); assert(Mask[4] >= -1 && Mask[4] < 16 && "Out of bound mask element!"); assert(Mask[5] >= -1 && Mask[5] < 16 && "Out of bound mask element!"); assert(Mask[6] >= -1 && Mask[6] < 16 && "Out of bound mask element!"); assert(Mask[7] >= -1 && Mask[7] < 16 && "Out of bound mask element!"); assert(Mask[8] >= -1 && Mask[8] < 16 && "Out of bound mask element!"); assert(Mask[9] >= -1 && Mask[9] < 16 && "Out of bound mask element!"); assert(Mask[10] >= -1 && Mask[10] < 16 && "Out of bound mask element!"); assert(Mask[11] >= -1 && Mask[11] < 16 && "Out of bound mask element!"); assert(Mask[12] >= -1 && Mask[12] < 16 && "Out of bound mask element!"); assert(Mask[13] >= -1 && Mask[13] < 16 && "Out of bound mask element!"); assert(Mask[14] >= -1 && Mask[14] < 16 && "Out of bound mask element!"); assert(Mask[15] >= -1 && Mask[15] < 16 && "Out of bound mask element!"); uint64_t Imm = 0; Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; Imm |= (Mask[4] < 0 ? 4 : Mask[4]) << 8; Imm |= (Mask[5] < 0 ? 5 : Mask[5]) << 10; Imm |= (Mask[6] < 0 ? 6 : Mask[6]) << 12; Imm |= (Mask[7] < 0 ? 7 : Mask[7]) << 14; Imm |= (Mask[8] < 0 ? 8 : Mask[8]) << 16; Imm |= (Mask[9] < 0 ? 9 : Mask[9]) << 18; Imm |= (Mask[10] < 0 ? 10 : Mask[10]) << 20; Imm |= (Mask[11] < 0 ? 11 : Mask[11]) << 22; Imm |= (Mask[12] < 0 ? 12 : Mask[12]) << 24; Imm |= (Mask[13] < 0 ? 13 : Mask[13]) << 26; Imm |= (Mask[14] < 0 ? 14 : Mask[14]) << 28; Imm |= (Mask[15] < 0 ? 15 : Mask[15]) << 30; return Imm; } static SDValue lowerV16X128VectorShuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); // To handle 256 bit vector requires VLX and most probably // function lowerV2X128VectorShuffle() is better solution. assert(VT.is2048BitVector() && "Unexpected vector size for 2048bit shuffle."); SmallVector<int, 16> WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); // Check for patterns which can be matched with a single insert of a 256-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 4, 5, 6, 7,0, 1, 2, 3, 4, 5, 6, 7}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,16,17,18,19})) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 16); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } assert(WidenedMask.size() == 16); // See if this is an insertion of the lower 128-bits of V2 into V1. bool IsInsert = true; int V2Index = -1; for (int i = 0; i < 16; ++i) { assert(WidenedMask[i] >= -1); if (WidenedMask[i] < 0) continue; // Make sure all V1 subvectors are in place. if (WidenedMask[i] < 16) { if (WidenedMask[i] != i) { IsInsert = false; break; } } else { // Make sure we only have a single V2 index and its the lowest 128-bits. if (V2Index >= 0 || WidenedMask[i] != 16) { IsInsert = false; break; } V2Index = i; } } if (IsInsert && V2Index >= 0) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, DAG.getIntPtrConstant(0, DL)); return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); } // Try to lower to to vshuf64x2/vshuf32x4. SDValue Ops[8] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; unsigned PermMask = 0; // Insure elements came from the same Op. for (int i = 0; i < 16; ++i) { assert(WidenedMask[i] >= -1); if (WidenedMask[i] < 0) continue; SDValue Op = WidenedMask[i] >= 16 ? V2 : V1; unsigned OpIndex = i / 2; if (Ops[OpIndex].isUndef()) Ops[OpIndex] = Op; else if (Ops[OpIndex] != Op) return SDValue(); // Convert the 128-bit shuffle mask selection values into 128-bit selection // bits defined by a vshuf64x2 instruction's immediate control byte. PermMask |= (WidenedMask[i] % 16) << (i * 2); } return DAG.getNode(X86ISD::SHUF128_P64, DL, VT, Ops[0], Ops[1], DAG.getConstant(PermMask, DL, MVT::i64)); } Please help...i m really sorry for asking but i m stuck here..The code runs w/o error at the o/p it gives following assembly; P_256B_VSHUF64x2_QWORD R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_1, 236 # encoding: [] P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] P_256B_VSHUF64x2_QWORD R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_1, 244 # encoding: [] P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] P_256B_VSHUF64x2_QWORD R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_0, 14 # encoding: [] P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] P_256B_VSHUF64x2_QWORD R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_0, 1 # encoding: [] P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] P_256B_PSHUFFLE_DWORD R_0_R2048b_0, R_0_R2048b_1, 236 # encoding: [] P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] P_256B_PSHUFFLE_DWORD R_0_R2048b_0, R_0_R2048b_1, 229 # encoding: [] Here the imm are small (0-255) looks like 8 bits.... What to do? Please help.... -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20180411/3b9374ed/attachment.html>
hameeza ahmed via llvm-dev
2018-Apr-11 19:24 UTC
[llvm-dev] 64 bit mask in x86vshuffle instruction
I m still unable to solve vshuffle issue. Can you tell me how to split v64i32 shuffle into 4 v16i32 shuffles? On Wed, Apr 11, 2018 at 12:14 AM, hameeza ahmed <hahmed2305 at gmail.com> wrote:> Please tell me whether the following implementation is correct..... > My target supports 64 bit mask means immediate(0-2^63) > > I have implemented it but i dont know whether its correct or not. Please > see the changes below that i have made in x86isellowering.cpp > > static SDValue lower2048BitVectorShuffle(const SDLoc &DL, ArrayRef<int> > Mask, > MVT VT, SDValue V1, SDValue V2, > const SmallBitVector &Zeroable, > const X86Subtarget &Subtarget, > SelectionDAG &DAG) { > > > // If we have a single input to the zero element, insert that into V1 if > we > // can do so cheaply. > int NumElts = Mask.size(); > int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >> NumElts; }); > > if (NumV2Elements == 1 && Mask[0] >= NumElts) > if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( > DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) > return Insertion; > > // Check for being able to broadcast a single element. > if (SDValue Broadcast > lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, > DAG)) > return Broadcast; > > // Dispatch to each element type for lowering. If we don't have support > for > // specific element type shuffles at 512 bits, immediately split them and > // lower them. Each lowering routine of a given type is allowed to > assume that > // the requisite ISA extensions for that element type are available. > switch (VT.SimpleTy) { > case MVT::v32f64: > return lowerV32F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, > DAG); > case MVT::v64f32: > return lowerV64F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, > DAG); > case MVT::v32i64: > return lowerV32I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, > DAG); > case MVT::v64i32: > return lowerV64I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, > DAG); > > default: > llvm_unreachable("Not a valid P x86 vector type!"); > } > } > > static SDValue lowerV64I32VectorShuffle(const SDLoc &DL, ArrayRef<int> > Mask, > const SmallBitVector &Zeroable, > SDValue V1, SDValue V2, > const X86Subtarget &Subtarget, > SelectionDAG &DAG) { > > assert(V1.getSimpleValueType() == MVT::v64i32 && "Bad operand type!"); > assert(V2.getSimpleValueType() == MVT::v64i32 && "Bad operand type!"); > assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); > > if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( > DL, MVT::v64i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) > return ZExt; > > > SmallVector<int, 16> RepeatedMask; > > bool Is128BitLaneRepeatedShuffle > is128BitLaneRepeatedShuffleMask(MVT::v64i32, Mask, RepeatedMask); > > if (Is128BitLaneRepeatedShuffle) { > // assert(RepeatedMask.size() == 16 && "Unexpected repeated mask > size!"); > > > if (V2.isUndef()) > { > return DAG.getNode(X86ISD::PSHUFD_P64, DL, MVT::v64i32, V1, > getV16X86ShuffleImm64ForMask(/*Repeated*/Mask, > DL, DAG)); > > > } > > // Use dedicated unpack instructions for masks that match their > pattern. > if (SDValue V > lowerVectorShuffleWithUNPCK(DL, MVT::v64i32, Mask, V1, V2, > DAG)) > return V; > } > > > > // Try to use shift instructions. > if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i32, V1, V2, > Mask, > Zeroable, Subtarget, DAG)) > return Shift; > > // Try to use VALIGN. > if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v64i32, V1, V2, > Mask, Subtarget, DAG)) > return Rotate; > > // Assume that a single SHUFPS is faster than using a permv shuffle. > // If some CPU is harmed by the domain switch, we can fix it in a later > pass. > > // If we have AVX512F support, we can use VEXPAND. > if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v64i32, Zeroable, > Mask, > V1, V2, DAG, Subtarget)) > return V; > > return lowerVectorShuffleWithPERMV(DL, MVT::v64i32, Mask, V1, V2, DAG); > > } > > static SDValue lowerV32I64VectorShuffle(const SDLoc &DL, ArrayRef<int> > Mask, > const SmallBitVector &Zeroable, > SDValue V1, SDValue V2, > const X86Subtarget &Subtarget, > SelectionDAG &DAG) { > assert(V1.getSimpleValueType() == MVT::v32i64 && "Bad operand type!"); > assert(V2.getSimpleValueType() == MVT::v32i64 && "Bad operand type!"); > assert(Mask.size() == 32 && "Unexpected mask size for v8 shuffle!"); > > if (SDValue Shuf128 > lowerV16X128VectorShuffle(DL, MVT::v32i64, Mask, V1, V2, DAG)) > return Shuf128; > > if (V2.isUndef()) { > // When the shuffle is mirrored between the 128-bit lanes of the unit, > we > // can use lower latency instructions that will operate on all four > // 128-bit lanes. > SmallVector<int, 8> Repeated128Mask; > if (is128BitLaneRepeatedShuffleMask(MVT::v32i64, Mask, > Repeated128Mask)) { > SmallVector<int, 64> PSHUFDMask; > scaleShuffleMask(8, Repeated128Mask, PSHUFDMask); > return DAG.getBitcast( > MVT::v32i64, > DAG.getNode(X86ISD::PSHUFD_P64, DL, MVT::v64i32, > DAG.getBitcast(MVT::v64i32, V1), > getV16X86ShuffleImm64ForMask(PSHUFDMask, DL, DAG))); > } > > SmallVector<int, 16> Repeated256Mask; > if (is256BitLaneRepeatedShuffleMask(MVT::v32i64, Mask, > Repeated256Mask)) > return DAG.getNode(X86ISD::VPERMI, DL, MVT::v32i64, V1, > getV16X86ShuffleImm64ForMask(Repeated256Mask, > DL, DAG)); > } > > // Try to use shift instructions. > if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i64, V1, V2, > Mask, > Zeroable, Subtarget, DAG)) > return Shift; > > // Try to use VALIGN. > if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v32i64, V1, V2, > Mask, Subtarget, DAG)) > return Rotate; > > // Try to use PALIGNR. > if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v32i64, > V1, V2, > Mask, Subtarget, > DAG)) > return Rotate; > > if (SDValue Unpck > lowerVectorShuffleWithUNPCK(DL, MVT::v32i64, Mask, V1, V2, DAG)) > return Unpck; > // If we have AVX512F support, we can use VEXPAND. > if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v32i64, Zeroable, > Mask, V1, > V2, DAG, Subtarget)) > return V; > > return lowerVectorShuffleWithPERMV(DL, MVT::v32i64, Mask, V1, V2, DAG); > } > > static SDValue getV64X86ShuffleImm64ForMask(ArrayRef<int> Mask, SDLoc DL, > SelectionDAG &DAG) { > return DAG.getConstant(getV64X86ShuffleImm(Mask), DL, MVT::i64); > } > > > static uint64_t getV16X86ShuffleImm(ArrayRef<int> Mask) { > // assert(Mask.size() == 16 && "Only 16-lane shuffle masks"); > assert(Mask[0] >= -1 && Mask[0] < 16 && "Out of bound mask element!"); > assert(Mask[1] >= -1 && Mask[1] < 16 && "Out of bound mask element!"); > assert(Mask[2] >= -1 && Mask[2] < 16 && "Out of bound mask element!"); > assert(Mask[3] >= -1 && Mask[3] < 16 && "Out of bound mask element!"); > assert(Mask[4] >= -1 && Mask[4] < 16 && "Out of bound mask element!"); > assert(Mask[5] >= -1 && Mask[5] < 16 && "Out of bound mask element!"); > assert(Mask[6] >= -1 && Mask[6] < 16 && "Out of bound mask element!"); > assert(Mask[7] >= -1 && Mask[7] < 16 && "Out of bound mask element!"); > assert(Mask[8] >= -1 && Mask[8] < 16 && "Out of bound mask element!"); > assert(Mask[9] >= -1 && Mask[9] < 16 && "Out of bound mask element!"); > assert(Mask[10] >= -1 && Mask[10] < 16 && "Out of bound mask element!"); > assert(Mask[11] >= -1 && Mask[11] < 16 && "Out of bound mask element!"); > assert(Mask[12] >= -1 && Mask[12] < 16 && "Out of bound mask element!"); > assert(Mask[13] >= -1 && Mask[13] < 16 && "Out of bound mask element!"); > assert(Mask[14] >= -1 && Mask[14] < 16 && "Out of bound mask element!"); > assert(Mask[15] >= -1 && Mask[15] < 16 && "Out of bound mask element!"); > > uint64_t Imm = 0; > Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; > Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; > Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; > Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; > Imm |= (Mask[4] < 0 ? 4 : Mask[4]) << 8; > Imm |= (Mask[5] < 0 ? 5 : Mask[5]) << 10; > Imm |= (Mask[6] < 0 ? 6 : Mask[6]) << 12; > Imm |= (Mask[7] < 0 ? 7 : Mask[7]) << 14; > Imm |= (Mask[8] < 0 ? 8 : Mask[8]) << 16; > Imm |= (Mask[9] < 0 ? 9 : Mask[9]) << 18; > Imm |= (Mask[10] < 0 ? 10 : Mask[10]) << 20; > Imm |= (Mask[11] < 0 ? 11 : Mask[11]) << 22; > Imm |= (Mask[12] < 0 ? 12 : Mask[12]) << 24; > Imm |= (Mask[13] < 0 ? 13 : Mask[13]) << 26; > Imm |= (Mask[14] < 0 ? 14 : Mask[14]) << 28; > Imm |= (Mask[15] < 0 ? 15 : Mask[15]) << 30; > > return Imm; > } > > > static SDValue lowerV16X128VectorShuffle(const SDLoc &DL, MVT VT, > ArrayRef<int> Mask, SDValue V1, > SDValue V2, SelectionDAG &DAG) { > assert(VT.getScalarSizeInBits() == 64 && > "Unexpected element type size for 128bit shuffle."); > > // To handle 256 bit vector requires VLX and most probably > // function lowerV2X128VectorShuffle() is better solution. > assert(VT.is2048BitVector() && "Unexpected vector size for 2048bit > shuffle."); > > SmallVector<int, 16> WidenedMask; > if (!canWidenShuffleElements(Mask, WidenedMask)) > return SDValue(); > > // Check for patterns which can be matched with a single insert of a > 256-bit > // subvector. > bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, > {0, 1, 2, 3, 4, 5, 6, 7,0, 1, 2, > 3, 4, 5, 6, 7}); > if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, > {0, 1, 2, 3, 4, 5, 6, 7, 12, 13, > 14, 15,16,17,18,19})) > > { > MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 16); > SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, > DAG.getIntPtrConstant(0, DL)); > SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, > OnlyUsesV1 ? V1 : V2, > DAG.getIntPtrConstant(0, DL)); > return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); > } > > assert(WidenedMask.size() == 16); > > // See if this is an insertion of the lower 128-bits of V2 into V1. > bool IsInsert = true; > int V2Index = -1; > for (int i = 0; i < 16; ++i) { > assert(WidenedMask[i] >= -1); > if (WidenedMask[i] < 0) > continue; > > // Make sure all V1 subvectors are in place. > if (WidenedMask[i] < 16) { > if (WidenedMask[i] != i) { > IsInsert = false; > break; > } > } else { > // Make sure we only have a single V2 index and its the lowest > 128-bits. > if (V2Index >= 0 || WidenedMask[i] != 16) { > IsInsert = false; > break; > } > V2Index = i; > } > } > if (IsInsert && V2Index >= 0) { > MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); > SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, > DAG.getIntPtrConstant(0, DL)); > return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); > } > > // Try to lower to to vshuf64x2/vshuf32x4. > SDValue Ops[8] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), > DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), DAG.getUNDEF(VT), > DAG.getUNDEF(VT)}; > unsigned PermMask = 0; > // Insure elements came from the same Op. > for (int i = 0; i < 16; ++i) { > assert(WidenedMask[i] >= -1); > if (WidenedMask[i] < 0) > continue; > > SDValue Op = WidenedMask[i] >= 16 ? V2 : V1; > unsigned OpIndex = i / 2; > if (Ops[OpIndex].isUndef()) > Ops[OpIndex] = Op; > else if (Ops[OpIndex] != Op) > return SDValue(); > > // Convert the 128-bit shuffle mask selection values into 128-bit > selection > // bits defined by a vshuf64x2 instruction's immediate control byte. > PermMask |= (WidenedMask[i] % 16) << (i * 2); > } > > return DAG.getNode(X86ISD::SHUF128_P64, DL, VT, Ops[0], Ops[1], > DAG.getConstant(PermMask, DL, MVT::i64)); > } > > > > Please help...i m really sorry for asking but i m stuck here..The code > runs w/o error at the o/p it gives following assembly; > > P_256B_VSHUF64x2_QWORD R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_1, > 236 # encoding: [] > P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # > encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] > P_256B_VSHUF64x2_QWORD R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_1, > 244 # encoding: [] > P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # > encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] > P_256B_VSHUF64x2_QWORD R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_0, 14 > # encoding: [] > P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # > encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] > P_256B_VSHUF64x2_QWORD R_0_R2048b_0, R_0_R2048b_1, R_0_R2048b_0, 1 > # encoding: [] > P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # > encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] > P_256B_PSHUFFLE_DWORD R_0_R2048b_0, R_0_R2048b_1, 236 # encoding: [] > P_256B_VADD_DWORD R_0_R2048b_1, R_0_R2048b_1, R_0_R2048b_0 # > encoding: [0x61,0x02,0x46,0x00,0x00,0x20,0x00,0x04,0x00,0x00,0x00] > P_256B_PSHUFFLE_DWORD R_0_R2048b_0, R_0_R2048b_1, 229 # encoding: [] > > Here the imm are small (0-255) looks like 8 bits.... > What to do? Please help.... > > > > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20180412/20d8db1a/attachment-0001.html>
Maybe Matching Threads
- Error in generating Object Code for implemented assembly vector instructions
- Using new types v32f32, v32f64 in llvm backend not possible
- Using new types v32f32, v32f64 in llvm backend not possible
- Using new types v32f32, v32f64 in llvm backend not possible
- TableGen - Help to implement a form of gather/scatter operations for Mips MSA