1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #ifndef _HWCONFORMITY_H_
10 #define _HWCONFORMITY_H_
11 
12 #include "G4_IR.hpp"
13 #include "BuildIR.h"
14 #include "FlowGraph.h"
15 #include "Common_ISA_util.h"
16 #include "RegAlloc.h"
17 
18 #include <map>
19 
20 namespace vISA
21 {
22     class HWConformity
23     {
24         IR_Builder& builder;
25         G4_Kernel& kernel;
26         vISA::Mem_Manager& mem;
27 
28         // this must be set before calling the individual fix functions
29         G4_BB* curBB = nullptr;
30 
31         // This is added for data layout optimization.
32         // Currently it only targets packed-byte pattern.
33         // Can be extended later for other patterns.
34         enum AccessPattern {
35             ACCESS_PATTERN_UNDEF = 0,
36             ACCESS_PATTERN_PACKED_BYTE = 1,
37             ACCESS_PATTERN_INVALID = 2
38         };
39         std::map<G4_Declare*, AccessPattern> dclAccessPattern;
40 
getAccessPattern(G4_Declare * dcl)41         AccessPattern getAccessPattern(G4_Declare* dcl)
42         {
43             auto iter = dclAccessPattern.find(dcl);
44             if (iter == dclAccessPattern.end())
45             {
46                 return ACCESS_PATTERN_UNDEF;
47             }
48             return (*iter).second;
49         }
setAccessPattern(G4_Declare * dcl,AccessPattern ap)50         void setAccessPattern(G4_Declare* dcl, AccessPattern ap)
51         {
52             dclAccessPattern[dcl] = ap;
53         }
54         bool markPackedByteReference(G4_Kernel& kernel, G4_Operand* opnd, G4_INST* inst);
55         G4_Operand* fixPackedByteReference(IR_Builder& builder, G4_Operand* opnd);
56 
57         // helper functions
58         bool hasBadRegion(G4_INST *inst);
59         bool canSplitInst(G4_INST *inst, G4_INST *use_op);
60         bool splitInstListForByteDst(INST_LIST_ITER it, G4_BB *bb, uint16_t extypesize);
61 
62         G4_DstRegRegion* insertMovAfter(INST_LIST_ITER& it, G4_DstRegRegion* dst, G4_Type type, G4_BB *bb, G4_SubReg_Align dstAlign = Any);
63         G4_Operand* insertMovBefore(INST_LIST_ITER it, uint32_t srcNum, G4_Type type, G4_BB *bb,
64             G4_SubReg_Align tmpAlign = Any);
65         G4_Operand* insertMovBefore(INST_LIST_ITER it, uint32_t srcNum, G4_Type type, G4_BB* bb,
66             uint16_t stride, G4_SubReg_Align tmpAlign = Any);
67 
68         // replace src <srcNum> for inst <*it> with a temp variable of type <type>
69         // This is used to satisfy various HW restrictions on src type/alignment/region/modifier/etc.
70         void replaceSrc(INST_LIST_ITER it, uint32_t srcNum, G4_Type type, G4_BB* bb, G4_SubReg_Align tmpAlign = Any)
71         {
72             G4_INST* inst = *it;
73             inst->setSrc(insertMovBefore(it, srcNum, type, bb, tmpAlign), srcNum);
74         }
75         // replace dst for inst <*it> with a temp variable of type <type>
76         // the original dst is now the dst of a new move instruction from the temp variable.
77         // This is used to satisfy various HW restrictions on dst type/alignment/etc.
78         void replaceDst(INST_LIST_ITER it, G4_Type type, G4_SubReg_Align dstAlign = Any)
79         {
80             G4_INST* inst = *it;
81             inst->setDest(insertMovAfter(it, inst->getDst(), type, curBB, dstAlign));
82         }
83 
84         G4_SrcRegRegion* insertCopyBefore(INST_LIST_ITER it, uint32_t srcNum, G4_SubReg_Align tmpAlign, G4_BB *bb);
85         G4_SrcRegRegion* insertCopyAtBBEntry(G4_BB* bb, G4_ExecSize newExecSize, G4_Operand* src);
86         void broadcast(G4_BB* bb, INST_LIST_ITER it, int srcPos, G4_SubReg_Align subAlign);
87 
88         G4_INST *splitInstWithByteDst(G4_INST *expand_op);
89         G4_SubReg_Align getDclAlignment(int opndBytes, G4_INST *inst, bool isScalar);
90 
91         // HW conformity check functions
92         void fixPackedSource(INST_LIST_ITER it, G4_BB *bb);
93         bool fixMathInst(INST_LIST_ITER it, G4_BB *bb);
94         bool fixMULInst(INST_LIST_ITER &it, G4_BB *bb);
95         void fixMULHInst(INST_LIST_ITER &i, G4_BB *bb);
96         void fixOpnds(INST_LIST_ITER it, G4_BB *bb, G4_Type& exType);
97         bool fixLine(INST_LIST_ITER it, G4_BB *bb);
98         bool fixOpndType(INST_LIST_ITER it, G4_BB *bb);
99         void fixPackedHFConversions(INST_LIST_ITER it, G4_BB* bb);
100 
101         bool fixIndirectOpnd(INST_LIST_ITER i, G4_BB *bb);
102         void fix3SrcInst(INST_LIST_ITER i, G4_BB* bb);
103         void fixAlign13SrcInst(INST_LIST_ITER i, G4_BB* bb);
104         void fixCompareInst(INST_LIST_ITER i, G4_BB *bb, G4_Type exType, int dst_elsize);
105         bool fixDstAlignment(INST_LIST_ITER i, G4_BB *bb, G4_Type extype, unsigned int dst_elsize);
106         void fixPredicateIndirectInst(INST_LIST_ITER i, G4_BB* bb);
107         bool fixDstAlignmentWithVectorImm(INST_LIST_ITER i, G4_BB *bb);
108         bool fixAcc(INST_LIST_ITER i, G4_BB* bb);
109         void fixDstHstride(INST_LIST_ITER i, int extypesize);
110         void fixMADInst(G4_BB* bb);
111         void fixSrcRegion(G4_INST *inst);
112         void conformBB(G4_BB* bb);
113         void fixSADA2Inst(G4_BB* bb);
114         void fixMixedHFInst(G4_BB* bb);
115         void fixSendInst(G4_BB* bb);
116         void fixsrc1src2Overlap(G4_BB* bb);
117         void fixOverlapInst(G4_BB* bb);
118         bool canSplitByteDst(G4_opcode op);
119         bool fixInstOpndTypeAlign(INST_LIST_ITER i, G4_BB* bb);
120         void fixOpndTypeAlign(G4_BB* bb);
121         void fixInstExecSize(G4_BB* bb);
122         bool reduceExecSize(INST_LIST_ITER iter, G4_BB* bb);
123         bool reduceExecSizeForMath(INST_LIST_ITER iter, G4_BB* bb);
124         bool checkSrcDstOverlap(INST_LIST_ITER iter, G4_BB* bb, bool compOpt);
125         void splitInstruction(INST_LIST_ITER iter, G4_BB* bb, bool compOpt, uint8_t numInFirstMov, bool rule4_11, bool allowSrcCrossGRF);
126         void splitSIMD32Inst(INST_LIST_ITER iter, G4_BB* bb);
127         bool evenlySplitInst(INST_LIST_ITER iter, G4_BB* bb, bool checkOverlap = true);
128         void moveSrcToGRF(INST_LIST_ITER it, uint32_t srcNum, uint16_t numGRF, G4_BB *bb);
129         void saveDst(INST_LIST_ITER& it, uint8_t stride, G4_BB *bb);
130         void restoreDst(INST_LIST_ITER& it, G4_DstRegRegion *origDst, G4_BB *bb);
131         void insertMovAfter(INST_LIST_ITER& it, uint16_t stride, G4_BB* bb);
132         void removeBadSrc(INST_LIST_ITER& it, G4_BB *bb, bool crossGRFDst, bool oneGRFSrc[3], bool badTwoGRFSrc[3]);
133         uint8_t checkMinExecSize(G4_opcode op);
134         void convertMAD2MulAdd(INST_LIST_ITER iter, G4_BB *bb);
135         void maintainDU4TempMov(G4_INST *inst, G4_INST *movInst);
136         void fixImm64(INST_LIST_ITER i, G4_BB* bb);
137         bool checkSrcCrossGRF(INST_LIST_ITER &i, G4_BB* bb);
138         G4_INST* checkSrcDefInst(G4_INST *inst, G4_INST *def_inst, uint32_t srcNum);
139         bool emulate64bMov(INST_LIST_ITER iter, G4_BB* bb);
140         bool fix64bInst(INST_LIST_ITER i, G4_BB* bb);
141         bool fixPlaneInst(INST_LIST_ITER i, G4_BB* bb);
142         void expandPlaneInst(INST_LIST_ITER i, G4_BB* bb);
143         bool fixAddcSubb(G4_BB* bb);
144         void fixDataLayout();
145         void fixBFMixedMode();
146         bool fixMov(INST_LIST_ITER i, G4_BB* bb);
147         bool fixRotate(INST_LIST_ITER i, G4_BB* bb);
148         bool fixIntToHFMove(G4_BB* bb);
149 
150         bool isFloatOr64b(G4_INST *inst);
151         uint16_t getSrcStride(G4_SrcRegRegion* src);
152         void change64bStride2CopyToUD(INST_LIST_ITER it, G4_BB* bb);
153         bool fixBFMove(INST_LIST_ITER i, G4_BB* bb);
154         void fixUnalignedRegions(INST_LIST_ITER it, G4_BB* bb);
155         bool fixFcvt(INST_LIST_ITER i, G4_BB* bb);
156         void fixByteXBarRestriction(INST_LIST_ITER it, G4_BB* bb);
157         void fixDPAS(INST_LIST_ITER it, G4_BB* bb);
158         bool fixSrnd(INST_LIST_ITER i, G4_BB* bb);
159         void fixShiftInsts(INST_LIST_ITER i, G4_BB* bb);
160 
161         void helperGenerateTempDst(
162             G4_BB *bb,
163             INST_LIST_ITER instIter,
164             G4_INST *inst,
165             uint8_t hStride,
166             G4_Type tempDstType,
167             G4_SubReg_Align subAlign = Any);
168 
169         bool isGoodAlign16Src(G4_INST* inst, int srcPos);
170         bool isGoodAlign1TernarySrc(G4_INST* inst, int srcPos, bool canBeImm);
171         bool isGoodAlign1TernaryDst(G4_INST* inst) const;
172         void copyDwords(G4_Declare* dst,
173             int dstOffset,
174             G4_Declare* src,
175             int srcOffset,
176             int numDwords,
177             G4_BB* bb,
178             INST_LIST_ITER iter);
179 
180         void copyDwordsIndirect(G4_Declare* dst,
181             G4_SrcRegRegion* src,
182             int numDwords,
183             G4_BB* bb,
184             INST_LIST_ITER iter);
185 
186         void copyRegs(G4_Declare* dst,
187             int dstOffset,
188             G4_Declare* src,
189             int srcOffset,
190             int numRegs,
191             G4_BB* bb,
192             INST_LIST_ITER iter);
193 
194         bool isFpMadPreferred(G4_BB *bb, INST_LIST_ITER iter);
195         bool generateFPMad(G4_BB* bb, INST_LIST_ITER iter);
196         bool generateAlign1Mad(G4_BB* bb, INST_LIST_ITER iter);
197         bool hasSameSubregOffset(G4_INST* inst) const;
198         bool hasSameSubregOffset(G4_INST* inst, uint32_t& byteOffset) const;
199 
200         void fixImmAndARFSrc(INST_LIST_ITER it, G4_BB *bb);
201         void generateMacl(INST_LIST_ITER it, G4_BB *bb);
202         void doGenerateMacl(INST_LIST_ITER it, G4_BB *bb);
203 
204         void fixSelCsel(INST_LIST_ITER it, G4_BB *bb);
205 
206         void avoidDstSrcOverlap(PointsToAnalysis& p);
207         void avoidInstDstSrcOverlap(INST_LIST_ITER it, G4_BB* bb, PointsToAnalysis& p);
208 
209         void replaceHFBFwithFloat(INST_LIST_ITER it, G4_BB* bb);
210 
new(size_t sz,vISA::Mem_Manager & m)211         void* operator new(size_t sz, vISA::Mem_Manager& m) { return m.alloc(sz); }
212 
213         bool checkSrcMod(INST_LIST_ITER it, G4_BB* bb, int srcPos);
214 
215         void fixSrc2(INST_LIST_ITER it, G4_BB* bb, bool swapSrc0and2);
216 
217         void fixVxHFloat64b(INST_LIST_ITER it, G4_BB* bb);
218 
219         void fixPredCtrl(INST_LIST_ITER it, G4_BB* bb);
220 
221         // Calla src register must be grf aligned (sub-reg offset must be 0)
222         void fixCalla(INST_LIST_ITER it, G4_BB* bb);
223 
224         // If alignment and region of all operands of any instruction are conformed
225         // by a dedicated function, return true.
226         // This is used to skip generic conformity functions, such as fixOpndTypeAlign().
hasDedicateAlignRegionConformity(INST_LIST_ITER it)227         bool hasDedicateAlignRegionConformity(INST_LIST_ITER it) const
228         {
229             return hasDedicateAlignRegionConformity(*it);
230         }
231         bool hasDedicateAlignRegionConformity(const G4_INST *I) const;
232 
233         void fixSrc1Region(INST_LIST_ITER it, G4_BB* bb);
234 
235         INST_LIST_ITER fixMadwInst(INST_LIST_ITER i, G4_BB* bb);
236 
237         void fixFloatARFDst(INST_LIST_ITER it, G4_BB* bb);
238 
239     public:
HWConformity(IR_Builder & b,G4_Kernel & k,vISA::Mem_Manager & m)240         HWConformity(IR_Builder& b, G4_Kernel &k, vISA::Mem_Manager& m) :
241             builder(b), kernel(k), mem(m)
242         {
243         }
244         void chkHWConformity();
245         static void tryEliminateMadSrcModifier(IR_Builder &builder, G4_INST *inst);
246         void localizeForAcc(G4_BB* bb);
247         void splitDWMULInst(INST_LIST_ITER& start, INST_LIST_ITER& end, G4_BB* bb);
248         void fixMulSrc1(INST_LIST_ITER i, G4_BB* bb);
249     };
250 }
251 //single entry point for HW conformity checks
252 extern void HWConformityChk(vISA::IR_Builder& builder, vISA::G4_Kernel& kernel, vISA::Mem_Manager& mem);
253 
254 #endif /* _HWCONFORMITY_H_ */
255