1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2019-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "LatencyTable.h"
10 #include "LocalScheduler_G4IR.h"
11 #include "../G4_IR.hpp"
12 
13 using namespace vISA;
14 
getLatency(G4_INST * Inst) const15 uint16_t LatencyTable::getLatency(G4_INST* Inst) const
16 {
17     auto GEN = getPlatformGeneration(m_builder->getPlatform());
18     if (GEN >= PlatformGen::XE)
19         return getLatencyG12(Inst);
20 
21     return getLatencyLegacy(Inst);
22 }
23 
getDPAS8x8Latency() const24 uint16_t LatencyTable::getDPAS8x8Latency() const
25 {
26     switch(m_builder->getPlatform())
27     {
28         case XeHP_SDV:
29         case GENX_PVC:
30             return uint16_t(LatenciesXe::DPAS + 7); //28
31         case GENX_PVCXT:
32             return uint16_t(LatenciesXe::DPAS + 1 + 7); //29
33         case GENX_DG2:
34             return 32;
35         default: //Not suppport platform
36            return 46;
37     }
38 }
39 
40 // This calculates the node's pipeline occupancy (node delay)
getOccupancy(G4_INST * Inst) const41 uint16_t LatencyTable::getOccupancy(G4_INST* Inst) const
42 {
43     auto GEN = getPlatformGeneration(m_builder->getPlatform());
44     if (GEN >= PlatformGen::XE)
45         return getOccupancyG12(Inst);
46 
47     return getOccupancyLegacy(Inst);
48 }
49 
50 
getLatencyLegacy(G4_INST * Inst) const51 uint16_t LatencyTable::getLatencyLegacy(G4_INST* Inst) const
52 {
53     if (Inst->isSend())
54     {
55         G4_SendDesc* MsgDesc = Inst->getMsgDesc();
56         return LegacyFFLatency[SFIDtoInt(MsgDesc->getSFID())];
57     } else if (Inst->isMath()) {
58         if (Inst->asMathInst()->getMathCtrl() == MATH_FDIV ||
59             Inst->asMathInst()->getMathCtrl() == MATH_POW)
60             return LegacyLatencies::EDGE_LATENCY_MATH_TYPE2;
61         return LegacyLatencies::EDGE_LATENCY_MATH;
62     }
63     return LegacyLatencies::IVB_PIPELINE_LENGTH;
64 }
65 
getOccupancyLegacy(G4_INST * Inst) const66 uint16_t LatencyTable::getOccupancyLegacy(G4_INST* Inst) const
67 {
68     int divisor = 8;
69     int InstLatency = LegacyLatencies::UNCOMPR_LATENCY;
70     if (Inst->isFastHFInstruction()) {
71         divisor = 16;
72     }
73 
74     // Number of n-wide passes in FPU0 or FPU1 (EM).
75     // "n" is:
76     //      16 for BDW+ HalfFloatDoublePerf instructions,
77     //      8 for other instructions.
78     int passes = std::max(1, Inst->getExecSize() / divisor);
79 
80     // InstLatency is:
81     //      4 for EM/FPU1 POW and FDIV instrutions ( HSW; for BDW+ it is 2 times higher ),
82     //      2 for other EM/FPU1 instructions ( HSW; for BDW+ it is 2 times higher ),
83     //      2 for other instructions.
84     // Update DagNode latency for math.
85     G4_opcode opCode = Inst->opcode();
86     switch (opCode) {
87     case G4_math: {
88         // Use EdgeLatencyMathType2 for FDIV, FPOW functions.
89         if (Inst->asMathInst()->getMathCtrl() == MATH_FDIV ||
90             Inst->asMathInst()->getMathCtrl() == MATH_POW) {
91             InstLatency = 4;
92         } else {
93             // Used EdgeLatencyMath for other functions.
94             InstLatency = 2;
95         }
96 
97         // BDW+ platforms have lower math TPT and longer latency (all math functions).
98         InstLatency *= 2;
99         break;
100     }
101     case G4_bfe:
102     case G4_bfi1:
103     case G4_bfi2:
104     case G4_bfrev:
105     case G4_cbit:
106     case G4_dp2:
107     case G4_dp3:
108     case G4_dp4:
109     case G4_dph:
110     case G4_fbh:
111     case G4_fbl:
112     case G4_lrp:
113     case G4_mac:
114     case G4_mach:
115     case G4_pln:
116         InstLatency *= 2;
117         break;
118     case G4_label:
119         // Labels need special care. They should have a latency of 1.
120         // But their execSize is 255, which sets passes=31.
121         passes = 1;
122         InstLatency = 1;
123         break;
124     default:
125         break;
126     }
127 
128     return uint16_t(passes * InstLatency);
129 }
130 
getLatencyG12(const G4_INST * Inst) const131 uint16_t LatencyTable::getLatencyG12(const G4_INST* Inst) const
132 {
133     int Sz = Inst->getExecSize();
134     int Scale = (Sz <= 8) ? 0 : (Sz == 16) ? 1 : 3;
135     auto Dst = Inst->getDst();
136 
137     if (Inst->isSend()) {
138         G4_SendDesc* MsgDesc = Inst->getMsgDesc();
139         if (MsgDesc->isLSC())
140         {
141             if (MsgDesc->isFence())
142             {
143                 return MsgDesc->isTyped() ?
144                     LatenciesXe::LSC_TYPED_FENCE : LatenciesXe::LSC_UNTYPED_FENCE;
145             }
146             else
147             {
148                 bool isCachedInL1 = MsgDesc->getCachingL1() == Caching::CA ||
149                     (MsgDesc->getCachingL1() != Caching::UC && m_builder->getOption(vISA_assumeL1Hit));
150                 if (MsgDesc->isLSC() && MsgDesc->isTyped())
151                 {
152                     return isCachedInL1 ? LatenciesXe::LSC_TYPED_L1 : LatenciesXe::LSC_TYPED_L3;
153                 }
154                 else
155                 {
156                     return isCachedInL1 ? LatenciesXe::LSC_UNTYPED_L1 : LatenciesXe::LSC_UNTYPED_L3;
157                 }
158             }
159         }
160         if (MsgDesc->isSLM())
161             return Inst->asSendInst()->isFence() ? LatenciesXe::SLM_FENCE : LatenciesXe::SLM;
162         if (MsgDesc->isSampler())
163             return LatenciesXe::SAMPLER_L3;
164         if (MsgDesc->isHDC())
165             return LatenciesXe::DP_L3;
166         if (MsgDesc->isBarrier())
167             return LatenciesXe::BARRIER;
168          return LatenciesXe::SEND_OTHERS;
169     }
170     if (Inst->isMath())
171     {
172         return uint16_t(LatenciesXe::MATH + LatenciesXe::DELTA_MATH * Scale);
173     }
174     if (Inst->isFlowControl())
175     {
176         return LatenciesXe::BRANCH;
177     }
178     if (Inst->isDpas()) {
179 
180         if (m_builder->getPlatform() ==  GENX_PVC)
181         {
182             G4_InstDpas *dpas = Inst->asDpasInst();
183             return uint16_t(LatenciesXe::DPAS + dpas->getRepeatCount() - 1);
184         }
185 
186         if (m_builder->getPlatform() ==  GENX_PVCXT)
187         {
188             G4_InstDpas *dpas = Inst->asDpasInst();
189             return uint16_t(LatenciesXe::DPAS + 1 + dpas->getRepeatCount() - 1); //22 ~29
190         }
191 
192         if (m_builder->getPlatform() ==  GENX_DG2)
193         {
194             G4_InstDpas *dpas = Inst->asDpasInst();
195             switch(dpas->getRepeatCount())
196             {
197             case 1:
198                 return 21;
199             case 2:
200                 return 22;
201             case 8:
202                 return 32;
203             default:
204                 return 32;
205             }
206         }
207         G4_InstDpas* dpas = Inst->asDpasInst();
208         return uint16_t(LatenciesXe::DPAS + dpas->getRepeatCount() - 1);
209     }
210     if (Inst->writesFlag() || (Dst && Dst->isA0()))
211     {
212         return LatenciesXe::ARF;
213     }
214     if (Inst->isArithmetic()) {
215         if (Dst->isAccReg())
216             return uint16_t(LatenciesXe::FPU_ACC + LatenciesXe::DELTA * Scale);
217         return uint16_t(LatenciesXe::FPU + LatenciesXe::DELTA * Scale);
218     }
219 
220     // By default, use the FPU pipeline latency.
221     return uint16_t(LatenciesXe::FPU);
222 }
223 
getOccupancyG12(G4_INST * Inst) const224 uint16_t LatencyTable::getOccupancyG12(G4_INST* Inst) const
225 {
226     enum OccupancyXe {
227         G12_OC_MATH = 4,
228         G12_OC_Others = 1
229     };
230 
231     int Sz = Inst->getExecSize();
232     int Scale = (Sz <= 8) ? 1 : (Sz == 16) ? 2 : 4;
233     if (Inst->isMath())
234         return uint16_t(G12_OC_MATH * Scale);
235     if (Inst->isFastHFInstruction())
236         Scale = (Sz <= 16) ? 1 : 2;
237     else if (G4_DstRegRegion* Dst = Inst->getDst()) {
238         if (Dst->getTypeSize() == 8)
239             Scale = (Sz <= 4) ? 1 : 2;
240     }
241     return uint16_t(G12_OC_Others * Scale);
242 }
243