1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2019-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #include "LatencyTable.h" 10 #include "LocalScheduler_G4IR.h" 11 #include "../G4_IR.hpp" 12 13 using namespace vISA; 14 getLatency(G4_INST * Inst) const15uint16_t LatencyTable::getLatency(G4_INST* Inst) const 16 { 17 auto GEN = getPlatformGeneration(m_builder->getPlatform()); 18 if (GEN >= PlatformGen::XE) 19 return getLatencyG12(Inst); 20 21 return getLatencyLegacy(Inst); 22 } 23 getDPAS8x8Latency() const24uint16_t LatencyTable::getDPAS8x8Latency() const 25 { 26 switch(m_builder->getPlatform()) 27 { 28 case XeHP_SDV: 29 case GENX_PVC: 30 return uint16_t(LatenciesXe::DPAS + 7); //28 31 case GENX_PVCXT: 32 return uint16_t(LatenciesXe::DPAS + 1 + 7); //29 33 case GENX_DG2: 34 return 32; 35 default: //Not suppport platform 36 return 46; 37 } 38 } 39 40 // This calculates the node's pipeline occupancy (node delay) getOccupancy(G4_INST * Inst) const41uint16_t LatencyTable::getOccupancy(G4_INST* Inst) const 42 { 43 auto GEN = getPlatformGeneration(m_builder->getPlatform()); 44 if (GEN >= PlatformGen::XE) 45 return getOccupancyG12(Inst); 46 47 return getOccupancyLegacy(Inst); 48 } 49 50 getLatencyLegacy(G4_INST * Inst) const51uint16_t LatencyTable::getLatencyLegacy(G4_INST* Inst) const 52 { 53 if (Inst->isSend()) 54 { 55 G4_SendDesc* MsgDesc = Inst->getMsgDesc(); 56 return LegacyFFLatency[SFIDtoInt(MsgDesc->getSFID())]; 57 } else if (Inst->isMath()) { 58 if (Inst->asMathInst()->getMathCtrl() == MATH_FDIV || 59 Inst->asMathInst()->getMathCtrl() == MATH_POW) 60 return LegacyLatencies::EDGE_LATENCY_MATH_TYPE2; 61 return LegacyLatencies::EDGE_LATENCY_MATH; 62 } 63 return LegacyLatencies::IVB_PIPELINE_LENGTH; 64 } 65 getOccupancyLegacy(G4_INST * Inst) const66uint16_t LatencyTable::getOccupancyLegacy(G4_INST* Inst) const 67 { 68 int divisor = 8; 69 int InstLatency = LegacyLatencies::UNCOMPR_LATENCY; 70 if (Inst->isFastHFInstruction()) { 71 divisor = 16; 72 } 73 74 // Number of n-wide passes in FPU0 or FPU1 (EM). 75 // "n" is: 76 // 16 for BDW+ HalfFloatDoublePerf instructions, 77 // 8 for other instructions. 78 int passes = std::max(1, Inst->getExecSize() / divisor); 79 80 // InstLatency is: 81 // 4 for EM/FPU1 POW and FDIV instrutions ( HSW; for BDW+ it is 2 times higher ), 82 // 2 for other EM/FPU1 instructions ( HSW; for BDW+ it is 2 times higher ), 83 // 2 for other instructions. 84 // Update DagNode latency for math. 85 G4_opcode opCode = Inst->opcode(); 86 switch (opCode) { 87 case G4_math: { 88 // Use EdgeLatencyMathType2 for FDIV, FPOW functions. 89 if (Inst->asMathInst()->getMathCtrl() == MATH_FDIV || 90 Inst->asMathInst()->getMathCtrl() == MATH_POW) { 91 InstLatency = 4; 92 } else { 93 // Used EdgeLatencyMath for other functions. 94 InstLatency = 2; 95 } 96 97 // BDW+ platforms have lower math TPT and longer latency (all math functions). 98 InstLatency *= 2; 99 break; 100 } 101 case G4_bfe: 102 case G4_bfi1: 103 case G4_bfi2: 104 case G4_bfrev: 105 case G4_cbit: 106 case G4_dp2: 107 case G4_dp3: 108 case G4_dp4: 109 case G4_dph: 110 case G4_fbh: 111 case G4_fbl: 112 case G4_lrp: 113 case G4_mac: 114 case G4_mach: 115 case G4_pln: 116 InstLatency *= 2; 117 break; 118 case G4_label: 119 // Labels need special care. They should have a latency of 1. 120 // But their execSize is 255, which sets passes=31. 121 passes = 1; 122 InstLatency = 1; 123 break; 124 default: 125 break; 126 } 127 128 return uint16_t(passes * InstLatency); 129 } 130 getLatencyG12(const G4_INST * Inst) const131uint16_t LatencyTable::getLatencyG12(const G4_INST* Inst) const 132 { 133 int Sz = Inst->getExecSize(); 134 int Scale = (Sz <= 8) ? 0 : (Sz == 16) ? 1 : 3; 135 auto Dst = Inst->getDst(); 136 137 if (Inst->isSend()) { 138 G4_SendDesc* MsgDesc = Inst->getMsgDesc(); 139 if (MsgDesc->isLSC()) 140 { 141 if (MsgDesc->isFence()) 142 { 143 return MsgDesc->isTyped() ? 144 LatenciesXe::LSC_TYPED_FENCE : LatenciesXe::LSC_UNTYPED_FENCE; 145 } 146 else 147 { 148 bool isCachedInL1 = MsgDesc->getCachingL1() == Caching::CA || 149 (MsgDesc->getCachingL1() != Caching::UC && m_builder->getOption(vISA_assumeL1Hit)); 150 if (MsgDesc->isLSC() && MsgDesc->isTyped()) 151 { 152 return isCachedInL1 ? LatenciesXe::LSC_TYPED_L1 : LatenciesXe::LSC_TYPED_L3; 153 } 154 else 155 { 156 return isCachedInL1 ? LatenciesXe::LSC_UNTYPED_L1 : LatenciesXe::LSC_UNTYPED_L3; 157 } 158 } 159 } 160 if (MsgDesc->isSLM()) 161 return Inst->asSendInst()->isFence() ? LatenciesXe::SLM_FENCE : LatenciesXe::SLM; 162 if (MsgDesc->isSampler()) 163 return LatenciesXe::SAMPLER_L3; 164 if (MsgDesc->isHDC()) 165 return LatenciesXe::DP_L3; 166 if (MsgDesc->isBarrier()) 167 return LatenciesXe::BARRIER; 168 return LatenciesXe::SEND_OTHERS; 169 } 170 if (Inst->isMath()) 171 { 172 return uint16_t(LatenciesXe::MATH + LatenciesXe::DELTA_MATH * Scale); 173 } 174 if (Inst->isFlowControl()) 175 { 176 return LatenciesXe::BRANCH; 177 } 178 if (Inst->isDpas()) { 179 180 if (m_builder->getPlatform() == GENX_PVC) 181 { 182 G4_InstDpas *dpas = Inst->asDpasInst(); 183 return uint16_t(LatenciesXe::DPAS + dpas->getRepeatCount() - 1); 184 } 185 186 if (m_builder->getPlatform() == GENX_PVCXT) 187 { 188 G4_InstDpas *dpas = Inst->asDpasInst(); 189 return uint16_t(LatenciesXe::DPAS + 1 + dpas->getRepeatCount() - 1); //22 ~29 190 } 191 192 if (m_builder->getPlatform() == GENX_DG2) 193 { 194 G4_InstDpas *dpas = Inst->asDpasInst(); 195 switch(dpas->getRepeatCount()) 196 { 197 case 1: 198 return 21; 199 case 2: 200 return 22; 201 case 8: 202 return 32; 203 default: 204 return 32; 205 } 206 } 207 G4_InstDpas* dpas = Inst->asDpasInst(); 208 return uint16_t(LatenciesXe::DPAS + dpas->getRepeatCount() - 1); 209 } 210 if (Inst->writesFlag() || (Dst && Dst->isA0())) 211 { 212 return LatenciesXe::ARF; 213 } 214 if (Inst->isArithmetic()) { 215 if (Dst->isAccReg()) 216 return uint16_t(LatenciesXe::FPU_ACC + LatenciesXe::DELTA * Scale); 217 return uint16_t(LatenciesXe::FPU + LatenciesXe::DELTA * Scale); 218 } 219 220 // By default, use the FPU pipeline latency. 221 return uint16_t(LatenciesXe::FPU); 222 } 223 getOccupancyG12(G4_INST * Inst) const224uint16_t LatencyTable::getOccupancyG12(G4_INST* Inst) const 225 { 226 enum OccupancyXe { 227 G12_OC_MATH = 4, 228 G12_OC_Others = 1 229 }; 230 231 int Sz = Inst->getExecSize(); 232 int Scale = (Sz <= 8) ? 1 : (Sz == 16) ? 2 : 4; 233 if (Inst->isMath()) 234 return uint16_t(G12_OC_MATH * Scale); 235 if (Inst->isFastHFInstruction()) 236 Scale = (Sz <= 16) ? 1 : 2; 237 else if (G4_DstRegRegion* Dst = Inst->getDst()) { 238 if (Dst->getTypeSize() == 8) 239 Scale = (Sz <= 4) ? 1 : 2; 240 } 241 return uint16_t(G12_OC_Others * Scale); 242 } 243