1 /*========================== begin_copyright_notice ============================ 2 3 Copyright (C) 2017-2021 Intel Corporation 4 5 SPDX-License-Identifier: MIT 6 7 ============================= end_copyright_notice ===========================*/ 8 9 #include "Compiler/CISACodeGen/CISABuilder.hpp" 10 #include "Compiler/CISACodeGen/ShaderCodeGen.hpp" 11 #include "Compiler/CISACodeGen/PixelShaderCodeGen.hpp" 12 #include "Compiler/CISACodeGen/ComputeShaderCodeGen.hpp" 13 #include "common/allocator.h" 14 #include "common/Types.hpp" 15 #include "common/Stats.hpp" 16 #include "common/MemStats.h" 17 #include "common/debug/Dump.hpp" 18 #include "common/igc_regkeys.hpp" 19 #include "common/secure_mem.h" 20 #include "common/secure_string.h" 21 #include "common/shaderOverride.hpp" 22 #include "common/CompilerStatsUtils.hpp" 23 #include "inc/common/sku_wa.h" 24 #include <llvm/Support/Path.h> 25 #include <llvm/ADT/Statistic.h> 26 #include <iStdLib/utility.h> 27 #include <iostream> 28 #include <iomanip> 29 #include <sstream> 30 #include <string> 31 #include <fstream> 32 #include "Probe/Assertion.h" 33 34 #if !defined(_WIN32) 35 # define _strdup strdup 36 #endif 37 38 /*********************************************************************************** 39 This file defines the CEncoder class which is used to generate CISA instructions 40 ************************************************************************************/ 41 42 // macro to check the result of VISA API calls 43 #define V(x) do { int result = (x); IGC_ASSERT_MESSAGE((0 == result), "call to VISA API failed"); } while(0) 44 45 static const unsigned int g_cScratchSpaceMsglimit = (128 * 1024); 46 using namespace llvm; 47 48 #define DEBUG_TYPE "cisa-builder" 49 50 STATISTIC(SimdSize8, "Number of shader(s) with SIMD8"); 51 STATISTIC(SimdSize16, "Number of shader(s) with SIMD16"); 52 STATISTIC(SimdSize32, "Number of shader(s) with SIMD32"); 53 54 namespace IGC 55 { visaExecSize(SIMDMode width)56 inline VISA_Exec_Size visaExecSize(SIMDMode width) 57 { 58 switch (width) 59 { 60 case SIMDMode::SIMD1: return EXEC_SIZE_1; 61 case SIMDMode::SIMD2: return EXEC_SIZE_2; 62 case SIMDMode::SIMD4: return EXEC_SIZE_4; 63 case SIMDMode::SIMD8: return EXEC_SIZE_8; 64 case SIMDMode::SIMD16: return EXEC_SIZE_16; 65 case SIMDMode::SIMD32: return EXEC_SIZE_32; 66 case SIMDMode::UNKNOWN: 67 default: IGC_ASSERT_MESSAGE(0, "unreachable"); break; 68 } 69 return EXEC_SIZE_ILLEGAL; 70 } 71 convertAtomicOpEnumToVisa(AtomicOp op)72 VISAAtomicOps convertAtomicOpEnumToVisa(AtomicOp op) 73 { 74 switch (op) 75 { 76 case EATOMIC_AND: 77 case EATOMIC_AND64: 78 return ATOMIC_AND; 79 case EATOMIC_DEC: 80 case EATOMIC_DEC64: 81 return ATOMIC_DEC; 82 case EATOMIC_IADD: 83 case EATOMIC_IADD64: 84 return ATOMIC_ADD; 85 case EATOMIC_IMAX: 86 case EATOMIC_IMAX64: 87 return ATOMIC_IMAX; 88 case EATOMIC_IMIN: 89 case EATOMIC_IMIN64: 90 return ATOMIC_IMIN; 91 case EATOMIC_INC: 92 case EATOMIC_INC64: 93 return ATOMIC_INC; 94 case EATOMIC_MAX: 95 case EATOMIC_MAX64: 96 return ATOMIC_MAX; 97 case EATOMIC_MIN: 98 case EATOMIC_MIN64: 99 return ATOMIC_MIN; 100 case EATOMIC_OR: 101 case EATOMIC_OR64: 102 return ATOMIC_OR; 103 case EATOMIC_SUB: 104 case EATOMIC_SUB64: 105 return ATOMIC_SUB; 106 case EATOMIC_UMAX: 107 case EATOMIC_UMAX64: 108 return ATOMIC_MAX; 109 case EATOMIC_UMIN: 110 case EATOMIC_UMIN64: 111 return ATOMIC_MIN; 112 case EATOMIC_XOR: 113 case EATOMIC_XOR64: 114 return ATOMIC_XOR; 115 case EATOMIC_XCHG: 116 case EATOMIC_XCHG64: 117 return ATOMIC_XCHG; 118 case EATOMIC_CMPXCHG: 119 case EATOMIC_CMPXCHG64: 120 return ATOMIC_CMPXCHG; 121 case EATOMIC_PREDEC: 122 case EATOMIC_PREDEC64: 123 return ATOMIC_PREDEC; 124 case EATOMIC_FMAX: 125 return ATOMIC_FMAX; 126 case EATOMIC_FMIN: 127 return ATOMIC_FMIN; 128 case EATOMIC_FCMPWR: 129 return ATOMIC_FCMPWR; 130 case EATOMIC_FADD: 131 case EATOMIC_FADD64: 132 return ATOMIC_FADD; 133 case EATOMIC_FSUB: 134 return ATOMIC_FSUB; 135 default: 136 IGC_ASSERT_MESSAGE(0, "Atomic Op not implemented"); 137 return ATOMIC_AND; 138 } 139 } 140 visaElementSize(unsigned int m_elt_size)141 inline GATHER_SCATTER_ELEMENT_SIZE visaElementSize(unsigned int m_elt_size) 142 { 143 GATHER_SCATTER_ELEMENT_SIZE elementSize = GATHER_SCATTER_BYTE_UNDEF; 144 if (m_elt_size == 1) 145 { 146 elementSize = GATHER_SCATTER_BYTE; 147 } 148 else if (m_elt_size == 2) 149 { 150 elementSize = GATHER_SCATTER_WORD; 151 } 152 else if (m_elt_size == 4) 153 { 154 elementSize = GATHER_SCATTER_DWORD; 155 } 156 else 157 { 158 IGC_ASSERT_MESSAGE(0, "unreachable"); 159 } 160 return elementSize; 161 } 162 163 static inline VISA_SVM_Block_Type visaBlockType(unsigned elemSize)164 visaBlockType(unsigned elemSize) { 165 switch (elemSize) { 166 case 8: return SVM_BLOCK_TYPE_BYTE; 167 case 32: return SVM_BLOCK_TYPE_DWORD; 168 case 64: return SVM_BLOCK_TYPE_QWORD; 169 } 170 171 IGC_ASSERT_MESSAGE(0, "Unknown block/element size. Expect 8-/32-/64-bit only!"); 172 return static_cast<VISA_SVM_Block_Type>(~0U); 173 } 174 175 static inline VISA_SVM_Block_Num visaBlockNum(unsigned numElems)176 visaBlockNum(unsigned numElems) { 177 switch (numElems) { 178 case 1: return SVM_BLOCK_NUM_1; 179 case 2: return SVM_BLOCK_NUM_2; 180 case 4: return SVM_BLOCK_NUM_4; 181 case 8: return SVM_BLOCK_NUM_8; 182 } 183 184 IGC_ASSERT_MESSAGE(0, "Unknown number of blocks/elements. Expect 1, 2, 4, or 8 only!"); 185 return static_cast<VISA_SVM_Block_Num>(~0U); 186 } 187 visaNumLanes(VISA_Exec_Size execSize)188 constexpr unsigned visaNumLanes(VISA_Exec_Size execSize) 189 { 190 unsigned lanes = 0; 191 switch (execSize) 192 { 193 case EXEC_SIZE_1: lanes = 1; break; 194 case EXEC_SIZE_2: lanes = 2; break; 195 case EXEC_SIZE_4: lanes = 4; break; 196 case EXEC_SIZE_8: lanes = 8; break; 197 case EXEC_SIZE_16: lanes = 16; break; 198 case EXEC_SIZE_32: lanes = 32; break; 199 default: IGC_ASSERT(0); break; 200 } 201 return lanes; 202 } 203 204 // Take certain attributes of either src or dst instruction operand and return the size 205 // of the associated grf region, accessed during instruction's execution, in bytes. 206 // If aligned==true, the size includes length of data block starting at the beginning of grf 207 // and ending at the subReg; this is useful to check if the region crosses 2 grf boundary. 208 // If special region attribute is not set, the regioning is <1; 1, 0> for src and <1> for dst. 209 // Note that the assertions may hit in certain cases, which should be handled separately, 210 // like uniform vars with operand with special region set. GrfRegionSize(VISA_Exec_Size execSize,unsigned elementSize,const SModifier & mod,bool isSource,bool aligned=true)211 constexpr unsigned GrfRegionSize(VISA_Exec_Size execSize, unsigned elementSize, 212 const SModifier& mod, bool isSource, bool aligned = true) 213 { 214 constexpr unsigned grfSize = 32; // in bytes 215 // If subReg is big enough to cross grf boundary, adjust it. 216 const unsigned base = (mod.subReg * elementSize) % grfSize; 217 unsigned lastInRegion = aligned ? base : 0; 218 if (isSource) 219 { 220 // Formula based on algorithm provided in the spec (see Region Parameters) 221 const unsigned vstride = mod.specialRegion ? mod.region[0] : 1; 222 const unsigned width = mod.specialRegion ? mod.region[1] : 1; 223 const unsigned hstride = mod.specialRegion ? mod.region[2] : 0; 224 if (0 == width) 225 { 226 return unsigned(-1); 227 } 228 const unsigned height = visaNumLanes(execSize) / width; 229 if (0 == height) 230 { 231 return unsigned(-1); 232 } 233 lastInRegion += (height - 1) * vstride * elementSize + 234 (width - 1) * hstride * elementSize; 235 } 236 else 237 { 238 const unsigned hstride = mod.specialRegion ? mod.region[2] : 1; 239 lastInRegion += (visaNumLanes(execSize) - 1) * hstride * elementSize; 240 } 241 return lastInRegion + elementSize; 242 }; 243 // Compile-time ULTs for GrfRegionSize() 244 static_assert(GrfRegionSize(EXEC_SIZE_16, 4, SModifier{}, false) == 64 && 245 GrfRegionSize(EXEC_SIZE_16, 4, SModifier{ 16, {}, {0,0,2}, {}, {}, true }, false) == 124 && 246 GrfRegionSize(EXEC_SIZE_16, 4, SModifier{ 15, {}, {0,0,2}, {}, {}, true }, false) == 124 + 7 * 4 && 247 GrfRegionSize(EXEC_SIZE_8, 8, SModifier{ 1, {}, {0,0,2}, {}, {}, true }, false) == 128, 248 "GrfRegionSize compile-time test failed - dst."); 249 static_assert(GrfRegionSize(EXEC_SIZE_16, 4, SModifier{}, true) == 64 && 250 GrfRegionSize(EXEC_SIZE_16, 4, SModifier{ {}, {}, {4,4,0}, {}, {}, true }, true) == 52 && 251 GrfRegionSize(EXEC_SIZE_8, 8, SModifier{ 8, {}, {2,1,0}, {}, {}, true }, true) == 120 && 252 GrfRegionSize(EXEC_SIZE_8, 8, SModifier{ 10, {}, {2,1,0}, {}, {}, true }, true) == 120 + 2 * 8, 253 "GrfRegionSize compile-time test failed - src."); 254 255 // split a SIMD16 variable into two SIMD8 while satisfying vISA's raw operand alignment 256 // return a tuple representing the vISA raw operand (var + offset) after split splitRawOperand(CVariable * var,bool isFirstHalf,VISA_EMask_Ctrl execMask)257 std::tuple<CVariable*, uint32_t> CEncoder::splitRawOperand(CVariable* var, bool isFirstHalf, 258 VISA_EMask_Ctrl execMask) 259 { 260 261 if (!var || var->IsUniform() || isFirstHalf) 262 { 263 // simply return the original variable 264 return std::make_tuple(var, 0); 265 } 266 267 uint32_t offset = 8 * var->GetElemSize(); 268 if ((offset % getGRFSize()) == 0) 269 { 270 return std::make_tuple(var, offset); 271 } 272 else 273 { 274 // create a copy to make the CVariable aligned 275 auto tmpVar = m_program->GetNewVariable( 276 8, var->GetType(), CVariable::getAlignment(getGRFSize()), 277 CName::NONE); 278 SModifier mod; 279 mod.init(); 280 auto dstOpnd = GetDestinationOperand(tmpVar, mod); 281 mod.subReg = 8; 282 auto srcOpnd = GetSourceOperand(var, mod); 283 284 V(vKernel->AppendVISADataMovementInst( 285 ISA_MOV, nullptr, false, 286 SplitEMask(EXEC_SIZE_16, EXEC_SIZE_8, 1, execMask), 287 EXEC_SIZE_8, dstOpnd, srcOpnd)); 288 289 return std::make_tuple(tmpVar, 0); 290 } 291 } 292 GetRawOpndSplitOffset(VISA_Exec_Size fromExecSize,VISA_Exec_Size toExecSize,unsigned thePart,CVariable * var) const293 unsigned CEncoder::GetRawOpndSplitOffset( 294 VISA_Exec_Size fromExecSize, 295 VISA_Exec_Size toExecSize, 296 unsigned thePart, CVariable* var) const 297 { 298 if (!var || var->IsUniform()) 299 return 0; 300 301 IGC_ASSERT_MESSAGE(fromExecSize == EXEC_SIZE_16, "Only support splitting from exec-size 16 to exec-size 8"); 302 IGC_ASSERT_MESSAGE(toExecSize == EXEC_SIZE_8, "Only support splitting from exec-size 16 to exec-size 8"); 303 IGC_ASSERT_MESSAGE((thePart == 0) || (thePart == 1), "Splitting from exec-size-16 to exec-size-8 only breaks into 2 parts"); 304 305 unsigned elemSize = var->GetElemSize(); 306 307 switch (elemSize) 308 { 309 case 4: 310 return thePart * getGRFSize() * 1; 311 case 8: 312 return thePart * getGRFSize() * 2; 313 } 314 315 IGC_ASSERT_MESSAGE(0, "Unknown data type to split!"); 316 return ~0U; 317 } 318 size() const319 size_t URBChannelMask::size() const 320 { 321 return m_bitmask == 0 ? 0 : iSTD::bsr(m_bitmask) + 1; 322 } 323 asVISAMask() const324 unsigned int URBChannelMask::asVISAMask() const 325 { 326 // if all bits in the mask are set we need to return 0xFF which means 'no channel mask' 327 // if all bits are set -> adding one creates a power of two, so x and x+1 has no common bits. 328 if (((m_bitmask + 1) & m_bitmask) == 0) 329 { 330 return (uint32_t)-1; 331 } 332 else 333 { 334 return (uint16_t)m_bitmask; 335 } 336 } 337 Init()338 void CEncoder::Init() 339 { 340 m_encoderState.m_srcOperand[0].init(); 341 m_encoderState.m_srcOperand[1].init(); 342 m_encoderState.m_srcOperand[2].init(); 343 m_encoderState.m_srcOperand[3].init(); 344 m_encoderState.m_dstOperand.init(); 345 m_encoderState.m_flag.init(); 346 m_encoderState.m_mask = EMASK_Q1; 347 m_encoderState.m_noMask = false; 348 m_encoderState.m_simdSize = m_program->m_SIMDSize; 349 m_encoderState.m_uniformSIMDSize = SIMDMode::SIMD1; 350 351 if (m_nestLevelForcedNoMaskRegion > 0) { 352 m_encoderState.m_noMask = true; 353 } 354 } 355 CEncoder()356 CEncoder::CEncoder() 357 { 358 m_program = nullptr; 359 vbuilder = nullptr; 360 vAsmTextBuilder = nullptr; 361 } 362 ~CEncoder()363 CEncoder::~CEncoder() 364 { 365 } 366 getGRFSize() const367 uint32_t CEncoder::getGRFSize() const { return m_program->getGRFSize(); } 368 369 GetShaderName()370 std::string CEncoder::GetShaderName() { 371 return IGC::Debug::GetDumpNameObj(m_program, "").str(); 372 } 373 SetProgram(CShader * program)374 void CEncoder::SetProgram(CShader* program) 375 { 376 m_program = program; 377 Init(); 378 } 379 SubroutineCall(CVariable * flag,llvm::Function * F)380 void CEncoder::SubroutineCall(CVariable* flag, llvm::Function* F) 381 { 382 VISA_LabelOpnd* visaLabel = GetFuncLabel(F); 383 m_encoderState.m_flag.var = flag; 384 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 385 // control flow instructions cannot be broken down into lower SIMD 386 VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; 387 VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize); 388 if (F->hasFnAttribute("KMPLOCK")) 389 { 390 emask = vISA_EMASK_M1_NM; 391 execSize = EXEC_SIZE_1; 392 } 393 V(vKernel->AppendVISACFCallInst(predOpnd, emask, execSize, visaLabel)); 394 } 395 StackCall(CVariable * flag,llvm::Function * F,unsigned char argSize,unsigned char retSize)396 void CEncoder::StackCall(CVariable* flag, llvm::Function* F, unsigned char argSize, unsigned char retSize) 397 { 398 399 m_encoderState.m_flag.var = flag; 400 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 401 // control flow instructions cannot be broken down into lower SIMD 402 VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; 403 VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize); 404 V(vKernel->AppendVISACFFunctionCallInst(predOpnd, emask, execSize, F->getName().data(), argSize, retSize)); 405 } 406 IndirectStackCall(CVariable * flag,CVariable * funcPtr,unsigned char argSize,unsigned char retSize)407 void CEncoder::IndirectStackCall(CVariable* flag, CVariable* funcPtr, unsigned char argSize, unsigned char retSize) 408 { 409 m_encoderState.m_flag.var = flag; 410 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 411 // control flow instructions cannot be broken down into lower SIMD 412 VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; 413 VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize); 414 VISA_VectorOpnd* funcAddrOpnd = GetSourceOperandNoModifier(funcPtr); 415 V(vKernel->AppendVISACFIndirectFuncCallInst(predOpnd, emask, execSize, funcAddrOpnd, argSize, retSize)); 416 } 417 SubroutineRet(CVariable * flag,llvm::Function * F)418 void CEncoder::SubroutineRet(CVariable* flag, llvm::Function* F) 419 { 420 m_encoderState.m_flag.var = flag; 421 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 422 // control flow instructions cannot be broken down into lower SIMD 423 VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; 424 VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize); 425 if (F->hasFnAttribute("KMPLOCK")) 426 { 427 emask = vISA_EMASK_M1_NM; 428 execSize = EXEC_SIZE_1; 429 } 430 V(vKernel->AppendVISACFRetInst(predOpnd, emask, execSize)); 431 } 432 StackRet(CVariable * flag)433 void CEncoder::StackRet(CVariable* flag) 434 { 435 m_encoderState.m_flag.var = flag; 436 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 437 // control flow instructions cannot be broken down into lower SIMD 438 VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; 439 VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize); 440 V(vKernel->AppendVISACFFunctionRetInst(predOpnd, emask, execSize)); 441 } 442 Jump(CVariable * flag,uint label)443 void CEncoder::Jump(CVariable* flag, uint label) 444 { 445 VISA_LabelOpnd* visaLabel = GetLabel(label); 446 m_encoderState.m_flag.var = flag; 447 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 448 // control flow instructions cannot be broken down into lower SIMD 449 VISA_EMask_Ctrl emask = m_encoderState.m_noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; 450 VISA_Exec_Size execSize = visaExecSize(m_program->m_dispatchSize); 451 452 // visa and igc agreement. 453 // goto (1) is used to tell visa the goto is uniform. 454 // Goto(1) is generated if 455 // 1. jump is unconditional, or 456 // 2. jump is uniform (thread uniform or above) and no EU fusion, or 457 // 3. jump is either workgroup or global uniform under EU fusion 458 // (it is temporarily under key control for ease of debugging) 459 if (flag == nullptr || 460 (!m_program->m_Platform->hasFusedEU() && flag->IsUniform()) || 461 (IGC_IS_FLAG_ENABLED(EnableWorkGroupUniformGoto) && 462 m_program->m_Platform->hasFusedEU() && flag->IsWorkGroupOrGlobalUniform())) 463 { 464 execSize = EXEC_SIZE_1; 465 } 466 V(vKernel->AppendVISACFGotoInst(predOpnd, emask, execSize, visaLabel)); 467 } 468 Label(uint label)469 void CEncoder::Label(uint label) 470 { 471 VISA_LabelOpnd* visaLabel = GetLabel(label); 472 V(vKernel->AppendVISACFLabelInst(visaLabel)); 473 } 474 GetNewLabelID(const CName & name)475 uint CEncoder::GetNewLabelID(const CName &name) 476 { 477 uint id = labelMap.size(); 478 labelMap.push_back(nullptr); 479 labelNameMap.push_back( 480 CreateVisaLabelName(llvm::StringRef(name.getCString()))); 481 return id; 482 } 483 DwordAtomicRaw(AtomicOp atomic_op,const ResourceDescriptor & resource,CVariable * dst,CVariable * elem_offset,CVariable * src0,CVariable * src1,bool is16Bit)484 void CEncoder::DwordAtomicRaw( 485 AtomicOp atomic_op, 486 const ResourceDescriptor& resource, 487 CVariable* dst, 488 CVariable* elem_offset, 489 CVariable* src0, 490 CVariable* src1, 491 bool is16Bit) 492 { 493 494 // Fix types for dword atomics 495 VISA_Type type = ISA_TYPE_UD; 496 if (atomic_op == EATOMIC_IMAX || atomic_op == EATOMIC_IMIN) 497 { 498 type = ISA_TYPE_D; 499 } 500 else if (atomic_op == EATOMIC_FMAX || 501 atomic_op == EATOMIC_FMIN || 502 atomic_op == EATOMIC_FADD || 503 atomic_op == EATOMIC_FSUB || 504 atomic_op == EATOMIC_FCMPWR) 505 { 506 type = ISA_TYPE_F; 507 } 508 if (src0 && src0->GetType() != type) 509 src0 = m_program->BitCast(src0, type); 510 if (src1 && src1->GetType() != type) 511 src1 = m_program->BitCast(src1, type); 512 if (dst && dst->GetType() != type) 513 dst = m_program->BitCast(dst, type); 514 if (elem_offset->GetType() != ISA_TYPE_UD) 515 elem_offset = m_program->BitCast(elem_offset, ISA_TYPE_UD); 516 517 IGC_ASSERT_MESSAGE(nullptr == m_encoderState.m_flag.var, "not supported predicate"); 518 519 VISA_StateOpndHandle* pSurfStateOpndHandle = GetVISASurfaceOpnd(resource); 520 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 521 VISA_RawOpnd* pDst = GetRawDestination(dst); 522 VISA_RawOpnd* pElemOffset = GetRawSource(elem_offset); 523 VISA_RawOpnd* pSrc0 = GetRawSource(src0); 524 VISA_RawOpnd* pSrc1 = GetRawSource(src1); 525 526 /* 527 So the problem is this - the message was added for SNB, and at the time it was implemented as 528 CMPXCHG : new = (old==src1) ? src0 : old 529 530 In IVB this becomes untyped atomic, and it's implemented as 531 AOP_CMPWR (src0 == old_dst) ? src1 : old_dst old_dst 532 533 Note that the source is swapped. Since we define CMPXCHG as the former in vISA, internally we 534 perform a swap for it. So I guess for now you'll need to swap the two source to follow the vISA 535 semantics. We may want to add a new vISA message to fix this issue. 536 */ 537 if (atomic_op == EATOMIC_CMPXCHG) { 538 std::swap(pSrc0, pSrc1); 539 } 540 541 V(vKernel->AppendVISASurfAccessDwordAtomicInst( 542 predOpnd, 543 convertAtomicOpEnumToVisa(atomic_op), 544 is16Bit, 545 ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask), 546 visaExecSize(m_encoderState.m_simdSize), 547 pSurfStateOpndHandle, 548 pElemOffset, 549 pSrc0, 550 pSrc1, 551 pDst)); 552 if (ESURFACE_STATELESS == resource.m_surfaceType) 553 { 554 this->m_program->IncStatelessWritesCount(); 555 } 556 } 557 Cmp(e_predicate p,CVariable * dst,CVariable * src0,CVariable * src1)558 void CEncoder::Cmp(e_predicate p, CVariable* dst, CVariable* src0, CVariable* src1) 559 { 560 VISA_Cond_Mod subOp = ConvertCondModToVisaType(p); 561 562 bool flagDst = 0; 563 if (dst->GetType() == ISA_TYPE_BOOL) 564 { 565 flagDst = true; 566 } 567 568 VISA_VectorOpnd* opnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]); 569 VISA_VectorOpnd* opnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]); 570 571 if (flagDst) 572 { 573 V(vKernel->AppendVISAComparisonInst( 574 subOp, 575 GetAluEMask(dst), 576 GetAluExecSize(dst), 577 dst->visaPredVariable, 578 opnd0, 579 opnd1)); 580 } 581 else 582 { 583 V(vKernel->AppendVISAComparisonInst( 584 subOp, 585 GetAluEMask(dst), 586 GetAluExecSize(dst), 587 GetDestinationOperand(dst, m_encoderState.m_dstOperand), 588 opnd0, 589 opnd1)); 590 } 591 } 592 Select(CVariable * flag,CVariable * dst,CVariable * src0,CVariable * src1)593 void CEncoder::Select(CVariable* flag, CVariable* dst, CVariable* src0, CVariable* src1) 594 { 595 m_encoderState.m_flag.var = flag; 596 597 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand); 598 VISA_VectorOpnd* src0Opnd = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]); 599 VISA_VectorOpnd* src1Opnd = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]); 600 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 601 602 V(vKernel->AppendVISADataMovementInst( 603 ISA_SEL, 604 predOpnd, 605 IsSat(), 606 GetAluEMask(dst), 607 GetAluExecSize(dst), 608 dstOpnd, 609 src0Opnd, 610 src1Opnd)); 611 } 612 PredAdd(CVariable * flag,CVariable * dst,CVariable * src0,CVariable * src1)613 void CEncoder::PredAdd(CVariable* flag, CVariable* dst, CVariable* src0, CVariable* src1) 614 { 615 m_encoderState.m_flag.var = flag; 616 617 Arithmetic(ISA_ADD, dst, src0, src1); 618 } 619 SetDstSubVar(uint subVar)620 void CEncoder::SetDstSubVar(uint subVar) 621 { 622 m_encoderState.m_dstOperand.subVar = int_cast<uint8_t>(subVar); 623 } 624 SetDstSubReg(uint subReg)625 void CEncoder::SetDstSubReg(uint subReg) 626 { 627 m_encoderState.m_dstOperand.subReg = int_cast<uint16_t>(subReg); 628 } 629 SetSrcSubVar(uint srcNum,uint subVar)630 void CEncoder::SetSrcSubVar(uint srcNum, uint subVar) 631 { 632 IGC_ASSERT(srcNum < 4); 633 m_encoderState.m_srcOperand[srcNum].subVar = int_cast<uint8_t>(subVar); 634 } 635 SetSrcSubReg(uint srcNum,uint subReg)636 void CEncoder::SetSrcSubReg(uint srcNum, uint subReg) 637 { 638 IGC_ASSERT(srcNum < 4); 639 m_encoderState.m_srcOperand[srcNum].subReg = int_cast<uint16_t>(subReg); 640 } 641 SetDstModifier(e_modifier mod)642 void CEncoder::SetDstModifier(e_modifier mod) 643 { 644 IGC_ASSERT((mod == EMOD_SAT) || (mod == EMOD_NONE)); 645 m_encoderState.m_dstOperand.mod = mod; 646 } 647 SetSrcModifier(uint srcNum,e_modifier mod)648 void CEncoder::SetSrcModifier(uint srcNum, e_modifier mod) 649 { 650 IGC_ASSERT(mod != EMOD_SAT); 651 IGC_ASSERT(srcNum < 3); 652 m_encoderState.m_srcOperand[srcNum].mod = mod; 653 } 654 SetPredicate(CVariable * flag)655 void CEncoder::SetPredicate(CVariable* flag) 656 { 657 IGC_ASSERT((nullptr == flag) || (flag->GetVarType() == EVARTYPE_PREDICATE)); 658 m_encoderState.m_flag.var = flag; 659 } 660 SetInversePredicate(bool inv)661 void CEncoder::SetInversePredicate(bool inv) 662 { 663 m_encoderState.m_flag.invertFlag = inv; 664 } 665 SetPredicateMode(e_predMode mode)666 void CEncoder::SetPredicateMode(e_predMode mode) 667 { 668 m_encoderState.m_flag.mode = mode; 669 } 670 SetDstModifier(const DstModifier & modifier)671 void CEncoder::SetDstModifier(const DstModifier& modifier) 672 { 673 if (modifier.sat) 674 { 675 SetDstModifier(EMOD_SAT); 676 } 677 if (modifier.flag) 678 { 679 SetPredicate(m_program->GetSymbol(modifier.flag->value)); 680 SetInversePredicate(modifier.invertFlag); 681 } 682 } 683 SetSrcRegion(uint srcNum,uint vStride,uint width,uint hStride,e_instance instance)684 void CEncoder::SetSrcRegion(uint srcNum, uint vStride, uint width, uint hStride, e_instance instance) 685 { 686 m_encoderState.m_srcOperand[srcNum].region[0] = int_cast<uint8_t>(vStride); 687 m_encoderState.m_srcOperand[srcNum].region[1] = int_cast<uint8_t>(width); 688 m_encoderState.m_srcOperand[srcNum].region[2] = int_cast<uint8_t>(hStride); 689 m_encoderState.m_srcOperand[srcNum].instance = instance; 690 m_encoderState.m_srcOperand[srcNum].specialRegion = true; 691 } 692 SetDstRegion(uint hStride)693 void CEncoder::SetDstRegion(uint hStride) 694 { 695 m_encoderState.m_dstOperand.region[2] = int_cast<uint8_t>(hStride); 696 m_encoderState.m_dstOperand.specialRegion = (hStride != 1); 697 } 698 GetSignBit(VISA_Type type)699 uint64_t GetSignBit(VISA_Type type) 700 { 701 switch (type) 702 { 703 case ISA_TYPE_Q: 704 case ISA_TYPE_DF: 705 return 63; 706 case ISA_TYPE_D: 707 case ISA_TYPE_F: 708 return 31; 709 case ISA_TYPE_W: 710 case ISA_TYPE_HF: 711 case ISA_TYPE_BF: 712 return 15; 713 case ISA_TYPE_B: 714 return 7; 715 default: 716 IGC_ASSERT_MESSAGE(0, "type doesn't support modifier"); 717 break; 718 } 719 return 63; 720 } 721 IsFloat(VISA_Type type)722 bool IsFloat(VISA_Type type) 723 { 724 return type == ISA_TYPE_DF || type == ISA_TYPE_F || type == ISA_TYPE_HF || type == ISA_TYPE_BF; 725 } 726 CalculateImmediateValue(CVariable * var,e_modifier mod)727 uint64_t CalculateImmediateValue(CVariable* var, e_modifier mod) 728 { 729 IGC_ASSERT(nullptr != var); 730 uint64_t immediate = var->GetImmediateValue(); 731 IGC_ASSERT((mod == EMOD_ABS) || (mod == EMOD_NEG) || (mod == EMOD_NEGABS) || (mod == EMOD_NONE)); 732 // handle modifiers for immediates. 733 // Change the sign bit for floats and do logic operations for integers 734 if (IsFloat(var->GetType())) 735 { 736 if (mod == EMOD_ABS) 737 { 738 immediate &= ~((uint64_t)(1) << GetSignBit(var->GetType())); 739 } 740 else if (mod == EMOD_NEG) 741 { 742 immediate ^= (uint64_t)(1) << GetSignBit(var->GetType()); 743 } 744 else if (mod == EMOD_NEGABS) 745 { 746 immediate |= ((uint64_t)(1) << GetSignBit(var->GetType())); 747 } 748 } 749 else 750 { 751 if (mod == EMOD_ABS || mod == EMOD_NEGABS) 752 { 753 uint64_t mask = (immediate >> GetSignBit(var->GetType()))& (uint64_t)0x01; 754 immediate = (immediate + mask) ^ mask; 755 } 756 if (mod == EMOD_NEG || mod == EMOD_NEGABS) 757 { 758 immediate = ~immediate + 1; 759 } 760 } 761 return immediate; 762 } 763 GetSourceOperandNoModifier(CVariable * var)764 VISA_VectorOpnd* CEncoder::GetSourceOperandNoModifier(CVariable* var) 765 { 766 SModifier nullMod; 767 nullMod.init(); 768 return GetSourceOperand(var, nullMod); 769 } 770 GetSourceOperand(CVariable * var,const SModifier & mod)771 VISA_VectorOpnd* CEncoder::GetSourceOperand(CVariable* var, const SModifier& mod) 772 { 773 if (var == nullptr) 774 { 775 return nullptr; 776 } 777 VISA_VectorOpnd* operand = nullptr; 778 if (var->IsImmediate()) 779 { 780 uint64_t immediate = CalculateImmediateValue(var, mod.mod); 781 V(vKernel->CreateVISAImmediate(operand, &immediate, var->GetType())); 782 } 783 else 784 { 785 if (var->GetVarType() == EVARTYPE_GENERAL) 786 { 787 unsigned short vStride = 1; 788 unsigned short width = 1; 789 unsigned short hStride = 0; 790 791 if (mod.specialRegion) 792 { 793 vStride = int_cast<unsigned short>(mod.region[0]); 794 width = int_cast<unsigned short>(mod.region[1]); 795 hStride = int_cast<unsigned short>(mod.region[2]); 796 } 797 else if (var->IsUniform()) 798 { 799 //Scalar regioning 800 vStride = 0; 801 width = 1; 802 hStride = 0; 803 } 804 unsigned char rowOffset = 0; 805 unsigned char colOffset = 0; 806 GetRowAndColOffset(var, mod.subVar, mod.subReg, rowOffset, colOffset); 807 V(vKernel->CreateVISASrcOperand( 808 operand, 809 GetVISAVariable(var, mod.instance), 810 ConvertModifierToVisaType(mod.mod), 811 vStride, 812 width, 813 hStride, 814 rowOffset, 815 colOffset)); 816 } 817 else if (var->GetVarType() == EVARTYPE_ADDRESS) 818 { 819 if (var->IsUniform()) 820 { 821 // uniform addressing uses 1x1 indirect addressing mode 822 unsigned short vStride = 8; 823 unsigned short width = 8; 824 unsigned short hStride = 1; 825 826 //if vector is also uniform 827 if (var->IsVectorUniform()) 828 { 829 vStride = 0; 830 width = 1; 831 hStride = 0; 832 } 833 unsigned short immOffset = (unsigned short) 834 mod.subReg * GetCISADataTypeSize(var->GetType()); 835 V(vKernel->CreateVISAIndirectSrcOperand( 836 operand, 837 var->visaAddrVariable, 838 MODIFIER_NONE, 839 0, 840 immOffset, 841 vStride, 842 width, 843 hStride, 844 var->GetType())); 845 } 846 else 847 { 848 // non-uniform addressing uses VxH indirect addressing mode 849 // NB: this requires that all subregisters of a0 are properly 850 // set up, including per-lane subreg offsets. 851 V(vKernel->CreateVISAIndirectOperandVxH( 852 operand, 853 var->visaAddrVariable, 854 mod.subReg, 855 0, 856 var->GetType())); 857 } 858 } 859 } 860 return operand; 861 } 862 GetDestinationOperand(CVariable * var,const SModifier & mod)863 VISA_VectorOpnd* CEncoder::GetDestinationOperand(CVariable* var, const SModifier& mod) 864 { 865 VISA_VectorOpnd* operand = NULL; 866 //Create Dst operand 867 if (var->GetVarType() == EVARTYPE_GENERAL) 868 { 869 unsigned short hStride = 1; 870 unsigned char rowOffset = 0; 871 unsigned char colOffset = 0; 872 GetRowAndColOffset(var, mod.subVar, mod.subReg, rowOffset, colOffset); 873 if (mod.specialRegion) 874 { 875 hStride = (unsigned short)mod.region[2]; 876 } 877 878 V(vKernel->CreateVISADstOperand( 879 operand, 880 GetVISAVariable(var), 881 hStride, 882 rowOffset, 883 colOffset)); 884 } 885 else if (var->GetVarType() == EVARTYPE_ADDRESS) 886 { 887 const unsigned short hStride = 1; 888 unsigned char addrOffset = int_cast<unsigned char>(mod.subReg); 889 unsigned short immOffset = 0; 890 if (var->IsUniform()) 891 { 892 // We are using 1x1 destination region, we must use a0.0. 893 // Use subReg to compute immOffset. 894 immOffset = (unsigned short) 895 mod.subReg * GetCISADataTypeSize(var->GetType()); 896 addrOffset = 0; 897 } 898 V(vKernel->CreateVISAIndirectDstOperand( 899 operand, 900 var->visaAddrVariable, 901 addrOffset, 902 immOffset, 903 hStride, 904 var->GetType())); 905 } 906 return operand; 907 } 908 GetFlagOperand(const SFlag & flag)909 VISA_PredOpnd* CEncoder::GetFlagOperand(const SFlag& flag) 910 { 911 if (flag.var == nullptr) 912 { 913 return nullptr; 914 } 915 VISA_PredOpnd* operand = nullptr; 916 VISA_PREDICATE_STATE predState = (flag.invertFlag) 917 ? PredState_INVERSE : PredState_NO_INVERSE; 918 VISA_PREDICATE_CONTROL predCtrl = PRED_CTRL_NON; 919 920 switch (flag.mode) 921 { 922 case EPRED_ALL: predCtrl = PRED_CTRL_ALL; break; 923 case EPRED_ANY: predCtrl = PRED_CTRL_ANY; break; 924 default: break; 925 } 926 927 V(vKernel->CreateVISAPredicateOperand( 928 operand, 929 flag.var->visaPredVariable, 930 predState, 931 predCtrl)); 932 return operand; 933 } 934 GetAluExecSize(CVariable * dst) const935 VISA_Exec_Size CEncoder::GetAluExecSize(CVariable* dst) const 936 { 937 SIMDMode simdSize = m_encoderState.m_simdSize; 938 939 if (dst && dst->GetVarType() == EVARTYPE_ADDRESS) 940 { 941 if (dst->IsVectorUniform() && dst->IsUniform()) 942 { 943 simdSize = m_encoderState.m_uniformSIMDSize; 944 } 945 } 946 else if (dst && dst->IsUniform()) 947 { 948 if (dst->GetVarType() == EVARTYPE_PREDICATE) 949 { 950 if (dst->GetNumberElement() == 1) 951 { 952 simdSize = m_encoderState.m_uniformSIMDSize; 953 } 954 } 955 else 956 { 957 simdSize = m_encoderState.m_uniformSIMDSize; 958 } 959 } 960 961 return visaExecSize(simdSize); 962 } 963 GetAluEMask(CVariable * dst)964 VISA_EMask_Ctrl CEncoder::GetAluEMask(CVariable* dst) 965 { 966 e_mask mask = m_encoderState.m_mask; 967 bool noMask = m_encoderState.m_noMask; 968 if (dst) 969 { 970 if (m_encoderState.m_SubSpanDestination) 971 { 972 noMask = true; 973 } 974 else 975 { 976 if (dst->GetVarType() == EVARTYPE_ADDRESS) 977 { 978 if (dst->IsVectorUniform() && dst->IsUniform()) 979 { 980 noMask = true; 981 } 982 } 983 else if (dst->IsUniform()) 984 { 985 noMask = true; 986 } 987 } 988 } 989 990 return ConvertMaskToVisaType(mask, noMask); 991 } 992 IsSat()993 bool CEncoder::IsSat() 994 { 995 return (m_encoderState.m_dstOperand.mod == EMOD_SAT) ? true : false; 996 } 997 MinMax(CISA_MIN_MAX_SUB_OPCODE subopcode,CVariable * dst,CVariable * src0,CVariable * src1)998 void CEncoder::MinMax(CISA_MIN_MAX_SUB_OPCODE subopcode, CVariable* dst, CVariable* src0, CVariable* src1) 999 { 1000 IGC_ASSERT_MESSAGE(nullptr == m_encoderState.m_flag.var, "min/max doesn't support predication"); 1001 1002 VISA_VectorOpnd* opnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]); 1003 VISA_VectorOpnd* opnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]); 1004 VISA_VectorOpnd* dstopnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand); 1005 1006 V(vKernel->AppendVISAMinMaxInst( 1007 subopcode, 1008 IsSat(), 1009 GetAluEMask(dst), 1010 GetAluExecSize(dst), 1011 dstopnd, 1012 opnd0, 1013 opnd1)); 1014 } 1015 1016 // NeedSplitting - Check whether a variable needs splitting due to the 1017 // violation of the hardware rule of no more than 2 GRFs should be accessed. 1018 // So far, only the following cases are covered 1019 // - SIMD16 1020 // note that SIMD32 is supported differently. 1021 // - data types of 4+ bytes or 32+ bits 1022 // - for source, we only handle limited regions. 1023 // 1024 // numParts - return the total parts to be split, e.g. if the region spans 4 1025 // GRFs, it needs splitting into 2 parts at least. NeedSplitting(CVariable * var,const SModifier & mod,unsigned & numParts,bool isSource) const1026 bool CEncoder::NeedSplitting(CVariable* var, const SModifier& mod, 1027 unsigned& numParts, bool isSource) const 1028 { 1029 // If nothing is specified, don't split. 1030 if (!var) 1031 { 1032 return false; 1033 } 1034 1035 // Only handle SIMD16 now! We assume all data movements in SIMD8 will honor 1036 // the region rules. 1037 VISA_Exec_Size simdSize = GetAluExecSize(var); 1038 const unsigned elemSize = var->GetElemSize(); 1039 1040 switch (simdSize) 1041 { 1042 case EXEC_SIZE_16: 1043 break; 1044 default: 1045 { 1046 // Checks for some rare cases that are not handled by the splitter, but should be detected and reported. 1047 // Example: mov (8|M0) r4.0<1>:q r31.0<2;1,0>:q 1048 unsigned maxBlockSize = getGRFSize() * 2; // size of 2 GRFs in bytes 1049 // For uniform variables (which implies simdSize==1) the emitter may set regions with width>1. 1050 // As it may happen in various places, we detect it here. 1051 IGC_ASSERT(var->IsUniform() || (GrfRegionSize(simdSize, elemSize, mod, isSource) <= maxBlockSize)); 1052 return false; 1053 } 1054 } 1055 1056 // Only general variables need splitting so far. 1057 if (var->GetVarType() != EVARTYPE_GENERAL) 1058 { 1059 return false; 1060 } 1061 1062 // Only varying variable need splitting so far. 1063 // NOTE: uniform variable is assumed to take less than 2 GRF+. 1064 if (var->IsUniform()) 1065 { 1066 return false; 1067 } 1068 1069 // We assume there is no 2 GRF crossing when element size is smaller than 1070 // 4 bytes (or 32 bits), e.g. 16-bit WORD. 1071 if (elemSize < 4) 1072 { 1073 return false; 1074 } 1075 1076 // If the data type has more than 4 bytes, i.e. 32 bits, it already crosses 1077 // 2+ GRFs by itself. There's no need to check further. 1078 if (elemSize > 4) 1079 { 1080 IGC_ASSERT_MESSAGE(8 == elemSize, "Only QWORD is supported so far"); 1081 IGC_ASSERT_MESSAGE(isSource || !mod.specialRegion, 1082 "It's expected that there's no special region associated with QWORD type destination!"); 1083 if (isSource && mod.specialRegion) 1084 { 1085 if (mod.region[1] == 1 && mod.region[0] == 0) 1086 { 1087 // src region is <0;1,x>, can't cross 2 GRF. No need to split. 1088 return false; 1089 } 1090 IGC_ASSERT_MESSAGE(0, "Unhandled special source region on QWORD type!"); 1091 } 1092 1093 numParts = std::max(numParts, 2U); 1094 return true; 1095 1096 } 1097 1098 1099 // For 32-bit data types, without special region, they won't cross 2+ GRFs. 1100 if (!mod.specialRegion) 1101 { 1102 return false; 1103 } 1104 1105 // Check regioning. 1106 if (isSource) 1107 { 1108 // FIXME: Need better support for region with non-1 width. 1109 if (mod.region[1] != 1) 1110 { 1111 return false; 1112 } 1113 1114 if (mod.region[0] < 2) 1115 { 1116 return false; 1117 } 1118 1119 // For src with width set to 1, region with > 1 vstride needs 1120 // splitting. 1121 numParts = std::max(numParts, unsigned(mod.region[0])); 1122 return true; 1123 } 1124 1125 if (mod.region[2] < 2) 1126 { 1127 return false; 1128 } 1129 1130 // For dst, region with > 1 hstride needs splitting. 1131 numParts = std::max(numParts, unsigned(mod.region[2])); 1132 return true; 1133 } 1134 1135 // SplitVariable - Split the variable to prevent accessing 2+ GRFs. SplitVariable(VISA_Exec_Size fromExecSize,VISA_Exec_Size toExecSize,unsigned thePart,CVariable * var,const SModifier & mod,bool isSource) const1136 SModifier CEncoder::SplitVariable(VISA_Exec_Size fromExecSize, 1137 VISA_Exec_Size toExecSize, 1138 unsigned thePart, 1139 CVariable* var, const SModifier& mod, 1140 bool isSource) const { 1141 // Splitting uniform or source scalar variables is unnecessary! 1142 bool isAddrVar = var && var->GetVarType() == EVARTYPE_ADDRESS; 1143 if (!var || (var->IsUniform() && (!isAddrVar || var->IsVectorUniform())) || 1144 (isSource && mod.specialRegion && 1145 mod.region[1] == 1 && mod.region[0] == 0 && mod.region[2] == 0)) 1146 return mod; 1147 1148 IGC_ASSERT_MESSAGE(((fromExecSize == EXEC_SIZE_16) && (toExecSize == EXEC_SIZE_8)) || ((fromExecSize == EXEC_SIZE_32) && (toExecSize == EXEC_SIZE_16)), 1149 "Only support splitting from exec-size 16 to exec-size 8, or 32 to 16!"); 1150 IGC_ASSERT_MESSAGE((thePart == 0) || (thePart == 1), 1151 "Splitting from exec-size-16 to exec-size-8 only breaks into 2 parts!"); 1152 1153 // Copy the original modifier first. 1154 SModifier newMod = mod; 1155 unsigned elemSize = var->GetElemSize(); 1156 1157 if (isAddrVar) 1158 { 1159 // Note that for address var, subReg has two meanings: 1160 // 1. if var is uniform (so using 1x1 addressing mode), 1161 // subReg * (size of var's type) is a0.0's immOffset; 1162 // 2. otherwise (using VxH addressing mode), 1163 // subReg is indeed an sub register number of a0. 1164 newMod.subReg += thePart * visaNumLanes(toExecSize); 1165 return newMod; 1166 } 1167 1168 if (!mod.specialRegion) { 1169 // Without special regioning, split the given variable based on type. 1170 switch (elemSize) { 1171 case 1: 1172 case 2: 1173 newMod.subReg += thePart * 8; // 8, i.e. half elements 1174 break; 1175 case 4: 1176 newMod.subVar += thePart * 1; // 1 GRF 1177 break; 1178 case 8: 1179 newMod.subVar += thePart * 2; // 2 GRFs 1180 break; 1181 default: 1182 IGC_ASSERT_MESSAGE(0, "Unknown data type to split!"); 1183 break; 1184 } 1185 return newMod; 1186 } 1187 1188 unsigned theStride = 0; 1189 if (isSource) { 1190 IGC_ASSERT_MESSAGE((mod.region[1] == 1), 1191 "Don't know how to split region with non-1 width!"); 1192 theStride = mod.region[0]; 1193 } 1194 else { 1195 theStride = mod.region[2]; 1196 } 1197 1198 switch (elemSize) { 1199 case 1: 1200 case 2: 1201 newMod.subReg += thePart * 8 * theStride; // 8, i.e. half elements 1202 break; 1203 case 4: 1204 newMod.subVar += thePart * 1 * theStride; // 1 GRF 1205 break; 1206 case 8: 1207 newMod.subVar += thePart * 2 * theStride; // 2 GRFs 1208 break; 1209 default: 1210 IGC_ASSERT_MESSAGE(0, "Unknown data type to split!"); 1211 break; 1212 } 1213 1214 return newMod; 1215 } 1216 SplitExecSize(VISA_Exec_Size fromExecSize,unsigned numParts) const1217 VISA_Exec_Size CEncoder::SplitExecSize(VISA_Exec_Size fromExecSize, unsigned numParts) const 1218 { 1219 IGC_ASSERT_MESSAGE(2 == numParts, "Only know splitting SIMD16 into SIMD8!"); 1220 1221 switch (fromExecSize) { 1222 default: 1223 break; 1224 case EXEC_SIZE_32: 1225 return EXEC_SIZE_16; 1226 case EXEC_SIZE_16: 1227 return EXEC_SIZE_8; 1228 } 1229 IGC_ASSERT_MESSAGE(0, "Unknown execution size to be split!"); 1230 return static_cast<VISA_Exec_Size>(~0); 1231 } 1232 1233 VISA_EMask_Ctrl SplitEMask(VISA_Exec_Size fromExecSize,VISA_Exec_Size toExecSize,unsigned thePart,VISA_EMask_Ctrl execMask) const1234 CEncoder::SplitEMask(VISA_Exec_Size fromExecSize, 1235 VISA_Exec_Size toExecSize, 1236 unsigned thePart, VISA_EMask_Ctrl execMask) const { 1237 IGC_ASSERT_MESSAGE(((fromExecSize == EXEC_SIZE_16) && (toExecSize == EXEC_SIZE_8)) || ((fromExecSize == EXEC_SIZE_32) && (toExecSize == EXEC_SIZE_16)), 1238 "Only support splitting from exec-size 16 to exec-size 8, or from 32 to 16!"); 1239 IGC_ASSERT_MESSAGE((thePart == 0) || (thePart == 1), 1240 "Splitting from exec-size-16 to exec-size-8 only breaks into 2 parts!"); 1241 1242 // FIXME: Better to generate a table! 1243 1244 switch (fromExecSize) { 1245 default: 1246 break; 1247 case EXEC_SIZE_32: 1248 switch (toExecSize) { 1249 default: 1250 break; 1251 case EXEC_SIZE_16: 1252 switch (execMask) { 1253 default: 1254 break; 1255 case vISA_EMASK_M1: return thePart ? vISA_EMASK_M5 : vISA_EMASK_M1; 1256 case vISA_EMASK_M1_NM: return thePart ? vISA_EMASK_M5_NM : vISA_EMASK_M1_NM; 1257 case vISA_EMASK_M3: return thePart ? vISA_EMASK_M7 : vISA_EMASK_M3; 1258 case vISA_EMASK_M3_NM: return thePart ? vISA_EMASK_M7_NM : vISA_EMASK_M3_NM; 1259 case vISA_EMASK_M5: return thePart ? vISA_EMASK_M1 : vISA_EMASK_M5; 1260 case vISA_EMASK_M5_NM: return thePart ? vISA_EMASK_M1_NM : vISA_EMASK_M5_NM; 1261 case vISA_EMASK_M7: return thePart ? vISA_EMASK_M3 : vISA_EMASK_M7; 1262 case vISA_EMASK_M7_NM: return thePart ? vISA_EMASK_M3_NM : vISA_EMASK_M7_NM; 1263 } 1264 break; 1265 } 1266 break; 1267 1268 case EXEC_SIZE_16: 1269 switch (toExecSize) { 1270 default: 1271 break; 1272 case EXEC_SIZE_8: 1273 switch (execMask) { 1274 default: 1275 break; 1276 case vISA_EMASK_M1: return thePart ? vISA_EMASK_M3 : vISA_EMASK_M1; 1277 case vISA_EMASK_M1_NM: return thePart ? vISA_EMASK_M3_NM : vISA_EMASK_M1_NM; 1278 case vISA_EMASK_M5: return thePart ? vISA_EMASK_M7 : vISA_EMASK_M5; 1279 case vISA_EMASK_M5_NM: return thePart ? vISA_EMASK_M7_NM : vISA_EMASK_M5_NM; 1280 } 1281 break; 1282 } 1283 break; 1284 } 1285 IGC_ASSERT_MESSAGE(0, "Unknown execution mask to be split into low part!"); 1286 return static_cast<VISA_EMask_Ctrl>(~0); 1287 } 1288 1289 // Splitting SIMD16 Message Data Payload (MDP at offset = MDPOfst) for A64 1290 // scatter/untyped write messages to two SIMD8 MDPs (V0 and V1). SplitPayloadToLowerSIMD(CVariable * MDP,uint32_t MDPOfst,uint32_t NumBlks,CVariable * V0,CVariable * V1,uint32_t fromSize)1291 void CEncoder::SplitPayloadToLowerSIMD(CVariable* MDP, uint32_t MDPOfst, uint32_t NumBlks, CVariable* V0, CVariable* V1, uint32_t fromSize) 1292 { 1293 IGC_ASSERT(nullptr != MDP); 1294 IGC_ASSERT(nullptr != V0); 1295 IGC_ASSERT(nullptr != V1); 1296 1297 VISA_GenVar* GV = GetVISAVariable(MDP); 1298 VISA_GenVar* v0GV = GetVISAVariable(V0); 1299 VISA_GenVar* v1GV = GetVISAVariable(V1); 1300 VISA_VectorOpnd* movDst0 = nullptr; 1301 VISA_VectorOpnd* movDst1 = nullptr; 1302 VISA_VectorOpnd* srcOpnd = nullptr; 1303 const uint32_t toSize = fromSize / 2; 1304 const VISA_Exec_Size fromESize = visaExecSize(lanesToSIMDMode(fromSize)); 1305 const VISA_Exec_Size toESize = visaExecSize(lanesToSIMDMode(toSize)); 1306 const uint32_t eltBytes = MDP->GetElemSize(); 1307 1308 IGC_ASSERT_MESSAGE(V0->GetElemSize() == eltBytes, "Element size should be the same among SIMD16 MDP and SIMD8 MDP."); 1309 IGC_ASSERT_MESSAGE(V1->GetElemSize() == eltBytes, "Element size should be the same among SIMD16 MDP and SIMD8 MDP."); 1310 1311 // Number of elements per GRF 1312 1313 if (eltBytes > 0) 1314 { 1315 uint32_t GRFElts = getGRFSize() / eltBytes; 1316 1317 if (GRFElts > 0) 1318 { 1319 VISA_EMask_Ctrl execNM = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask); 1320 uint32_t MDPStart = MDPOfst / eltBytes; 1321 for (uint32_t i = 0; i < NumBlks; ++i) 1322 { 1323 uint32_t dstOfst = i * toSize; 1324 uint32_t srcOfst = i * fromSize + MDPStart; 1325 V(vKernel->CreateVISADstOperand(movDst0, v0GV, 1, dstOfst / GRFElts, dstOfst % GRFElts)); 1326 V(vKernel->CreateVISADstOperand(movDst1, v1GV, 1, dstOfst / GRFElts, dstOfst % GRFElts)); 1327 1328 V(vKernel->CreateVISASrcOperand(srcOpnd, GV, MODIFIER_NONE, 1329 1, 1, 0, srcOfst / GRFElts, srcOfst % GRFElts)); 1330 1331 V(vKernel->AppendVISADataMovementInst( 1332 ISA_MOV, nullptr, false, 1333 SplitEMask(fromESize, toESize, 0, execNM), 1334 toESize, movDst0, srcOpnd)); 1335 1336 srcOfst += toSize; 1337 V(vKernel->CreateVISASrcOperand(srcOpnd, GV, MODIFIER_NONE, 1338 1, 1, 0, srcOfst / GRFElts, srcOfst % GRFElts)); 1339 1340 V(vKernel->AppendVISADataMovementInst( 1341 ISA_MOV, nullptr, false, 1342 SplitEMask(fromESize, toESize, 1, execNM), 1343 toESize, movDst1, srcOpnd)); 1344 } 1345 } 1346 } 1347 } 1348 1349 // Merge two SIMD8 MDP (V0 and V1) into a single SIMD16 MDP (MDP at offset = MDPOfst) MergePayloadToHigherSIMD(CVariable * V0,CVariable * V1,uint32_t NumBlks,CVariable * MDP,uint32_t MDPOfst,uint32_t toSize)1350 void CEncoder::MergePayloadToHigherSIMD(CVariable* V0, CVariable* V1, uint32_t NumBlks, CVariable* MDP, uint32_t MDPOfst, uint32_t toSize) 1351 { 1352 VISA_GenVar* GV = GetVISAVariable(MDP); 1353 VISA_GenVar* v0GV = GetVISAVariable(V0); 1354 VISA_GenVar* v1GV = GetVISAVariable(V1); 1355 VISA_VectorOpnd* movDst = nullptr; 1356 VISA_VectorOpnd* movSrc0 = nullptr; 1357 VISA_VectorOpnd* movSrc1 = nullptr; 1358 const uint32_t fromSize = toSize / 2; 1359 const VISA_Exec_Size fromESize = visaExecSize(lanesToSIMDMode(toSize)); 1360 const VISA_Exec_Size toESize = visaExecSize(lanesToSIMDMode(fromSize)); 1361 const uint32_t eltBytes = MDP->GetElemSize(); 1362 IGC_ASSERT_MESSAGE(V0->GetElemSize() == eltBytes, "Element size should be the same among SIMD16 MDP and SIMD8 MDP!"); 1363 IGC_ASSERT_MESSAGE(V1->GetElemSize() == eltBytes, "Element size should be the same among SIMD16 MDP and SIMD8 MDP!"); 1364 1365 if (eltBytes > 0) 1366 { 1367 // Number of elements per GRF 1368 const uint32_t GRFElts = getGRFSize() / eltBytes; 1369 1370 if (GRFElts > 0) 1371 { 1372 VISA_EMask_Ctrl execNM = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask); 1373 uint32_t MDPStart = MDPOfst / eltBytes; 1374 for (uint32_t i = 0; i < NumBlks; ++i) 1375 { 1376 uint32_t dstOfst = i * toSize + MDPStart; 1377 uint32_t srcOfst = i * fromSize; 1378 V(vKernel->CreateVISADstOperand(movDst, GV, 1, dstOfst / GRFElts, dstOfst % GRFElts)); 1379 V(vKernel->CreateVISASrcOperand(movSrc0, v0GV, MODIFIER_NONE, 1380 1, 1, 0, srcOfst / GRFElts, srcOfst % GRFElts)); 1381 V(vKernel->CreateVISASrcOperand(movSrc1, v1GV, MODIFIER_NONE, 1382 1, 1, 0, srcOfst / GRFElts, srcOfst % GRFElts)); 1383 1384 V(vKernel->AppendVISADataMovementInst( 1385 ISA_MOV, nullptr, false, 1386 SplitEMask(fromESize, toESize, 0, execNM), 1387 toESize, movDst, movSrc0)); 1388 1389 dstOfst += fromSize; 1390 V(vKernel->CreateVISADstOperand(movDst, GV, 1, dstOfst / GRFElts, dstOfst % GRFElts)); 1391 V(vKernel->AppendVISADataMovementInst( 1392 ISA_MOV, nullptr, false, 1393 SplitEMask(fromESize, toESize, 1, execNM), 1394 toESize, movDst, movSrc1)); 1395 } 1396 } 1397 } 1398 } 1399 1400 static SModifier EmulateVariable(CVariable * Var,SModifier Mod,bool IsHiPart,bool IsSource)1401 EmulateVariable(CVariable* Var, SModifier Mod, bool IsHiPart, bool IsSource) { 1402 if (Mod.specialRegion) { 1403 if (IsSource) { 1404 Mod.region[0] *= 2; 1405 Mod.region[2] *= 2; 1406 } 1407 else 1408 Mod.region[2] *= 2; 1409 } 1410 else { 1411 if (IsSource) { 1412 if (!Var->IsUniform()) { 1413 Mod.region[0] = 2; 1414 Mod.region[1] = 1; 1415 Mod.region[2] = 0; 1416 Mod.specialRegion = true; 1417 } 1418 } 1419 else { 1420 Mod.region[2] = 2; 1421 Mod.specialRegion = true; 1422 } 1423 } 1424 Mod.subReg *= 2; 1425 if (IsHiPart) 1426 Mod.subReg += 1; 1427 return Mod; 1428 } 1429 DataMov(ISA_Opcode opcode,CVariable * dst,CVariable * src)1430 void CEncoder::DataMov(ISA_Opcode opcode, CVariable* dst, CVariable* src) 1431 { 1432 if (opcode == ISA_SETP) 1433 { 1434 IGC_ASSERT(nullptr != dst); 1435 IGC_ASSERT(dst->GetVarType() == EVARTYPE_PREDICATE); 1436 V(vKernel->AppendVISASetP( 1437 GetAluEMask(dst), 1438 IsSecondHalf() ? GetAluExecSize(dst) : visaExecSize(m_program->m_dispatchSize), 1439 dst->visaPredVariable, 1440 GetSourceOperand(src, m_encoderState.m_srcOperand[0]))); 1441 } 1442 else if (opcode == ISA_MOV && src->GetVarType() == EVARTYPE_PREDICATE) 1443 { 1444 V(vKernel->AppendVISAPredicateMove( 1445 GetDestinationOperand(dst, m_encoderState.m_dstOperand), 1446 src->visaPredVariable)); 1447 } 1448 else 1449 { 1450 VISA_Type dstT = dst->GetType(); 1451 VISA_Type srcT = src->GetType(); 1452 bool Is64BitDst = (dstT == ISA_TYPE_Q || dstT == ISA_TYPE_UQ); 1453 bool Is64BitSrc = (srcT == ISA_TYPE_Q || srcT == ISA_TYPE_UQ); 1454 bool Need64BitEmu = 1455 m_program->GetContext()->platform.hasNoInt64Inst() && 1456 (Is64BitDst || Is64BitSrc); 1457 1458 // If DP is not supported, need to split mov as well. 1459 if (IGC_IS_FLAG_ENABLED(ForceDPEmulation) || 1460 m_program->GetContext()->platform.hasNoFP64Inst()) 1461 { 1462 if (dstT == ISA_TYPE_DF && srcT == ISA_TYPE_DF) 1463 { 1464 Need64BitEmu = true; 1465 Is64BitDst = true; 1466 Is64BitSrc = true; 1467 } 1468 else 1469 { 1470 IGC_ASSERT_MESSAGE(dstT != ISA_TYPE_DF, "double type is not expected here"); 1471 IGC_ASSERT_MESSAGE(srcT != ISA_TYPE_DF, "double type is not expected here"); 1472 } 1473 } 1474 if (dst->GetVarType() != EVARTYPE_GENERAL || src->GetVarType() != EVARTYPE_GENERAL) 1475 { 1476 // code can't handle indirect operands, let vISA do it 1477 // ToDo: disable int64b copy emu entirely? 1478 Need64BitEmu = false; 1479 } 1480 1481 CVariable* dstAlias = nullptr; 1482 CVariable* srcAlias = nullptr; 1483 VISA_VectorOpnd* srcImmLo = nullptr; 1484 VISA_VectorOpnd* srcImmHi = nullptr; 1485 if (Need64BitEmu) { 1486 if (Is64BitDst) 1487 dstAlias = m_program->GetNewAlias(dst, ISA_TYPE_UD, 0, 0); 1488 else 1489 dstAlias = dst; 1490 if (src->IsImmediate()) { 1491 uint64_t Imm = src->GetImmediateValue(); 1492 unsigned ImmLo = Imm & 0xFFFFFFFFULL; 1493 unsigned ImmHi = Imm >> 32; 1494 V(vKernel->CreateVISAImmediate(srcImmLo, &ImmLo, ISA_TYPE_UD)); 1495 V(vKernel->CreateVISAImmediate(srcImmHi, &ImmHi, ISA_TYPE_UD)); 1496 } 1497 else { 1498 if (Is64BitSrc) 1499 srcAlias = m_program->GetNewAlias(src, ISA_TYPE_UD, 0, 0); 1500 else 1501 srcAlias = src; 1502 } 1503 } 1504 1505 if (Need64BitEmu) 1506 { 1507 if (Is64BitSrc && Is64BitDst) 1508 { 1509 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 1510 if (!predOpnd && !IsSat() && dst->IsUniform() && src->IsUniform() && !src->IsImmediate() && m_encoderState.m_uniformSIMDSize == SIMDMode::SIMD1) 1511 { 1512 // special handling for uniform 64b copy by generating SIMD2 move instead of 2xSIMD1 1513 // technically we need to check for src modifier and whether dst/src are indirect operand as well, 1514 // but it doesn't look like the original code below is doing it anyway.. 1515 SModifier dstAsUDMod = m_encoderState.m_dstOperand; 1516 dstAsUDMod.subReg *= 2; 1517 SModifier srcAsUDMod = m_encoderState.m_srcOperand[0]; 1518 srcAsUDMod.region[0] = 1; 1519 srcAsUDMod.region[1] = 1; 1520 srcAsUDMod.region[2] = 0; 1521 srcAsUDMod.specialRegion = true; 1522 srcAsUDMod.subReg *= 2; 1523 auto dstOpnd = GetDestinationOperand(dstAlias, dstAsUDMod); 1524 auto SIMDSize = lanesToSIMDMode(numLanes(m_encoderState.m_uniformSIMDSize) * 2); 1525 auto srcOpnd = GetSourceOperand(srcAlias, srcAsUDMod); 1526 V(vKernel->AppendVISADataMovementInst(opcode, nullptr, false, vISA_EMASK_M1_NM, visaExecSize(SIMDSize), 1527 dstOpnd, srcOpnd)); 1528 } 1529 else 1530 { 1531 // Generate data movement on Lo part. 1532 SModifier LoDstMod = EmulateVariable(dst, m_encoderState.m_dstOperand, false, false); 1533 SModifier LoSrcMod = EmulateVariable(src, m_encoderState.m_srcOperand[0], false, true); 1534 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dstAlias, LoDstMod); 1535 VISA_VectorOpnd* srcOpnd = srcImmLo ? srcImmLo : GetSourceOperand(srcAlias, LoSrcMod); 1536 1537 V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(), 1538 GetAluEMask(dst), 1539 GetAluExecSize(dst), 1540 dstOpnd, srcOpnd)); 1541 // Generate data movement on Hi part. 1542 SModifier HiDstMod = EmulateVariable(dst, m_encoderState.m_dstOperand, true, false); 1543 SModifier HiSrcMod = EmulateVariable(src, m_encoderState.m_srcOperand[0], true, true); 1544 dstOpnd = GetDestinationOperand(dstAlias, HiDstMod); 1545 srcOpnd = srcImmHi ? srcImmHi : GetSourceOperand(srcAlias, HiSrcMod); 1546 predOpnd = GetFlagOperand(m_encoderState.m_flag); 1547 V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(), 1548 GetAluEMask(dst), 1549 GetAluExecSize(dst), 1550 dstOpnd, srcOpnd)); 1551 } 1552 } 1553 else if (Is64BitSrc) 1554 { 1555 IGC_ASSERT_MESSAGE(!Is64BitDst, "Expect non 64-bit dst!"); 1556 // Generate data movement on Lo part only. 1557 SModifier LoSrcMod = EmulateVariable(src, m_encoderState.m_srcOperand[0], false, true); 1558 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dstAlias, m_encoderState.m_dstOperand); 1559 VISA_VectorOpnd* srcOpnd = srcImmLo ? srcImmLo : GetSourceOperand(srcAlias, LoSrcMod); 1560 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 1561 V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(), 1562 GetAluEMask(dst), 1563 GetAluExecSize(dst), 1564 dstOpnd, srcOpnd)); 1565 } 1566 else 1567 { 1568 IGC_ASSERT_MESSAGE(Is64BitDst, "Expect 64-bit dst!"); 1569 IGC_ASSERT_MESSAGE(!Is64BitSrc, "Expect non 64-bit src"); 1570 1571 // Generate data movement on Lo part. 1572 SModifier LoDstMod = EmulateVariable(dst, m_encoderState.m_dstOperand, false, false); 1573 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dstAlias, LoDstMod); 1574 VISA_VectorOpnd* srcOpnd = srcImmLo ? srcImmLo : GetSourceOperand(srcAlias, m_encoderState.m_srcOperand[0]); 1575 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 1576 V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(), 1577 GetAluEMask(dst), 1578 GetAluExecSize(dst), 1579 dstOpnd, srcOpnd)); 1580 // Generate data movement on Hi part. 1581 unsigned ImmHi = 0U; 1582 V(vKernel->CreateVISAImmediate(srcImmHi, &ImmHi, ISA_TYPE_UD)); 1583 SModifier HiDstMod = EmulateVariable(dst, m_encoderState.m_dstOperand, true, false); 1584 dstOpnd = GetDestinationOperand(dstAlias, HiDstMod); 1585 srcOpnd = srcImmHi; 1586 predOpnd = GetFlagOperand(m_encoderState.m_flag); 1587 V(vKernel->AppendVISADataMovementInst(opcode, predOpnd, IsSat(), 1588 GetAluEMask(dst), 1589 GetAluExecSize(dst), 1590 dstOpnd, srcOpnd)); 1591 } 1592 } 1593 else 1594 { 1595 VISA_VectorOpnd* srcOpnd = GetSourceOperand(src, m_encoderState.m_srcOperand[0]); 1596 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand); 1597 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 1598 V(vKernel->AppendVISADataMovementInst( 1599 opcode, 1600 predOpnd, 1601 IsSat(), 1602 GetAluEMask(dst), 1603 GetAluExecSize(dst), 1604 dstOpnd, 1605 srcOpnd)); 1606 } 1607 } 1608 } 1609 LogicOp(ISA_Opcode opcode,CVariable * dst,CVariable * src0,CVariable * src1,CVariable * src2,CVariable * src3)1610 void CEncoder::LogicOp( 1611 ISA_Opcode opcode, 1612 CVariable* dst, 1613 CVariable* src0, 1614 CVariable* src1, 1615 CVariable* src2, 1616 CVariable* src3) 1617 { 1618 if (dst->GetVarType() == EVARTYPE_PREDICATE || 1619 src0->GetVarType() == EVARTYPE_PREDICATE || 1620 (src1 != nullptr && src1->GetVarType() == EVARTYPE_PREDICATE)) 1621 { 1622 VISA_PredVar* src1Dcl = NULL; 1623 if (src1 != NULL) 1624 src1Dcl = src1->visaPredVariable; 1625 1626 // Try to use NOT instruction for predicate, we won't have phi on 1627 // predicate since Legalization pass convert i1 phi to i32. 1628 if (opcode == ISA_NOT) 1629 SetNoMask(); 1630 1631 V(vKernel->AppendVISALogicOrShiftInst( 1632 opcode, 1633 GetAluEMask(dst), 1634 GetAluExecSize(dst), 1635 dst->visaPredVariable, 1636 src0->visaPredVariable, 1637 src1Dcl)); 1638 } 1639 else 1640 { 1641 VISA_VectorOpnd* srcOpnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]); 1642 VISA_VectorOpnd* srcOpnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]); 1643 VISA_VectorOpnd* srcOpnd2 = GetSourceOperand(src2, m_encoderState.m_srcOperand[2]); 1644 VISA_VectorOpnd* srcOpnd3 = GetSourceOperand(src3, m_encoderState.m_srcOperand[3]); 1645 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand); 1646 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 1647 1648 V(vKernel->AppendVISALogicOrShiftInst( 1649 opcode, 1650 predOpnd, 1651 IsSat(), 1652 GetAluEMask(dst), 1653 GetAluExecSize(dst), 1654 dstOpnd, 1655 srcOpnd0, 1656 srcOpnd1, 1657 srcOpnd2, 1658 srcOpnd3)); 1659 } 1660 } 1661 Arithmetic(ISA_Opcode opcode,CVariable * dst,CVariable * src0,CVariable * src1,CVariable * src2)1662 void CEncoder::Arithmetic(ISA_Opcode opcode, CVariable* dst, CVariable* src0, CVariable* src1, CVariable* src2) 1663 { 1664 VISA_VectorOpnd* srcOpnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]); 1665 VISA_VectorOpnd* srcOpnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]); 1666 VISA_VectorOpnd* srcOpnd2 = GetSourceOperand(src2, m_encoderState.m_srcOperand[2]); 1667 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand); 1668 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 1669 V(vKernel->AppendVISAArithmeticInst( 1670 opcode, 1671 predOpnd, 1672 IsSat(), 1673 GetAluEMask(dst), 1674 GetAluExecSize(dst), 1675 dstOpnd, 1676 srcOpnd0, 1677 srcOpnd1, 1678 srcOpnd2)); 1679 } 1680 Bfn(uint8_t booleanFuncCtrl,CVariable * dst,CVariable * src0,CVariable * src1,CVariable * src2)1681 void CEncoder::Bfn(uint8_t booleanFuncCtrl, CVariable* dst, CVariable* src0, CVariable* src1, CVariable* src2) 1682 { 1683 VISA_VectorOpnd* srcOpnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]); 1684 VISA_VectorOpnd* srcOpnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]); 1685 VISA_VectorOpnd* srcOpnd2 = GetSourceOperand(src2, m_encoderState.m_srcOperand[2]); 1686 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand); 1687 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 1688 1689 V(vKernel->AppendVISABfnInst( 1690 booleanFuncCtrl, 1691 predOpnd, 1692 IsSat(), 1693 GetAluEMask(dst), 1694 GetAluExecSize(dst), 1695 dstOpnd, 1696 srcOpnd0, 1697 srcOpnd1, 1698 srcOpnd2)); 1699 } 1700 1701 // We allow H1 to be nullptr for the common case of adding 64-bit variable with 32-bit imm AddPair(CVariable * Lo,CVariable * Hi,CVariable * L0,CVariable * H0,CVariable * L1,CVariable * H1)1702 void CEncoder::AddPair(CVariable* Lo, CVariable* Hi, CVariable* L0, CVariable* H0, CVariable* L1, CVariable* H1) { 1703 IGC_ASSERT_MESSAGE(m_encoderState.m_dstOperand.mod == EMOD_NONE, "addPair doesn't support saturate"); 1704 1705 if (Hi == nullptr) { 1706 // When Hi part is ignored, reduce 64-bit subtraction into 32-bit. 1707 GenericAlu(EOPCODE_ADD, Lo, L0, L1); 1708 return; 1709 } 1710 1711 if (Lo == nullptr) { 1712 // We cannot reduce the strength if only Lo is ignored. 1713 Lo = m_program->GetNewVariable( 1714 Hi->GetNumberElement(), Hi->GetType(), Hi->GetAlign(), Hi->IsUniform(), Hi->getName()); 1715 } 1716 1717 // Use `UD` only. 1718 if (Lo->GetType() != ISA_TYPE_UD && Lo->GetType() != ISA_TYPE_UV) Lo = m_program->BitCast(Lo, ISA_TYPE_UD); 1719 if (Hi->GetType() != ISA_TYPE_UD && Hi->GetType() != ISA_TYPE_UV) Hi = m_program->BitCast(Hi, ISA_TYPE_UD); 1720 if (L0->GetType() != ISA_TYPE_UD && L0->GetType() != ISA_TYPE_UV) L0 = m_program->BitCast(L0, ISA_TYPE_UD); 1721 if (H0->GetType() != ISA_TYPE_UD && H0->GetType() != ISA_TYPE_UV) H0 = m_program->BitCast(H0, ISA_TYPE_UD); 1722 if (L1->GetType() != ISA_TYPE_UD && L1->GetType() != ISA_TYPE_UV) L1 = m_program->BitCast(L1, ISA_TYPE_UD); 1723 if (H1 && H1->GetType() != ISA_TYPE_UD && H1->GetType() != ISA_TYPE_UV) H1 = m_program->BitCast(H1, ISA_TYPE_UD); 1724 1725 VISA_Exec_Size ExecSize = GetAluExecSize(Lo); 1726 IGC_ASSERT((ExecSize == EXEC_SIZE_32) || (ExecSize == EXEC_SIZE_16) || (ExecSize == EXEC_SIZE_8) || (ExecSize == EXEC_SIZE_4) || (ExecSize == EXEC_SIZE_2) || (ExecSize == EXEC_SIZE_1)); 1727 1728 if (needsSplitting(ExecSize)) 1729 { 1730 // Have to split it because `acc0` has only 8 elements for 32-bit 1731 // integer types. 1732 unsigned NumParts = 2; 1733 VISA_EMask_Ctrl ExecMask = GetAluEMask(Lo); 1734 VISA_Exec_Size FromExecSize = GetAluExecSize(Lo); 1735 VISA_Exec_Size ToExecSize = SplitExecSize(FromExecSize, NumParts); 1736 1737 VISA_PredOpnd* Pred = GetFlagOperand(m_encoderState.m_flag); 1738 for (unsigned ThePart = 0; ThePart != NumParts; ++ThePart) { 1739 SModifier NewDstMod = SplitVariable(FromExecSize, ToExecSize, ThePart, Lo, m_encoderState.m_dstOperand); 1740 SModifier NewS0LMod = SplitVariable(FromExecSize, ToExecSize, ThePart, L0, m_encoderState.m_srcOperand[0], true); 1741 SModifier NewS0HMod = SplitVariable(FromExecSize, ToExecSize, ThePart, H0, m_encoderState.m_srcOperand[1], true); 1742 SModifier NewS1LMod = SplitVariable(FromExecSize, ToExecSize, ThePart, L1, m_encoderState.m_srcOperand[2], true); 1743 1744 VISA_VectorOpnd* S0L = GetSourceOperand(L0, NewS0LMod); 1745 VISA_VectorOpnd* S0H = GetSourceOperand(H0, NewS0HMod); 1746 VISA_VectorOpnd* S1L = GetSourceOperand(L1, NewS1LMod); 1747 VISA_VectorOpnd* L = GetDestinationOperand(Lo, NewDstMod); 1748 VISA_VectorOpnd* H = GetDestinationOperand(Hi, NewDstMod); 1749 VISA_VectorOpnd* HIn = GetSourceOperand(Hi, NewDstMod); 1750 1751 unsigned NumElems = m_program->m_Platform->getAccChNumUD(); 1752 CVariable* Carry = m_program->GetNewVariable( 1753 (uint16_t)NumElems, Lo->GetType(), Lo->GetAlign(), Lo->IsUniform(), 1754 CName(Lo->getName(), "Carry")); 1755 VISA_VectorOpnd* AccOut = GetDestinationOperand(Carry, m_encoderState.m_dstOperand); 1756 VISA_VectorOpnd* AccIn = GetSourceOperand(Carry, m_encoderState.m_dstOperand); 1757 1758 VISA_EMask_Ctrl EMask = SplitEMask(FromExecSize, ToExecSize, ThePart, ExecMask); 1759 V(vKernel->AppendVISATwoDstArithmeticInst( 1760 ISA_ADDC, Pred, EMask, ToExecSize, 1761 L, AccOut, S0L, S1L)); 1762 1763 if (H1 && !(H1->IsImmediate() && H1->GetImmediateValue() == 0)) 1764 { 1765 SModifier NewS1HMod = SplitVariable(FromExecSize, ToExecSize, ThePart, H1, m_encoderState.m_srcOperand[3], true); 1766 VISA_VectorOpnd* S1H = GetSourceOperand(H1, NewS1HMod); 1767 if (m_program->m_Platform->supportAdd3Instruction()) 1768 { 1769 H = GetDestinationOperand(Hi, NewDstMod); 1770 V(vKernel->AppendVISAArithmeticInst( 1771 ISA_ADD3, Pred, false, EMask, ToExecSize, 1772 H, AccIn, S0H, S1H)); 1773 } 1774 else 1775 { 1776 V(vKernel->AppendVISAArithmeticInst( 1777 ISA_ADD, Pred, false, EMask, ToExecSize, 1778 H, S0H, S1H)); 1779 H = GetDestinationOperand(Hi, NewDstMod); 1780 V(vKernel->AppendVISAArithmeticInst( 1781 ISA_ADD, Pred, false, EMask, ToExecSize, 1782 H, AccIn, HIn)); 1783 } 1784 } 1785 else 1786 { 1787 V(vKernel->AppendVISAArithmeticInst( 1788 ISA_ADD, Pred, false, EMask, ToExecSize, 1789 H, AccIn, S0H)); 1790 } 1791 } 1792 } 1793 else { 1794 VISA_VectorOpnd* S0L = GetSourceOperand(L0, m_encoderState.m_srcOperand[0]); 1795 VISA_VectorOpnd* S0H = GetSourceOperand(H0, m_encoderState.m_srcOperand[1]); 1796 VISA_VectorOpnd* S1L = GetSourceOperand(L1, m_encoderState.m_srcOperand[2]); 1797 VISA_VectorOpnd* L = GetDestinationOperand(Lo, m_encoderState.m_dstOperand); 1798 VISA_VectorOpnd* H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand); 1799 VISA_PredOpnd* Pred = GetFlagOperand(m_encoderState.m_flag); 1800 1801 unsigned short NumElems = (ExecSize == EXEC_SIZE_1) ? 1 : 1802 (ExecSize == EXEC_SIZE_2) ? 2 : 1803 (ExecSize == EXEC_SIZE_4) ? 4 : m_program->m_Platform->getAccChNumUD(); 1804 CVariable* Carry = m_program->GetNewVariable( 1805 NumElems, Lo->GetType(), Lo->GetAlign(), Lo->IsUniform(), CName(Lo->getName(), "Carry")); 1806 VISA_VectorOpnd* AccOut = GetDestinationOperand(Carry, m_encoderState.m_dstOperand); 1807 1808 SModifier MidMod = m_encoderState.m_dstOperand; 1809 if (Lo->IsUniform() && NumElems != 1) { 1810 MidMod.region[0] = 1; 1811 MidMod.region[1] = 1; 1812 MidMod.region[2] = 0; 1813 MidMod.specialRegion = true; 1814 } 1815 VISA_VectorOpnd* HIn = GetSourceOperand(Hi, MidMod); 1816 VISA_VectorOpnd* AccIn = GetSourceOperand(Carry, MidMod); 1817 1818 VISA_EMask_Ctrl ExecMask = GetAluEMask(Lo); 1819 V(vKernel->AppendVISATwoDstArithmeticInst( 1820 ISA_ADDC, Pred, ExecMask, ExecSize, 1821 L, AccOut, S0L, S1L)); 1822 1823 if (H1 && !(H1->IsImmediate() && H1->GetImmediateValue() == 0)) 1824 { 1825 VISA_VectorOpnd* S1H = GetSourceOperand(H1, m_encoderState.m_srcOperand[3]); 1826 if (m_program->m_Platform->supportAdd3Instruction()) 1827 { 1828 H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand); 1829 V(vKernel->AppendVISAArithmeticInst( 1830 ISA_ADD3, Pred, false, ExecMask, ExecSize, 1831 H, AccIn, S0H, S1H)); 1832 } 1833 else 1834 { 1835 V(vKernel->AppendVISAArithmeticInst( 1836 ISA_ADD, Pred, false, ExecMask, ExecSize, 1837 H, S0H, S1H)); 1838 H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand); 1839 V(vKernel->AppendVISAArithmeticInst( 1840 ISA_ADD, Pred, false, ExecMask, ExecSize, 1841 H, AccIn, HIn)); 1842 } 1843 } 1844 else 1845 { 1846 V(vKernel->AppendVISAArithmeticInst( 1847 ISA_ADD, Pred, false, ExecMask, ExecSize, 1848 H, AccIn, S0H)); 1849 } 1850 } 1851 } 1852 SubPair(CVariable * Lo,CVariable * Hi,CVariable * L0,CVariable * H0,CVariable * L1,CVariable * H1)1853 void CEncoder::SubPair(CVariable* Lo, CVariable* Hi, CVariable* L0, CVariable* H0, CVariable* L1, CVariable* H1) { 1854 IGC_ASSERT_MESSAGE(m_encoderState.m_dstOperand.mod == EMOD_NONE, "subPair doesn't support saturate"); 1855 1856 IGC_ASSERT(Lo || Hi); // At least one is used 1857 if (Hi == nullptr) { 1858 // When Hi part is ignored, reduce 64-bit subtraction into 32-bit. 1859 SetSrcModifier(1, EMOD_NEG); 1860 GenericAlu(EOPCODE_ADD, Lo, L0, L1); 1861 return; 1862 } 1863 1864 if (Lo == nullptr) { 1865 // We cannot reduce the strength if only Lo is ignored. 1866 Lo = m_program->GetNewVariable( 1867 Hi->GetNumberElement(), Hi->GetType(), Hi->GetAlign(), Hi->IsUniform(), CName(Hi->getName(), "Carry")); 1868 } 1869 1870 VISA_Exec_Size ExecSize = GetAluExecSize(Lo); 1871 IGC_ASSERT((ExecSize == EXEC_SIZE_32) || (ExecSize == EXEC_SIZE_16) || (ExecSize == EXEC_SIZE_8) || (ExecSize == EXEC_SIZE_1)); 1872 1873 // Use `UD` only. 1874 if (Lo->GetType() != ISA_TYPE_UD && Lo->GetType() != ISA_TYPE_UV) Lo = m_program->BitCast(Lo, ISA_TYPE_UD); 1875 if (Hi->GetType() != ISA_TYPE_UD && Hi->GetType() != ISA_TYPE_UV) Hi = m_program->BitCast(Hi, ISA_TYPE_UD); 1876 if (L0->GetType() != ISA_TYPE_UD && L0->GetType() != ISA_TYPE_UV) L0 = m_program->BitCast(L0, ISA_TYPE_UD); 1877 if (H0->GetType() != ISA_TYPE_UD && H0->GetType() != ISA_TYPE_UV) H0 = m_program->BitCast(H0, ISA_TYPE_UD); 1878 if (L1->GetType() != ISA_TYPE_UD && L1->GetType() != ISA_TYPE_UV) L1 = m_program->BitCast(L1, ISA_TYPE_UD); 1879 if (H1->GetType() != ISA_TYPE_UD && H1->GetType() != ISA_TYPE_UV) H1 = m_program->BitCast(H1, ISA_TYPE_UD); 1880 1881 if (needsSplitting(ExecSize)) 1882 { 1883 // Have to split it because `acc0` has only 8 elements for 32-bit 1884 // integer types. 1885 unsigned NumParts = 2; 1886 VISA_EMask_Ctrl ExecMask = GetAluEMask(Lo); 1887 VISA_Exec_Size FromExecSize = GetAluExecSize(Lo); 1888 VISA_Exec_Size ToExecSize = SplitExecSize(FromExecSize, NumParts); 1889 1890 // Negative `S1H` 1891 SModifier S1HMod = m_encoderState.m_srcOperand[1]; 1892 IGC_ASSERT(S1HMod.mod == EMOD_NONE); 1893 S1HMod.mod = EMOD_NEG; 1894 VISA_PredOpnd* Pred = GetFlagOperand(m_encoderState.m_flag); 1895 for (unsigned ThePart = 0; ThePart != NumParts; ++ThePart) { 1896 SModifier NewDstMod = SplitVariable(FromExecSize, ToExecSize, ThePart, Lo, m_encoderState.m_dstOperand); 1897 SModifier NewS0LMod = SplitVariable(FromExecSize, ToExecSize, ThePart, L0, m_encoderState.m_srcOperand[0], true); 1898 SModifier NewS0HMod = SplitVariable(FromExecSize, ToExecSize, ThePart, H0, m_encoderState.m_srcOperand[1], true); 1899 SModifier NewS1LMod = SplitVariable(FromExecSize, ToExecSize, ThePart, L1, m_encoderState.m_srcOperand[2], true); 1900 SModifier NewS1HMod = SplitVariable(FromExecSize, ToExecSize, ThePart, H1, S1HMod, true); 1901 VISA_VectorOpnd* S0L = GetSourceOperand(L0, NewS0LMod); 1902 VISA_VectorOpnd* S0H = GetSourceOperand(H0, NewS0HMod); 1903 VISA_VectorOpnd* S1L = GetSourceOperand(L1, NewS1LMod); 1904 VISA_VectorOpnd* S1H = GetSourceOperand(H1, NewS1HMod); 1905 VISA_VectorOpnd* L = GetDestinationOperand(Lo, NewDstMod); 1906 VISA_VectorOpnd* H = GetDestinationOperand(Hi, NewDstMod); 1907 VISA_VectorOpnd* HIn = GetSourceOperand(Hi, NewDstMod); 1908 1909 unsigned short NumElems = m_program->m_Platform->getAccChNumUD(); 1910 CVariable* Carry = 1911 m_program->GetNewVariable(NumElems, Lo->GetType(), Lo->GetAlign(), Lo->IsUniform(), CName(Lo->getName(), "Carry")); 1912 VISA_VectorOpnd* AccOut = GetDestinationOperand(Carry, m_encoderState.m_dstOperand); 1913 // Negative `Acc0` 1914 SModifier AccMod = m_encoderState.m_dstOperand; 1915 IGC_ASSERT(AccMod.mod == EMOD_NONE); 1916 AccMod.mod = EMOD_NEG; 1917 VISA_VectorOpnd* AccIn = GetSourceOperand(Carry, AccMod); 1918 1919 VISA_EMask_Ctrl EMask = SplitEMask(FromExecSize, ToExecSize, ThePart, ExecMask); 1920 V(vKernel->AppendVISATwoDstArithmeticInst( 1921 ISA_SUBB, Pred, EMask, ToExecSize, 1922 L, AccOut, S0L, S1L)); 1923 if (m_program->m_Platform->supportAdd3Instruction()) 1924 { 1925 H = GetDestinationOperand(Hi, NewDstMod); 1926 V(vKernel->AppendVISAArithmeticInst( 1927 ISA_ADD3, Pred, false, EMask, ToExecSize, 1928 H, AccIn, S0H, S1H)); 1929 } 1930 else 1931 { 1932 V(vKernel->AppendVISAArithmeticInst( 1933 ISA_ADD, Pred, false, EMask, ToExecSize, 1934 H, S0H, S1H)); 1935 H = GetDestinationOperand(Hi, NewDstMod); 1936 V(vKernel->AppendVISAArithmeticInst( 1937 ISA_ADD, Pred, false, EMask, ToExecSize, 1938 H, AccIn, HIn)); 1939 } 1940 } 1941 } 1942 else { 1943 VISA_VectorOpnd* S0L = GetSourceOperand(L0, m_encoderState.m_srcOperand[0]); 1944 VISA_VectorOpnd* S0H = GetSourceOperand(H0, m_encoderState.m_srcOperand[1]); 1945 VISA_VectorOpnd* S1L = GetSourceOperand(L1, m_encoderState.m_srcOperand[2]); 1946 // Negative `S0H` 1947 SModifier S1HMod = m_encoderState.m_srcOperand[1]; 1948 IGC_ASSERT(S1HMod.mod == EMOD_NONE); 1949 S1HMod.mod = EMOD_NEG; 1950 VISA_VectorOpnd* S1H = GetSourceOperand(H1, S1HMod); 1951 VISA_VectorOpnd* L = GetDestinationOperand(Lo, m_encoderState.m_dstOperand); 1952 VISA_VectorOpnd* H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand); 1953 VISA_PredOpnd* Pred = GetFlagOperand(m_encoderState.m_flag); 1954 1955 unsigned short NumElems = (ExecSize == 1) ? 1 : m_program->m_Platform->getAccChNumUD(); 1956 CVariable* Carry = m_program->GetNewVariable( 1957 NumElems, Lo->GetType(), Lo->GetAlign(), Lo->IsUniform(), CName(Lo->getName(), "Carry")); 1958 VISA_VectorOpnd* AccOut = GetDestinationOperand(Carry, m_encoderState.m_dstOperand); 1959 1960 SModifier MidMod = m_encoderState.m_dstOperand; 1961 if (Lo->IsUniform() && NumElems != 1) { 1962 MidMod.region[0] = 1; 1963 MidMod.region[1] = 1; 1964 MidMod.region[2] = 0; 1965 MidMod.specialRegion = true; 1966 } 1967 VISA_VectorOpnd* HIn = GetSourceOperand(Hi, MidMod); 1968 // Negative `Acc0` 1969 SModifier AccMod = MidMod; 1970 IGC_ASSERT(AccMod.mod == EMOD_NONE); 1971 AccMod.mod = EMOD_NEG; 1972 VISA_VectorOpnd* AccIn = GetSourceOperand(Carry, AccMod); 1973 1974 VISA_EMask_Ctrl ExecMask = GetAluEMask(Lo); 1975 V(vKernel->AppendVISATwoDstArithmeticInst( 1976 ISA_SUBB, Pred, ExecMask, ExecSize, 1977 L, AccOut, S0L, S1L)); 1978 if (m_program->m_Platform->supportAdd3Instruction()) 1979 { 1980 H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand); 1981 V(vKernel->AppendVISAArithmeticInst( 1982 ISA_ADD3, Pred, false, ExecMask, ExecSize, 1983 H, AccIn, S0H, S1H)); 1984 } 1985 else 1986 { 1987 V(vKernel->AppendVISAArithmeticInst( 1988 ISA_ADD, Pred, false, ExecMask, ExecSize, 1989 H, S0H, S1H)); 1990 H = GetDestinationOperand(Hi, m_encoderState.m_dstOperand); 1991 V(vKernel->AppendVISAArithmeticInst( 1992 ISA_ADD, Pred, false, ExecMask, ExecSize, 1993 H, AccIn, HIn)); 1994 } 1995 } 1996 } 1997 CarryBorrowArith(ISA_Opcode opcode,CVariable * dst,CVariable * src0,CVariable * src1)1998 void CEncoder::CarryBorrowArith(ISA_Opcode opcode, CVariable* dst, CVariable* src0, CVariable* src1) 1999 { 2000 VISA_VectorOpnd* srcOpnd0 = GetSourceOperand(src0, m_encoderState.m_srcOperand[0]); 2001 VISA_VectorOpnd* srcOpnd1 = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]); 2002 VISA_VectorOpnd* dstOpnd = GetDestinationOperand(dst, m_encoderState.m_dstOperand); 2003 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2004 SModifier carryOperand = m_encoderState.m_dstOperand; 2005 VISA_Exec_Size execSize = GetAluExecSize(dst); 2006 2007 switch (execSize) 2008 { 2009 case EXEC_SIZE_1: 2010 carryOperand.subReg += 1; 2011 break; 2012 case EXEC_SIZE_8: 2013 carryOperand.subVar += 1; 2014 break; 2015 case EXEC_SIZE_16: 2016 carryOperand.subVar += 2; 2017 break; 2018 default: 2019 IGC_ASSERT_MESSAGE(0, "Unknown execution size on carry-borrow-arith!"); 2020 break; 2021 } 2022 VISA_VectorOpnd* carryBorrowOpnd = GetDestinationOperand(dst, carryOperand); 2023 IGC_ASSERT_MESSAGE(m_encoderState.m_dstOperand.mod == EMOD_NONE, "addc/subb doesn't support saturate"); 2024 2025 V(vKernel->AppendVISATwoDstArithmeticInst( 2026 opcode, 2027 predOpnd, 2028 GetAluEMask(dst), 2029 GetAluExecSize(dst), 2030 dstOpnd, 2031 carryBorrowOpnd, 2032 srcOpnd0, 2033 srcOpnd1)); 2034 } 2035 URBWrite(CVariable * src,const int payloadElementOffset,CVariable * offset,CVariable * urbHandle,CVariable * mask)2036 void CEncoder::URBWrite( 2037 CVariable* src, 2038 const int payloadElementOffset, 2039 CVariable* offset, 2040 CVariable* urbHandle, 2041 CVariable* mask) 2042 { 2043 2044 IGC_ASSERT(nullptr != offset); 2045 2046 VISA_EMask_Ctrl emask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask); 2047 VISA_Exec_Size execSize = visaExecSize(m_encoderState.m_simdSize); 2048 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2049 VISA_RawOpnd* handle = GetRawSource(urbHandle); 2050 // Two possible cases: offset may be constant (immediate) or runtime value. 2051 unsigned short immOffset = 0; 2052 VISA_RawOpnd* perSlotOffset = nullptr; 2053 if (offset->IsImmediate()) 2054 { 2055 immOffset = int_cast<unsigned short>(offset->GetImmediateValue()); 2056 V(vKernel->CreateVISANullRawOperand(perSlotOffset, false)); 2057 } 2058 else 2059 { 2060 perSlotOffset = GetRawSource(offset); 2061 } 2062 2063 // Three possible cases: 2064 // 1. Channel Mask is immediate value with 0xFF, so not needed to send 2065 // 2. Channel Mask is immediate value other than 0xFF, so needed to send, but as immediate value 2066 // 3. Channel Mask is not immediate value, so needed to send, but as not immediate value 2067 VISA_RawOpnd* channelMask = nullptr; 2068 unsigned char payloadSize = 0; 2069 if (!mask->IsImmediate()) 2070 { 2071 channelMask = GetRawSource(mask); 2072 // All 4 elements will be send - we don't know which are masked out. 2073 payloadSize = 4; 2074 } 2075 else 2076 { 2077 unsigned int immChannelMask = int_cast<unsigned int>(mask->GetImmediateValue()); 2078 URBChannelMask immMask(immChannelMask); 2079 if (immMask.isAllSet()) 2080 { 2081 V(vKernel->CreateVISANullRawOperand(channelMask, false)); 2082 } 2083 else 2084 { 2085 CVariable* tmpDst = m_program->GetNewVariable(8, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE); 2086 VISA_VectorOpnd* movDst = nullptr; 2087 V(vKernel->CreateVISADstOperand(movDst, GetVISAVariable(tmpDst), 1, 0, 0)); 2088 2089 VISA_VectorOpnd* immSrc = nullptr; 2090 V(vKernel->CreateVISAImmediate(immSrc, &immChannelMask, ISA_TYPE_UW)); 2091 2092 V(vKernel->AppendVISADataMovementInst( 2093 ISA_MOV, nullptr, false, emask, 2094 EXEC_SIZE_8, movDst, immSrc)); 2095 V(vKernel->CreateVISARawOperand(channelMask, GetVISAVariable(tmpDst), 0)); 2096 } 2097 2098 payloadSize = int_cast<unsigned char>(immMask.size()); 2099 } 2100 2101 VISA_RawOpnd* vertexData = GetRawSource(src, payloadElementOffset); 2102 2103 V(vKernel->AppendVISA3dURBWrite( 2104 predOpnd, 2105 emask, 2106 execSize, 2107 payloadSize, 2108 channelMask, 2109 immOffset, 2110 handle, 2111 perSlotOffset, 2112 vertexData)); 2113 } 2114 2115 GetRawSource(CVariable * var,uint offset)2116 VISA_RawOpnd* CEncoder::GetRawSource(CVariable* var, uint offset) 2117 { 2118 VISA_RawOpnd* srcOpnd = nullptr; 2119 if (var) 2120 { 2121 if (var->IsImmediate()) 2122 { 2123 VISA_VectorOpnd* vecOpnd = nullptr; 2124 uint immediate = int_cast<uint>(var->GetImmediateValue()); 2125 V(vKernel->CreateVISAImmediate(vecOpnd, &immediate, ISA_TYPE_UD)); 2126 srcOpnd = (VISA_RawOpnd*)vecOpnd; 2127 } 2128 else 2129 { 2130 V(vKernel->CreateVISARawOperand( 2131 srcOpnd, 2132 GetVISAVariable(var), 2133 int_cast<unsigned short>(offset + var->GetAliasOffset()))); 2134 } 2135 } 2136 else 2137 { 2138 V(vKernel->CreateVISANullRawOperand(srcOpnd, false)); 2139 } 2140 return srcOpnd; 2141 } 2142 GetRawDestination(CVariable * var,unsigned offset)2143 VISA_RawOpnd* CEncoder::GetRawDestination(CVariable* var, unsigned offset) 2144 { 2145 VISA_RawOpnd* dstOpnd = nullptr; 2146 if (var) 2147 { 2148 V(vKernel->CreateVISARawOperand( 2149 dstOpnd, GetVISAVariable(var), 2150 m_encoderState.m_dstOperand.subVar * getGRFSize() + offset + var->GetAliasOffset())); 2151 } 2152 else 2153 { 2154 V(vKernel->CreateVISANullRawOperand(dstOpnd, true)); 2155 } 2156 return dstOpnd; 2157 } 2158 Send(CVariable * dst,CVariable * src,uint exDesc,CVariable * messDescriptor,bool isSendc)2159 void CEncoder::Send(CVariable* dst, CVariable* src, uint exDesc, CVariable* messDescriptor, bool isSendc) 2160 { 2161 if (dst && dst->IsUniform()) 2162 { 2163 m_encoderState.m_simdSize = m_encoderState.m_uniformSIMDSize; 2164 } 2165 unsigned char sendc = isSendc ? 1 : 0; 2166 unsigned char srcSize = src->GetSize() / getGRFSize(); 2167 unsigned char dstSize = dst ? dst->GetSize() / getGRFSize() : 0; 2168 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2169 VISA_RawOpnd* srcOpnd0 = GetRawSource(src); 2170 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 2171 VISA_VectorOpnd* desc = GetUniformSource(messDescriptor); 2172 2173 V(vKernel->AppendVISAMiscRawSend( 2174 predOpnd, 2175 GetAluEMask(dst), 2176 visaExecSize(m_encoderState.m_simdSize), 2177 sendc, 2178 exDesc, 2179 srcSize, 2180 dstSize, 2181 desc, 2182 srcOpnd0, 2183 dstOpnd)); 2184 } 2185 Send(CVariable * dst,CVariable * src,uint ffid,CVariable * exDesc,CVariable * messDescriptor,bool isSendc)2186 void CEncoder::Send(CVariable* dst, CVariable* src, uint ffid, CVariable* exDesc, CVariable* messDescriptor, bool isSendc) 2187 { 2188 Sends(dst, src, nullptr, ffid, exDesc, messDescriptor, isSendc); 2189 } 2190 Sends(CVariable * dst,CVariable * src0,CVariable * src1,uint ffid,CVariable * exDesc,CVariable * messDescriptor,bool isSendc,bool hasEOT)2191 void CEncoder::Sends(CVariable* dst, CVariable* src0, CVariable* src1, uint ffid, CVariable* exDesc, CVariable* messDescriptor, bool isSendc, bool hasEOT) 2192 { 2193 if (exDesc->IsImmediate() && src1 == nullptr) 2194 { 2195 Send(dst, src0, (uint)exDesc->GetImmediateValue(), messDescriptor, isSendc); 2196 return; 2197 } 2198 if (dst && dst->IsUniform()) 2199 { 2200 m_encoderState.m_simdSize = m_encoderState.m_uniformSIMDSize; 2201 } 2202 unsigned char sendc = isSendc ? 1 : 0; 2203 unsigned char src0Size = src0->GetSize() / getGRFSize(); 2204 unsigned char src1Size = src1 ? src1->GetSize() / getGRFSize() : 0; 2205 unsigned char dstSize = dst ? dst->GetSize() / getGRFSize() : 0; 2206 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2207 VISA_RawOpnd* srcOpnd0 = GetRawSource(src0); 2208 VISA_RawOpnd* srcOpnd1 = GetRawSource(src1); 2209 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 2210 VISA_VectorOpnd* exMessDesc = GetUniformSource(exDesc); 2211 VISA_VectorOpnd* desc = GetUniformSource(messDescriptor); 2212 2213 V(vKernel->AppendVISAMiscRawSends( 2214 predOpnd, 2215 GetAluEMask(dst), 2216 visaExecSize(m_encoderState.m_simdSize), 2217 sendc, 2218 ffid, 2219 exMessDesc, 2220 src0Size, 2221 src1Size, // right now only one source 2222 dstSize, 2223 desc, 2224 srcOpnd0, 2225 srcOpnd1, 2226 dstOpnd, 2227 hasEOT)); 2228 } 2229 GetBTIOperand(uint bindingTableIndex)2230 VISA_StateOpndHandle* CEncoder::GetBTIOperand(uint bindingTableIndex) 2231 { 2232 IGC::e_predefSurface predDefSurface = ESURFACE_NORMAL; 2233 if (bindingTableIndex == 255) 2234 predDefSurface = ESURFACE_STATELESS; 2235 else if (bindingTableIndex == 254) 2236 predDefSurface = ESURFACE_SLM; 2237 CVariable tempImm(bindingTableIndex, ISA_TYPE_UD); 2238 return GetVISASurfaceOpnd(predDefSurface, &tempImm); 2239 } 2240 RenderTargetWrite(CVariable * var[],bool isUndefined[],bool lastRenderTarget,bool isNullRT,bool perSample,bool coarseMode,bool headerMaskFromCe0,CVariable * bindingTableIndex,CVariable * RTIndex,CVariable * source0Alpha,CVariable * oMask,CVariable * depth,CVariable * stencil,CVariable * CPSCounter,CVariable * sampleIndex,CVariable * r1Reg)2241 void CEncoder::RenderTargetWrite(CVariable* var[], 2242 bool isUndefined[], 2243 bool lastRenderTarget, 2244 bool isNullRT, 2245 bool perSample, 2246 bool coarseMode, 2247 bool headerMaskFromCe0, 2248 CVariable* bindingTableIndex, 2249 CVariable* RTIndex, 2250 CVariable* source0Alpha, 2251 CVariable* oMask, 2252 CVariable* depth, 2253 CVariable* stencil, 2254 CVariable* CPSCounter, 2255 CVariable* sampleIndex, 2256 CVariable* r1Reg) 2257 { 2258 VISA_EMask_Ctrl emask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask); 2259 VISA_Exec_Size execSize = visaExecSize(m_encoderState.m_simdSize); 2260 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2261 VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(ESURFACE_NORMAL, bindingTableIndex); 2262 2263 vISA_RT_CONTROLS cntrls; 2264 uint8_t numMsgSpecificOpnds = 0; 2265 VISA_RawOpnd* srcOpnd[8] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; 2266 2267 cntrls.isPerSample = perSample; 2268 cntrls.isCoarseMode = coarseMode; 2269 cntrls.isHeaderMaskfromCe0 = headerMaskFromCe0; 2270 IGC_ASSERT(!((predOpnd != nullptr) && cntrls.isHeaderMaskfromCe0)); 2271 2272 if (source0Alpha) 2273 { 2274 cntrls.s0aPresent = true; 2275 srcOpnd[numMsgSpecificOpnds++] = GetRawSource(source0Alpha); 2276 } 2277 else 2278 cntrls.s0aPresent = false; 2279 2280 if (oMask) 2281 { 2282 cntrls.oMPresent = true; 2283 srcOpnd[numMsgSpecificOpnds++] = GetRawSource(oMask); 2284 } 2285 else 2286 cntrls.oMPresent = false; 2287 2288 for (int i = 0; i < 4; i++) 2289 { 2290 if (isUndefined[i]) 2291 { 2292 V(vKernel->CreateVISANullRawOperand(srcOpnd[numMsgSpecificOpnds++], false)); 2293 } 2294 else 2295 { 2296 srcOpnd[numMsgSpecificOpnds++] = GetRawSource(var[i]); 2297 } 2298 } 2299 2300 if (depth) 2301 { 2302 cntrls.zPresent = true; 2303 srcOpnd[numMsgSpecificOpnds++] = GetRawSource(depth); 2304 } 2305 else 2306 cntrls.zPresent = false; 2307 2308 if (stencil) 2309 { 2310 cntrls.isStencil = true; 2311 srcOpnd[numMsgSpecificOpnds++] = GetRawSource(stencil); 2312 } 2313 else 2314 cntrls.isStencil = false; 2315 2316 cntrls.isSampleIndex = false; 2317 VISA_VectorOpnd* sampleIndexOpnd = NULL; 2318 if (sampleIndex) 2319 { 2320 sampleIndexOpnd = GetSourceOperandNoModifier(sampleIndex); 2321 cntrls.isSampleIndex = true; 2322 } 2323 VISA_VectorOpnd* cpsCounterOpnd = GetSourceOperandNoModifier(CPSCounter); 2324 2325 VISA_VectorOpnd* RTIndexOpnd = nullptr; 2326 cntrls.RTIndexPresent = false; 2327 // if RTIndex is 0, then no need to prepare the header for send 2328 if (!RTIndex->IsImmediate() || RTIndex->GetImmediateValue() != 0) 2329 { 2330 RTIndexOpnd = GetSourceOperandNoModifier(RTIndex); 2331 cntrls.RTIndexPresent = true; 2332 } 2333 2334 //controls last render target select bit 2335 cntrls.isLastWrite = lastRenderTarget; 2336 2337 // controls NULL render target enbale bit 2338 cntrls.isNullRT = isNullRT; 2339 2340 //r1Reg should always be populated 2341 //vISA will decide whether to use it or not. 2342 VISA_RawOpnd* r1RegOpnd = GetRawSource(r1Reg); 2343 2344 2345 if (CPSCounter) 2346 { 2347 V(vKernel->AppendVISA3dRTWriteCPS( 2348 predOpnd, 2349 emask, 2350 execSize, 2351 RTIndexOpnd, 2352 cntrls, 2353 surfOpnd, 2354 r1RegOpnd, 2355 sampleIndexOpnd, 2356 cpsCounterOpnd, 2357 numMsgSpecificOpnds, 2358 srcOpnd)); 2359 } 2360 else 2361 { 2362 V(vKernel->AppendVISA3dRTWrite( 2363 predOpnd, 2364 emask, 2365 execSize, 2366 RTIndexOpnd, 2367 cntrls, 2368 surfOpnd, 2369 r1RegOpnd, 2370 sampleIndexOpnd, 2371 numMsgSpecificOpnds, 2372 srcOpnd)); 2373 } 2374 } 2375 GetSamplerOperand(const SamplerDescriptor & sampler,bool & isIdxLT16)2376 VISA_StateOpndHandle* CEncoder::GetSamplerOperand( 2377 const SamplerDescriptor& sampler, 2378 bool& isIdxLT16) 2379 { 2380 //Sampler index 2381 VISA_VectorOpnd* dstOpnd = nullptr; 2382 VISA_SamplerVar* samplerVar = nullptr; 2383 2384 if (sampler.m_samplerType == ESAMPLER_NORMAL) 2385 { 2386 samplerVar = samplervar; 2387 2388 if (sampler.m_sampler->IsImmediate()) 2389 { 2390 uint immediate = int_cast<uint>(sampler.m_sampler->GetImmediateValue()); 2391 if (immediate < 16) 2392 { 2393 isIdxLT16 = true; 2394 } 2395 else 2396 { 2397 isIdxLT16 = false; 2398 } 2399 } 2400 else 2401 { 2402 // for dynamic index, avoid generate additional code for APIs only supporting 16 samplers 2403 if (m_program->GetContext()->m_DriverInfo.SupportMoreThan16Samplers()) 2404 { 2405 isIdxLT16 = false; 2406 } 2407 else 2408 { 2409 isIdxLT16 = true; 2410 } 2411 } 2412 } 2413 else 2414 { 2415 V(vKernel->GetBindlessSampler(samplerVar)); 2416 isIdxLT16 = true; 2417 } 2418 2419 V(vKernel->CreateVISAStateOperand(dstOpnd, samplerVar, 0, true)); 2420 2421 IGC_ASSERT(nullptr != sampler.m_sampler); 2422 IGC_ASSERT(sampler.m_sampler->IsUniform()); 2423 VISA_VectorOpnd* sourecOpnd = GetUniformSource(sampler.m_sampler); 2424 2425 //Add the mov special instruction for sampler 2426 V(vKernel->AppendVISADataMovementInst( 2427 ISA_MOVS, 2428 nullptr, 2429 false, 2430 vISA_EMASK_M1_NM, 2431 EXEC_SIZE_1, 2432 dstOpnd, 2433 sourecOpnd, 2434 nullptr)); 2435 2436 VISA_StateOpndHandle* samplerOpnd = nullptr; 2437 V(vKernel->CreateVISAStateOperandHandle(samplerOpnd, samplerVar)); 2438 return samplerOpnd; 2439 } 2440 GetSamplerOperand(CVariable * samplerIndex)2441 VISA_StateOpndHandle* CEncoder::GetSamplerOperand(CVariable* samplerIndex) 2442 { 2443 SamplerDescriptor sampler; 2444 bool isIdxLT16; 2445 sampler.m_sampler = samplerIndex; 2446 return GetSamplerOperand(sampler, isIdxLT16); 2447 } 2448 Sample(EOPCODE subOpcode,uint writeMask,CVariable * offset,const ResourceDescriptor & resource,const SamplerDescriptor & sampler,uint numSources,CVariable * dst,SmallVector<CVariable *,4> & payload,bool zeroLOD,bool cpsEnable,bool feedbackEnable,bool nonUniformState)2449 void CEncoder::Sample( 2450 EOPCODE subOpcode, 2451 uint writeMask, 2452 CVariable* offset, 2453 const ResourceDescriptor& resource, 2454 const SamplerDescriptor& sampler, 2455 uint numSources, 2456 CVariable* dst, 2457 SmallVector<CVariable*, 4>& payload, 2458 bool zeroLOD, 2459 bool cpsEnable, 2460 bool feedbackEnable, 2461 bool nonUniformState) 2462 { 2463 2464 if (!m_program->m_Platform->hasSamplerSupport()) 2465 return; 2466 2467 int numMsgSpecificOpnds = numSources; 2468 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2469 bool isIdxLT16; 2470 VISA_StateOpndHandle* samplerOpnd = GetSamplerOperand(sampler, isIdxLT16); 2471 VISA_StateOpndHandle* btiOpnd = GetVISASurfaceOpnd(resource); 2472 VISA_RawOpnd* dstVar = GetRawDestination(dst); 2473 VISA_RawOpnd* opndArray[11]; 2474 for (int i = 0; i < numMsgSpecificOpnds; i++) 2475 { 2476 opndArray[i] = GetRawSource(payload[i]); 2477 } 2478 2479 VISA_VectorOpnd* aoffimmi = GetSourceOperandNoModifier(offset); 2480 // Use bit 15 of aoffimmi to tell VISA the sample index could be greater 2481 // than 15. In this case, we need to use msg header, and setup M0.3 2482 // to point to next 16 sampler state. 2483 if (!isIdxLT16) 2484 { 2485 uint16_t aoffimmiVal = (uint16_t)offset->GetImmediateValue() | BIT(15); 2486 V(vKernel->CreateVISAImmediate(aoffimmi, &aoffimmiVal, ISA_TYPE_UW)); 2487 } 2488 2489 { 2490 int status = vKernel->AppendVISA3dSampler( 2491 ConvertSubOpcode(subOpcode, zeroLOD), 2492 feedbackEnable, // pixel null mask 2493 cpsEnable, 2494 !nonUniformState, 2495 predOpnd, 2496 GetAluEMask(dst), 2497 visaExecSize(m_encoderState.m_simdSize), 2498 ConvertChannelMaskToVisaType(writeMask), 2499 aoffimmi, 2500 samplerOpnd, 2501 btiOpnd, 2502 dstVar, 2503 numSources, 2504 opndArray); 2505 2506 V(status); 2507 } 2508 } 2509 Load(EOPCODE subOpcode,uint writeMask,CVariable * offset,const ResourceDescriptor & resource,uint numSources,CVariable * dst,SmallVector<CVariable *,4> & payload,bool zeroLOD,bool feedbackEnable)2510 void CEncoder::Load( 2511 EOPCODE subOpcode, 2512 uint writeMask, 2513 CVariable* offset, 2514 const ResourceDescriptor& resource, 2515 uint numSources, 2516 CVariable* dst, 2517 SmallVector<CVariable*, 4>& payload, 2518 bool zeroLOD, 2519 bool feedbackEnable) 2520 { 2521 2522 if (!m_program->m_Platform->hasSamplerSupport()) 2523 return; 2524 2525 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2526 VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(resource); 2527 VISA_RawOpnd* dstVar = GetRawDestination(dst); 2528 2529 VISA_RawOpnd* opndArray[11]; 2530 for (unsigned int i = 0; i < numSources; i++) 2531 { 2532 opndArray[i] = GetRawSource(payload[i]); 2533 } 2534 2535 VISA_VectorOpnd* aoffimmi = GetSourceOperandNoModifier(offset); 2536 2537 { 2538 int status = vKernel->AppendVISA3dLoad( 2539 ConvertSubOpcode(subOpcode, zeroLOD), 2540 feedbackEnable, // pixel null mask 2541 predOpnd, 2542 GetAluEMask(dst), 2543 GetAluExecSize(dst), 2544 ConvertChannelMaskToVisaType(writeMask), 2545 aoffimmi, 2546 surfOpnd, 2547 dstVar, 2548 numSources, 2549 opndArray); 2550 2551 V(status); 2552 } 2553 } 2554 Info(EOPCODE subOpcode,uint writeMask,const ResourceDescriptor & resource,CVariable * lod,CVariable * dst)2555 void CEncoder::Info(EOPCODE subOpcode, uint writeMask, const ResourceDescriptor& resource, CVariable* lod, CVariable* dst) 2556 { 2557 VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(resource); 2558 VISA_RawOpnd* dstVar = GetRawDestination(dst); 2559 VISA_RawOpnd* lodVar = GetRawSource(lod); 2560 2561 V(vKernel->AppendVISA3dInfo( 2562 ConvertSubOpcode(subOpcode, false), 2563 GetAluEMask(dst), 2564 GetAluExecSize(dst), 2565 ConvertChannelMaskToVisaType(writeMask), 2566 surfOpnd, 2567 lodVar, 2568 dstVar)); 2569 } 2570 Gather4Inst(EOPCODE subOpcode,CVariable * offset,const ResourceDescriptor & resource,const SamplerDescriptor & sampler,uint numSources,CVariable * dst,SmallVector<CVariable *,4> & payload,uint channel,bool feedbackEnable)2571 void CEncoder::Gather4Inst( 2572 EOPCODE subOpcode, 2573 CVariable* offset, 2574 const ResourceDescriptor& resource, 2575 const SamplerDescriptor& sampler, 2576 uint numSources, 2577 CVariable* dst, 2578 SmallVector<CVariable*, 4>& payload, 2579 uint channel, 2580 bool feedbackEnable) 2581 { 2582 2583 if (!m_program->m_Platform->hasSamplerSupport()) 2584 return; 2585 2586 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2587 bool isIdxLT16; 2588 VISA_StateOpndHandle* samplerOpnd = GetSamplerOperand(sampler, isIdxLT16); 2589 VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(resource); 2590 VISA_RawOpnd* dstVar = GetRawDestination(dst); 2591 VISA_RawOpnd* opndArray[11]; 2592 for (unsigned int i = 0; i < numSources; i++) 2593 { 2594 opndArray[i] = GetRawSource(payload[i]); 2595 } 2596 2597 VISA_VectorOpnd* aoffimmi = GetSourceOperandNoModifier(offset); 2598 if (!isIdxLT16) 2599 { 2600 uint16_t aoffimmiVal = (uint16_t)offset->GetImmediateValue() | BIT(15); 2601 V(vKernel->CreateVISAImmediate(aoffimmi, &aoffimmiVal, ISA_TYPE_UW)); 2602 } 2603 2604 { 2605 int status = vKernel->AppendVISA3dGather4( 2606 ConvertSubOpcode(subOpcode, false), 2607 feedbackEnable, // pixel null mask 2608 predOpnd, 2609 GetAluEMask(dst), 2610 visaExecSize(m_encoderState.m_simdSize), 2611 ConvertSingleSourceChannel(channel), 2612 aoffimmi, 2613 samplerOpnd, 2614 surfOpnd, 2615 dstVar, 2616 numSources, 2617 opndArray); 2618 2619 V(status); 2620 } 2621 } 2622 AddrAdd(CVariable * dst,CVariable * src0,CVariable * src1)2623 void CEncoder::AddrAdd(CVariable* dst, CVariable* src0, CVariable* src1) 2624 { 2625 // On ICL+ platforms address register must be initialized if it is used 2626 // in VxH indirect addressing to avoid out-of-bounds access on inactive 2627 // lanes. VISA initializes address register at the beginning of the 2628 // shader which is sufficient for shaders that use address register only 2629 // for indirect addressing but is not sufficient if shader also uses 2630 // address register in send descriptors. The latter case is handled by 2631 // the initialization below. 2632 // see VISA Optimizer::resetA0() 2633 const bool mayUseA0InSendDesc = 2634 m_program->GetContext()->m_instrTypes.mayHaveIndirectResources; 2635 const bool needsA0Reset = 2636 m_program->m_Platform->NeedResetA0forVxHA0(); 2637 2638 if (((mayUseA0InSendDesc && needsA0Reset) || 2639 IGC_IS_FLAG_ENABLED(InitializeAddressRegistersBeforeUse)) && 2640 !dst->IsUniform() && 2641 !m_encoderState.m_noMask) 2642 { 2643 m_encoderState.m_noMask = true; 2644 VISA_VectorOpnd* srcOpnd = nullptr; 2645 VISA_VectorOpnd* dstOpnd = nullptr; 2646 const DWORD zero = 0; 2647 V(vKernel->CreateVISAImmediate(srcOpnd, &zero, ISA_TYPE_UW)); 2648 V(vKernel->CreateVISAAddressDstOperand(dstOpnd, dst->visaAddrVariable, 0)); 2649 V(vKernel->AppendVISADataMovementInst( 2650 ISA_MOV, 2651 nullptr, 2652 false, 2653 GetAluEMask(dst), 2654 visaExecSize(m_encoderState.m_simdSize), 2655 dstOpnd, 2656 srcOpnd)); 2657 m_encoderState.m_noMask = false; 2658 } 2659 2660 if (dst->IsUniform()) 2661 { 2662 m_encoderState.m_simdSize = SIMDMode::SIMD1; 2663 m_encoderState.m_noMask = true; 2664 } 2665 VISA_VectorOpnd* pSrc1Opnd = GetSourceOperand(src1, m_encoderState.m_srcOperand[1]); 2666 VISA_VectorOpnd* pSrc0Addr = nullptr; 2667 V(vKernel->CreateVISAAddressOfOperand(pSrc0Addr, GetVISAVariable(src0), src0->GetAliasOffset())); 2668 VISA_VectorOpnd* pVectorOpnd = nullptr; 2669 V(vKernel->CreateVISAAddressDstOperand(pVectorOpnd, dst->visaAddrVariable, 0)); 2670 2671 V(vKernel->AppendVISAAddrAddInst( 2672 GetAluEMask(dst), 2673 visaExecSize(m_encoderState.m_simdSize), 2674 pVectorOpnd, 2675 pSrc0Addr, 2676 pSrc1Opnd)); 2677 } 2678 Barrier(e_barrierKind BarrierKind)2679 void CEncoder::Barrier(e_barrierKind BarrierKind) 2680 { 2681 if (BarrierKind == EBARRIER_SIGNAL) { 2682 // signal only 2683 V(vKernel->AppendVISASplitBarrierInst(true)); 2684 return; 2685 } 2686 if (BarrierKind == EBARRIER_WAIT) { 2687 // wait only 2688 V(vKernel->AppendVISASplitBarrierInst(false)); 2689 return; 2690 } 2691 V(vKernel->AppendVISASyncInst(ISA_BARRIER)); 2692 } 2693 Fence(bool CommitEnable,bool L3_Flush_RW_Data,bool L3_Flush_Constant_Data,bool L3_Flush_Texture_Data,bool L3_Flush_Instructions,bool Global_Mem_Fence,bool L1_Flush_Constant_Data,bool SWFence)2694 void CEncoder::Fence(bool CommitEnable, 2695 bool L3_Flush_RW_Data, 2696 bool L3_Flush_Constant_Data, 2697 bool L3_Flush_Texture_Data, 2698 bool L3_Flush_Instructions, 2699 bool Global_Mem_Fence, 2700 bool L1_Flush_Constant_Data, 2701 bool SWFence) // if true no ISA is emitted and the instruction is a pure code barrier 2702 { 2703 // Only a single bit set here is a valid configuration 2704 IGC_ASSERT((L3_Flush_Instructions + L3_Flush_Texture_Data + L3_Flush_Constant_Data + L3_Flush_RW_Data) <= 1); 2705 2706 uint fenceFlags = (L3_Flush_Instructions << 1) | 2707 (L3_Flush_Texture_Data << 2) | 2708 (L3_Flush_Constant_Data << 3) | 2709 (L3_Flush_RW_Data << 4) | 2710 ((!Global_Mem_Fence) << 5) | // bit 5: 1 -- local, 0 -- global 2711 (L1_Flush_Constant_Data << 6) | 2712 (SWFence << 7) | 2713 (CommitEnable << 0); 2714 2715 V(vKernel->AppendVISASyncInst(ISA_FENCE, int_cast<unsigned char>(fenceFlags))); 2716 } 2717 FlushSamplerCache()2718 void CEncoder::FlushSamplerCache() 2719 { 2720 V(vKernel->AppendVISASyncInst(ISA_SAMPLR_CACHE_FLUSH)); 2721 } 2722 EOT()2723 void CEncoder::EOT() 2724 { 2725 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 2726 V(vKernel->AppendVISACFRetInst(predOpnd, vISA_EMASK_M1, EXEC_SIZE_1)); 2727 } 2728 2729 // Init Control register for denorm modes, rounding modes, etc. initCR(VISAKernel * vKernel)2730 void CEncoder::initCR(VISAKernel* vKernel) 2731 { 2732 // Those bits must be zero'ed on entry to kernel/shader. 2733 // (If not, this function needs to be changed accordingly.) 2734 VISA_VectorOpnd* src0_Opnd = nullptr; 2735 VISA_VectorOpnd* src1_Opnd = nullptr; 2736 VISA_VectorOpnd* dst_Opnd = nullptr; 2737 VISA_GenVar* cr0_var = nullptr; 2738 uint imm_data = 0; 2739 2740 CodeGenContext* pCtx = m_program->GetContext(); 2741 if (pCtx->m_floatDenormMode16 == FLOAT_DENORM_RETAIN) 2742 imm_data |= 0x400; 2743 if (pCtx->m_floatDenormMode32 == FLOAT_DENORM_RETAIN) 2744 imm_data |= 0x80; 2745 if (pCtx->m_floatDenormMode64 == FLOAT_DENORM_RETAIN) 2746 imm_data |= 0x40; 2747 2748 uint RM_bits = 0; 2749 ERoundingMode RM_FPCvtInt = static_cast<ERoundingMode>(pCtx->getModuleMetaData()->compOpt.FloatCvtIntRoundingMode); 2750 ERoundingMode RM_FP = static_cast<ERoundingMode>(pCtx->getModuleMetaData()->compOpt.FloatRoundingMode); 2751 if (RM_FPCvtInt == ERoundingMode::ROUND_TO_ZERO) { 2752 // No need to set FPCvtInt, just need to set FP RM. 2753 RM_bits = getEncoderRoundingMode_FP(RM_FP); 2754 } 2755 else if (RM_FPCvtInt == RM_FP) { 2756 // Setting FPCvtInt will set both FPCvtInt and FP 2757 RM_bits = getEncoderRoundingMode_FPCvtInt(RM_FPCvtInt); 2758 } 2759 else { 2760 IGC_ASSERT_MESSAGE(0, "Unsupport combination of default rounding mode (FP and FPCvtInt)!"); 2761 } 2762 imm_data |= RM_bits; 2763 2764 // If we are in the default mode no need to set the CR 2765 if (imm_data != 0) 2766 { 2767 V(vKernel->GetPredefinedVar(cr0_var, PREDEFINED_CR0)); 2768 V(vKernel->CreateVISASrcOperand(src0_Opnd, cr0_var, MODIFIER_NONE, 0, 1, 0, 0, 0)); 2769 V(vKernel->CreateVISAImmediate(src1_Opnd, &imm_data, ISA_TYPE_UD)); 2770 V(vKernel->CreateVISADstOperand(dst_Opnd, cr0_var, 1, 0, 0)); 2771 V(vKernel->AppendVISAArithmeticInst( 2772 ISA_OR, 2773 nullptr, 2774 false, 2775 vISA_EMASK_M1_NM, 2776 EXEC_SIZE_1, 2777 dst_Opnd, 2778 src0_Opnd, 2779 src1_Opnd)); 2780 } 2781 } 2782 SetVectorMask(bool VMask)2783 void CEncoder::SetVectorMask(bool VMask) 2784 { 2785 VISA_VectorOpnd* src0_Opnd = nullptr; 2786 VISA_VectorOpnd* src1_Opnd = nullptr; 2787 VISA_VectorOpnd* dst_Opnd = nullptr; 2788 VISA_GenVar* cr0_var = nullptr; 2789 uint bitmaskImm = 1 << 3; 2790 if (!VMask) 2791 { 2792 bitmaskImm = ~bitmaskImm; 2793 } 2794 V(vKernel->GetPredefinedVar(cr0_var, PREDEFINED_CR0)); 2795 V(vKernel->CreateVISASrcOperand(src0_Opnd, cr0_var, MODIFIER_NONE, 0, 1, 0, 0, 0)); 2796 V(vKernel->CreateVISAImmediate(src1_Opnd, &bitmaskImm, ISA_TYPE_UD)); 2797 V(vKernel->CreateVISADstOperand(dst_Opnd, cr0_var, 1, 0, 0)); 2798 V(vKernel->AppendVISAArithmeticInst( 2799 VMask ? ISA_OR : ISA_AND, 2800 nullptr, 2801 false, 2802 vISA_EMASK_M1_NM, 2803 EXEC_SIZE_1, 2804 dst_Opnd, 2805 src0_Opnd, 2806 src1_Opnd)); 2807 } 2808 SetRoundingMode_FP(ERoundingMode actualRM,ERoundingMode newRM)2809 void CEncoder::SetRoundingMode_FP(ERoundingMode actualRM, ERoundingMode newRM) 2810 { 2811 IGC_ASSERT_MESSAGE(newRM != ERoundingMode::ROUND_TO_ANY, "Invalid rounding mode"); 2812 if (actualRM != newRM) 2813 { 2814 RMEncoding actualRM_en = getEncoderRoundingMode_FP(actualRM); 2815 RMEncoding newRM_en = getEncoderRoundingMode_FP(newRM); 2816 SetRoundingMode(actualRM_en, newRM_en); 2817 } 2818 } 2819 SetRoundingMode_FPCvtInt(ERoundingMode actualRM,ERoundingMode newRM)2820 void CEncoder::SetRoundingMode_FPCvtInt(ERoundingMode actualRM, ERoundingMode newRM) 2821 { 2822 IGC_ASSERT_MESSAGE(newRM != ERoundingMode::ROUND_TO_ANY, "Invalid rounding mode"); 2823 if (actualRM != newRM) 2824 { 2825 RMEncoding actualRM_en = getEncoderRoundingMode_FPCvtInt(actualRM); 2826 RMEncoding newRM_en = getEncoderRoundingMode_FPCvtInt(newRM); 2827 SetRoundingMode(actualRM_en, newRM_en); 2828 } 2829 } 2830 2831 // Set rounding mode based on given encoding. SetRoundingMode(RMEncoding actualRM,RMEncoding newRM)2832 void CEncoder::SetRoundingMode(RMEncoding actualRM, RMEncoding newRM) 2833 { 2834 IGC_ASSERT_MESSAGE((actualRM != newRM), "Only setting RM if the new RM is different from the current RM!"); 2835 2836 VISA_VectorOpnd* src0_Opnd = nullptr; 2837 VISA_VectorOpnd* src1_Opnd = nullptr; 2838 VISA_VectorOpnd* dst_Opnd = nullptr; 2839 VISA_GenVar* cr0_var = nullptr; 2840 uint roundingMode = actualRM ^ newRM; 2841 IGC_ASSERT(nullptr != vKernel); 2842 V(vKernel->GetPredefinedVar(cr0_var, PREDEFINED_CR0)); 2843 V(vKernel->CreateVISASrcOperand(src0_Opnd, cr0_var, MODIFIER_NONE, 0, 1, 0, 0, 0)); 2844 V(vKernel->CreateVISAImmediate(src1_Opnd, &roundingMode, ISA_TYPE_UD)); 2845 V(vKernel->CreateVISADstOperand(dst_Opnd, cr0_var, 1, 0, 0)); 2846 V(vKernel->AppendVISAArithmeticInst( 2847 ISA_XOR, 2848 nullptr, 2849 false, 2850 vISA_EMASK_M1_NM, 2851 EXEC_SIZE_1, 2852 dst_Opnd, 2853 src0_Opnd, 2854 src1_Opnd)); 2855 } 2856 getEncoderRoundingMode_FP(ERoundingMode FP_RM)2857 CEncoder::RMEncoding CEncoder::getEncoderRoundingMode_FP(ERoundingMode FP_RM) 2858 { 2859 switch (FP_RM) { 2860 default: 2861 break; 2862 case ROUND_TO_POSITIVE: 2863 return RMEncoding::RoundToPositive; 2864 case ROUND_TO_NEGATIVE: 2865 return RMEncoding::RoundToNegative; 2866 case ROUND_TO_ZERO: 2867 return RMEncoding::RoundToZero; 2868 } 2869 return RMEncoding::RoundToNearestEven; 2870 } 2871 getEncoderRoundingMode_FPCvtInt(ERoundingMode FCvtI_RM)2872 CEncoder::RMEncoding CEncoder::getEncoderRoundingMode_FPCvtInt(ERoundingMode FCvtI_RM) 2873 { 2874 switch (FCvtI_RM) { 2875 default: 2876 break; 2877 case ROUND_TO_NEAREST_EVEN: 2878 return RMEncoding::RoundToNearestEven_int; 2879 case ROUND_TO_POSITIVE: 2880 return RMEncoding::RoundToPositive_int; 2881 case ROUND_TO_NEGATIVE: 2882 return RMEncoding::RoundToNegative_int; 2883 } 2884 return RMEncoding::RoundToZero_int; 2885 } 2886 GetLabel(uint label)2887 VISA_LabelOpnd* CEncoder::GetLabel(uint label) 2888 { 2889 VISA_LabelOpnd* visaLabel = labelMap[label]; 2890 if (visaLabel == nullptr) 2891 { 2892 // all blocks should have labels; but new blocks inserted during 2893 // encoding might not 2894 VISA_Label_Kind kind = LABEL_BLOCK; 2895 2896 std::stringstream lbl; 2897 if (labelNameMap[label].empty()) { 2898 lbl << CreateShortLabel(labelCounter++); 2899 } else { 2900 lbl << labelNameMap[label].getVisaCString(); 2901 } 2902 V(vKernel->CreateVISALabelVar(visaLabel, lbl.str().c_str(), kind)); 2903 labelMap[label] = visaLabel; 2904 } 2905 return visaLabel; 2906 } 2907 GetStackFunction(llvm::Function * F)2908 VISAFunction* CEncoder::GetStackFunction(llvm::Function* F) 2909 { 2910 auto Iter = stackFuncMap.find(F); 2911 if (Iter != stackFuncMap.end()) 2912 { 2913 return Iter->second; 2914 } 2915 VISAFunction* visaFunc = nullptr; 2916 V(vbuilder->AddFunction(visaFunc, F->getName().data())); 2917 stackFuncMap[F] = visaFunc; 2918 return visaFunc; 2919 } 2920 GetFuncLabel(llvm::Function * F)2921 VISA_LabelOpnd* CEncoder::GetFuncLabel(llvm::Function* F) 2922 { 2923 auto Iter = funcLabelMap.find(F); 2924 if (Iter != funcLabelMap.end()) 2925 { 2926 return Iter->second; 2927 } 2928 2929 // Create a new function label. 2930 VISA_LabelOpnd* visaLabel = nullptr; 2931 V(vKernel->CreateVISALabelVar(visaLabel, F->getName().data(), LABEL_SUBROUTINE)); 2932 funcLabelMap[F] = visaLabel; 2933 2934 return visaLabel; 2935 } 2936 Push()2937 void CEncoder::Push() 2938 { 2939 Init(); 2940 } 2941 GetUniformSource(CVariable * var)2942 VISA_VectorOpnd* CEncoder::GetUniformSource(CVariable* var) 2943 { 2944 VISA_VectorOpnd* srcOperand = nullptr; 2945 if (var == nullptr) 2946 { 2947 return nullptr; 2948 } 2949 if (var->IsImmediate()) 2950 { 2951 // TODO: need support for 64 bits immediate 2952 uint immediate = int_cast<uint>(var->GetImmediateValue()); 2953 V(vKernel->CreateVISAImmediate(srcOperand, &immediate, ISA_TYPE_UD)); 2954 } 2955 else 2956 { 2957 unsigned char rowOffset = 0; 2958 unsigned char colOffset = 0; 2959 GetRowAndColOffset(var, 0, 0, rowOffset, colOffset); 2960 V(vKernel->CreateVISASrcOperand(srcOperand, GetVISAVariable(var), MODIFIER_NONE, 0, 1, 0, rowOffset, colOffset)); 2961 } 2962 return srcOperand; 2963 } 2964 GetVISAPlatform(const CPlatform * platform)2965 TARGET_PLATFORM GetVISAPlatform(const CPlatform* platform) 2966 { 2967 switch (platform->GetPlatformFamily()) 2968 { 2969 case IGFX_GEN8_CORE: 2970 if (platform->getPlatformInfo().eProductFamily == IGFX_CHERRYVIEW) 2971 { 2972 return GENX_CHV; 2973 } 2974 else 2975 { 2976 return GENX_BDW; 2977 } 2978 // fall-through 2979 case IGFX_GEN9_CORE: 2980 case IGFX_GENNEXT_CORE: 2981 if (platform->getPlatformInfo().eProductFamily == IGFX_BROXTON || 2982 platform->getPlatformInfo().eProductFamily == IGFX_GEMINILAKE) 2983 { 2984 return GENX_BXT; 2985 } 2986 else 2987 { 2988 return GENX_SKL; 2989 } 2990 // fall-through 2991 case IGFX_GEN11_CORE: 2992 return GENX_ICLLP; 2993 case IGFX_GEN12_CORE: 2994 case IGFX_XE_HP_CORE: 2995 case IGFX_GEN12LP_CORE: 2996 if ( platform->getPlatformInfo().eProductFamily == IGFX_TIGERLAKE_LP 2997 || platform->getPlatformInfo().eProductFamily == IGFX_DG1 2998 || platform->getPlatformInfo().eProductFamily == IGFX_ROCKETLAKE 2999 || platform->getPlatformInfo().eProductFamily == IGFX_ALDERLAKE_S 3000 || platform->getPlatformInfo().eProductFamily == IGFX_ALDERLAKE_P 3001 ) 3002 { 3003 return GENX_TGLLP; 3004 } 3005 else if (platform->getPlatformInfo().eProductFamily == IGFX_XE_HP_SDV) 3006 { 3007 return XeHP_SDV; 3008 } 3009 // fall-through 3010 default: 3011 IGC_ASSERT_MESSAGE(0, "unsupported platform"); 3012 break; 3013 } 3014 return GENX_SKL; 3015 } 3016 OWLoad(CVariable * dst,const ResourceDescriptor & resource,CVariable * src0,bool owordAligned,uint bytesToBeRead,uint dstOffset)3017 void CEncoder::OWLoad(CVariable* dst, const ResourceDescriptor& resource, CVariable* src0, bool owordAligned, uint bytesToBeRead, uint dstOffset) 3018 { 3019 VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(resource); 3020 VISA_VectorOpnd* offset = GetUniformSource(src0); 3021 VISA_RawOpnd* dstVar = GetRawDestination(dst, dstOffset); 3022 uint size = (bytesToBeRead / SIZE_OWORD); 3023 3024 V(vKernel->AppendVISASurfAccessOwordLoadStoreInst( 3025 owordAligned ? ISA_OWORD_LD : ISA_OWORD_LD_UNALIGNED, 3026 vISA_EMASK_M1_NM, // OWord load is always nomask 3027 surfOpnd, 3028 ConvertSizeToVisaType(size), 3029 offset, 3030 dstVar)); 3031 } 3032 OWStore(CVariable * data,e_predefSurface surfaceType,CVariable * bufId,CVariable * src0,uint bytesToBeRead,uint srcOffset)3033 void CEncoder::OWStore(CVariable* data, e_predefSurface surfaceType, CVariable* bufId, CVariable* src0, uint bytesToBeRead, uint srcOffset) 3034 { 3035 VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(surfaceType, bufId); 3036 VISA_VectorOpnd* offset = GetUniformSource(src0); 3037 VISA_RawOpnd* dataVar = GetRawSource(data, srcOffset); 3038 uint size = (bytesToBeRead / SIZE_OWORD); 3039 3040 V(vKernel->AppendVISASurfAccessOwordLoadStoreInst( 3041 ISA_OWORD_ST, 3042 vISA_EMASK_M1_NM, 3043 surfOpnd, 3044 ConvertSizeToVisaType(size), 3045 offset, 3046 dataVar)); 3047 if (ESURFACE_STATELESS == surfaceType) 3048 { 3049 this->m_program->IncStatelessWritesCount(); 3050 } 3051 } 3052 OWStoreA64(CVariable * data,CVariable * src0,uint bytesToBeRead,uint srcOffset)3053 void CEncoder::OWStoreA64(CVariable* data, CVariable* src0, uint bytesToBeRead, uint srcOffset) 3054 { 3055 VISA_VectorOpnd* offset = GetUniformSource(src0); 3056 VISA_RawOpnd* dataVar = GetRawDestination(data, srcOffset); 3057 uint size = (bytesToBeRead / SIZE_OWORD); 3058 3059 V(vKernel->AppendVISASvmBlockStoreInst( 3060 ConvertSizeToVisaType(size), 3061 true, // always unaligned for now 3062 offset, 3063 dataVar)); 3064 } 3065 OWLoadA64(CVariable * dst,CVariable * src0,uint bytesToBeRead,uint dstOffset)3066 void CEncoder::OWLoadA64(CVariable* dst, CVariable* src0, uint bytesToBeRead, uint dstOffset) 3067 { 3068 VISA_VectorOpnd* offset = GetUniformSource(src0); 3069 VISA_RawOpnd* dstVar = GetRawDestination(dst, dstOffset); 3070 uint size = (bytesToBeRead / SIZE_OWORD); 3071 3072 V(vKernel->AppendVISASvmBlockLoadInst( 3073 ConvertSizeToVisaType(size), 3074 true, // always unaligned for now 3075 offset, 3076 dstVar)); 3077 } 3078 MediaBlockMessage(ISA_Opcode subOpcode,CVariable * dst,e_predefSurface surfaceType,CVariable * bufId,CVariable * xOffset,CVariable * yOffset,uint modifier,unsigned char blockWidth,unsigned char blockHeight,uint plane)3079 void CEncoder::MediaBlockMessage( 3080 ISA_Opcode subOpcode, 3081 CVariable* dst, 3082 e_predefSurface surfaceType, 3083 CVariable* bufId, 3084 CVariable* xOffset, 3085 CVariable* yOffset, 3086 uint modifier, 3087 unsigned char blockWidth, 3088 unsigned char blockHeight, 3089 uint plane) 3090 { 3091 VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(surfaceType, bufId); 3092 VISA_VectorOpnd* xVar = GetUniformSource(xOffset); 3093 VISA_VectorOpnd* yVar = GetUniformSource(yOffset); 3094 VISA_RawOpnd* tempVar = nullptr; 3095 if (subOpcode == ISA_MEDIA_LD) 3096 { 3097 tempVar = GetRawDestination(dst); 3098 } 3099 else if (subOpcode == ISA_MEDIA_ST) 3100 { 3101 tempVar = GetRawSource(dst); 3102 } 3103 3104 MEDIA_LD_mod modi = (MEDIA_LD_mod)modifier; 3105 CISA_PLANE_ID planeVar = (CISA_PLANE_ID)plane; 3106 3107 V(vKernel->AppendVISASurfAccessMediaLoadStoreInst( 3108 subOpcode, 3109 modi, 3110 surfOpnd, 3111 blockWidth, 3112 blockHeight, 3113 xVar, 3114 yVar, 3115 tempVar, 3116 planeVar)); 3117 } 3118 TypedReadWrite(ISA_Opcode opcode,const ResourceDescriptor & resource,CVariable * pU,CVariable * pV,CVariable * pR,CVariable * pLOD,CVariable * pSrcDst,uint writeMask)3119 void CEncoder::TypedReadWrite( 3120 ISA_Opcode opcode, 3121 const ResourceDescriptor& resource, 3122 CVariable* pU, 3123 CVariable* pV, 3124 CVariable* pR, 3125 CVariable* pLOD, 3126 CVariable* pSrcDst, 3127 uint writeMask) 3128 { 3129 // only SIMD 8 reads & writes are supported. 3130 VISAChannelMask channelMask = CHANNEL_MASK_RGBA;//for typed write leaving this as before 3131 if (writeMask != 0) 3132 { 3133 channelMask = ConvertChannelMaskToVisaType(writeMask); 3134 } 3135 VISA_StateOpndHandle* pSurfStateOpndHandle = GetVISASurfaceOpnd(resource); 3136 3137 // TODO unify the way we calculate offset for raw sources, maybe we shouldn't use offset at all 3138 VISA_RawOpnd* pUOffset = GetRawSource(pU, m_encoderState.m_srcOperand[0].subVar * getGRFSize()); 3139 VISA_RawOpnd* pVOffset = GetRawSource(pV, m_encoderState.m_srcOperand[1].subVar * getGRFSize()); 3140 VISA_RawOpnd* pROffset = GetRawSource(pR, m_encoderState.m_srcOperand[2].subVar * getGRFSize()); 3141 VISA_RawOpnd* pLODOffset = GetRawSource(pLOD, m_encoderState.m_srcOperand[3].subVar * getGRFSize()); 3142 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 3143 IGC_ASSERT(0 == m_encoderState.m_dstOperand.subVar); 3144 3145 VISA_RawOpnd* pDstVar = nullptr; 3146 VISA_EMask_Ctrl mask; 3147 if (opcode == ISA_SCATTER4_TYPED) 3148 { 3149 pDstVar = GetRawSource(pSrcDst, 0); 3150 mask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask); 3151 } 3152 else 3153 { 3154 pDstVar = GetRawDestination(pSrcDst); 3155 mask = GetAluEMask(pSrcDst); 3156 } 3157 3158 V(vKernel->AppendVISASurfAccessGather4Scatter4TypedInst( 3159 opcode, 3160 predOpnd, 3161 channelMask, 3162 mask, 3163 visaExecSize(m_encoderState.m_simdSize), 3164 pSurfStateOpndHandle, 3165 pUOffset, 3166 pVOffset, 3167 pROffset, 3168 pLODOffset, 3169 pDstVar)); 3170 } 3171 ScatterGather(ISA_Opcode opcode,CVariable * srcdst,CVariable * bufId,CVariable * offset,CVariable * gOffset,e_predefSurface surface,int elementSize)3172 void CEncoder::ScatterGather(ISA_Opcode opcode, CVariable* srcdst, CVariable* bufId, CVariable* offset, CVariable* gOffset, e_predefSurface surface, int elementSize) 3173 { 3174 VISA_VectorOpnd* globalOffsetOpnd = nullptr; 3175 VISA_StateOpndHandle* surfOpnd = GetVISASurfaceOpnd(surface, bufId); 3176 if (gOffset) 3177 { 3178 globalOffsetOpnd = GetUniformSource(gOffset); 3179 } 3180 else 3181 { 3182 int value = 0; 3183 V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &value, ISA_TYPE_UD)); 3184 } 3185 VISA_RawOpnd* elementOffset = GetRawSource(offset); 3186 3187 VISA_RawOpnd* dstVar = NULL; 3188 3189 VISA_EMask_Ctrl mask; 3190 if (opcode == ISA_GATHER) 3191 { 3192 dstVar = GetRawDestination(srcdst); 3193 mask = GetAluEMask(srcdst); 3194 } 3195 else 3196 { 3197 dstVar = GetRawSource(srcdst); 3198 mask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask); 3199 } 3200 3201 V(vKernel->AppendVISASurfAccessGatherScatterInst( 3202 opcode, 3203 mask, 3204 visaElementSize(elementSize), 3205 visaExecSize(m_encoderState.m_simdSize), 3206 surfOpnd, 3207 globalOffsetOpnd, 3208 elementOffset, 3209 dstVar)); 3210 if (ISA_SCATTER == opcode && ESURFACE_STATELESS == surface) 3211 { 3212 this->m_program->IncStatelessWritesCount(); 3213 } 3214 } 3215 GenericAlu(e_opcode opcode,CVariable * dst,CVariable * src0,CVariable * src1,CVariable * src2)3216 void CEncoder::GenericAlu(e_opcode opcode, CVariable* dst, CVariable* src0, CVariable* src1, CVariable* src2) 3217 { 3218 ISA_Opcode visaOpcode = ConvertOpcode[opcode]; 3219 switch (visaOpcode) 3220 { 3221 case ISA_MOV: 3222 case ISA_MOVS: 3223 case ISA_SETP: 3224 DataMov(visaOpcode, dst, src0); 3225 break; 3226 case ISA_FMINMAX: 3227 MinMax(opcode == EOPCODE_MIN ? CISA_DM_FMIN : CISA_DM_FMAX, dst, src0, src1); 3228 break; 3229 case ISA_AND: 3230 case ISA_ASR: 3231 case ISA_CBIT: 3232 case ISA_FBH: 3233 case ISA_FBL: 3234 case ISA_NOT: 3235 case ISA_OR: 3236 case ISA_SHL: 3237 case ISA_SHR: 3238 case ISA_ROL: 3239 case ISA_ROR: 3240 case ISA_XOR: 3241 LogicOp(visaOpcode, dst, src0, src1, src2); 3242 break; 3243 default: 3244 Arithmetic(visaOpcode, dst, src0, src1, src2); 3245 break; 3246 } 3247 } 3248 GetVISASurfaceOpnd(const ResourceDescriptor & resource)3249 VISA_StateOpndHandle* CEncoder::GetVISASurfaceOpnd(const ResourceDescriptor& resource) 3250 { 3251 return GetVISASurfaceOpnd(resource.m_surfaceType, resource.m_resource); 3252 } 3253 GetVISASurfaceOpnd(e_predefSurface surfaceType,CVariable * bti)3254 VISA_StateOpndHandle* CEncoder::GetVISASurfaceOpnd(e_predefSurface surfaceType, CVariable* bti) 3255 { 3256 VISA_StateOpndHandle* surfOpnd = nullptr; 3257 if (surfaceType == ESURFACE_NORMAL || surfaceType == ESURFACE_BINDLESS || surfaceType == ESURFACE_SSHBINDLESS) 3258 { 3259 VISA_SurfaceVar* surfacevar = nullptr; 3260 if (surfaceType == ESURFACE_BINDLESS) 3261 { 3262 V(vKernel->GetPredefinedSurface(surfacevar, PREDEFINED_SURFACE_T252)); 3263 } 3264 else 3265 { 3266 surfacevar = dummySurface; 3267 } 3268 VISA_VectorOpnd* sourecOpnd = GetUniformSource(bti); 3269 VISA_VectorOpnd* dstOpnd = nullptr; 3270 V(vKernel->CreateVISAStateOperand(dstOpnd, surfacevar, 0, true)); 3271 3272 //Add the mov special instruction 3273 V(vKernel->AppendVISADataMovementInst( 3274 ISA_MOVS, 3275 nullptr, 3276 false, 3277 vISA_EMASK_M1_NM, 3278 EXEC_SIZE_1, 3279 dstOpnd, 3280 sourecOpnd, 3281 nullptr)); 3282 3283 V(vKernel->CreateVISAStateOperandHandle(surfOpnd, surfacevar)); 3284 } 3285 else 3286 { 3287 VISA_SurfaceVar* surfacevar = NULL; 3288 switch (surfaceType) 3289 { 3290 case ESURFACE_SLM: 3291 V(vKernel->GetPredefinedSurface(surfacevar, PREDEFINED_SURFACE_SLM)); 3292 break; 3293 case ESURFACE_STATELESS: 3294 V(vKernel->GetPredefinedSurface(surfacevar, PREDEFINED_SURFACE_T255)); 3295 break; 3296 case ESURFACE_SCRATCH: 3297 // NOTE: For scratch surface, we need to shr the surface state offset coming in R0.5 by 4. 3298 // This shr operation is generated by vISA in HDC path 3299 V(vKernel->GetPredefinedSurface(surfacevar, PREDEFINED_SURFACE_SCRATCH)); 3300 break; 3301 default: 3302 IGC_ASSERT_MESSAGE(0, "Invalid surface"); 3303 break; 3304 } 3305 V(vKernel->CreateVISAStateOperandHandle(surfOpnd, surfacevar)); 3306 } 3307 return surfOpnd; 3308 } 3309 ConvertMaskToVisaType(e_mask mask,bool noMask)3310 VISA_EMask_Ctrl CEncoder::ConvertMaskToVisaType(e_mask mask, bool noMask) 3311 { 3312 VISA_EMask_Ctrl emaskRet = vISA_EMASK_M1_NM; 3313 switch (mask) 3314 { 3315 case EMASK_Q1: 3316 if (m_encoderState.m_secondHalf) 3317 { 3318 emaskRet = noMask ? vISA_EMASK_M5_NM : vISA_EMASK_M5; 3319 } 3320 else 3321 { 3322 emaskRet = noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; 3323 } 3324 break; 3325 case EMASK_Q2: 3326 if (m_encoderState.m_secondHalf) 3327 { 3328 emaskRet = noMask ? vISA_EMASK_M7_NM : vISA_EMASK_M7; 3329 } 3330 else 3331 { 3332 emaskRet = noMask ? vISA_EMASK_M3_NM : vISA_EMASK_M3; 3333 } 3334 break; 3335 case EMASK_Q3: 3336 emaskRet = noMask ? vISA_EMASK_M5_NM : vISA_EMASK_M5; 3337 break; 3338 case EMASK_Q4: 3339 emaskRet = noMask ? vISA_EMASK_M7_NM : vISA_EMASK_M7; 3340 break; 3341 case EMASK_H1: 3342 emaskRet = noMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; 3343 break; 3344 case EMASK_H2: 3345 emaskRet = noMask ? vISA_EMASK_M5_NM : vISA_EMASK_M5; 3346 break; 3347 default: 3348 IGC_ASSERT_MESSAGE(0, "unreachable"); 3349 emaskRet = vISA_EMASK_M1_NM; 3350 } 3351 3352 if (!m_encoderState.m_secondNibble) 3353 return emaskRet; 3354 3355 switch (emaskRet) { 3356 case vISA_EMASK_M1: return vISA_EMASK_M2; 3357 case vISA_EMASK_M1_NM: return vISA_EMASK_M2_NM; 3358 case vISA_EMASK_M3: return vISA_EMASK_M4; 3359 case vISA_EMASK_M3_NM: return vISA_EMASK_M4_NM; 3360 case vISA_EMASK_M5: return vISA_EMASK_M6; 3361 case vISA_EMASK_M5_NM: return vISA_EMASK_M6_NM; 3362 case vISA_EMASK_M7: return vISA_EMASK_M8; 3363 case vISA_EMASK_M7_NM: return vISA_EMASK_M8_NM; 3364 default: 3365 IGC_ASSERT_MESSAGE(0, "unreachable"); 3366 return vISA_EMASK_M1_NM; 3367 } 3368 return vISA_EMASK_M1_NM; 3369 } 3370 ConvertModifierToVisaType(e_modifier modifier)3371 VISA_Modifier ConvertModifierToVisaType(e_modifier modifier) 3372 { 3373 switch (modifier) 3374 { 3375 case EMOD_NONE: 3376 return MODIFIER_NONE; 3377 case EMOD_SAT: 3378 return MODIFIER_SAT; 3379 case EMOD_ABS: 3380 return MODIFIER_ABS; 3381 case EMOD_NEG: 3382 return MODIFIER_NEG; 3383 case EMOD_NEGABS: 3384 return MODIFIER_NEG_ABS; 3385 case EMOD_NOT: 3386 return MODIFIER_NOT; 3387 default: 3388 IGC_ASSERT_MESSAGE(0, "unreachable"); 3389 return MODIFIER_NONE; 3390 } 3391 } 3392 ConvertCondModToVisaType(e_predicate condMod)3393 VISA_Cond_Mod ConvertCondModToVisaType(e_predicate condMod) 3394 { 3395 switch (condMod) 3396 { 3397 case EPREDICATE_EQ: 3398 return ISA_CMP_E; 3399 case EPREDICATE_NE: 3400 return ISA_CMP_NE; 3401 case EPREDICATE_GT: 3402 return ISA_CMP_G; 3403 case EPREDICATE_GE: 3404 return ISA_CMP_GE; 3405 case EPREDICATE_LT: 3406 return ISA_CMP_L; 3407 case EPREDICATE_LE: 3408 return ISA_CMP_LE; 3409 default: 3410 IGC_ASSERT_MESSAGE(0, "unreachable"); 3411 return ISA_CMP_UNDEF; 3412 } 3413 } 3414 ConvertSizeToVisaType(uint size)3415 VISA_Oword_Num ConvertSizeToVisaType(uint size) 3416 { 3417 switch (size) 3418 { 3419 case 1: 3420 return OWORD_NUM_1; 3421 case 2: 3422 return OWORD_NUM_2; 3423 case 4: 3424 return OWORD_NUM_4; 3425 case 8: 3426 return OWORD_NUM_8; 3427 case 16: 3428 return OWORD_NUM_16; 3429 default: 3430 IGC_ASSERT_MESSAGE(0, "unreachable"); 3431 return OWORD_NUM_ILLEGAL; 3432 } 3433 } 3434 ConvertChannelMaskToVisaType(uint mask)3435 VISAChannelMask ConvertChannelMaskToVisaType(uint mask) 3436 { 3437 switch (mask & 0xf) 3438 { 3439 case 1: return CHANNEL_MASK_R; 3440 case 2: return CHANNEL_MASK_G; 3441 case 3: return CHANNEL_MASK_RG; 3442 case 4: return CHANNEL_MASK_B; 3443 case 5: return CHANNEL_MASK_RB; 3444 case 6: return CHANNEL_MASK_GB; 3445 case 7: return CHANNEL_MASK_RGB; 3446 case 8: return CHANNEL_MASK_A; 3447 case 9: return CHANNEL_MASK_RA; 3448 case 0xa: return CHANNEL_MASK_GA; 3449 case 0xb: return CHANNEL_MASK_RGA; 3450 case 0xc: return CHANNEL_MASK_BA; 3451 case 0xd: return CHANNEL_MASK_RBA; 3452 case 0xe: return CHANNEL_MASK_GBA; 3453 case 0xf: return CHANNEL_MASK_RGBA; 3454 default: 3455 { 3456 IGC_ASSERT_MESSAGE(0, "Wrong mask"); 3457 return CHANNEL_MASK_NOMASK; 3458 } 3459 } 3460 } 3461 ConvertSubOpcode(EOPCODE subOpcode,bool zeroLOD)3462 VISASampler3DSubOpCode CEncoder::ConvertSubOpcode(EOPCODE subOpcode, bool zeroLOD) 3463 { 3464 switch (subOpcode) 3465 { 3466 case llvm_sampleptr: 3467 return VISA_3D_SAMPLE; 3468 case llvm_sample_bptr: 3469 return VISA_3D_SAMPLE_B; 3470 case llvm_sample_cptr: 3471 return VISA_3D_SAMPLE_C; 3472 case llvm_sample_dptr: 3473 return VISA_3D_SAMPLE_D; 3474 case llvm_sample_dcptr: 3475 return VISA_3D_SAMPLE_D_C; 3476 case llvm_sample_lptr: 3477 return zeroLOD ? VISA_3D_SAMPLE_LZ : VISA_3D_SAMPLE_L; 3478 case llvm_sample_lcptr: 3479 return zeroLOD ? VISA_3D_SAMPLE_C_LZ : VISA_3D_SAMPLE_L_C; 3480 case llvm_sample_bcptr: 3481 return VISA_3D_SAMPLE_B_C; 3482 case llvm_ld_ptr: 3483 return zeroLOD ? VISA_3D_LD_LZ : VISA_3D_LD; 3484 case llvm_resinfoptr: 3485 return VISA_3D_RESINFO; 3486 case llvm_gather4ptr: 3487 return VISA_3D_GATHER4; 3488 case llvm_gather4Cptr: 3489 return VISA_3D_GATHER4_C; 3490 case llvm_gather4POptr: 3491 return VISA_3D_GATHER4_PO; 3492 case llvm_gather4POCptr: 3493 return VISA_3D_GATHER4_PO_C; 3494 case llvm_sampleinfoptr: 3495 return VISA_3D_SAMPLEINFO; 3496 case llvm_ldmsptr: 3497 case llvm_ldmsptr16bit: 3498 return VISA_3D_LD2DMS_W; 3499 case llvm_ldmcsptr: 3500 return VISA_3D_LD_MCS; 3501 case llvm_lodptr: 3502 return VISA_3D_LOD; 3503 case llvm_sample_killpix: 3504 return VISA_3D_SAMPLE_KILLPIX; 3505 default: 3506 IGC_ASSERT_MESSAGE(0, "wrong sampler subopcode"); 3507 return VISA_3D_SAMPLE; 3508 } 3509 } 3510 IsIntegerType(VISA_Type type)3511 bool CEncoder::IsIntegerType(VISA_Type type) 3512 { 3513 return (type == ISA_TYPE_B || 3514 type == ISA_TYPE_UB || 3515 type == ISA_TYPE_W || 3516 type == ISA_TYPE_UW || 3517 type == ISA_TYPE_D || 3518 type == ISA_TYPE_UD || 3519 type == ISA_TYPE_Q || 3520 type == ISA_TYPE_UQ || 3521 0); 3522 } 3523 IsFloatType(VISA_Type type)3524 bool CEncoder::IsFloatType(VISA_Type type) 3525 { 3526 return (type == ISA_TYPE_F || 3527 type == ISA_TYPE_DF || 3528 0); 3529 } 3530 ConvertSingleSourceChannel(uint srcChannel)3531 VISASourceSingleChannel ConvertSingleSourceChannel(uint srcChannel) 3532 { 3533 switch (srcChannel) 3534 { 3535 case 0: 3536 return VISA_3D_GATHER4_CHANNEL_R; 3537 case 1: 3538 return VISA_3D_GATHER4_CHANNEL_G; 3539 case 2: 3540 return VISA_3D_GATHER4_CHANNEL_B; 3541 case 3: 3542 return VISA_3D_GATHER4_CHANNEL_A; 3543 default: 3544 IGC_ASSERT_MESSAGE(0, "Wrong channel"); 3545 return VISA_3D_GATHER4_CHANNEL_R; 3546 } 3547 } 3548 BeginSubroutine(llvm::Function * F)3549 void CEncoder::BeginSubroutine(llvm::Function* F) 3550 { 3551 InitLabelMap(F); 3552 V(vKernel->AppendVISACFLabelInst(GetFuncLabel(F))); 3553 } 3554 BeginStackFunction(llvm::Function * F)3555 void CEncoder::BeginStackFunction(llvm::Function* F) 3556 { 3557 InitLabelMap(F); 3558 // At this place, the vISA object is changed! 3559 vKernel = GetStackFunction(F); 3560 VISA_LabelOpnd* visaLabel = nullptr; 3561 V(vKernel->CreateVISALabelVar(visaLabel, F->getName().data(), LABEL_SUBROUTINE)); 3562 V(vKernel->AppendVISACFLabelInst(visaLabel)); 3563 } 3564 BeginPayloadSection()3565 void CEncoder::BeginPayloadSection() 3566 { 3567 // Payload Section is created as a function and compiled separately 3568 // from the shader body 3569 VISAFunction* visaFunc = nullptr; 3570 V(vbuilder->AddPayloadSection(visaFunc, "PayloadSection")); 3571 vPayloadSection = visaFunc; 3572 CodeGenContext* context = m_program->GetContext(); 3573 std::string asmName; 3574 if (m_enableVISAdump || context->m_instrTypes.hasDebugInfo) 3575 { 3576 asmName = GetDumpFileName("asm"); 3577 } 3578 else 3579 { 3580 asmName = "kernel.asm"; 3581 } 3582 V(vPayloadSection->AddKernelAttribute("OutputAsmPath", asmName.length(), asmName.c_str())); 3583 3584 VISA_LabelOpnd* functionLabel = nullptr; 3585 V(vPayloadSection->CreateVISALabelVar(functionLabel, "payload", LABEL_SUBROUTINE)); 3586 V(vPayloadSection->AppendVISACFLabelInst(functionLabel)); 3587 vMainKernel = vPayloadSection; 3588 } 3589 AddVISASymbol(std::string & symName,CVariable * cvar)3590 void CEncoder::AddVISASymbol(std::string& symName, CVariable* cvar) 3591 { 3592 SModifier mod; 3593 mod.init(); 3594 VISA_VectorOpnd* visaSymAddr = GetDestinationOperand(cvar, mod); 3595 V(vKernel->AppendVISACFSymbolInst(symName, visaSymAddr)); 3596 } 3597 SaveOption(vISAOptions option,bool val)3598 void CEncoder::SaveOption(vISAOptions option, bool val) 3599 { 3600 OptionValue entry; 3601 entry.type = OpType::ET_BOOL; 3602 entry.vBool = val; 3603 m_visaUserOptions.push_back(std::make_pair(option, entry)); 3604 } SaveOption(vISAOptions option,uint32_t val)3605 void CEncoder::SaveOption(vISAOptions option, uint32_t val) 3606 { 3607 OptionValue entry; 3608 entry.type = OpType::ET_INT32; 3609 entry.vInt32 = val; 3610 m_visaUserOptions.push_back(std::make_pair(option, entry)); 3611 } SaveOption(vISAOptions option,const char * val)3612 void CEncoder::SaveOption(vISAOptions option, const char* val) 3613 { 3614 OptionValue entry; 3615 entry.type = OpType::ET_CSTR; 3616 entry.vCstr = val; 3617 m_visaUserOptions.push_back(std::make_pair(option, entry)); 3618 } SetBuilderOptions(VISABuilder * pbuilder)3619 void CEncoder::SetBuilderOptions(VISABuilder* pbuilder) 3620 { 3621 for (auto OP : m_visaUserOptions) 3622 { 3623 switch (OP.second.type) 3624 { 3625 case OpType::ET_BOOL: 3626 pbuilder->SetOption(OP.first, OP.second.vBool); 3627 break; 3628 case OpType::ET_INT32: 3629 pbuilder->SetOption(OP.first, OP.second.vInt32); 3630 break; 3631 case OpType::ET_CSTR: 3632 pbuilder->SetOption(OP.first, OP.second.vCstr); 3633 break; 3634 default: 3635 IGC_ASSERT_MESSAGE(0, "Undefined user option type"); 3636 break; 3637 } 3638 } 3639 } 3640 InitBuildParams(llvm::SmallVector<std::unique_ptr<char,std::function<void (char *)>>,10> & params)3641 void CEncoder::InitBuildParams(llvm::SmallVector<std::unique_ptr< char, std::function<void(char*)>>, 10>& params) 3642 { 3643 CodeGenContext* context = m_program->GetContext(); 3644 bool isOptDisabled = context->getModuleMetaData()->compOpt.OptDisable; 3645 using param_uptr = std::unique_ptr<char, std::function<void(char*)>>; 3646 auto literal_deleter = [](char* val) {}; 3647 auto dup_deleter = [](char* val) {free(val); }; 3648 // create vbuilder->Compile() params 3649 if (IGC_IS_FLAG_ENABLED(EnableVISADotAll)) 3650 { 3651 params.push_back(param_uptr("-dotAll", literal_deleter)); 3652 } 3653 if (IGC_IS_FLAG_ENABLED(EnableVISADebug) || isOptDisabled) 3654 { 3655 params.push_back(param_uptr("-debug", literal_deleter)); 3656 } 3657 3658 if (context->getModuleMetaData()->compOpt.FastVISACompile) 3659 { 3660 params.push_back(param_uptr("-fasterRA", literal_deleter)); 3661 params.push_back(param_uptr("-noLocalSplit", literal_deleter)); 3662 } 3663 if (IGC_IS_FLAG_ENABLED(EnableGlobalStateBuffer)) 3664 { 3665 params.push_back(param_uptr("-emitCrossThreadOffR0Reloc", literal_deleter)); 3666 } 3667 // Ensure VISA_Opts has the same scope as CreateVISABuilder so that valid 3668 // strings are checked by vISA and freed out of this function. 3669 if (IGC_IS_FLAG_ENABLED(VISAOptions)) 3670 { 3671 std::vector<std::string> VISA_Opts; 3672 const char* DELIMITERS = " \t\n\v\f\r,"; // isspace(c), and comma for igcstandalone 3673 std::string line(IGC_GET_REGKEYSTRING(VISAOptions)); 3674 std::size_t pos = 0; 3675 std::size_t found; 3676 for (; (found = line.find_first_of(DELIMITERS, pos)) != std::string::npos; ++pos) { 3677 // Skip consecutive whitespaces. 3678 if (found == pos) 3679 continue; 3680 VISA_Opts.push_back(line.substr(pos, found - pos)); 3681 pos = found; 3682 } 3683 if (pos < line.length()) 3684 VISA_Opts.push_back(line.substr(pos)); 3685 for (auto& opt : VISA_Opts) { 3686 // note that the memory should be freed once 3687 // params has been read, but since this is only for 3688 // debugging, do not bother freeing memory. 3689 params.push_back(param_uptr(_strdup(opt.c_str()), dup_deleter)); 3690 if (opt == "-output" || opt == "-binary" || opt == "-dumpvisa" || opt == "-dumpcommonisa") 3691 { 3692 m_enableVISAdump = true; 3693 } 3694 } 3695 } 3696 if (IGC_IS_FLAG_DISABLED(ForceDisableShaderDebugHashCodeInKernel) && 3697 (context->m_DriverInfo.EnableShaderDebugHashCodeInKernel() || 3698 IGC_IS_FLAG_ENABLED(ShaderDebugHashCodeInKernel))) 3699 { 3700 auto addHash = [&](char* OptName, QWORD Hash) 3701 { 3702 params.push_back(param_uptr(OptName, literal_deleter)); 3703 std::string Low = std::to_string((DWORD)Hash); 3704 std::string High = std::to_string((DWORD)(Hash >> 32)); 3705 params.push_back(param_uptr(_strdup(Low.c_str()), dup_deleter)); 3706 params.push_back(param_uptr(_strdup(High.c_str()), dup_deleter)); 3707 }; 3708 3709 QWORD AssemblyHash = context->hash.getAsmHash(); 3710 addHash("-hashmovs", AssemblyHash); 3711 3712 QWORD NosHash = context->hash.getNosHash(); 3713 QWORD PsoHash = context->hash.getPsoHash(); 3714 QWORD hashToUse = NosHash != 0 ? NosHash : PsoHash; 3715 if (hashToUse) 3716 addHash("-hashmovs1", hashToUse); 3717 else if (context->hash.getPerShaderPsoHash() != 0) 3718 addHash("-hashmovs1", context->hash.getPerShaderPsoHash()); 3719 } 3720 } InitVISABuilderOptions(TARGET_PLATFORM VISAPlatform,bool canAbortOnSpill,bool hasStackCall,bool enableVISA_IR)3721 void CEncoder::InitVISABuilderOptions(TARGET_PLATFORM VISAPlatform, bool canAbortOnSpill, bool hasStackCall, bool enableVISA_IR) 3722 { 3723 CodeGenContext* context = m_program->GetContext(); 3724 bool KernelDebugEnable = false; 3725 bool ForceNonCoherentStatelessBti = false; 3726 bool AllowSpill = true; 3727 if (context->type == ShaderType::OPENCL_SHADER) 3728 { 3729 auto ClContext = static_cast<OpenCLProgramContext*>(context); 3730 KernelDebugEnable = ClContext->m_InternalOptions.KernelDebugEnable; 3731 ForceNonCoherentStatelessBti = ClContext->m_ShouldUseNonCoherentStatelessBTI; 3732 AllowSpill = !ClContext->m_InternalOptions.NoSpill; 3733 3734 if (ClContext->m_InternalOptions.GTPinReRA) 3735 { 3736 SaveOption(vISA_GTPinReRA, true); 3737 SaveOption(vISA_ReRAPostSchedule, true); 3738 } 3739 if (ClContext->m_InternalOptions.GTPinGRFInfo) 3740 { 3741 SaveOption(vISA_GetFreeGRFInfo, true); 3742 } 3743 if (ClContext->m_InternalOptions.GTPinScratchAreaSize) 3744 { 3745 SaveOption(vISA_GTPinScratchAreaSize, ClContext->m_InternalOptions.GTPinScratchAreaSizeValue); 3746 } 3747 } 3748 3749 bool EnableBarrierInstCounterBits = false; 3750 if (context->type == ShaderType::HULL_SHADER) 3751 { 3752 EnableBarrierInstCounterBits = true; 3753 } 3754 bool preserveR0 = false; 3755 if (context->type == ShaderType::PIXEL_SHADER) 3756 { 3757 preserveR0 = !static_cast<CPixelShader*>(m_program)->IsLastPhase(); 3758 } 3759 bool isOptDisabled = context->getModuleMetaData()->compOpt.OptDisable; 3760 3761 // Set up options. This must be done before creating any variable/instructions 3762 // since some of the options affect IR building. 3763 if (IGC_IS_FLAG_ENABLED(ForceNoFP64bRegioning)) 3764 { 3765 SaveOption(vISA_forceNoFP64bRegioning, true); 3766 } 3767 3768 if (IGC_IS_FLAG_ENABLED(DumpCompilerStats) || context->getModuleMetaData()->compOpt.CaptureCompilerStats) 3769 { 3770 SaveOption(vISA_EnableCompilerStats, true); 3771 } 3772 3773 if (IGC_IS_FLAG_ENABLED(EnableSamplerSplit)) 3774 { 3775 SaveOption(vISA_enableCloneSampleInst, true); 3776 } 3777 3778 3779 if (m_program->m_Platform->getWATable().Wa_14012760189 && IGC_IS_FLAG_ENABLED(EnableEvaluateSamplerSplit)) 3780 { 3781 SaveOption(vISA_cloneEvaluateSampleInst, true); 3782 } 3783 3784 if (IGC_IS_FLAG_ENABLED(ForceFFIDOverwrite)/*|| m_program->m_Platform->WaOverwriteFFID()*/) 3785 { 3786 unsigned int ffid[unsigned(ShaderType::END)] = { 3787 0, 3788 static_cast<unsigned>(context->isPOSH() ? FFID_VSR : FFID_VS), 3789 FFID_HS, 3790 FFID_DS, 3791 FFID_GS, 3792 FFID_PS, 3793 FFID_GP, 3794 FFID_GP 3795 }; 3796 SaveOption(vISA_setFFID, ffid[unsigned(context->type)]); 3797 } 3798 3799 SaveOption(vISA_hasRNEandDenorm, true); 3800 3801 // need to fold ret into the previous RTWrite/URBWrite/etc 3802 if (context->type != ShaderType::OPENCL_SHADER && context->type != ShaderType::COMPUTE_SHADER) 3803 { 3804 { 3805 SaveOption(vISA_foldEOTtoPrevSend, true); 3806 } 3807 } 3808 3809 if (m_program->m_DriverInfo->clearScratchWriteBeforeEOT() && 3810 (context->type == ShaderType::PIXEL_SHADER || context->type == ShaderType::OPENCL_SHADER)) 3811 { 3812 SaveOption(vISA_clearScratchWritesBeforeEOT, true); 3813 } 3814 3815 bool clearHDCWritesBeforeEOT = m_program->m_DriverInfo->UsesSparseAliasedResidency() && 3816 context->platform.WaInsertHDCFenceBeforeEOTWhenSparseAliasedResources(); 3817 clearHDCWritesBeforeEOT |= ((context->type == ShaderType::PIXEL_SHADER) || 3818 (context->type == ShaderType::COMPUTE_SHADER) || 3819 (context->type == ShaderType::OPENCL_SHADER)) && 3820 context->platform.NeedsHDCFenceBeforeEOTInPixelShader(); 3821 clearHDCWritesBeforeEOT |= IGC_IS_FLAG_ENABLED(ForceMemoryFenceBeforeEOT); 3822 3823 if (clearHDCWritesBeforeEOT) 3824 { 3825 SaveOption(vISA_clearHDCWritesBeforeEOT, true); 3826 } 3827 3828 3829 // Disable multi-threaded latencies in the vISA scheduler when not in 3D 3830 if (context->type == ShaderType::OPENCL_SHADER) 3831 { 3832 if (m_program->m_Platform->singleThreadBasedInstScheduling()) 3833 { 3834 SaveOption(vISA_useMultiThreadedLatencies, false); 3835 } 3836 } 3837 3838 auto enableScheduler = [=]() { 3839 // Check if preRA scheduler is disabled from input. 3840 if (isOptDisabled) 3841 return false; 3842 if (context->type == ShaderType::OPENCL_SHADER) { 3843 auto ClContext = static_cast<OpenCLProgramContext*>(context); 3844 if (!ClContext->m_InternalOptions.IntelEnablePreRAScheduling) 3845 return false; 3846 } 3847 3848 // Check reg-key or compiler input 3849 if (IGC_IS_FLAG_ENABLED(ForceVISAPreSched) || context->getModuleMetaData()->csInfo.forcedVISAPreRAScheduler) 3850 return true; 3851 3852 // API check. 3853 bool enableForRetey = m_program->m_DriverInfo->enableVISAPreRASchedulerForRetry() || 3854 context->m_retryManager.AllowVISAPreRAScheduler(); 3855 3856 if (IGC_IS_FLAG_ENABLED(EnableVISAPreSched) && 3857 m_program->m_DriverInfo->enableVISAPreRAScheduler() && 3858 enableForRetey) 3859 return true; 3860 3861 return false; 3862 }; 3863 3864 if (enableScheduler()) 3865 { 3866 SaveOption(vISA_preRA_Schedule, true); 3867 if (uint32_t Val = IGC_GET_FLAG_VALUE(VISAPreSchedCtrl)) 3868 { 3869 SaveOption(vISA_preRA_ScheduleCtrl, Val); 3870 } 3871 else 3872 { 3873 uint32_t V = m_program->m_DriverInfo->getVISAPreRASchedulerCtrl(); 3874 if (m_program->GetHasDPAS()) 3875 { 3876 V = 4; // register pressure only 3877 } 3878 SaveOption(vISA_preRA_ScheduleCtrl, V); 3879 } 3880 3881 uint32_t VISAPreSchedVal = 0; 3882 if (context->type == ShaderType::COMPUTE_SHADER) 3883 VISAPreSchedVal = context->getModuleMetaData()->csInfo.VISAPreSchedRPThreshold; 3884 else if (context->type == ShaderType::PIXEL_SHADER) 3885 VISAPreSchedVal = context->getModuleMetaData()->compOpt.VISAPreSchedRPThreshold; 3886 // registry key setting has higher priority 3887 if (uint32_t Val = IGC_GET_FLAG_VALUE(VISAPreSchedRPThreshold)) 3888 { 3889 SaveOption(vISA_preRA_ScheduleRPThreshold, Val); 3890 } 3891 else if (VISAPreSchedVal) 3892 { 3893 SaveOption(vISA_preRA_ScheduleRPThreshold, VISAPreSchedVal); 3894 } 3895 3896 if (uint32_t Val = IGC_GET_FLAG_VALUE(VISAScheduleStartBBID)) 3897 { 3898 SaveOption(vISA_ScheduleStartBBID, Val); 3899 } 3900 3901 if (uint32_t Val = IGC_GET_FLAG_VALUE(VISAScheduleEndBBID)) 3902 { 3903 SaveOption(vISA_ScheduleEndBBID, Val); 3904 } 3905 } 3906 else 3907 { 3908 SaveOption(vISA_preRA_Schedule, false); 3909 } 3910 3911 if (IGC_IS_FLAG_ENABLED(ReplaceIndirectCallWithJmpi)) 3912 { 3913 SaveOption(vISA_replaceIndirectCallWithJmpi, true); 3914 } 3915 3916 if (IGC_IS_FLAG_ENABLED(FastSpill)) 3917 { 3918 SaveOption(vISA_FastSpill, true); 3919 } 3920 3921 #ifdef _DEBUG 3922 // enable vISA verifier if we are generating vISA IR 3923 SaveOption(vISA_NoVerifyvISA, !enableVISA_IR); 3924 #else 3925 SaveOption(vISA_NoVerifyvISA, true); 3926 #endif 3927 3928 if (context->m_instrTypes.hasDebugInfo) 3929 { 3930 SaveOption(vISA_GenerateDebugInfo, true); 3931 3932 if (context->metrics.Enable()) 3933 { 3934 SaveOption(vISA_GenerateKernelInfo, true); 3935 SaveOption(vISA_EmitLocation, true); 3936 } 3937 } 3938 3939 if (canAbortOnSpill) 3940 { 3941 SaveOption(vISA_AbortOnSpill, true); 3942 if (AvoidRetryOnSmallSpill()) 3943 { 3944 // 2 means #spill/fill is roughly 1% of #inst 3945 // ToDo: tune the threshold 3946 if (m_program->m_dispatchSize == SIMDMode::SIMD8) 3947 SaveOption(vISA_AbortOnSpillThreshold, IGC_GET_FLAG_VALUE(SIMD8_SpillThreshold) * 2); 3948 3949 else if (m_program->m_dispatchSize == SIMDMode::SIMD16) 3950 SaveOption(vISA_AbortOnSpillThreshold, IGC_GET_FLAG_VALUE(SIMD16_SpillThreshold) * 2); 3951 } 3952 } 3953 3954 if (context->type == ShaderType::OPENCL_SHADER && m_program->m_dispatchSize == SIMDMode::SIMD8) 3955 { 3956 // AllowSpill is set to false if -cl-intel-no-spill internal option was passed from OpenCL Runtime. 3957 // It has been implemented to avoid scratch space usage for scheduler kernel. 3958 if (AllowSpill) 3959 { 3960 SaveOption(vISA_AbortOnSpillThreshold, IGC_GET_FLAG_VALUE(SIMD8_SpillThreshold) * 2); 3961 } 3962 } 3963 3964 if ((context->type == ShaderType::OPENCL_SHADER || context->type == ShaderType::COMPUTE_SHADER) && 3965 m_program->m_Platform->preemptionSupported() && IGC_IS_FLAG_ENABLED(EnablePreemption)) 3966 { 3967 SaveOption(vISA_enablePreemption, true); 3968 } 3969 3970 if (IGC_IS_FLAG_ENABLED(forceGlobalRA)) 3971 { 3972 SaveOption(vISA_LocalRA, false); 3973 SaveOption(vISA_LocalBankConflictReduction, false); 3974 } 3975 3976 if (IGC_IS_FLAG_ENABLED(disableVarSplit)) 3977 { 3978 SaveOption(vISA_LocalDeclareSplitInGlobalRA, false); 3979 } 3980 3981 if (IGC_IS_FLAG_ENABLED(disableRemat)) 3982 { 3983 SaveOption(vISA_NoRemat, true); 3984 } 3985 3986 if (ForceNonCoherentStatelessBti || IGC_IS_FLAG_ENABLED(ForceNonCoherentStatelessBTI)) 3987 { 3988 SaveOption(vISA_noncoherentStateless, true); 3989 } 3990 3991 if (IGC_IS_FLAG_ENABLED(DisableIfCvt)) 3992 { 3993 SaveOption(vISA_ifCvt, false); 3994 } 3995 3996 if (IGC_IS_FLAG_ENABLED(EnableVISAStructurizer) && 3997 (m_program->m_Platform->hasSCF() || IGC_IS_FLAG_ENABLED(ForceVISAStructurizer))) 3998 { 3999 SaveOption(vISA_EnableStructurizer, true); 4000 4001 if (IGC_GET_FLAG_VALUE(EnableVISAStructurizer) == FLAG_SCF_UCFOnly) 4002 { 4003 // visa structurizer will generate UCF only. 4004 SaveOption(vISA_StructurizerCF, false); 4005 } 4006 } 4007 4008 if (IGC_IS_FLAG_DISABLED(EnableVISAJmpi)) 4009 { 4010 SaveOption(vISA_EnableScalarJmp, false); 4011 } 4012 4013 if (IGC_IS_FLAG_ENABLED(ForceNoMaskWA)) { 4014 SaveOption(vISA_forceNoMaskWA, true); 4015 // Turn off jmpi as there is no wa for jmpi 4016 SaveOption(vISA_EnableScalarJmp, false); 4017 } 4018 4019 if (m_program->m_Platform->getWATable().Wa_1808850743 || 4020 m_program->m_Platform->getWATable().Wa_1409909237) 4021 { 4022 SaveOption(vISA_noMaskWA, IGC_GET_FLAG_VALUE(NoMaskWA)); 4023 if (IGC_GET_FLAG_VALUE(NoMaskWA) > 0) 4024 { 4025 // Turn off jmpi as there is no wa for jmpi 4026 SaveOption(vISA_EnableScalarJmp, false); 4027 } 4028 } 4029 4030 if (m_program->m_Platform->hasFusedEU() 4031 && IGC_IS_FLAG_ENABLED(EnableCallWA) 4032 && (m_program->HasStackCalls() || m_program->IsIntelSymbolTableVoidProgram())) 4033 { 4034 SaveOption(vISA_fusedCallWA, true); 4035 } 4036 4037 if (IGC_IS_FLAG_ENABLED(DisableCSEL)) 4038 { 4039 SaveOption(vISA_enableCSEL, false); 4040 } 4041 if (IGC_IS_FLAG_ENABLED(DisableFlagOpt)) 4042 { 4043 SaveOption(vISA_LocalFlagOpt, false); 4044 } 4045 4046 if (IGC_IS_FLAG_ENABLED(EnableVISAOutput)) 4047 { 4048 SaveOption(vISA_outputToFile, true); 4049 m_enableVISAdump = true; 4050 } 4051 if (IGC_IS_FLAG_ENABLED(EnableVISABinary)) 4052 { 4053 SaveOption(vISA_GenerateBinary, true); 4054 m_enableVISAdump = true; 4055 } 4056 if (IGC_IS_FLAG_ENABLED(EnableVISADumpCommonISA)) 4057 { 4058 SaveOption(vISA_DumpvISA, true); 4059 SaveOption(vISA_GenerateISAASM, true); 4060 m_enableVISAdump = true; 4061 } 4062 if (IGC_IS_FLAG_ENABLED(EnableVISANoSchedule)) 4063 { 4064 SaveOption(vISA_LocalScheduling, false); 4065 } 4066 if (IGC_IS_FLAG_ENABLED(EnableVISANoBXMLEncoder)) 4067 { 4068 SaveOption(vISA_BXMLEncoder, false); 4069 } 4070 if (IGC_IS_FLAG_ENABLED(DisableMixMode) || 4071 context->getModuleMetaData()->disableMixMode) 4072 { 4073 SaveOption(vISA_DisableMixMode, true); 4074 } 4075 if (IGC_IS_FLAG_ENABLED(ForceMixMode)) 4076 { 4077 SaveOption(vISA_ForceMixMode, true); 4078 } 4079 if (IGC_IS_FLAG_ENABLED(DisableHFMath)) 4080 { 4081 SaveOption(vISA_DisableHFMath, true); 4082 } 4083 4084 if (IGC_IS_FLAG_ENABLED(disableIGASyntax)) 4085 { 4086 SaveOption(vISA_dumpNewSyntax, false); 4087 } 4088 if (IGC_IS_FLAG_ENABLED(disableCompaction)) 4089 { 4090 SaveOption(vISA_Compaction, false); 4091 } 4092 4093 if (auto *regex = IGC_GET_REGKEYSTRING(ShaderDumpFilter)) 4094 { 4095 SaveOption(vISA_ShaderDumpFilter, regex); 4096 } 4097 4098 // In Vulkan and OGL buffer variable memory reads and writes within 4099 // a single shader invocation must be processed in order. 4100 if (m_program->m_DriverInfo->DisableDpSendReordering()) 4101 { 4102 SaveOption(vISA_ReorderDPSendToDifferentBti, false); 4103 } 4104 4105 if (m_program->m_DriverInfo->UseALTMode()) 4106 { 4107 SaveOption(vISA_ChangeMoveType, false); 4108 SaveOption(vISA_ALTMode, true); 4109 } 4110 4111 if (IGC_IS_FLAG_ENABLED(DisableSendS)) 4112 { 4113 SaveOption(vISA_UseSends, false); 4114 } 4115 if (m_program->m_DriverInfo->AllowUnsafeHalf()) 4116 { 4117 SaveOption(vISA_enableUnsafeCP_DF, true); 4118 } 4119 4120 if (IGC_GET_FLAG_VALUE(ReservedRegisterNum) != 0 && (IGC_GET_FLAG_VALUE(TotalGRFNum) != 0)) 4121 { 4122 IGC_ASSERT_MESSAGE(0, "ReservedRegisterNum and TotalGRFNum registry keys cannot be used at the same time"); 4123 } 4124 4125 if (IGC_GET_FLAG_VALUE(ReservedRegisterNum) != 0) 4126 { 4127 SaveOption(vISA_ReservedGRFNum, IGC_GET_FLAG_VALUE(ReservedRegisterNum)); 4128 } 4129 if (IGC_GET_FLAG_VALUE(GRFNumToUse) > 0) 4130 { 4131 SaveOption(vISA_GRFNumToUse, IGC_GET_FLAG_VALUE(GRFNumToUse)); 4132 } 4133 4134 if (IGC_GET_FLAG_VALUE(TotalGRFNum) != 0) 4135 { 4136 SaveOption(vISA_TotalGRFNum, IGC_GET_FLAG_VALUE(TotalGRFNum)); 4137 } 4138 else if (context->type == ShaderType::COMPUTE_SHADER && IGC_GET_FLAG_VALUE(TotalGRFNum4CS) != 0) 4139 { 4140 SaveOption(vISA_TotalGRFNum, IGC_GET_FLAG_VALUE(TotalGRFNum4CS)); 4141 } 4142 else 4143 { 4144 SaveOption(vISA_TotalGRFNum, context->getNumGRFPerThread()); 4145 } 4146 4147 // 4148 // Setting number of GRF and threads per EU is restricted to OCL only 4149 // Number of threads can be set by: 4150 // 1. User input through 4151 // 1.1 compiler option for entire module 4152 // 1.2 kernel annotation for a specific kernel function 4153 // 2. Compiler heuristics 4154 // 4155 if (context->type == ShaderType::OPENCL_SHADER) 4156 { 4157 auto ClContext = static_cast<OpenCLProgramContext*>(context); 4158 if (m_program->m_Platform->supportsStaticRegSharing()) 4159 { 4160 if (ClContext->getNumThreadsPerEU() > 0) 4161 { 4162 // Number of threads per EU is set per module (by compiler option) 4163 SaveOption(vISA_HWThreadNumberPerEU, ClContext->getNumThreadsPerEU()); 4164 } 4165 else if (m_program->getAnnotatedNumThreads() > 0) 4166 { 4167 // Number of threads per EU is set per kernel function (by user annotation) 4168 SaveOption(vISA_HWThreadNumberPerEU, m_program->getAnnotatedNumThreads()); 4169 } 4170 else if (m_program->m_Platform->supportsAutoGRFSelection() && 4171 context->m_DriverInfo.supportsAutoGRFSelection() && 4172 !IGC_IS_FLAG_ENABLED(DisableRegSharingHeuristics) && 4173 !ClContext->m_InternalOptions.Intel128GRFPerThread && 4174 !ClContext->m_InternalOptions.Intel256GRFPerThread) 4175 { 4176 // When user hasn't specified number of threads, we can rely on compiler heuristics 4177 SaveOption(vISA_RegSharingHeuristics, true); 4178 } 4179 } 4180 } 4181 if (IGC_GET_FLAG_VALUE(ForceHWThreadNumberPerEU) != 0) 4182 { 4183 SaveOption(vISA_ForceHWThreadNumberPerEU, IGC_GET_FLAG_VALUE(ForceHWThreadNumberPerEU)); 4184 } 4185 4186 if (IGC_IS_FLAG_ENABLED(EnableHashMovsAtPrologue)) 4187 { 4188 SaveOption(vISA_HashMovsAtPrologue, true); 4189 } 4190 4191 if (IGC_IS_FLAG_ENABLED(SystemThreadEnable)) 4192 { 4193 /* Some tools only use 32bits hash, to maintain compatibility 4194 across lot of unknown tool chains doing Compare for only LowerPart 4195 */ 4196 if (IGC_GET_FLAG_VALUE(ShaderDebugHashCode) == (DWORD)context->hash.getAsmHash()) 4197 { 4198 SaveOption(vISA_setStartBreakPoint, true); 4199 } 4200 } 4201 else if (KernelDebugEnable) 4202 { 4203 SaveOption(vISA_AddKernelID, true); 4204 SaveOption(vISA_setStartBreakPoint, true); 4205 } 4206 4207 auto g4Subset = (uint32_t)IGC_GET_FLAG_VALUE(ShaderDumpEnableG4); 4208 if (g4Subset != 0) 4209 SaveOption(vISA_DumpPassesSubset, g4Subset); 4210 4211 if (EnableBarrierInstCounterBits) 4212 { 4213 SaveOption(VISA_EnableBarrierInstCounterBits, true); 4214 } 4215 if (preserveR0) 4216 { 4217 SaveOption(vISA_ReserveR0, true); 4218 } 4219 if (IGC_IS_FLAG_ENABLED(InitializeRegistersEnable)) 4220 { 4221 SaveOption(vISA_InitPayload, true); 4222 } 4223 if (IGC_IS_FLAG_ENABLED(UseOldSubRoutineAugIntf)) 4224 { 4225 SaveOption(vISA_UseOldSubRoutineAugIntf, true); 4226 } 4227 if (IGC_IS_FLAG_ENABLED(FastCompileRA) && !hasStackCall) 4228 { 4229 SaveOption(vISA_FastCompileRA, true); 4230 } 4231 if (IGC_IS_FLAG_ENABLED(HybridRAWithSpill) && !hasStackCall) 4232 { 4233 SaveOption(vISA_HybridRAWithSpill, true); 4234 } 4235 if (IGC_IS_FLAG_ENABLED(DumpPayloadToScratch)) 4236 { 4237 SaveOption(vISA_dumpPayload, true); 4238 } 4239 if (IGC_IS_FLAG_ENABLED(ExpandPlane)) 4240 { 4241 SaveOption(vISA_expandPlane, true); 4242 } 4243 if (IGC_IS_FLAG_ENABLED(EnableBCR)) 4244 { 4245 SaveOption(vISA_enableBCR, true); 4246 } 4247 if (IGC_IS_FLAG_ENABLED(ForceBCR)) 4248 { 4249 SaveOption(vISA_forceBCR, true); 4250 } 4251 if (IGC_IS_FLAG_ENABLED(forceSamplerHeader)) 4252 { 4253 SaveOption(vISA_forceSamplerHeader, true); 4254 } 4255 if (IGC_IS_FLAG_ENABLED(EnableIGAEncoder)) 4256 { 4257 SaveOption(vISA_IGAEncoder, true); 4258 } 4259 else 4260 { 4261 SaveOption(vISA_IGAEncoder, false); 4262 } 4263 4264 if (IGC_IS_FLAG_ENABLED(SetA0toTdrForSendc)) 4265 { 4266 SaveOption(vISA_setA0toTdrForSendc, true); 4267 } 4268 4269 if (IGC_IS_FLAG_ENABLED(AvoidDstSrcGRFOverlap)) 4270 { 4271 SaveOption(vISA_DstSrcOverlapWA, true); 4272 } 4273 4274 if (IGC_IS_FLAG_ENABLED(AvoidSrc1Src2Overlap)) 4275 { 4276 SaveOption(vISA_Src1Src2OverlapWA, true); 4277 } 4278 4279 if (IGC_IS_FLAG_ENABLED(UseLinearScanRA)) 4280 { 4281 SaveOption(vISA_LinearScan, true); 4282 } 4283 4284 if (IGC_IS_FLAG_ENABLED(EnableIGASWSB)) 4285 { 4286 SaveOption(vISA_EnableIGASWSB, true); 4287 } 4288 4289 if (IGC_IS_FLAG_ENABLED(EnableQuickTokenAlloc)) 4290 { 4291 SaveOption(vISA_QuickTokenAllocation, true); 4292 } 4293 4294 if (IGC_IS_FLAG_ENABLED(EnableSWSBStitch) || 4295 (context->type == ShaderType::PIXEL_SHADER && 4296 static_cast<CPixelShader*>(m_program)->GetPhase() == PSPHASE_PIXEL)) 4297 { 4298 SaveOption(vISA_SWSBStitch, true); 4299 } 4300 4301 if (IGC_IS_FLAG_ENABLED(DisableRegDistDep)) 4302 { 4303 SaveOption(vISA_disableRegDistDep, true); 4304 } 4305 4306 if (IGC_IS_FLAG_ENABLED(EnableForceDebugSWSB) || 4307 IGC_IS_FLAG_ENABLED(EnableSWSBInstStall) || 4308 IGC_IS_FLAG_ENABLED(EnableSWSBTokenBarrier)) 4309 { 4310 if (IGC_IS_FLAG_ENABLED(EnableSWSBInstStall)) 4311 { 4312 SaveOption(vISA_SWSBInstStall, IGC_GET_FLAG_VALUE(EnableSWSBInstStall)); 4313 SaveOption(vISA_SWSBInstStallEnd, IGC_GET_FLAG_VALUE(EnableSWSBInstStallEnd)); 4314 } 4315 4316 if (IGC_IS_FLAG_ENABLED(EnableSWSBTokenBarrier)) 4317 { 4318 SaveOption(vISA_SWSBTokenBarrier, IGC_GET_FLAG_VALUE(EnableSWSBTokenBarrier)); 4319 } 4320 4321 if (IGC_IS_FLAG_ENABLED(EnableForceDebugSWSB)) 4322 { 4323 SaveOption(vISA_forceDebugSWSB, true); 4324 } 4325 SaveOption(vISA_Compaction, false); 4326 } 4327 4328 if (IGC_IS_FLAG_ENABLED(EnableGatherWithImm)) 4329 { 4330 SaveOption(vISA_EnableGatherWithImm, true); 4331 } 4332 4333 if (IGC_IS_FLAG_ENABLED(EnableGroupScheduleForBC)) 4334 { 4335 SaveOption(vISA_EnableGroupScheduleForBC, true); 4336 } 4337 4338 if (VISAPlatform == XeHP_SDV && IGC_IS_FLAG_ENABLED(DPASTokenReduction)) 4339 { 4340 SaveOption(vISA_EnableDPASTokenReduction, true); 4341 } 4342 4343 if (IGC_IS_FLAG_ENABLED(DisableThreeALUPipes)) 4344 { 4345 SaveOption(vISA_EnableALUThreePipes, false); 4346 } 4347 4348 SaveOption(vISA_useInlineData, m_program->passNOSInlineData()); 4349 4350 if (m_program->m_Platform->supportLoadThreadPayloadForCompute()) 4351 { 4352 SaveOption(vISA_loadThreadPayload, m_program->loadThreadPayload()); 4353 } 4354 else 4355 { 4356 SaveOption(vISA_loadThreadPayload, false); 4357 } 4358 4359 4360 4361 if (IGC_IS_FLAG_ENABLED(EnablerReadSuppressionWA) && 4362 VISAPlatform >= GENX_TGLLP) 4363 { 4364 SaveOption(vISA_InsertDummyMovForHWRSWA, true); 4365 if (IGC_IS_FLAG_ENABLED(DPASReadSuppressionWA)) 4366 { 4367 SaveOption(vISA_InsertDummyMovForDPASRSWA, true); 4368 } 4369 if (IGC_GET_FLAG_VALUE(RSWARegNum) != 0) 4370 { 4371 SaveOption(vISA_registerHWRSWA, IGC_GET_FLAG_VALUE(RSWARegNum)); 4372 } 4373 } 4374 4375 if (IGC_GET_FLAG_VALUE(SWSBTokenNum) != 0) 4376 { 4377 SaveOption(vISA_SWSBTokenNum, IGC_GET_FLAG_VALUE(SWSBTokenNum)); 4378 } 4379 4380 if (IGC_IS_FLAG_ENABLED(EnableAccSub)) 4381 { 4382 SaveOption(vISA_accSubstitution, true); 4383 uint32_t numAcc = IGC_GET_FLAG_VALUE(NumGeneralAcc); 4384 4385 IGC_ASSERT_MESSAGE(0 <= numAcc, "number of general acc should be [1-16] if set"); 4386 IGC_ASSERT_MESSAGE(numAcc <= 16, "number of general acc should be [1-16] if set"); 4387 4388 if (numAcc > 0) 4389 { 4390 SaveOption(vISA_numGeneralAcc, numAcc); 4391 } 4392 4393 if (IGC_IS_FLAG_ENABLED(HasDoubleAcc)) 4394 { 4395 SaveOption(vISA_hasDoubleAcc, true); 4396 } 4397 } 4398 else 4399 { 4400 SaveOption(vISA_accSubstitution, false); 4401 } 4402 4403 if (IGC_IS_FLAG_ENABLED(GlobalSendVarSplit)) 4404 { 4405 SaveOption(vISA_GlobalSendVarSplit, true); 4406 } 4407 4408 if (m_program->m_Platform->canFuseTypedWrite()) 4409 { 4410 SaveOption(vISA_FuseTypedWrites, true); 4411 } 4412 4413 if (IGC_IS_FLAG_ENABLED(ShaderDumpEnable) && IGC_IS_FLAG_ENABLED(InterleaveSourceShader)) 4414 { 4415 SaveOption(vISA_EmitLocation, true); 4416 } 4417 4418 if (IGC_IS_FLAG_ENABLED(ShaderDumpEnable)) 4419 { 4420 SaveOption(vISA_SBIDDepLoc, true); 4421 } 4422 4423 // Enable SendFusion for SIMD8 4424 // TODO: Re-enable SendFusion when VMask is enabled. The hardware should support this, but 4425 // more investigation needs to be done on whether simply replacing sr0.2 with sr0.3 is enough. 4426 if (IGC_IS_FLAG_ENABLED(EnableSendFusion) && 4427 !(context->type == ShaderType::PIXEL_SHADER && static_cast<CPixelShader*>(m_program)->NeedVMask()) && 4428 m_program->GetContext()->platform.supportSplitSend() && 4429 m_program->m_dispatchSize == SIMDMode::SIMD8 && 4430 (IGC_GET_FLAG_VALUE(EnableSendFusion) == FLAG_LEVEL_2 || // 2: force send fusion 4431 context->m_DriverInfo.AllowSendFusion())) 4432 { 4433 SaveOption(vISA_EnableSendFusion, true); 4434 if (IGC_IS_FLAG_ENABLED(EnableAtomicFusion) && 4435 context->type == ShaderType::OPENCL_SHADER) 4436 { 4437 SaveOption(vISA_EnableAtomicFusion, true); 4438 } 4439 } 4440 4441 // With statelessToStatefull on, it is possible that two different BTI messages 4442 // (two kernel arguments) might refer to the same memory. To be safe, turn off 4443 // visa DPSend reordering. 4444 if (IGC_IS_FLAG_ENABLED(EnableStatelessToStatefull) && 4445 context->type == ShaderType::OPENCL_SHADER) 4446 { 4447 SaveOption(vISA_ReorderDPSendToDifferentBti, false); 4448 } 4449 4450 if (m_program->m_Platform->WaDisableSendSrcDstOverlap()) 4451 { 4452 SaveOption(vISA_noSendSrcDstOverlap, true); 4453 } 4454 4455 if (m_program->m_Platform->WaDisableSendSrcDstOverlap()) 4456 { 4457 SaveOption(vISA_noSendSrcDstOverlap, true); 4458 } 4459 4460 // Set to stitch all functions to all kernels in a VISABuidler 4461 SaveOption(vISA_noStitchExternFunc, false); 4462 4463 // Turning off optimizations as much as possible to have the fastest compilation 4464 if ((IsStage1FastestCompile(context->m_CgFlag, context->m_StagingCtx) || 4465 IGC_GET_FLAG_VALUE(ForceFastestSIMD)) && 4466 (m_program->m_DriverInfo->SupportFastestStage1() || 4467 IGC_IS_FLAG_ENABLED(EnableFastestForVulkan))) 4468 { 4469 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) == FCEXP_NO_EXPRIMENT) 4470 { 4471 SaveOption(vISA_LocalScheduling, false); 4472 SaveOption(vISA_preRA_Schedule, false); 4473 SaveOption(vISA_SpillSpaceCompression, false); 4474 SaveOption(vISA_LVN, false); 4475 SaveOption(vISA_QuickTokenAllocation, true); 4476 if (IGC_IS_FLAG_DISABLED(FastestWALinearScanForCS) || 4477 context->type != ShaderType::COMPUTE_SHADER) 4478 { 4479 SaveOption(vISA_LinearScan, true); 4480 } 4481 } 4482 else 4483 { 4484 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_FASTSPILL) 4485 SaveOption(vISA_FastSpill, true); 4486 4487 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_LOCAL_SCHEDULING) 4488 SaveOption(vISA_LocalScheduling, false); 4489 4490 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_PRERA_SCHEDULING) 4491 SaveOption(vISA_preRA_Schedule, false); 4492 4493 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_NO_REMAT) 4494 SaveOption(vISA_NoRemat, true); 4495 4496 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_SPILL_COMPRESSION) 4497 SaveOption(vISA_SpillSpaceCompression, false); 4498 4499 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_LOCAL_DECL_SPLIT_GLOBAL_RA) 4500 SaveOption(vISA_LocalDeclareSplitInGlobalRA, false); 4501 4502 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_DISABLE_LVN) 4503 SaveOption(vISA_LVN, false); 4504 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_QUICKTOKEN_ALLOC) 4505 SaveOption(vISA_QuickTokenAllocation, true); 4506 if (((IGC_IS_FLAG_DISABLED(FastestWALinearScanForCS) || 4507 context->type != ShaderType::COMPUTE_SHADER)) && 4508 (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_LINEARSCAN)) 4509 SaveOption(vISA_LinearScan, true); // use linearScan 4510 4511 if (IGC_GET_FLAG_VALUE(FastestS1Experiments) & FCEXP_1PASSRA) 4512 SaveOption(vISA_FastCompileRA, true); // use 1 iteration RA 4513 } 4514 } 4515 4516 } // InitVISABuilderOptions 4517 4518 // Get a unqiue label for inline asm instruction blocks at the module level. 4519 // For each call to asm("..."), user can input the "%=" string format to generate a unique label for that call. 4520 // In this case we would generate "__4_000" for the 1st usage of "%=" in an asm block in the 5th function of the module. GetUniqueInlineAsmLabel()4521 std::string CEncoder::GetUniqueInlineAsmLabel() 4522 { 4523 std::stringstream ss; 4524 ss << GetCompilerLabelPrefix() << labelFunctionIndex << "_" << 4525 std::setw(3) << std::setfill('0') << labelInlineAsmCounter++; 4526 return ss.str(); 4527 } 4528 4529 // Creates a module/program-unique label prefix. 4530 // E.g. the 3rd label of the 5th function would be 4531 // "__4_002". Ugly, yes, but you shouldn't see it as this is the 4532 // fallback case. Short, unique, and debuggable.... 4533 // Release-internal/debug will have better names. CreateShortLabel(unsigned labelIndex) const4534 std::string CEncoder::CreateShortLabel(unsigned labelIndex) const 4535 { 4536 std::stringstream ss; 4537 ss << GetCompilerLabelPrefix() << labelFunctionIndex << "_" << 4538 std::setw(3) << std::setfill('0') << labelIndex; 4539 return ss.str(); 4540 } 4541 4542 // Converts an LLVM label L into a name appropriate for vISA's label rules 4543 // * remove illegal chracters for vISA 4544 // * contrains the length while maintaining uniqueness 4545 // The format is something that contains both function index and the 4546 // label name passed in. 4547 // 4548 // If enabled the output will be: 4549 // _[FUNCTION-INDEX]_[LABEL-INDEX](_[LLVM-NAME])? 4550 // i.e. if the LLVM name is empty we omit that whole suffix CreateVisaLabelName(const llvm::StringRef & L)4551 CName CEncoder::CreateVisaLabelName(const llvm::StringRef &L) 4552 { 4553 #ifndef IGC_MAP_LLVM_NAMES_TO_VISA 4554 return CreateShortLabel(labelCounter++); 4555 #else // IGC_MAP_LLVM_NAMES_TO_VISA 4556 static const size_t MAX_LLVM_NAME = 250; 4557 4558 auto sanitizeChar = [](char c) { 4559 return isalnum(c) || c == '_' ? c : '_'; 4560 }; 4561 4562 // The vISA backend constrains this to around 256 characters. 4563 // (1) Function names can be extremely long (currFunctionName). 4564 // DPC++ with template gunk can be hundreds of characters. 4565 // If the names are too long, punt and use a function index. 4566 // Functions cannot be integers, thus the function part cannot 4567 // collide if we use this replacement. 4568 // (2) LLVM labels (L) can be extremely long. E.g. LLVM chains 4569 // together names synthetically and can get to >900 chars. 4570 // In this case, we prefix a label index and suffix as much of 4571 // the LLVM label on as possible. 4572 std::stringstream lbl; 4573 lbl << GetCompilerLabelPrefix(); 4574 if (!currFunctionName.empty() && currFunctionName.size() < 128) { 4575 const char *s = currFunctionName.getVisaCString(); 4576 while (*s) 4577 lbl << sanitizeChar(*s++); 4578 } else { 4579 lbl << std::setw(2) << std::setfill('0') << labelFunctionIndex; 4580 } 4581 // since the label name could be the empty string, and to keep things 4582 // simple, we unconditionally use the label counter (and increment it) 4583 lbl << "_" << std::setw(3) << std::setfill('0') << 4584 labelCounter++; 4585 4586 size_t charsLeft = MAX_LLVM_NAME - (size_t)lbl.tellp(); 4587 size_t nLeft = std::min(charsLeft, L.size()); 4588 if (L.size() > 0 && nLeft > 0) { 4589 // if not the empty string then add a separator 4590 lbl << "_"; 4591 nLeft--; 4592 } 4593 // suffix as many characters of the label as we can 4594 for (size_t i = 0; i < nLeft; i++) { 4595 lbl << sanitizeChar(L[i]); 4596 } 4597 4598 return lbl.str(); 4599 #endif // IGC_MAP_LLVM_NAMES_TO_VISA 4600 } 4601 InitLabelMap(const llvm::Function * F)4602 void CEncoder::InitLabelMap(const llvm::Function* F) 4603 { 4604 labelMap.clear(); 4605 labelMap.resize(F->size(), nullptr); 4606 labelCounter = 0; 4607 labelInlineAsmCounter = 0; 4608 labelFunctionIndex++; 4609 currFunctionName = F->getName(); 4610 labelNameMap.clear(); 4611 labelNameMap.reserve(F->size()); 4612 for (auto BI = F->begin(), BE = F->end(); BI != BE; BI++) 4613 { 4614 labelNameMap.emplace_back(CreateVisaLabelName(BI->getName())); 4615 } 4616 } 4617 InitEncoder(bool canAbortOnSpill,bool hasStackCall,bool hasInlineAsmCall,VISAKernel * prevKernel)4618 void CEncoder::InitEncoder(bool canAbortOnSpill, bool hasStackCall, bool hasInlineAsmCall, VISAKernel* prevKernel) 4619 { 4620 m_aliasesMap.clear(); 4621 m_encoderState.m_SubSpanDestination = false; 4622 CodeGenContext* context = m_program->GetContext(); 4623 m_encoderState.m_secondHalf = false; 4624 m_encoderState.m_secondNibble = false; 4625 m_enableVISAdump = false; 4626 m_nestLevelForcedNoMaskRegion = 0; 4627 m_hasInlineAsm = hasInlineAsmCall; 4628 4629 InitLabelMap(m_program->entry); 4630 4631 vbuilder = nullptr; 4632 vAsmTextBuilder = nullptr; 4633 TARGET_PLATFORM VISAPlatform = GetVISAPlatform(&(context->platform)); 4634 4635 SetVISAWaTable(m_program->m_Platform->getWATable()); 4636 4637 llvm::SmallVector<const char*, 10> params; 4638 llvm::SmallVector<std::unique_ptr< char, std::function<void(char*)>>, 10> params2; 4639 if (!m_hasInlineAsm) 4640 { 4641 // Asm text writer mode doesnt need dump params 4642 InitBuildParams(params2); 4643 for (size_t i = 0; i < params2.size(); i++) 4644 { 4645 params.push_back((params2[i].get())); 4646 } 4647 } 4648 4649 COMPILER_TIME_START(m_program->GetContext(), TIME_CG_vISACompile); 4650 bool enableVISADump = IGC_IS_FLAG_ENABLED(EnableVISASlowpath) || IGC_IS_FLAG_ENABLED(ShaderDumpEnable); 4651 auto builderMode = m_hasInlineAsm ? vISA_ASM_WRITER : vISA_DEFAULT; 4652 auto builderOpt = (enableVISADump || m_hasInlineAsm) ? VISA_BUILDER_BOTH : VISA_BUILDER_GEN; 4653 V(CreateVISABuilder(vbuilder, builderMode, builderOpt, VISAPlatform, params.size(), params.data(), 4654 &m_vISAWaTable)); 4655 4656 if (IsCodePatchCandidate()) 4657 { 4658 SetHasPrevKernel(prevKernel != nullptr); 4659 } 4660 InitVISABuilderOptions(VISAPlatform, canAbortOnSpill, hasStackCall, builderOpt == VISA_BUILDER_BOTH); 4661 4662 // Pass all build options to builder 4663 SetBuilderOptions(vbuilder); 4664 4665 vKernel = nullptr; 4666 4667 std::string kernelName = std::string(m_program->entry->getName()); 4668 if (context->m_instrTypes.hasDebugInfo) 4669 { 4670 // This metadata node is added by TransformBlocks pass for device side 4671 // enqueue feature of OCL2.0+. 4672 // The problem is that for device side enqueue, kernel name used in 4673 // IGC differs the one used to create JIT kernel. This leads to different 4674 // kernel names in .elf file and .dbg file. So dbgmerge tool cannot 4675 // merge the two together. With this metadata node we create a mapping 4676 // between the two names and when debug info is enabled, make JIT use 4677 // same name as IGC. 4678 // Names earlier - 4679 // ParentKernel_dispatch_0 in dbg and 4680 // __ParentKernel_block_invoke in elf 4681 // when kernel name is ParentKernel 4682 // 4683 auto md = m_program->entry->getParent()->getNamedMetadata("igc.device.enqueue"); 4684 if (md) 4685 { 4686 for (unsigned int i = 0; i < md->getNumOperands(); i++) 4687 { 4688 auto mdOpnd = md->getOperand(i); 4689 auto first = dyn_cast_or_null<MDString>(mdOpnd->getOperand(1)); 4690 if (first && 4691 first->getString().equals(kernelName)) 4692 { 4693 auto second = dyn_cast_or_null<MDString>(mdOpnd->getOperand(0)); 4694 if (second) 4695 { 4696 kernelName = second->getString().str(); 4697 } 4698 } 4699 } 4700 } 4701 } 4702 4703 std::string asmName; 4704 if (m_enableVISAdump || context->m_instrTypes.hasDebugInfo) 4705 { 4706 asmName = GetDumpFileName("asm"); 4707 } 4708 else 4709 { 4710 kernelName = "kernel"; 4711 asmName = "kernel.asm"; 4712 } 4713 4714 V(vbuilder->AddKernel(vKernel, kernelName.c_str())); 4715 V(vbuilder->SetPrevKernel(prevKernel)); 4716 V(vKernel->AddKernelAttribute("OutputAsmPath", asmName.length(), asmName.c_str())); 4717 4718 SetDispatchSimdSize(); 4719 SetSpillMemOffset(); 4720 4721 vMainKernel = vKernel; 4722 4723 auto gtpin_init = context->gtpin_init; 4724 if (gtpin_init) 4725 { 4726 vKernel->SetGTPinInit(gtpin_init); 4727 } 4728 4729 // Right now only 1 main function in the kernel 4730 VISA_LabelOpnd* functionLabel = nullptr; 4731 V(vKernel->CreateVISALabelVar(functionLabel, "_main", LABEL_SUBROUTINE)); 4732 V(vKernel->AppendVISACFLabelInst(functionLabel)); 4733 4734 V(vKernel->CreateVISASurfaceVar(dummySurface, "", 1)); 4735 4736 V(vKernel->CreateVISASamplerVar(samplervar, "", 1)); 4737 4738 // Set float denorm modes and rounding modes as default 4739 initCR(vKernel); 4740 } 4741 SetDispatchSimdSize()4742 void CEncoder::SetDispatchSimdSize() 4743 { 4744 IGC_ASSERT(nullptr != vKernel); 4745 uint8_t dispatchSIMD = (uint8_t)numLanes(m_program->m_dispatchSize); 4746 V(vKernel->AddKernelAttribute("SimdSize", 1, &dispatchSIMD)); 4747 } 4748 SetSpillMemOffset()4749 void CEncoder::SetSpillMemOffset() 4750 { 4751 IGC_ASSERT(nullptr != vKernel); 4752 uint scratchSpaceSizeTemp = m_program->m_ScratchSpaceSize; 4753 4754 4755 if (scratchSpaceSizeTemp > 0) { 4756 V(vKernel->AddKernelAttribute("SpillMemOffset", 4, &scratchSpaceSizeTemp)); 4757 } 4758 } 4759 SetStackFunctionArgSize(uint size)4760 void CEncoder::SetStackFunctionArgSize(uint size) 4761 { 4762 uint8_t sz = (uint8_t)size; 4763 IGC_ASSERT(nullptr != vKernel); 4764 V(vKernel->AddKernelAttribute("ArgSize", 1, &sz)); 4765 } 4766 SetStackFunctionRetSize(uint size)4767 void CEncoder::SetStackFunctionRetSize(uint size) 4768 { 4769 uint8_t sz = (uint8_t)size; 4770 IGC_ASSERT(nullptr != vKernel); 4771 V(vKernel->AddKernelAttribute("RetValSize", 1, &sz)); 4772 } 4773 SetExternFunctionFlag()4774 void CEncoder::SetExternFunctionFlag() 4775 { 4776 IGC_ASSERT(nullptr != vKernel); 4777 V(vKernel->AddKernelAttribute("Extern", 0, nullptr)); 4778 } 4779 CopyEncoderState()4780 SEncoderState CEncoder::CopyEncoderState() 4781 { 4782 return m_encoderState; 4783 } 4784 SetEncoderState(SEncoderState & newState)4785 void CEncoder::SetEncoderState(SEncoderState& newState) 4786 { 4787 m_encoderState = newState; 4788 } 4789 GetVISAAlign(CVariable * var)4790 VISA_Align CEncoder:: GetVISAAlign(CVariable* var) 4791 { 4792 VISA_Align align; 4793 switch (var->GetAlign()) 4794 { 4795 case EALIGN_BYTE: align = ALIGN_BYTE; 4796 break; 4797 case EALIGN_WORD: align = ALIGN_WORD; 4798 break; 4799 case EALIGN_DWORD: align = ALIGN_DWORD; 4800 break; 4801 case EALIGN_QWORD: align = ALIGN_QWORD; 4802 break; 4803 case EALIGN_OWORD: align = ALIGN_OWORD; 4804 break; 4805 case EALIGN_HWORD: align = ALIGN_HWORD; 4806 break; 4807 case EALIGN_32WORD: align = ALIGN_32WORD; 4808 break; 4809 case EALIGN_64WORD: align = ALIGN_64WORD; 4810 break; 4811 default: 4812 align = ALIGN_UNDEF; 4813 IGC_ASSERT(0); 4814 break; 4815 } 4816 return align; 4817 } 4818 GetVISAVariable(CVariable * var)4819 VISA_GenVar* CEncoder::GetVISAVariable(CVariable* var) 4820 { 4821 if (m_encoderState.m_secondHalf) 4822 { 4823 if (var->GetNumberInstance() == 2) 4824 { 4825 return var->visaGenVariable[1]; 4826 } 4827 } 4828 return var->visaGenVariable[0]; 4829 } 4830 GetVISAVariable(CVariable * var,e_instance instance)4831 VISA_GenVar* CEncoder::GetVISAVariable(CVariable* var, e_instance instance) 4832 { 4833 VISA_GenVar* result = GetVISAVariable(var); 4834 4835 if (instance != EINSTANCE_UNSPECIFIED && 4836 var->GetNumberInstance() == 2) 4837 { 4838 if (instance == EINSTANCE_FIRST_HALF) 4839 { 4840 result = var->visaGenVariable[0]; 4841 } 4842 else 4843 { 4844 result = var->visaGenVariable[1]; 4845 } 4846 } 4847 return result; 4848 } 4849 GetVISAPredefinedVar(CVariable * pVar,PreDefined_Vars var)4850 void CEncoder::GetVISAPredefinedVar(CVariable* pVar, PreDefined_Vars var) 4851 { 4852 vKernel->GetPredefinedVar(pVar->visaGenVariable[0], var); 4853 switch (var) { 4854 case PREDEFINED_NULL: 4855 case PREDEFINED_TSC: 4856 case PREDEFINED_SR0: 4857 case PREDEFINED_CR0: 4858 case PREDEFINED_CE0: 4859 case PREDEFINED_DBG: 4860 // Creating alias to ARF is not allowed. 4861 return; 4862 default: 4863 break; 4864 } 4865 4866 VISA_GenVar* pAliasGenVar = nullptr; 4867 4868 // Create alias to the specified pre-defined variable to match the 4869 // requested types and elements.. 4870 vKernel->CreateVISAGenVar( 4871 pAliasGenVar, 4872 pVar->getVisaCString(), 4873 pVar->GetNumberElement(), 4874 pVar->GetType(), 4875 ALIGN_HWORD, 4876 pVar->visaGenVariable[0], 4877 pVar->GetAliasOffset()); 4878 4879 pVar->visaGenVariable[0] = pAliasGenVar; 4880 } 4881 CreateVISAVar(CVariable * var)4882 void CEncoder::CreateVISAVar(CVariable* var) 4883 { 4884 IGC_ASSERT(nullptr != var); 4885 4886 if (var->GetAlias() != NULL) 4887 { 4888 var->ResolveAlias(); 4889 // In case the alias is an exact copy or just a sub variable just re-use the variable 4890 if (var->GetAlias()->GetType() == var->GetType()) 4891 { 4892 for (uint i = 0; i < var->GetNumberInstance(); i++) 4893 { 4894 var->visaGenVariable[i] = var->GetAlias()->visaGenVariable[i]; 4895 } 4896 } 4897 else 4898 { 4899 SAlias alias(var->GetAlias(), var->GetType()); 4900 auto aliasPair = m_aliasesMap.insert(std::pair<SAlias, CVariable*>(alias, var)); 4901 if (aliasPair.second == false) 4902 { 4903 for (uint i = 0; i < var->GetNumberInstance(); i++) 4904 { 4905 var->visaGenVariable[i] = aliasPair.first->second->visaGenVariable[i]; 4906 } 4907 } 4908 else 4909 { 4910 IGC_ASSERT_MESSAGE(var->GetType() != ISA_TYPE_BOOL, "boolean cannot have alias"); 4911 for (uint i = 0; i < var->GetNumberInstance(); i++) 4912 { 4913 // Since we no longer use the built-in alias offset mechanism, 4914 // we have to create the aliases to be of at least the size of the 4915 // original variable (in bytes) 4916 // Otherwise, we may end up a situation where we have an alias with 4917 // an offset (m_aliasOffset, that we don't notify vISA about), 4918 // and make an out-of-bounds access. 4919 // This is the opposite of the calculation that happens in 4920 // CVariable::CVariable. 4921 4922 const unsigned int denominator = CEncoder::GetCISADataTypeSize(var->GetType()); 4923 IGC_ASSERT(denominator); 4924 uint16_t nbElement = 4925 var->GetAlias()->GetNumberElement() * 4926 CEncoder::GetCISADataTypeSize(var->GetAlias()->GetType()) / 4927 denominator; 4928 4929 V(vKernel->CreateVISAGenVar( 4930 var->visaGenVariable[i], 4931 var->getVisaCString(), 4932 nbElement, 4933 var->GetType(), 4934 GetVISAAlign(var->GetAlias()), // Use parent's align as we create an alias of the parent. 4935 var->GetAlias()->visaGenVariable[i], 4936 0)); 4937 } 4938 } 4939 } 4940 } 4941 else 4942 { 4943 uint num_elts = var->GetNumberElement(); 4944 if (var->GetVarType() == EVARTYPE_GENERAL) 4945 { 4946 var->visaGenVariable[0] = nullptr; 4947 var->visaGenVariable[1] = nullptr; 4948 IGC_ASSERT_MESSAGE(var->GetType() != ISA_TYPE_BOOL, "boolean cannot be general var"); 4949 for (uint i = 0; i < var->GetNumberInstance(); i++) 4950 { 4951 V(vKernel->CreateVISAGenVar( 4952 var->visaGenVariable[i], 4953 var->getVisaCString(), 4954 num_elts, 4955 var->GetType(), 4956 GetVISAAlign(var))); 4957 } 4958 } 4959 else if (var->GetVarType() == EVARTYPE_PREDICATE) 4960 { 4961 unsigned short nb = int_cast<unsigned short>(num_elts) * var->GetNumberInstance(); 4962 V(vKernel->CreateVISAPredVar( 4963 var->visaPredVariable, 4964 "", 4965 nb)); 4966 } 4967 else 4968 { 4969 // when both array and index are uniform so is the destination address variable 4970 uint nb = (var->IsUniform() && var->IsVectorUniform()) ? 1 : var->GetNumberElement(); 4971 V(vKernel->CreateVISAAddrVar(var->visaAddrVariable, "", nb)); 4972 } 4973 } 4974 } 4975 DeclareInput(CVariable * var,uint offset,uint instance)4976 void CEncoder::DeclareInput(CVariable* var, uint offset, uint instance) 4977 { 4978 // Avoid declaring more inputs/outputs than available registers 4979 if (offset + var->GetSize() >= vKernel->getNumRegTotal() * getGRFSize()) 4980 return; 4981 V(vKernel->CreateVISAInputVar( 4982 var->visaGenVariable[instance], 4983 int_cast<unsigned short>(offset), 4984 int_cast<unsigned short>(var->GetSize()))); 4985 } 4986 MarkAsOutput(CVariable * var)4987 void CEncoder::MarkAsOutput(CVariable* var) 4988 { 4989 for (unsigned int i = 0; i < var->GetNumberInstance(); i++) 4990 { 4991 V(vKernel->AddAttributeToVar(var->visaGenVariable[i], "Output", 0, nullptr)); 4992 } 4993 } 4994 MarkAsPayloadLiveOut(CVariable * var)4995 void CEncoder::MarkAsPayloadLiveOut(CVariable* var) 4996 { 4997 for (unsigned int i = 0; i < var->GetNumberInstance(); i++) 4998 { 4999 V(vKernel->AddAttributeToVar(var->visaGenVariable[i], "PayloadLiveOut", 0, nullptr)); 5000 } 5001 } 5002 AvoidRetryOnSmallSpill() const5003 bool CEncoder::AvoidRetryOnSmallSpill() const 5004 { 5005 CodeGenContext* context = m_program->GetContext(); 5006 return context->type == ShaderType::PIXEL_SHADER && 5007 (m_program->m_dispatchSize == SIMDMode::SIMD8 || m_program->m_dispatchSize == SIMDMode::SIMD16) && 5008 context->m_retryManager.IsFirstTry(); 5009 } 5010 CreateKernelSymbol(const std::string & kernelName,unsigned offset,unsigned size,SProgramOutput::ZEBinFuncSymbolTable & symbols)5011 void CEncoder::CreateKernelSymbol(const std::string& kernelName, unsigned offset, 5012 unsigned size, SProgramOutput::ZEBinFuncSymbolTable& symbols) 5013 { 5014 // kernel symbols are local symbols 5015 symbols.local.emplace_back(vISA::GenSymType::S_KERNEL, offset, size, kernelName); 5016 } 5017 CreateSymbolTable(ValueToSymbolList & symbolTableList)5018 void CEncoder::CreateSymbolTable(ValueToSymbolList& symbolTableList) 5019 { 5020 Module* pModule = m_program->GetContext()->getModule(); 5021 ModuleMetaData* modMD = m_program->GetContext()->getModuleMetaData(); 5022 5023 for (auto& F : pModule->getFunctionList()) 5024 { 5025 // Find all variant function declarations 5026 if (F.isDeclaration() && F.hasFnAttribute("variant-function-decl")) 5027 { 5028 // Parse the function name string 5029 auto [symStr, fName, vecLen] = IGC::ParseVectorVariantFunctionString(F.getName()); 5030 5031 Function* VFDef = pModule->getFunction(fName); 5032 if (VFDef && numLanes(m_program->m_dispatchSize) == vecLen) 5033 { 5034 auto Iter = stackFuncMap.find(VFDef); 5035 IGC_ASSERT_MESSAGE(Iter != stackFuncMap.end(), "vISA function not found"); 5036 5037 vISA::GenSymEntry fEntry; 5038 IGC_ASSERT(F.getName().size() <= vISA::MAX_SYMBOL_NAME_LENGTH); 5039 strcpy_s(fEntry.s_name, vISA::MAX_SYMBOL_NAME_LENGTH, F.getName().str().c_str()); 5040 5041 // Query vISA for the function's byte offset within the compiled module 5042 // The actual binary offset data should point to the function definition 5043 VISAFunction* visaFunc = Iter->second; 5044 fEntry.s_type = vISA::GenSymType::S_FUNC; 5045 fEntry.s_offset = (uint32_t)visaFunc->getGenOffset(); 5046 fEntry.s_size = (uint32_t)visaFunc->getGenSize(); 5047 5048 symbolTableList.push_back(std::make_pair(&F, fEntry)); 5049 } 5050 } 5051 // Ignore variant function definitions 5052 else if (F.hasFnAttribute("variant-function-def")) 5053 { 5054 IGC_ASSERT_MESSAGE(F.use_empty(), "This function should never be accessed directly"); 5055 continue; 5056 } 5057 // Find all functions in the module we need to export as symbols 5058 else if (F.hasFnAttribute("referenced-indirectly") && (!F.isDeclaration() || !F.use_empty())) 5059 { 5060 vISA::GenSymEntry fEntry; 5061 IGC_ASSERT(F.getName().size() <= vISA::MAX_SYMBOL_NAME_LENGTH); 5062 strcpy_s(fEntry.s_name, vISA::MAX_SYMBOL_NAME_LENGTH, F.getName().str().c_str()); 5063 5064 bool isTrue = false; 5065 if (F.isDeclaration() || isTrue) 5066 { 5067 // If the function is only declared, set as undefined type 5068 fEntry.s_type = vISA::GenSymType::S_UNDEF; 5069 fEntry.s_offset = 0; 5070 fEntry.s_size = 0; 5071 } 5072 else 5073 { 5074 auto Iter = stackFuncMap.find(&F); 5075 IGC_ASSERT_MESSAGE(Iter != stackFuncMap.end(), "vISA function not found"); 5076 5077 // Query vISA for the function's byte offset within the compiled module 5078 VISAFunction* visaFunc = Iter->second; 5079 fEntry.s_type = vISA::GenSymType::S_FUNC; 5080 fEntry.s_offset = (uint32_t)visaFunc->getGenOffset(); 5081 fEntry.s_size = (uint32_t)visaFunc->getGenSize(); 5082 } 5083 symbolTableList.push_back(std::make_pair(&F, fEntry)); 5084 } 5085 } 5086 5087 // Export global symbols 5088 for (auto global : modMD->inlineProgramScopeOffsets) 5089 { 5090 GlobalVariable* pGlobal = global.first; 5091 5092 // Export the symbol if global is external/common linkage, or has uses in the module 5093 bool needSymbol = pGlobal->use_empty() 5094 ? (modMD->compOpt.EnableTakeGlobalAddress && (pGlobal->hasCommonLinkage() || pGlobal->hasExternalLinkage())) 5095 : true; 5096 5097 if (needSymbol) 5098 { 5099 StringRef name = pGlobal->getName(); 5100 unsigned addrSpace = pGlobal->getType()->getAddressSpace(); 5101 IGC_ASSERT(name.size() <= vISA::MAX_SYMBOL_NAME_LENGTH); 5102 5103 vISA::GenSymEntry sEntry; 5104 memset(sEntry.s_name, '0', vISA::MAX_SYMBOL_NAME_LENGTH); 5105 strcpy_s(sEntry.s_name, vISA::MAX_SYMBOL_NAME_LENGTH, name.str().c_str()); 5106 MDNode* md = pGlobal->getMetadata("ConstSampler"); 5107 if (md) 5108 { 5109 // Constant Sampler: s_offset contains the sampler ID 5110 sEntry.s_type = vISA::GenSymType::S_CONST_SAMPLER; 5111 sEntry.s_size = 0; 5112 sEntry.s_offset = static_cast<uint32_t>(global.second); 5113 } 5114 else 5115 { 5116 sEntry.s_type = (addrSpace == ADDRESS_SPACE_GLOBAL) ? vISA::GenSymType::S_GLOBAL_VAR : vISA::GenSymType::S_GLOBAL_VAR_CONST; 5117 sEntry.s_size = int_cast<uint32_t>(pModule->getDataLayout().getTypeAllocSize(pGlobal->getType()->getPointerElementType())); 5118 sEntry.s_offset = static_cast<uint32_t>(global.second); 5119 } 5120 symbolTableList.push_back(std::make_pair(pGlobal, sEntry)); 5121 } 5122 } 5123 } 5124 CreateSymbolTable(void * & buffer,unsigned & bufferSize,unsigned & tableEntries)5125 void CEncoder::CreateSymbolTable(void*& buffer, unsigned& bufferSize, unsigned& tableEntries) 5126 { 5127 buffer = nullptr; 5128 bufferSize = 0; 5129 tableEntries = 0; 5130 5131 ValueToSymbolList symbolTableList; 5132 CreateSymbolTable(symbolTableList); 5133 5134 // Get the data for patch token 5135 if (!symbolTableList.empty()) 5136 { 5137 std::vector<vISA::GenSymEntry> tempBufferData; 5138 // Collect the data just for the symbol table entries 5139 for (auto I : symbolTableList) 5140 { 5141 auto symbolEntry = I.second; 5142 tempBufferData.push_back(symbolEntry); 5143 } 5144 5145 tableEntries = tempBufferData.size(); 5146 bufferSize = tableEntries * sizeof(vISA::GenSymEntry); 5147 buffer = malloc(bufferSize); 5148 IGC_ASSERT_MESSAGE(nullptr != buffer, "Symbol table cannot be allocated"); 5149 memcpy_s(buffer, bufferSize, tempBufferData.data(), bufferSize); 5150 } 5151 } 5152 CreateSymbolTable(SProgramOutput::ZEBinFuncSymbolTable & funcSyms,SOpenCLProgramInfo::ZEBinProgramSymbolTable & programSyms)5153 void CEncoder::CreateSymbolTable(SProgramOutput::ZEBinFuncSymbolTable& funcSyms, 5154 SOpenCLProgramInfo::ZEBinProgramSymbolTable& programSyms) 5155 { 5156 ValueToSymbolList symbolTableList; 5157 CreateSymbolTable(symbolTableList); 5158 5159 // Get the data for zebin 5160 for (auto I : symbolTableList) 5161 { 5162 Value* symbolValue = I.first; 5163 auto symbolEntry = I.second; 5164 5165 if (Function* F = dyn_cast<Function>(symbolValue)) 5166 { 5167 funcSyms.function.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, F->getName().str()); 5168 } 5169 else if (GlobalVariable* G = dyn_cast<GlobalVariable>(symbolValue)) 5170 { 5171 // const sampler 5172 if (symbolEntry.s_type == vISA::GenSymType::S_CONST_SAMPLER) { 5173 funcSyms.sampler.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, G->getName().str()); 5174 } 5175 // global variables 5176 else if (symbolEntry.s_type == vISA::GenSymType::S_GLOBAL_VAR) { 5177 programSyms.global.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, G->getName().str()); 5178 } 5179 // global constants and string literals 5180 else { 5181 Constant* initializer = G->getInitializer(); 5182 ConstantDataSequential* cds = dyn_cast<ConstantDataSequential>(initializer); 5183 if (cds && (cds->isCString() || cds->isString())) 5184 programSyms.globalStringConst.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, G->getName().str()); 5185 else 5186 programSyms.globalConst.emplace_back((vISA::GenSymType)symbolEntry.s_type, symbolEntry.s_offset, symbolEntry.s_size, G->getName().str()); 5187 } 5188 } 5189 else 5190 { 5191 IGC_ASSERT(0); 5192 } 5193 } 5194 } 5195 CreateRelocationTable(void * & buffer,unsigned & bufferSize,unsigned & tableEntries)5196 void CEncoder::CreateRelocationTable(void*& buffer, unsigned& bufferSize, unsigned& tableEntries) 5197 { 5198 // for patch-token-based binary format 5199 buffer = nullptr; 5200 bufferSize = 0; 5201 tableEntries = 0; 5202 5203 // vISA will directly return the buffer with GenRelocEntry layout 5204 IGC_ASSERT(nullptr != vMainKernel); 5205 V(vMainKernel->GetGenRelocEntryBuffer(buffer, bufferSize, tableEntries)); 5206 IGC_ASSERT((sizeof(vISA::GenRelocEntry) * tableEntries) == bufferSize); 5207 } 5208 CreateRelocationTable(SProgramOutput::RelocListTy & relocations)5209 void CEncoder::CreateRelocationTable(SProgramOutput::RelocListTy& relocations) 5210 { 5211 // for ZEBinary format 5212 IGC_ASSERT(nullptr != vMainKernel); 5213 V(vMainKernel->GetRelocations(relocations)); 5214 } 5215 CreateFuncAttributeTable(void * & buffer,unsigned & bufferSize,unsigned & tableEntries,SProgramOutput::FuncAttrListTy & attrs)5216 void CEncoder::CreateFuncAttributeTable(void*& buffer, unsigned& bufferSize, 5217 unsigned& tableEntries, SProgramOutput::FuncAttrListTy& attrs) 5218 { 5219 buffer = nullptr; 5220 bufferSize = 0; 5221 tableEntries = 0; 5222 5223 std::vector<vISA::GenFuncAttribEntry> attribTable; 5224 for (auto it : funcAttributeMap) 5225 { 5226 vISA::GenFuncAttribEntry entry; 5227 Function* F = it.first; 5228 IGC_ASSERT(nullptr != F); 5229 IGC_ASSERT(F->getName().size() <= vISA::MAX_SYMBOL_NAME_LENGTH); 5230 strcpy_s(entry.f_name, vISA::MAX_SYMBOL_NAME_LENGTH, F->getName().str().c_str()); 5231 entry.f_isKernel = it.second.isKernel ? 1 : 0; 5232 entry.f_hasBarrier = it.second.hasBarrier ? 1 : 0; 5233 entry.f_privateMemPerThread = (uint32_t) (it.second.argumentStackSize + it.second.allocaStackSize); 5234 5235 // Get spill mem usage from visa 5236 VISAKernel* visaFunc = nullptr; 5237 if (it.second.isKernel) 5238 { 5239 visaFunc = vMainKernel; 5240 } 5241 else 5242 { 5243 auto Iter = stackFuncMap.find(F); 5244 IGC_ASSERT_MESSAGE(Iter != stackFuncMap.end(), "vISA function not found"); 5245 visaFunc = Iter->second; 5246 } 5247 FINALIZER_INFO* jitInfo; 5248 visaFunc->GetJitInfo(jitInfo); 5249 entry.f_spillMemPerThread = jitInfo->spillMemUsed; 5250 5251 attrs.emplace_back(entry.f_isKernel, entry.f_hasBarrier, entry.f_privateMemPerThread, 5252 entry.f_spillMemPerThread, F->getName().str()); 5253 attribTable.push_back(entry); 5254 } 5255 5256 if (!attribTable.empty()) 5257 { 5258 tableEntries = attribTable.size(); 5259 bufferSize = tableEntries * sizeof(vISA::GenFuncAttribEntry); 5260 buffer = malloc(bufferSize); 5261 IGC_ASSERT_MESSAGE(nullptr != buffer, "Table cannot be allocated"); 5262 memcpy_s(buffer, bufferSize, attribTable.data(), bufferSize); 5263 } 5264 } 5265 Compile(bool hasSymbolTable)5266 void CEncoder::Compile(bool hasSymbolTable) 5267 { 5268 IGC_ASSERT(nullptr != m_program); 5269 CodeGenContext* const context = m_program->GetContext(); 5270 SProgramOutput* const pOutput = m_program->ProgramOutput(); 5271 5272 if (m_program->m_dispatchSize == SIMDMode::SIMD8) 5273 { 5274 MEM_SNAPSHOT(IGC::SMS_AFTER_CISACreateDestroy_SIMD8); 5275 } 5276 else if (m_program->m_dispatchSize == SIMDMode::SIMD16) 5277 { 5278 MEM_SNAPSHOT(IGC::SMS_AFTER_CISACreateDestroy_SIMD16); 5279 } 5280 else if (m_program->m_dispatchSize == SIMDMode::SIMD32) 5281 { 5282 MEM_SNAPSHOT(IGC::SMS_AFTER_CISACreateDestroy_SIMD32); 5283 } 5284 5285 int vIsaCompile = 0; 5286 VISAKernel* pMainKernel = nullptr; 5287 5288 // ShaderOverride for .visaasm files 5289 std::vector<std::string> visaOverrideFiles; 5290 bool visaAsmOverride = false; 5291 std::string kernelName; 5292 if (IGC_IS_FLAG_ENABLED(ShaderOverride)) 5293 { 5294 // Kernel count is one per visaBuilder 5295 // Function count depends on stackFuncMap size 5296 int kernelCount = 1; 5297 int functionCount = stackFuncMap.size(); 5298 int count = kernelCount + functionCount; 5299 IGC::Debug::OutputFolderName folder = IGC::Debug::GetShaderOverridePath(); 5300 Debug::DumpName name = IGC::Debug::GetDumpNameObj(m_program, "visaasm"); 5301 kernelName = name.GetKernelName(); 5302 5303 visaOverrideFiles.push_back(name.AbsolutePath(folder)); 5304 5305 for (int i = 0; i < functionCount; i++) 5306 { 5307 std::string tmpVisaFile = name.AbsolutePath(folder); 5308 std::string::size_type asmNameEnd = tmpVisaFile.find_last_of('.'); 5309 tmpVisaFile = tmpVisaFile.substr(0, asmNameEnd); 5310 std::stringstream asmName; 5311 asmName << tmpVisaFile; 5312 asmName << "_f"; 5313 asmName << i; 5314 asmName << ".visaasm"; 5315 visaOverrideFiles.push_back(asmName.str()); 5316 } 5317 5318 if (visaOverrideFiles.size() == count) 5319 { 5320 for (const std::string& file : visaOverrideFiles) 5321 { 5322 FILE* tempFile = fopen(file.c_str(), "r"); 5323 if (tempFile) 5324 { 5325 visaAsmOverride = true; 5326 fclose(tempFile); 5327 } 5328 else 5329 { 5330 visaAsmOverride = false; 5331 if (functionCount > 0) 5332 { 5333 std::string message = "Cannot open overridden file! Put all .visaasm files in ShaderOverride dir."; 5334 appendToShaderOverrideLogFile(message, "WARNING: "); 5335 } 5336 break; 5337 5338 } 5339 } 5340 } 5341 } 5342 5343 // Compile generated VISA text string for inlineAsm 5344 if (m_hasInlineAsm || visaAsmOverride) 5345 { 5346 llvm::SmallVector<const char*, 10> params; 5347 llvm::SmallVector<std::unique_ptr< char, std::function<void(char*)>>, 10> params2; 5348 InitBuildParams(params2); 5349 for (const auto &ptr : params2) 5350 { 5351 params.push_back(ptr.get()); 5352 } 5353 5354 // Create a new builder for parsing the visaasm 5355 TARGET_PLATFORM VISAPlatform = GetVISAPlatform(&(context->platform)); 5356 V(CreateVISABuilder(vAsmTextBuilder, vISA_ASM_READER, VISA_BUILDER_BOTH, VISAPlatform, 5357 params.size(), params.data(), &m_vISAWaTable)); 5358 // Use the same build options as before, except that we enable vISA verifier to catch 5359 // potential errors in user inline assembly 5360 SetBuilderOptions(vAsmTextBuilder); 5361 vAsmTextBuilder->SetOption(vISA_NoVerifyvISA, false); 5362 5363 bool vISAAsmParseError = false; 5364 // Parse the generated VISA text 5365 if (visaAsmOverride) 5366 { 5367 for (const std::string& tmpVisaFile : visaOverrideFiles) 5368 { 5369 llvm::SmallVector<char, 1024> visaAsmNameVector; 5370 std::string visaAsmName = GetDumpFileName(""); 5371 5372 StringRef visaAsmNameRef(visaAsmName.c_str()); 5373 StringRef tmpVisaFileRef(tmpVisaFile.c_str()); 5374 StringRef directory = llvm::sys::path::parent_path(visaAsmNameRef); 5375 StringRef filename = llvm::sys::path::filename(tmpVisaFileRef); 5376 5377 llvm::sys::path::append(visaAsmNameVector, directory, filename); 5378 visaAsmName = std::string(visaAsmNameVector.begin(), visaAsmNameVector.end()); 5379 5380 auto result = vAsmTextBuilder->ParseVISAText(tmpVisaFile.c_str()); 5381 appendToShaderOverrideLogFile(visaAsmName, "OVERRIDEN: "); 5382 vISAAsmParseError = (result != 0); 5383 if (vISAAsmParseError) { 5384 IGC_ASSERT_MESSAGE(0, "visaasm file parse error!"); 5385 break; 5386 } 5387 } 5388 // After call to ParseVISAText, we have new VISAKernel, which don't have asm path set. 5389 // So we need to set the OutputAsmPath attribute of overridden kernel, 5390 // otherwise, we will not get .visaasm dump and .asm file dump 5391 auto kernelName = IGC::Debug::GetDumpNameObj(m_program, "").GetKernelName(); 5392 std::string asmName = GetDumpFileName("asm"); 5393 auto overriddenKernel = vAsmTextBuilder->GetVISAKernel(kernelName); 5394 overriddenKernel->AddKernelAttribute("OutputAsmPath", asmName.length(), asmName.c_str()); 5395 5396 // We need to update stackFuncMap for the symbol table for the overridden object, 5397 // because stackFuncMap contains information about functions for original object. 5398 // Only the IndirectlyCalled functions should be updated, 5399 // because these functions can be used in CreateSymbolTable. 5400 // Other normal stack call functions aren't used in CreateSymbolTable. 5401 if (hasSymbolTable && stackFuncMap.size() > 0) 5402 { 5403 Module* pModule = m_program->GetContext()->getModule(); 5404 for (auto& F : pModule->getFunctionList()) 5405 { 5406 if (F.hasFnAttribute("referenced-indirectly") && (!F.isDeclaration() || !F.use_empty())) 5407 { 5408 auto Iter = stackFuncMap.find(&F); 5409 IGC_ASSERT_MESSAGE(Iter != stackFuncMap.end(), "vISA function not found"); 5410 5411 VISAFunction* original = Iter->second; 5412 stackFuncMap[&F] = static_cast<VISAFunction*>(vAsmTextBuilder->GetVISAKernel(original->getFunctionName())); 5413 } 5414 } 5415 } 5416 } 5417 else 5418 { 5419 std::string parseTextFile = GetDumpFileName("inline.visaasm"); 5420 auto result = vAsmTextBuilder->ParseVISAText(vbuilder->GetAsmTextStream().str(), parseTextFile); 5421 if (result != 0) 5422 { 5423 std::string output; 5424 raw_string_ostream S(output); 5425 S << "parsing vISA inline assembly failed:\n" << vAsmTextBuilder->GetCriticalMsg(); 5426 S.flush(); 5427 context->EmitError(output.c_str(), nullptr); 5428 vISAAsmParseError = true; 5429 } 5430 } 5431 5432 if (vISAAsmParseError) 5433 { 5434 COMPILER_TIME_END(m_program->GetContext(), TIME_CG_vISACompile); 5435 return; 5436 } 5437 else 5438 { 5439 if (!visaAsmOverride) 5440 { 5441 // vISA verifier is already invoked in ParseVISAText earlier 5442 vAsmTextBuilder->SetOption(vISA_NoVerifyvISA, true); 5443 } 5444 pMainKernel = vAsmTextBuilder->GetVISAKernel(kernelName); 5445 std::stringstream ss; 5446 vIsaCompile = vAsmTextBuilder->Compile( 5447 m_enableVISAdump ? GetDumpFileName("isa").c_str() : "", 5448 (context->m_compileToVISAOnly) ? &ss : nullptr, 5449 context->m_compileToVISAOnly); 5450 } 5451 } 5452 //Compile to generate the V-ISA binary 5453 else 5454 { 5455 pMainKernel = vMainKernel; 5456 std::stringstream ss; 5457 vIsaCompile = vbuilder->Compile( 5458 m_enableVISAdump ? GetDumpFileName("isa").c_str() : "", 5459 (context->m_compileToVISAOnly) ? &ss : nullptr, 5460 context->m_compileToVISAOnly); 5461 } 5462 5463 COMPILER_TIME_END(m_program->GetContext(), TIME_CG_vISACompile); 5464 5465 #if GET_TIME_STATS 5466 // handle the vISA time counters differently here 5467 if (context->m_compilerTimeStats) 5468 { 5469 context->m_compilerTimeStats->recordVISATimers(); 5470 } 5471 #endif 5472 KERNEL_INFO* vISAstats; 5473 pMainKernel->GetKernelInfo(vISAstats); 5474 // Collect metrics from vISA 5475 context->metrics.CollectRegStats(vISAstats); 5476 5477 FINALIZER_INFO* jitInfo = nullptr; 5478 pMainKernel->GetJitInfo(jitInfo); 5479 5480 // Depend on vISA information about barriers presence to make sure that it's 5481 // always set properly, even if a barrier is used as a part of Inline vISA code only. 5482 if (jitInfo->usesBarrier) 5483 { 5484 m_program->SetHasBarrier(); 5485 } 5486 5487 if (jitInfo->isSpill) 5488 { 5489 context->m_retryManager.SetSpillSize(jitInfo->numGRFSpillFill); 5490 m_program->m_spillSize = jitInfo->numGRFSpillFill; 5491 m_program->m_spillCost = 5492 float(jitInfo->numGRFSpillFill) / jitInfo->numAsmCount; 5493 5494 context->m_retryManager.numInstructions = jitInfo->numAsmCount; 5495 } 5496 5497 if (IGC_IS_FLAG_ENABLED(DumpCompilerStats)) 5498 { 5499 CompilerStats CompilerStats; 5500 pMainKernel->GetCompilerStats(CompilerStats); 5501 CompilerStatsUtils::RecordCodeGenCompilerStats(context, m_program->m_dispatchSize, CompilerStats, jitInfo); 5502 } 5503 5504 if (vIsaCompile == -1) 5505 { 5506 IGC_ASSERT_MESSAGE(0, "CM failure in vbuilder->Compile()"); 5507 } 5508 else if (vIsaCompile == -2) 5509 { 5510 IGC_ASSERT_MESSAGE(0, "CM user error in vbuilder->Compile()"); 5511 } 5512 else if (vIsaCompile == -3) // CM early terminates on spill 5513 { 5514 #if (GET_SHADER_STATS) 5515 if (m_program->m_dispatchSize == SIMDMode::SIMD8) 5516 { 5517 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_EARLYEXIT8, 1); 5518 } 5519 else if (m_program->m_dispatchSize == SIMDMode::SIMD16) 5520 { 5521 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_EARLYEXIT16, 1); 5522 } 5523 else if (m_program->m_dispatchSize == SIMDMode::SIMD32) 5524 { 5525 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_EARLYEXIT32, 1); 5526 } 5527 #endif 5528 context->SetSIMDInfo(SIMD_SKIP_SPILL, m_program->m_dispatchSize, m_program->m_ShaderDispatchMode); 5529 return; 5530 } 5531 5532 if (m_program->m_dispatchSize == SIMDMode::SIMD8) 5533 { 5534 MEM_SNAPSHOT(IGC::SMS_AFTER_vISACompile_SIMD8); 5535 SimdSize8++; 5536 } 5537 else if (m_program->m_dispatchSize == SIMDMode::SIMD16) 5538 { 5539 MEM_SNAPSHOT(IGC::SMS_AFTER_vISACompile_SIMD16); 5540 SimdSize16++; 5541 } 5542 else if (m_program->m_dispatchSize == SIMDMode::SIMD32) 5543 { 5544 MEM_SNAPSHOT(IGC::SMS_AFTER_vISACompile_SIMD32); 5545 SimdSize32++; 5546 } 5547 5548 if (m_program->m_dispatchSize == SIMDMode::SIMD16) 5549 { 5550 uint sendStallCycle = 0; 5551 uint staticCycle = 0; 5552 for (uint i = 0; i < jitInfo->BBNum; i++) 5553 { 5554 sendStallCycle += jitInfo->BBInfo[i].sendStallCycle; 5555 staticCycle += jitInfo->BBInfo[i].staticCycle; 5556 } 5557 m_program->m_sendStallCycle = sendStallCycle; 5558 m_program->m_staticCycle = staticCycle; 5559 } 5560 5561 if (jitInfo->isSpill && (AvoidRetryOnSmallSpill() || jitInfo->avoidRetry)) 5562 { 5563 context->m_retryManager.Disable(); 5564 } 5565 5566 #if (GET_SHADER_STATS && !PRINT_DETAIL_SHADER_STATS) 5567 if (m_program->m_dispatchSize == SIMDMode::SIMD8) 5568 { 5569 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_INST_COUNT, jitInfo->numAsmCount); 5570 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_SPILL8, (int)jitInfo->isSpill); 5571 } 5572 else if (m_program->m_dispatchSize == SIMDMode::SIMD16) 5573 { 5574 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_INST_COUNT_SIMD16, jitInfo->numAsmCount); 5575 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_SPILL16, (int)jitInfo->isSpill); 5576 } 5577 else if (m_program->m_dispatchSize == SIMDMode::SIMD32) 5578 { 5579 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_INST_COUNT_SIMD32, jitInfo->numAsmCount); 5580 COMPILER_SHADER_STATS_SET(m_program->m_shaderStats, STATS_ISA_SPILL32, (int)jitInfo->isSpill); 5581 } 5582 #endif 5583 5584 if (context->m_compileToVISAOnly) { 5585 return; 5586 } 5587 5588 void* genxbin = nullptr; 5589 int size = 0, binSize = 0; 5590 bool binOverride = false; 5591 5592 V(pMainKernel->GetGenxBinary(genxbin, binSize)); 5593 if (IGC_IS_FLAG_ENABLED(ShaderOverride)) 5594 { 5595 Debug::DumpName name = IGC::Debug::GetDumpNameObj(m_program, "asm"); 5596 std::string binFileName = name.overridePath(); 5597 5598 overrideShaderIGA(context->platform.getPlatformInfo(), genxbin, binSize, binFileName, binOverride); 5599 5600 if (!binOverride) 5601 { 5602 name = IGC::Debug::GetDumpNameObj(m_program, "dat"); 5603 binFileName = name.overridePath(); 5604 overrideShaderBinary(genxbin, binSize, binFileName, binOverride); 5605 } 5606 5607 } 5608 5609 IGC_ASSERT(genxbin != nullptr); 5610 size = binSize; 5611 5612 // the kernel has to be padded to have a size aligned on 64 bytes 5613 size_t padding = iSTD::GetAlignmentOffset(size, 64);//m_program->m_Platform->getKernelPointerAlignSize()); 5614 void* kernel = nullptr; 5615 if (size!=0) 5616 { 5617 kernel = IGC::aligned_malloc(size + padding, 16 /* sizeof(DQWORD) */); 5618 memcpy_s(kernel, size + padding, genxbin, binSize); 5619 // pad out the rest with 0s 5620 memset(static_cast<char*>(kernel) + size, 0, padding); 5621 } 5622 if (binOverride) 5623 { 5624 free(genxbin); 5625 } 5626 else 5627 { 5628 freeBlock(genxbin); 5629 } 5630 5631 void* dbgInfo = nullptr; 5632 unsigned int dbgSize = 0; 5633 if (context->m_instrTypes.hasDebugInfo || m_enableVISAdump) 5634 { 5635 void* genxdbgInfo = nullptr; 5636 V(pMainKernel->GetGenxDebugInfo(genxdbgInfo, dbgSize)); 5637 if (m_enableVISAdump) 5638 { 5639 // passing VISAOptions: -generateDebugInfo should 5640 // cause dbg file to be generated, even when 5641 // hasDebugInfo = false. 5642 if (context->m_instrTypes.hasDebugInfo) 5643 { 5644 // assertion check makes sense only if debug info 5645 // is present in input. 5646 IGC_ASSERT(nullptr != genxdbgInfo); 5647 IGC_ASSERT(0 < dbgSize); 5648 } 5649 if (dbgSize > 0) 5650 { 5651 // dump dbg file only if it not empty 5652 std::string debugFileNameStr = IGC::Debug::GetDumpName(m_program, "dbg"); 5653 FILE* const dbgFile = fopen(debugFileNameStr.c_str(), "wb+"); 5654 if (nullptr != dbgFile) 5655 { 5656 fwrite(genxdbgInfo, dbgSize, 1, dbgFile); 5657 fclose(dbgFile); 5658 } 5659 } 5660 } 5661 5662 dbgInfo = IGC::aligned_malloc(dbgSize, sizeof(void*)); 5663 5664 memcpy_s(dbgInfo, dbgSize, genxdbgInfo, dbgSize); 5665 5666 freeBlock(genxdbgInfo); 5667 } 5668 5669 pOutput->m_programBin = kernel; 5670 pOutput->m_programSize = size + padding; 5671 pOutput->m_unpaddedProgramSize = size; 5672 pOutput->m_scratchSpaceUsedBySpills = 0; // initializing 5673 pOutput->m_debugDataGenISA = dbgInfo; 5674 pOutput->m_debugDataGenISASize = dbgSize; 5675 pOutput->m_InstructionCount = jitInfo->numAsmCount; 5676 pOutput->m_BasicBlockCount = jitInfo->BBNum; 5677 if (context->getModuleMetaData()->compOpt.CaptureCompilerStats) 5678 { 5679 ReportCompilerStatistics(pMainKernel, pOutput); 5680 } 5681 5682 pMainKernel->GetGTPinBuffer(pOutput->m_gtpinBuffer, pOutput->m_gtpinBufferSize); 5683 5684 bool ZEBinEnabled = IGC_IS_FLAG_ENABLED(EnableZEBinary) || context->getCompilerOption().EnableZEBinary; 5685 5686 if (hasSymbolTable) 5687 { 5688 if (ZEBinEnabled) 5689 { 5690 // we can only support zebin symbols for OPENCL_SHADER for now 5691 IGC_ASSERT(context->type == ShaderType::OPENCL_SHADER); 5692 auto cl_context = static_cast<OpenCLProgramContext*>(context); 5693 CreateSymbolTable(pOutput->m_symbols, 5694 cl_context->m_programInfo.m_zebinSymbolTable); 5695 } 5696 else 5697 { 5698 CreateSymbolTable(pOutput->m_funcSymbolTable, 5699 pOutput->m_funcSymbolTableSize, 5700 pOutput->m_funcSymbolTableEntries); 5701 } 5702 } 5703 5704 if (ZEBinEnabled) 5705 { 5706 // create symbols for kernel. 5707 // The kernel Symbol has the same name as the kernel, and offset 5708 // pointed to 0. 5709 CreateKernelSymbol(m_program->entry->getName().str(), 0, 5710 (unsigned)pMainKernel->getGenSize(), pOutput->m_symbols); 5711 5712 // Emit symbol "_entry' as the actual kernel start. Maybe we can 5713 // consider to use the value of the _main label in this case. Now 5714 // set the symbol value as the max offset next to the per-thread 5715 // prolog, the cross-thread prolog, or the compute-FFID prolog. 5716 unsigned actual_kernel_start_off = 5717 std::max(std::max(jitInfo->offsetToSkipPerThreadDataLoad, 5718 jitInfo->offsetToSkipCrossThreadDataLoad), 5719 jitInfo->offsetToSkipSetFFIDGP1); 5720 CreateKernelSymbol("_entry", actual_kernel_start_off, 5721 (unsigned)pMainKernel->getGenSize() - actual_kernel_start_off, pOutput->m_symbols); 5722 } 5723 5724 if (ZEBinEnabled) 5725 { 5726 CreateRelocationTable(pOutput->m_relocs); 5727 } 5728 else 5729 { 5730 CreateRelocationTable(pOutput->m_funcRelocationTable, 5731 pOutput->m_funcRelocationTableSize, 5732 pOutput->m_funcRelocationTableEntries); 5733 } 5734 5735 if (IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching)) 5736 { 5737 CreateFuncAttributeTable(pOutput->m_funcAttributeTable, 5738 pOutput->m_funcAttributeTableSize, 5739 pOutput->m_funcAttributeTableEntries, 5740 pOutput->m_funcAttrs); 5741 } 5742 5743 if (jitInfo->isSpill == true) 5744 { 5745 pOutput->m_scratchSpaceUsedBySpills = jitInfo->spillMemUsed; 5746 } 5747 5748 pOutput->setScratchSpaceUsedByShader(m_program->m_ScratchSpaceSize); 5749 5750 pOutput->m_scratchSpaceUsedByGtpin = jitInfo->numBytesScratchGtpin; 5751 5752 pOutput->m_offsetToSkipPerThreadDataLoad = jitInfo->offsetToSkipPerThreadDataLoad; 5753 5754 pOutput->m_offsetToSkipSetFFIDGP = jitInfo->offsetToSkipSetFFIDGP; 5755 5756 pOutput->m_numGRFTotal = jitInfo->numGRFTotal; 5757 } 5758 DestroyVISABuilder()5759 void CEncoder::DestroyVISABuilder() 5760 { 5761 if (vAsmTextBuilder != nullptr) 5762 { 5763 V(::DestroyVISABuilder(vAsmTextBuilder)); 5764 vAsmTextBuilder = nullptr; 5765 } 5766 V(::DestroyVISABuilder(vbuilder)); 5767 vbuilder = nullptr; 5768 } 5769 Copy(CVariable * dst,CVariable * src)5770 void CEncoder::Copy(CVariable* dst, CVariable* src) 5771 { 5772 IGC_ASSERT(nullptr != dst); 5773 IGC_ASSERT(nullptr != src); 5774 // undef value are not copied 5775 if (!src->IsUndef() || IGC_IS_FLAG_ENABLED(InitializeUndefValueEnable)) 5776 { 5777 CVariable* rawDst = dst; 5778 IGC_ASSERT(GetCISADataTypeSize(src->GetType()) == GetCISADataTypeSize(dst->GetType())); 5779 bool isVecImm = src->IsImmediate() && (src->GetType() == ISA_TYPE_UV || 5780 src->GetType() == ISA_TYPE_V || 5781 src->GetType() == ISA_TYPE_VF); 5782 if (src->GetType() != dst->GetType() && !isVecImm) 5783 { 5784 rawDst = m_program->BitCast(dst, src->GetType()); 5785 } 5786 DataMov(ISA_MOV, rawDst, src); 5787 } 5788 } 5789 BoolToInt(CVariable * dst,CVariable * src)5790 void CEncoder::BoolToInt(CVariable* dst, CVariable* src) 5791 { 5792 IGC_ASSERT(nullptr != dst); 5793 IGC_ASSERT(nullptr != src); 5794 IGC_ASSERT(src->GetType() == ISA_TYPE_BOOL); 5795 5796 VISA_Type dstType = dst->GetType(); 5797 IGC_ASSERT((dstType == ISA_TYPE_UD) || (dstType == ISA_TYPE_D) || (dstType == ISA_TYPE_UB) || (dstType == ISA_TYPE_B) || (dstType == ISA_TYPE_UW) || (dstType == ISA_TYPE_W)); 5798 5799 // undef value are not copied 5800 if (!src->IsUndef() || IGC_IS_FLAG_ENABLED(InitializeUndefValueEnable)) { 5801 // Casting 'dst' to BOOL is unnecessary. 5802 DataMov(ISA_MOV, dst, src); 5803 } 5804 } 5805 GatherA64(CVariable * dst,CVariable * offset,unsigned elemSize,unsigned numElems)5806 void CEncoder::GatherA64( 5807 CVariable* dst, 5808 CVariable* offset, 5809 unsigned elemSize, 5810 unsigned numElems) 5811 { 5812 IGC_ASSERT_MESSAGE((elemSize == 8) || (elemSize == 32) || (elemSize == 64), 5813 "Only B/DW/QW-sized elements are supported!"); 5814 IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4) || ((numElems == 8) && ((elemSize == 32) || m_program->m_Platform->has8ByteA64ByteScatteredMessage())), 5815 "Only 1/2/4/8 elements are supported!"); 5816 5817 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 5818 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 5819 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 5820 5821 SIMDMode thisSM = offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : m_encoderState.m_simdSize; 5822 if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE && thisSM == SIMDMode::SIMD16) 5823 { 5824 // BDW A64 gather does not support SIMD16, split it into 2 SIMD8 5825 VISA_EMask_Ctrl execMask = GetAluEMask(offset); 5826 VISA_Exec_Size fromExecSize = EXEC_SIZE_16; 5827 VISA_Exec_Size toExecSize = EXEC_SIZE_8; 5828 5829 if (numElems == 1 || elemSize == 8) 5830 { // No mov instructions (for packing) are needed. 5831 for (unsigned p = 0; p < 2; ++p) 5832 { 5833 addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset)); 5834 dstOpnd = GetRawDestination(dst, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, dst)); 5835 5836 V(vKernel->AppendVISASvmGatherInst( 5837 predOpnd, 5838 SplitEMask(fromExecSize, toExecSize, p, execMask), 5839 toExecSize, 5840 visaBlockType(elemSize), 5841 visaBlockNum(numElems), 5842 addressOpnd, dstOpnd)); 5843 } 5844 } 5845 else 5846 { 5847 // Do two SIMD8 gather and then merge (pack) the two simd8 results 5848 // to form the single simd16 payload. 5849 CVariable* V0, * V1; 5850 uint16_t newNumElems = (uint16_t)8 * numElems; 5851 V0 = m_program->GetNewVariable( 5852 newNumElems, 5853 dst->GetType(), 5854 dst->GetAlign(), 5855 dst->IsUniform(), 5856 dst->getName()); 5857 V1 = m_program->GetNewVariable( 5858 newNumElems, 5859 dst->GetType(), 5860 dst->GetAlign(), 5861 dst->IsUniform(), 5862 dst->getName()); 5863 5864 for (unsigned p = 0; p < 2; ++p) 5865 { 5866 addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset)); 5867 dstOpnd = GetRawDestination(p == 0 ? V0 : V1); 5868 5869 V(vKernel->AppendVISASvmGatherInst( 5870 predOpnd, 5871 SplitEMask(fromExecSize, toExecSize, p, execMask), 5872 toExecSize, 5873 visaBlockType(elemSize), 5874 visaBlockNum(numElems), 5875 addressOpnd, dstOpnd)); 5876 } 5877 5878 uint32_t dstOfstBytes = dst->GetAliasOffset() + m_encoderState.m_dstOperand.subVar * getGRFSize(); 5879 MergePayloadToHigherSIMD(V0, V1, numElems, dst, dstOfstBytes, 16); 5880 } 5881 return; 5882 } 5883 5884 V(vKernel->AppendVISASvmGatherInst(predOpnd, GetAluEMask(offset), 5885 visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : m_encoderState.m_simdSize), 5886 visaBlockType(elemSize), 5887 visaBlockNum(numElems), 5888 addressOpnd, dstOpnd)); 5889 } 5890 ScatterA64(CVariable * src,CVariable * offset,unsigned elemSize,unsigned numElems)5891 void CEncoder::ScatterA64(CVariable* src, 5892 CVariable* offset, 5893 unsigned elemSize, 5894 unsigned numElems) { 5895 IGC_ASSERT_MESSAGE((elemSize == 8) || (elemSize == 32) || (elemSize == 64), 5896 "Only B/DW/QW-sized elements are supported!"); 5897 IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4) || ((numElems == 8) && ((elemSize == 32) || m_program->m_Platform->has8ByteA64ByteScatteredMessage())), 5898 "Only 1/2/4/8 elements are supported!"); 5899 5900 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 5901 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 5902 VISA_RawOpnd* srcOpnd = GetRawSource(src); 5903 5904 SIMDMode thisSM = offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : m_encoderState.m_simdSize; 5905 if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE && thisSM == SIMDMode::SIMD16) 5906 { 5907 // BDW A64 scatter does not support SIMD16, split it into 2 SIMD8 5908 VISA_EMask_Ctrl execMask = GetAluEMask(offset); 5909 VISA_Exec_Size fromExecSize = EXEC_SIZE_16; 5910 VISA_Exec_Size toExecSize = EXEC_SIZE_8; 5911 5912 if (numElems == 1 || elemSize == 8) 5913 { // No unpacking (mov instructions) are needed. 5914 for (unsigned p = 0; p < 2; ++p) 5915 { 5916 addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset)); 5917 srcOpnd = GetRawSource(src, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, src)); 5918 V(vKernel->AppendVISASvmScatterInst( 5919 predOpnd, 5920 SplitEMask(fromExecSize, toExecSize, p, execMask), 5921 toExecSize, 5922 visaBlockType(elemSize), 5923 visaBlockNum(numElems), 5924 addressOpnd, srcOpnd)); 5925 } 5926 } 5927 else 5928 { 5929 // Unpacking the original simd16 data payload to form the two simd8 5930 // data payload by splitting the original simd16 data payload. 5931 CVariable* V0, * V1; 5932 uint16_t newNumElems = (uint16_t)8 * numElems; 5933 V0 = m_program->GetNewVariable( 5934 newNumElems, 5935 src->GetType(), 5936 src->GetAlign(), 5937 src->IsUniform(), 5938 CName::NONE); 5939 V1 = m_program->GetNewVariable( 5940 newNumElems, 5941 src->GetType(), 5942 src->GetAlign(), 5943 src->IsUniform(), 5944 CName::NONE); 5945 // Starting offset is calculated from AliasOffset only (subVar not used). 5946 uint32_t srcOfstBytes = src->GetAliasOffset(); 5947 SplitPayloadToLowerSIMD(src, srcOfstBytes, numElems, V0, V1, 16); 5948 5949 for (unsigned p = 0; p < 2; ++p) 5950 { 5951 addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset)); 5952 srcOpnd = GetRawSource(p == 0 ? V0 : V1); 5953 5954 V(vKernel->AppendVISASvmScatterInst( 5955 predOpnd, 5956 SplitEMask(fromExecSize, toExecSize, p, execMask), 5957 toExecSize, 5958 visaBlockType(elemSize), 5959 visaBlockNum(numElems), 5960 addressOpnd, srcOpnd)); 5961 } 5962 } 5963 return; 5964 } 5965 5966 V(vKernel->AppendVISASvmScatterInst(predOpnd, GetAluEMask(offset), 5967 visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : m_encoderState.m_simdSize), 5968 visaBlockType(elemSize), 5969 visaBlockNum(numElems), 5970 addressOpnd, srcOpnd)); 5971 this->m_program->IncStatelessWritesCount(); 5972 } 5973 ByteGather(CVariable * dst,const ResourceDescriptor & resource,CVariable * offset,unsigned elemSize,unsigned numElems)5974 void CEncoder::ByteGather( 5975 CVariable* dst, 5976 const ResourceDescriptor& resource, 5977 CVariable* offset, 5978 unsigned elemSize, 5979 unsigned numElems) { 5980 IGC_ASSERT_MESSAGE(elemSize == 8, "Only BYTE element is supported!"); 5981 IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4), 5982 "Only 1/2/4 elements are supported!"); 5983 5984 // Extend the offset to 64bits and use the A64 gather message if needed 5985 if ((resource.m_surfaceType == ESURFACE_STATELESS) && 5986 (m_program->m_DriverInfo->NeedWAToTransformA32MessagesToA64()) && 5987 (m_program->m_Platform->getWATable().WaNoA32ByteScatteredStatelessMessages != 0)) 5988 { 5989 SEncoderState gatherState = CopyEncoderState(); 5990 Push(); 5991 5992 CVariable* offset64 = m_program->GetNewVariable( 5993 offset->GetNumberElement(), 5994 ISA_TYPE_UQ, 5995 EALIGN_GRF, 5996 offset->IsUniform(), 5997 offset->GetNumberInstance(), 5998 CName(offset->getName(), "_64b")); 5999 6000 CVariable* offset32UD = m_program->BitCast(offset, ISA_TYPE_UD); 6001 6002 if (offset->IsUniform()) 6003 { 6004 uint elements = offset->GetNumberElement(); 6005 SetUniformSIMDSize(lanesToSIMDMode(elements)); 6006 SetNoMask(); 6007 SetSrcRegion(0, elements, elements, 1); 6008 } 6009 6010 Cast(offset64, offset32UD); 6011 Push(); 6012 6013 SetEncoderState(gatherState); 6014 GatherA64(dst, offset64, elemSize, numElems); 6015 return; 6016 6017 } 6018 6019 VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource); 6020 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 6021 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6022 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 6023 6024 VISA_VectorOpnd* globalOffsetOpnd = 0; 6025 int val = 0; 6026 V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD)); 6027 6028 V(vKernel->AppendVISASurfAccessScatterScaledInst(ISA_GATHER_SCALED, 6029 predOpnd, 6030 GetAluEMask(offset), 6031 visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : 6032 m_encoderState.m_simdSize), 6033 visaBlockNum(numElems), 6034 surfaceOpnd, 6035 globalOffsetOpnd, 6036 addressOpnd, dstOpnd)); 6037 } 6038 ByteScatter(CVariable * src,const ResourceDescriptor & resource,CVariable * offset,unsigned elemSize,unsigned numElems)6039 void CEncoder::ByteScatter( 6040 CVariable* src, 6041 const ResourceDescriptor& resource, 6042 CVariable* offset, 6043 unsigned elemSize, 6044 unsigned numElems) 6045 { 6046 IGC_ASSERT_MESSAGE(elemSize == 8, "Only BYTE element is supported!"); 6047 IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4), 6048 "Only 1/2/4 elements are supported!"); 6049 6050 // Extend the offset to 64bits and use the A64 gather message if needed 6051 if ((resource.m_surfaceType == ESURFACE_STATELESS) && 6052 (m_program->m_DriverInfo->NeedWAToTransformA32MessagesToA64()) && 6053 (m_program->m_Platform->getWATable().WaNoA32ByteScatteredStatelessMessages != 0)) 6054 { 6055 SEncoderState gatherState = CopyEncoderState(); 6056 Push(); 6057 6058 CVariable* offset64 = m_program->GetNewVariable( 6059 offset->GetNumberElement(), 6060 ISA_TYPE_UQ, 6061 EALIGN_GRF, 6062 offset->IsUniform(), 6063 offset->GetNumberInstance(), 6064 CName(offset->getName(), "_64b")); 6065 6066 CVariable* offset32UD = m_program->BitCast(offset, ISA_TYPE_UD); 6067 6068 if (offset->IsUniform()) 6069 { 6070 uint elements = offset->GetNumberElement(); 6071 SetUniformSIMDSize(lanesToSIMDMode(elements)); 6072 SetNoMask(); 6073 SetSrcRegion(0, elements, elements, 1); 6074 } 6075 6076 Cast(offset64, offset32UD); 6077 Push(); 6078 6079 SetEncoderState(gatherState); 6080 ScatterA64(src, offset64, elemSize, numElems); 6081 return; 6082 6083 } 6084 6085 VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource); 6086 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 6087 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6088 VISA_RawOpnd* srcOpnd = GetRawSource(src); 6089 6090 VISA_VectorOpnd* globalOffsetOpnd = 0; 6091 int val = 0; 6092 V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD)); 6093 6094 V(vKernel->AppendVISASurfAccessScatterScaledInst(ISA_SCATTER_SCALED, 6095 predOpnd, 6096 GetAluEMask(offset), 6097 visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : 6098 m_encoderState.m_simdSize), 6099 visaBlockNum(numElems), 6100 surfaceOpnd, 6101 globalOffsetOpnd, 6102 addressOpnd, srcOpnd)); 6103 } 6104 Gather4ScaledNd(CVariable * dst,const ResourceDescriptor & resource,CVariable * offset,unsigned nd)6105 void CEncoder::Gather4ScaledNd(CVariable* dst, 6106 const ResourceDescriptor& resource, 6107 CVariable* offset, 6108 unsigned nd) { 6109 6110 VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource); 6111 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 6112 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6113 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 6114 6115 VISA_VectorOpnd* globalOffsetOpnd = 0; 6116 int val = 0; 6117 V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD)); 6118 6119 V(vKernel->AppendVISASurfAccessGather4Scatter4ScaledInst( 6120 ISA_GATHER4_SCALED, 6121 predOpnd, 6122 GetAluEMask(dst), 6123 visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : 6124 m_encoderState.m_simdSize), 6125 ConvertChannelMaskToVisaType(BIT(nd) - 1), 6126 surfaceOpnd, 6127 globalOffsetOpnd, 6128 addressOpnd, dstOpnd)); 6129 } 6130 getNumChannels(CVariable * var) const6131 uint32_t CEncoder::getNumChannels(CVariable* var) const 6132 { 6133 IGC_ASSERT(nullptr != var); 6134 unsigned nd = var->GetSize(); 6135 if (var->IsUniform()) 6136 { 6137 IGC_ASSERT_MESSAGE(nd <= getGRFSize(), "Unknown Variable Size!"); 6138 return 1; 6139 } 6140 else 6141 { 6142 static_assert(0 < SIZE_DWORD); 6143 6144 switch (m_encoderState.m_simdSize) 6145 { 6146 case SIMDMode::SIMD8: 6147 return nd / (8 * SIZE_DWORD); 6148 case SIMDMode::SIMD16: 6149 return nd / (16 * SIZE_DWORD); 6150 case SIMDMode::SIMD32: 6151 return nd / (32 * SIZE_DWORD); 6152 default: 6153 IGC_ASSERT_MESSAGE(0, "Unknown SIMD size!"); 6154 return 1; 6155 } 6156 } 6157 return 1; 6158 } 6159 Gather4Scaled(CVariable * dst,const ResourceDescriptor & resource,CVariable * offset)6160 void CEncoder::Gather4Scaled(CVariable* dst, 6161 const ResourceDescriptor& resource, 6162 CVariable* offset) 6163 { 6164 unsigned nd = getNumChannels(dst); 6165 Gather4ScaledNd(dst, resource, offset, nd); 6166 } 6167 Scatter4Scaled(CVariable * src,const ResourceDescriptor & resource,CVariable * offset)6168 void CEncoder::Scatter4Scaled(CVariable* src, 6169 const ResourceDescriptor& resource, 6170 CVariable* offset) { 6171 unsigned nd = getNumChannels(src); 6172 6173 VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource); 6174 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 6175 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6176 VISA_RawOpnd* srcOpnd = GetRawSource(src); 6177 6178 VISA_VectorOpnd* globalOffsetOpnd = 0; 6179 int val = 0; 6180 V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD)); 6181 6182 V(vKernel->AppendVISASurfAccessGather4Scatter4ScaledInst( 6183 ISA_SCATTER4_SCALED, 6184 predOpnd, 6185 GetAluEMask(src), 6186 visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : 6187 m_encoderState.m_simdSize), 6188 ConvertChannelMaskToVisaType(BIT(nd) - 1), 6189 surfaceOpnd, 6190 globalOffsetOpnd, 6191 addressOpnd, srcOpnd)); 6192 if (ESURFACE_STATELESS == resource.m_surfaceType) 6193 { 6194 this->m_program->IncStatelessWritesCount(); 6195 } 6196 } 6197 Gather4A64(CVariable * dst,CVariable * offset)6198 void CEncoder::Gather4A64(CVariable* dst, CVariable* offset) { 6199 IGC_ASSERT(nullptr != dst); 6200 IGC_ASSERT_MESSAGE(dst->GetElemSize() == 4, "Gather4 must have 4-byte element"); 6201 6202 uint32_t dstOfstBytes = m_encoderState.m_dstOperand.subVar * getGRFSize() + dst->GetAliasOffset(); 6203 unsigned nd = dst->GetSize(); 6204 switch (m_encoderState.m_simdSize) { 6205 case SIMDMode::SIMD8: 6206 nd = nd / (8 * SIZE_DWORD); 6207 break; 6208 case SIMDMode::SIMD16: 6209 nd = nd / (16 * SIZE_DWORD); 6210 break; 6211 case SIMDMode::SIMD32: 6212 nd = nd / (32 * SIZE_DWORD); 6213 break; 6214 default: 6215 IGC_ASSERT_MESSAGE(0, "Unknown SIMD size!"); 6216 return; 6217 } 6218 6219 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 6220 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6221 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 6222 6223 VISA_VectorOpnd* globalOffsetOpnd = 0; 6224 int val = 0; 6225 V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD)); 6226 6227 if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE && m_encoderState.m_simdSize == SIMDMode::SIMD16) 6228 { 6229 // BDW A64 untyped does not support SIMD16, split it into 2 SIMD8 6230 VISA_EMask_Ctrl execMask = GetAluEMask(offset); 6231 VISA_Exec_Size fromExecSize = EXEC_SIZE_16; 6232 VISA_Exec_Size toExecSize = EXEC_SIZE_8; 6233 6234 if (nd == 1) 6235 { 6236 // No packing is needed. 6237 for (unsigned p = 0; p < 2; ++p) 6238 { 6239 addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset)); 6240 dstOpnd = GetRawDestination(dst, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, dst)); 6241 6242 V(vKernel->AppendVISASvmGather4ScaledInst( 6243 predOpnd, 6244 SplitEMask(fromExecSize, toExecSize, p, execMask), 6245 toExecSize, 6246 ConvertChannelMaskToVisaType(BIT(nd) - 1), 6247 globalOffsetOpnd, 6248 addressOpnd, dstOpnd)); 6249 } 6250 } 6251 else 6252 { 6253 // Packing the two SIMD8 data payload to form the SIMD16 data payload 6254 // by merging the two simd8 data payload. 6255 CVariable* V0, * V1; 6256 uint16_t newNumElems = (uint16_t)8 * nd; 6257 V0 = m_program->GetNewVariable( 6258 newNumElems, 6259 dst->GetType(), 6260 dst->GetAlign(), 6261 dst->IsUniform(), 6262 CName(dst->getName(),"_M0")); 6263 V1 = m_program->GetNewVariable( 6264 newNumElems, 6265 dst->GetType(), 6266 dst->GetAlign(), 6267 dst->IsUniform(), 6268 CName(dst->getName(),"_M8")); 6269 6270 for (unsigned p = 0; p < 2; ++p) 6271 { 6272 addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset)); 6273 dstOpnd = GetRawDestination(p == 0 ? V0 : V1); 6274 6275 V(vKernel->AppendVISASvmGather4ScaledInst( 6276 predOpnd, 6277 SplitEMask(fromExecSize, toExecSize, p, execMask), 6278 toExecSize, 6279 ConvertChannelMaskToVisaType(BIT(nd) - 1), 6280 globalOffsetOpnd, 6281 addressOpnd, dstOpnd)); 6282 } 6283 6284 MergePayloadToHigherSIMD(V0, V1, nd, dst, dstOfstBytes, 16); 6285 } 6286 return; 6287 } 6288 6289 V(vKernel->AppendVISASvmGather4ScaledInst( 6290 predOpnd, 6291 GetAluEMask(dst), 6292 visaExecSize(m_encoderState.m_simdSize), 6293 ConvertChannelMaskToVisaType(BIT(nd) - 1), 6294 globalOffsetOpnd, 6295 addressOpnd, dstOpnd)); 6296 } 6297 Scatter4A64(CVariable * src,CVariable * offset)6298 void CEncoder::Scatter4A64(CVariable* src, CVariable* offset) { 6299 IGC_ASSERT(nullptr != src); 6300 IGC_ASSERT_MESSAGE(src->GetElemSize() == 4, "scatter4 must have 4-byte element"); 6301 6302 uint32_t srcOfstBytes = src->GetAliasOffset(); 6303 unsigned nd = src->GetSize(); 6304 switch (m_encoderState.m_simdSize) { 6305 case SIMDMode::SIMD8: 6306 nd = nd / (8 * SIZE_DWORD); 6307 break; 6308 case SIMDMode::SIMD16: 6309 nd = nd / (16 * SIZE_DWORD); 6310 break; 6311 case SIMDMode::SIMD32: 6312 nd = nd / (32 * SIZE_DWORD); 6313 break; 6314 default: 6315 IGC_ASSERT_MESSAGE(0, "unknown SIMD size"); 6316 return; 6317 } 6318 6319 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 6320 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6321 VISA_RawOpnd* srcOpnd = GetRawSource(src); 6322 6323 VISA_VectorOpnd* globalOffsetOpnd = 0; 6324 int val = 0; 6325 V(vKernel->CreateVISAImmediate(globalOffsetOpnd, &val, ISA_TYPE_UD)); 6326 6327 if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE && m_encoderState.m_simdSize == SIMDMode::SIMD16) 6328 { 6329 // BDW A64 untyped does not support SIMD16, split it into 2 SIMD8 6330 VISA_EMask_Ctrl execMask = GetAluEMask(src); 6331 VISA_Exec_Size fromExecSize = EXEC_SIZE_16; 6332 VISA_Exec_Size toExecSize = EXEC_SIZE_8; 6333 6334 if (nd == 1) 6335 { 6336 // No need to do unpacking 6337 for (unsigned p = 0; p < 2; ++p) 6338 { 6339 addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset)); 6340 srcOpnd = GetRawSource(src, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, src)); 6341 6342 V(vKernel->AppendVISASvmScatter4ScaledInst( 6343 predOpnd, 6344 SplitEMask(fromExecSize, toExecSize, p, execMask), 6345 toExecSize, 6346 ConvertChannelMaskToVisaType(BIT(nd) - 1), 6347 globalOffsetOpnd, 6348 addressOpnd, srcOpnd)); 6349 } 6350 } 6351 else 6352 { 6353 // Unpacking is needed from the original SIMD16 data payload to form 6354 // two SIMD8 data payload by spliting the original simd16 data payload. 6355 CVariable* V0, * V1; 6356 uint16_t newNumElems = (uint16_t)8 * nd; 6357 V0 = m_program->GetNewVariable( 6358 newNumElems, 6359 src->GetType(), 6360 src->GetAlign(), 6361 src->IsUniform(), 6362 CName(src->getName(),"_M0")); 6363 V1 = m_program->GetNewVariable( 6364 newNumElems, 6365 src->GetType(), 6366 src->GetAlign(), 6367 src->IsUniform(), 6368 CName(src->getName(),"_M8")); 6369 6370 SplitPayloadToLowerSIMD(src, srcOfstBytes, nd, V0, V1, 16); 6371 6372 for (unsigned p = 0; p < 2; ++p) 6373 { 6374 addressOpnd = GetRawSource(offset, GetRawOpndSplitOffset(fromExecSize, toExecSize, p, offset)); 6375 srcOpnd = GetRawSource(p == 0 ? V0 : V1); 6376 6377 V(vKernel->AppendVISASvmScatter4ScaledInst( 6378 predOpnd, 6379 SplitEMask(fromExecSize, toExecSize, p, execMask), 6380 toExecSize, 6381 ConvertChannelMaskToVisaType(BIT(nd) - 1), 6382 globalOffsetOpnd, 6383 addressOpnd, srcOpnd)); 6384 } 6385 } 6386 return; 6387 } 6388 6389 V(vKernel->AppendVISASvmScatter4ScaledInst( 6390 predOpnd, 6391 GetAluEMask(src), 6392 visaExecSize(m_encoderState.m_simdSize), 6393 ConvertChannelMaskToVisaType(BIT(nd) - 1), 6394 globalOffsetOpnd, 6395 addressOpnd, srcOpnd)); 6396 } 6397 AtomicRawA64(AtomicOp atomic_op,const ResourceDescriptor & resource,CVariable * dst,CVariable * offset,CVariable * src0,CVariable * src1,unsigned short bitwidth)6398 void CEncoder::AtomicRawA64( 6399 AtomicOp atomic_op, 6400 const ResourceDescriptor& resource, 6401 CVariable* dst, 6402 CVariable* offset, 6403 CVariable* src0, 6404 CVariable* src1, 6405 unsigned short bitwidth) 6406 { 6407 // For cmpxchg, we have to change the order of arguments. 6408 if (atomic_op == EATOMIC_CMPXCHG) { 6409 std::swap(src0, src1); 6410 } 6411 6412 VISAAtomicOps atomicOpcode = convertAtomicOpEnumToVisa(atomic_op); 6413 6414 if (m_encoderState.m_simdSize == SIMDMode::SIMD16) 6415 { 6416 // Split SIMD16 atomic ops into two SIMD8 ones. 6417 VISA_EMask_Ctrl execMask = ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask); 6418 VISA_Exec_Size fromExecSize = visaExecSize(m_encoderState.m_simdSize); 6419 VISA_Exec_Size toExecSize = SplitExecSize(fromExecSize, 2); 6420 6421 for (unsigned thePart = 0; thePart != 2; ++thePart) 6422 { 6423 CVariable* rawOpndVar = nullptr; 6424 uint32_t rawOpndOffset = 0; 6425 bool isFirstHalf = thePart == 0; 6426 6427 std::tie(rawOpndVar, rawOpndOffset) = splitRawOperand(offset, isFirstHalf, execMask); 6428 VISA_RawOpnd* addressOpnd = GetRawSource(rawOpndVar, rawOpndOffset); 6429 std::tie(rawOpndVar, rawOpndOffset) = splitRawOperand(src0, isFirstHalf, execMask); 6430 VISA_RawOpnd* src0Opnd = GetRawSource(rawOpndVar, rawOpndOffset); 6431 std::tie(rawOpndVar, rawOpndOffset) = splitRawOperand(src1, isFirstHalf, execMask); 6432 VISA_RawOpnd* src1Opnd = GetRawSource(rawOpndVar, rawOpndOffset); 6433 6434 // dst needs special handling since its move has to come after the send 6435 VISA_RawOpnd* dstOpnd = nullptr; 6436 bool needsTmpDst = !isFirstHalf && dst && (dst->GetElemSize() * 8) % getGRFSize() != 0; 6437 if (!needsTmpDst) 6438 { 6439 std::tie(rawOpndVar, rawOpndOffset) = splitRawOperand(dst, isFirstHalf, execMask); 6440 dstOpnd = GetRawDestination(rawOpndVar, rawOpndOffset); 6441 } 6442 else 6443 { 6444 rawOpndVar = m_program->GetNewVariable( 6445 8, 6446 dst->GetType(), 6447 CVariable::getAlignment(getGRFSize()), 6448 CName(dst->getName(), "_RET")); 6449 dstOpnd = GetRawDestination(rawOpndVar, 0); 6450 } 6451 6452 V(vKernel->AppendVISASvmAtomicInst(GetFlagOperand(m_encoderState.m_flag), 6453 SplitEMask(fromExecSize, toExecSize, thePart, execMask), 6454 toExecSize, atomicOpcode, bitwidth, 6455 addressOpnd, src0Opnd, src1Opnd, dstOpnd)); 6456 this->m_program->IncStatelessWritesCount(); 6457 6458 if (needsTmpDst) 6459 { 6460 SModifier mod; 6461 mod.init(); 6462 mod.subReg = 8; 6463 auto dstOpnd = GetDestinationOperand(dst, mod); 6464 6465 mod.init(); 6466 auto srcOpnd = GetSourceOperand(rawOpndVar, mod); 6467 6468 V(vKernel->AppendVISADataMovementInst( 6469 ISA_MOV, nullptr, false, 6470 SplitEMask(EXEC_SIZE_16, EXEC_SIZE_8, 1, execMask), 6471 EXEC_SIZE_8, dstOpnd, srcOpnd)); 6472 } 6473 } 6474 6475 return; 6476 } 6477 6478 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6479 VISA_RawOpnd* src0Opnd = GetRawSource(src0); 6480 VISA_RawOpnd* src1Opnd = GetRawSource(src1); 6481 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 6482 6483 V(vKernel->AppendVISASvmAtomicInst(GetFlagOperand(m_encoderState.m_flag), 6484 ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask), 6485 visaExecSize(m_encoderState.m_simdSize), 6486 atomicOpcode, 6487 bitwidth, 6488 addressOpnd, 6489 src0Opnd, 6490 src1Opnd, 6491 dstOpnd)); 6492 this->m_program->IncStatelessWritesCount(); 6493 } 6494 Wait()6495 void CEncoder::Wait() 6496 { 6497 V(vKernel->AppendVISAWaitInst(nullptr)); 6498 } 6499 SendVmeIme(CVariable * bindingTableIndex,unsigned char streamMode,unsigned char searchControlMode,CVariable * uniInputVar,CVariable * imeInputVar,CVariable * ref0Var,CVariable * ref1Var,CVariable * costCenterVar,CVariable * outputVar)6500 void CEncoder::SendVmeIme(CVariable* bindingTableIndex, 6501 unsigned char streamMode, 6502 unsigned char searchControlMode, 6503 CVariable* uniInputVar, 6504 CVariable* imeInputVar, 6505 CVariable* ref0Var, 6506 CVariable* ref1Var, 6507 CVariable* costCenterVar, 6508 CVariable* outputVar) { 6509 6510 VISA_StateOpndHandle* surface = GetVISASurfaceOpnd(ESURFACE_NORMAL, bindingTableIndex); 6511 VISA_RawOpnd* uniInput = GetRawSource(uniInputVar); 6512 VISA_RawOpnd* imeInput = GetRawSource(imeInputVar); 6513 VISA_RawOpnd* ref0 = GetRawSource(ref0Var); 6514 VISA_RawOpnd* ref1 = GetRawSource(ref1Var); 6515 VISA_RawOpnd* costCenter = GetRawSource(costCenterVar); 6516 VISA_RawOpnd* output = GetRawDestination(outputVar); 6517 V(vKernel->AppendVISAMiscVME_IME(surface, streamMode, searchControlMode, uniInput, imeInput, ref0, ref1, costCenter, output)); 6518 } 6519 SendVmeFbr(CVariable * bindingTableIndex,CVariable * uniInputVar,CVariable * fbrInputVar,CVariable * FBRMbModeVar,CVariable * FBRSubMbShapeVar,CVariable * FBRSubPredModeVar,CVariable * outputVar)6520 void CEncoder::SendVmeFbr(CVariable* bindingTableIndex, 6521 CVariable* uniInputVar, 6522 CVariable* fbrInputVar, 6523 CVariable* FBRMbModeVar, 6524 CVariable* FBRSubMbShapeVar, 6525 CVariable* FBRSubPredModeVar, 6526 CVariable* outputVar) { 6527 VISA_StateOpndHandle* surface = GetVISASurfaceOpnd(ESURFACE_NORMAL, bindingTableIndex); 6528 VISA_RawOpnd* UNIInput = GetRawSource(uniInputVar); 6529 VISA_RawOpnd* FBRInput = GetRawSource(fbrInputVar); 6530 VISA_VectorOpnd* FBRMbMode = GetSourceOperand(FBRMbModeVar, m_encoderState.m_srcOperand[0]); 6531 VISA_VectorOpnd* FBRSubMbShape = GetSourceOperand(FBRSubMbShapeVar, m_encoderState.m_srcOperand[1]); 6532 VISA_VectorOpnd* FBRSubPredMode = GetSourceOperand(FBRSubPredModeVar, m_encoderState.m_srcOperand[2]); 6533 VISA_RawOpnd* output = GetRawDestination(outputVar); 6534 6535 V(vKernel->AppendVISAMiscVME_FBR(surface, UNIInput, FBRInput, FBRMbMode, FBRSubMbShape, FBRSubPredMode, output)); 6536 } 6537 SendVmeSic(CVariable * bindingTableIndex,CVariable * uniInputVar,CVariable * sicInputVar,CVariable * outputVar)6538 void CEncoder::SendVmeSic( 6539 CVariable* bindingTableIndex, 6540 CVariable* uniInputVar, 6541 CVariable* sicInputVar, 6542 CVariable* outputVar) 6543 { 6544 VISA_StateOpndHandle* surface = GetVISASurfaceOpnd(ESURFACE_NORMAL, bindingTableIndex); 6545 VISA_RawOpnd* UNIInput = GetRawSource(uniInputVar); 6546 VISA_RawOpnd* SICInput = GetRawSource(sicInputVar); 6547 VISA_RawOpnd* output = GetRawDestination(outputVar); 6548 6549 V(vKernel->AppendVISAMiscVME_SIC(surface, UNIInput, SICInput, output)); 6550 } 6551 SendVideoAnalytic(llvm::GenIntrinsicInst * inst,CVariable * vaResult,CVariable * coords,CVariable * size,CVariable * srcImg,CVariable * sampler)6552 void CEncoder::SendVideoAnalytic( 6553 llvm::GenIntrinsicInst* inst, 6554 CVariable* vaResult, 6555 CVariable* coords, 6556 CVariable* size, 6557 CVariable* srcImg, 6558 CVariable* sampler) 6559 { 6560 VISA_RawOpnd* vaOutput = GetRawDestination(vaResult); 6561 6562 SModifier mod0 = m_encoderState.m_srcOperand[0]; 6563 SModifier mod1 = m_encoderState.m_srcOperand[1]; 6564 6565 mod0.specialRegion = mod1.specialRegion = true; 6566 mod0.region[0] = mod1.region[0] = 0; 6567 mod0.region[1] = mod1.region[1] = 1; 6568 mod0.region[2] = mod1.region[2] = 0; 6569 mod0.subReg = 0; 6570 mod0.subVar = 0; 6571 6572 if (coords->IsUniform()) 6573 { 6574 mod1.subReg = 1; 6575 mod1.subVar = 0; 6576 } 6577 else 6578 { 6579 mod1.subReg = 0; 6580 mod1.subVar = 2; 6581 } 6582 6583 VISA_VectorOpnd* uOffset = GetSourceOperand(coords, mod0); 6584 VISA_VectorOpnd* vOffset = GetSourceOperand(coords, mod1); 6585 6586 if (size && size->IsUniform()) 6587 { 6588 mod1.subReg = 1; 6589 mod1.subVar = 0; 6590 } 6591 else 6592 { 6593 mod1.subReg = 0; 6594 mod1.subVar = 2; 6595 } 6596 6597 VISA_VectorOpnd* wSize = (size ? GetSourceOperand(size, mod0) : NULL); 6598 VISA_VectorOpnd* hSize = (size ? GetSourceOperand(size, mod1) : NULL); 6599 6600 // So far we support only one VA function per kernel, and other sample 6601 // messages are not supported when there is VA function within the kernel. 6602 // So, for now it should be fine to always use bti 0 for VA functions. 6603 DWORD btiIndex = 0; 6604 DWORD mmfMode = 0; 6605 6606 VISA_StateOpndHandle* surfaceOpnd = GetBTIOperand(btiIndex); 6607 VISA_StateOpndHandle* samplerHnd = GetSamplerOperand(sampler); 6608 VISA_VectorOpnd* mmModeOpnd = NULL; 6609 6610 EDMode erodeDilateMode = VA_DILATE; 6611 EDExecMode execMode = VA_ED_64x4; 6612 bool isBigKernel = true; 6613 6614 if (m_program->m_Platform->GetPlatformFamily() == IGFX_GEN8_CORE) 6615 { 6616 isBigKernel = false; 6617 } 6618 6619 switch (inst->getIntrinsicID()) 6620 { 6621 case GenISAIntrinsic::GenISA_vaErode: 6622 erodeDilateMode = VA_ERODE; 6623 case GenISAIntrinsic::GenISA_vaDilate: 6624 V(vKernel->AppendVISAVAErodeDilate(erodeDilateMode, samplerHnd, surfaceOpnd, uOffset, vOffset, execMode, vaOutput)); 6625 break; 6626 case GenISAIntrinsic::GenISA_vaMinMaxFilter: 6627 V(vKernel->CreateVISAImmediate(mmModeOpnd, &mmfMode, ISA_TYPE_UD)); 6628 V(vKernel->AppendVISAVAMinMaxFilter(samplerHnd, surfaceOpnd, uOffset, vOffset, AVS_16_FULL, VA_MMF_16x4, mmModeOpnd, vaOutput)); 6629 break; 6630 case GenISAIntrinsic::GenISA_vaConvolveGRF_16x1: 6631 V(vKernel->AppendVISAVAConvolve(samplerHnd, surfaceOpnd, uOffset, vOffset, VA_CONV_16x1, isBigKernel, vaOutput)); 6632 break; 6633 case GenISAIntrinsic::GenISA_vaConvolve: 6634 case GenISAIntrinsic::GenISA_vaConvolveGRF_16x4: 6635 V(vKernel->AppendVISAVAConvolve(samplerHnd, surfaceOpnd, uOffset, vOffset, VA_CONV_16x4, isBigKernel, vaOutput)); 6636 break; 6637 case GenISAIntrinsic::GenISA_vaMinMax: 6638 V(vKernel->CreateVISAImmediate(mmModeOpnd, &mmfMode, ISA_TYPE_UD)); 6639 V(vKernel->AppendVISAVAMinMax(surfaceOpnd, uOffset, vOffset, mmModeOpnd, vaOutput)); 6640 break; 6641 case GenISAIntrinsic::GenISA_vaCentroid: 6642 V(vKernel->AppendVISAVACentroid(surfaceOpnd, uOffset, vOffset, wSize, vaOutput)); 6643 break; 6644 case GenISAIntrinsic::GenISA_vaBoolCentroid: 6645 case GenISAIntrinsic::GenISA_vaBoolSum: 6646 V(vKernel->AppendVISAVABooleanCentroid(surfaceOpnd, uOffset, vOffset, wSize, hSize, vaOutput)); 6647 break; 6648 default: 6649 IGC_ASSERT_MESSAGE(0, "Trying to emit unrecognized video analytic instruction (listed above)"); 6650 break; 6651 }; 6652 } 6653 SetVISAWaTable(WA_TABLE const & waTable)6654 void CEncoder::SetVISAWaTable(WA_TABLE const& waTable) 6655 { 6656 // Copy from driver WA table to VISA WA table, 6657 // then update the conditional W/A 6658 m_vISAWaTable = waTable; 6659 6660 if (m_program->GetShaderType() != ShaderType::PIXEL_SHADER && 6661 m_program->GetShaderType() != ShaderType::COMPUTE_SHADER && 6662 m_program->GetShaderType() != ShaderType::OPENCL_SHADER) 6663 { 6664 m_vISAWaTable.WaClearTDRRegBeforeEOTForNonPS = waTable.WaClearTDRRegBeforeEOTForNonPS; 6665 } 6666 else 6667 { 6668 m_vISAWaTable.WaClearTDRRegBeforeEOTForNonPS = false; 6669 } 6670 6671 if (IGC_IS_FLAG_DISABLED(ForceSendsSupportOnSKLA0)) 6672 { 6673 m_vISAWaTable.WaDisableSendsSrc0DstOverlap = waTable.WaDisableSendsSrc0DstOverlap; 6674 } 6675 else 6676 { 6677 m_vISAWaTable.WaDisableSendsSrc0DstOverlap = false; 6678 } 6679 6680 TODO("Limit this C0 WA as required to only Compute , as it causes hangs in some 3D Workloads"); 6681 if (IGC_IS_FLAG_DISABLED(DisableWaSendSEnableIndirectMsgDesc) && 6682 (m_program->GetShaderType() == ShaderType::COMPUTE_SHADER || 6683 m_program->GetShaderType() == ShaderType::OPENCL_SHADER)) 6684 { 6685 m_vISAWaTable.WaSendSEnableIndirectMsgDesc = waTable.WaSendSEnableIndirectMsgDesc; 6686 } 6687 else 6688 { 6689 m_vISAWaTable.WaSendSEnableIndirectMsgDesc = false; 6690 } 6691 6692 if (IGC_IS_FLAG_DISABLED(DisableWaDisableSIMD16On3SrcInstr)) 6693 { 6694 m_vISAWaTable.WaDisableSIMD16On3SrcInstr = waTable.WaDisableSIMD16On3SrcInstr; 6695 } 6696 else 6697 { 6698 m_vISAWaTable.WaDisableSIMD16On3SrcInstr = false; 6699 } 6700 } 6701 GetRowAndColOffset(CVariable * var,unsigned int subVar,unsigned int subReg,unsigned char & rowOff,unsigned char & colOff)6702 void CEncoder::GetRowAndColOffset(CVariable* var, unsigned int subVar, unsigned int subReg, unsigned char& rowOff, unsigned char& colOff) 6703 { 6704 IGC_ASSERT(nullptr != var); 6705 unsigned int varTypeSize = GetCISADataTypeSize(var->GetType()); 6706 unsigned int offset = var->GetAliasOffset() + subVar * getGRFSize() + subReg * varTypeSize; 6707 IGC_ASSERT(0 < getGRFSize()); 6708 IGC_ASSERT(0 < varTypeSize); 6709 IGC_ASSERT_MESSAGE((offset % getGRFSize()) % varTypeSize == 0, "offset has to be aligned on element size"); 6710 rowOff = int_cast<unsigned char>(offset / getGRFSize()); 6711 colOff = int_cast<unsigned char>((offset % getGRFSize()) / varTypeSize); 6712 } 6713 Loc(unsigned int line)6714 void CEncoder::Loc(unsigned int line) 6715 { 6716 V(vKernel->AppendVISAMiscLOC(line)); 6717 } 6718 File(std::string & s)6719 void CEncoder::File(std::string& s) 6720 { 6721 V(vKernel->AppendVISAMiscFileInst(s.c_str())); 6722 } 6723 Lifetime(VISAVarLifetime StartOrEnd,CVariable * dst)6724 void CEncoder::Lifetime(VISAVarLifetime StartOrEnd, CVariable* dst) 6725 { 6726 SModifier noMod; // Default is no mod. 6727 noMod.init(); 6728 VISA_VectorOpnd* srcOpnd = GetSourceOperand(dst, noMod); 6729 V(vKernel->AppendVISALifetime(StartOrEnd, srcOpnd)); 6730 } 6731 DebugLinePlaceholder()6732 void CEncoder::DebugLinePlaceholder() 6733 { 6734 V(vKernel->AppendVISADebugLinePlaceholder()); 6735 } 6736 ConvertPrecisionToVisaType(PrecisionType P)6737 GenPrecision ConvertPrecisionToVisaType(PrecisionType P) 6738 { 6739 switch (P) { 6740 default: break; 6741 case PrecisionType::S2: return GenPrecision::S2; 6742 case PrecisionType::S4: return GenPrecision::S4; 6743 case PrecisionType::S8: return GenPrecision::S8; 6744 case PrecisionType::U2: return GenPrecision::U2; 6745 case PrecisionType::U4: return GenPrecision::U4; 6746 case PrecisionType::U8: return GenPrecision::U8; 6747 case PrecisionType::BF16: return GenPrecision::BF16; 6748 case PrecisionType::FP16: return GenPrecision::FP16; 6749 } 6750 6751 return GenPrecision::INVALID; 6752 } 6753 6754 dpas(CVariable * dst,CVariable * input,CVariable * weight,PrecisionType weight_precision,CVariable * activation,PrecisionType activation_precision,uint8_t systolicDepth,uint8_t repeatCount,bool IsDpasw)6755 void CEncoder::dpas( 6756 CVariable* dst, CVariable* input, 6757 CVariable* weight, PrecisionType weight_precision, 6758 CVariable* activation, PrecisionType activation_precision, 6759 uint8_t systolicDepth, uint8_t repeatCount, 6760 bool IsDpasw) 6761 { 6762 SModifier noMod; // Default is no mod. 6763 noMod.init(); 6764 // PrecisionType to GenPrecision 6765 GenPrecision src1Precision = ConvertPrecisionToVisaType(weight_precision); 6766 GenPrecision src2Precision = ConvertPrecisionToVisaType(activation_precision); 6767 6768 VISA_EMask_Ctrl execMask = GetAluEMask(dst); 6769 VISA_Exec_Size execSize = EXEC_SIZE_8; 6770 { 6771 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 6772 VISA_RawOpnd* srcOpnd0 = GetRawSource(input); 6773 VISA_RawOpnd* srcOpnd1 = GetRawSource(weight); 6774 VISA_VectorOpnd* srcOpnd2 = GetSourceOperand(activation, noMod); 6775 V(vKernel->AppendVISADpasInst( 6776 IsDpasw ? ISA_DPASW : ISA_DPAS, 6777 execMask, 6778 execSize, 6779 dstOpnd, 6780 srcOpnd0, 6781 srcOpnd1, 6782 srcOpnd2, 6783 src2Precision, 6784 src1Precision, 6785 systolicDepth, 6786 repeatCount)); 6787 } 6788 } 6789 QWGather(CVariable * dst,const ResourceDescriptor & resource,CVariable * offset,unsigned elemSize,unsigned numElems)6790 void CEncoder::QWGather(CVariable* dst, 6791 const ResourceDescriptor& resource, 6792 CVariable* offset, 6793 unsigned elemSize, 6794 unsigned numElems) 6795 { 6796 IGC_ASSERT_MESSAGE(elemSize == 64, "Only QWord element is supported!"); 6797 IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4), 6798 "Only 1/2/4 elements are supported!"); 6799 6800 VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource); 6801 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 6802 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6803 VISA_RawOpnd* dstOpnd = GetRawDestination(dst); 6804 6805 V(vKernel->AppendVISAQwordGatherInst( 6806 predOpnd, 6807 GetAluEMask(offset), 6808 visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : 6809 m_encoderState.m_simdSize), 6810 visaBlockNum(numElems), 6811 surfaceOpnd, 6812 addressOpnd, dstOpnd)); 6813 } 6814 QWScatter(CVariable * src,const ResourceDescriptor & resource,CVariable * offset,unsigned elemSize,unsigned numElems)6815 void CEncoder::QWScatter(CVariable* src, 6816 const ResourceDescriptor& resource, 6817 CVariable* offset, 6818 unsigned elemSize, 6819 unsigned numElems) 6820 { 6821 IGC_ASSERT_MESSAGE(elemSize == 64, "Only QWord element is supported"); 6822 IGC_ASSERT_MESSAGE((numElems == 1) || (numElems == 2) || (numElems == 4), 6823 "Only 1/2/4 elements are supported!"); 6824 6825 VISA_StateOpndHandle* surfaceOpnd = GetVISASurfaceOpnd(resource); 6826 VISA_PredOpnd* predOpnd = GetFlagOperand(m_encoderState.m_flag); 6827 VISA_RawOpnd* addressOpnd = GetRawSource(offset); 6828 VISA_RawOpnd* srcOpnd = GetRawSource(src); 6829 6830 V(vKernel->AppendVISAQwordScatterInst( 6831 predOpnd, 6832 GetAluEMask(offset), 6833 visaExecSize(offset->IsUniform() ? lanesToSIMDMode(offset->GetNumberElement()) : 6834 m_encoderState.m_simdSize), 6835 visaBlockNum(numElems), 6836 surfaceOpnd, 6837 addressOpnd, srcOpnd)); 6838 } 6839 6840 GetVariableName(CVariable * var)6841 std::string CEncoder::GetVariableName(CVariable* var) 6842 { 6843 IGC_ASSERT(nullptr != var); 6844 if (var->IsImmediate()) 6845 { 6846 std::stringstream temp; 6847 temp << "0x" << std::hex << var->GetImmediateValue() << ":" << CISATypeTable[var->GetType()].typeName; 6848 return temp.str(); 6849 } 6850 switch (var->GetVarType()) 6851 { 6852 case EVARTYPE_GENERAL: 6853 return vKernel->getVarName(GetVISAVariable(var)); 6854 case EVARTYPE_PREDICATE: 6855 return vKernel->getVarName(var->visaPredVariable); 6856 case EVARTYPE_ADDRESS: 6857 return vKernel->getVarName(var->visaAddrVariable); 6858 case EVARTYPE_SURFACE: 6859 return vKernel->getVarName(var->visaSurfVariable); 6860 case EVARTYPE_SAMPLER: 6861 return vKernel->getVarName(var->visaSamplerVariable); 6862 default: 6863 IGC_ASSERT_MESSAGE(0, "Unknown var type"); 6864 return ""; 6865 } 6866 } 6867 GetDumpFileName(std::string extension)6868 std::string CEncoder::GetDumpFileName(std::string extension) 6869 { 6870 std::string filename = IGC::Debug::GetDumpName(m_program, extension.c_str()); 6871 return filename; 6872 } 6873 6874 ReportCompilerStatistics(VISAKernel * pMainKernel,SProgramOutput * pOutput)6875 void CEncoder::ReportCompilerStatistics(VISAKernel* pMainKernel, SProgramOutput* pOutput) 6876 { 6877 CompilerStats compilerStats; 6878 pMainKernel->GetCompilerStats(compilerStats); 6879 int simdsize = GetThreadCount(m_program->m_dispatchSize); 6880 6881 6882 // set optional statistics 6883 if (compilerStats.Find(CompilerStats::numCyclesStr())) 6884 { 6885 pOutput->m_NumCycles.emplace(compilerStats.GetI64(CompilerStats::numCyclesStr(), simdsize)); 6886 } 6887 6888 if (compilerStats.Find(CompilerStats::numGRFFillStr())) 6889 { 6890 pOutput->m_NumGRFFill.emplace(compilerStats.GetI64(CompilerStats::numGRFFillStr(), simdsize)); 6891 } 6892 6893 if (compilerStats.Find(CompilerStats::numGRFSpillStr())) 6894 { 6895 pOutput->m_NumGRFSpill.emplace(compilerStats.GetI64(CompilerStats::numGRFSpillStr(), simdsize)); 6896 } 6897 6898 if (compilerStats.Find(CompilerStats::numSendStr())) 6899 { 6900 pOutput->m_NumSends.emplace(compilerStats.GetI64(CompilerStats::numSendStr(), simdsize)); 6901 } 6902 FINALIZER_INFO* jitInfo = nullptr; 6903 if (0 == pMainKernel->GetJitInfo(jitInfo)) 6904 { 6905 uint sendStallCycle = 0; 6906 for (uint i = 0; i < jitInfo->BBNum; i++) 6907 { 6908 sendStallCycle += jitInfo->BBInfo[i].sendStallCycle; 6909 } 6910 pOutput->m_NumSendStallCycles.emplace(sendStallCycle); 6911 } 6912 } 6913 GetThreadCount(SIMDMode simdMode)6914 int CEncoder::GetThreadCount(SIMDMode simdMode) 6915 { 6916 int simdsize = 0; 6917 switch (m_program->m_dispatchSize) 6918 { 6919 case SIMDMode::SIMD8: 6920 simdsize = 8; 6921 break; 6922 case SIMDMode::SIMD16: 6923 simdsize = 16; 6924 break; 6925 case SIMDMode::SIMD32: 6926 simdsize = 32; 6927 break; 6928 default: 6929 break; 6930 } 6931 return simdsize; 6932 } 6933 6934 } 6935