1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #include "MergeURBWrites.hpp"
10 #include "Compiler/CodeGenPublic.h"
11 #include "common/LLVMWarningsPush.hpp"
12 #include <llvm/Pass.h>
13 #include <llvm/IR/BasicBlock.h>
14 #include <llvm/IR/Instruction.h>
15 #include <llvm/IR/IntrinsicInst.h>
16 #include <llvm/IR/Constants.h>
17 #include "common/LLVMWarningsPop.hpp"
18 #include "GenISAIntrinsics/GenIntrinsics.h"
19 #include "GenISAIntrinsics/GenIntrinsicInst.h"
20 #include "Probe/Assertion.h"
21 #include "IGCPassSupport.h"
22
23 namespace IGC
24 {
25 using namespace llvm;
26
27 /// Used to store information about instructions and their positions in the current BB.
28 class InstWithIndex
29 {
30 public:
31 // Initialize pointer to null and place to -1 to recognize "empty" entries
InstWithIndex()32 InstWithIndex() : m_inst(nullptr), m_place(-1)
33 {
34 }
35
InstWithIndex(CallInst * inst,int place)36 InstWithIndex(CallInst* inst, int place) :
37 m_inst(inst),
38 m_place(place)
39 {
40 }
41
GetPlace() const42 int GetPlace() const { return m_place; }
GetInst() const43 CallInst* GetInst() const { return m_inst; }
44
45 private:
46 CallInst* m_inst;
47 int m_place;
48 };
49
50 class MergeURBWrites : public FunctionPass
51 {
52 public:
53 static char ID;
54
MergeURBWrites()55 MergeURBWrites() :
56 FunctionPass(ID)
57 {
58 initializeMergeURBWritesPass(*PassRegistry::getPassRegistry());
59 }
60
61 virtual bool runOnFunction(Function& F);
62
getAnalysisUsage(AnalysisUsage & AU) const63 virtual void getAnalysisUsage(AnalysisUsage& AU) const
64 {
65 AU.setPreservesCFG();
66 AU.addRequired<CodeGenContextWrapper>();
67 }
68
getPassName() const69 virtual llvm::StringRef getPassName() const { return "MergeURBWrites"; }
70
71 private:
72 /// Stores all URB write instructions in a vector.
73 /// Also merges partial (channel granularity) writes to the same offset.
74 void FillWriteList(BasicBlock& BB);
75
76 // Tries to merge two writes to adjacent offsets into a single instruction.
77 // Does this by going through write list containing stored write instructions
78 // in order of offsets and merging adjacent writes.
79 void MergeInstructions();
80 // represents the map (urb index) --> (instruction, instruction index in BB)
81 // The key consists of a dynamic URB base offset (key.first) and
82 // an immediate offset from this dynamic base
83 // Dynamic URB base offset is null if URB offset is constant.
84 std::map<std::pair<Value*, unsigned int>, InstWithIndex> m_writeList;
85
86 bool m_bbModified;
87 };
88
89 char MergeURBWrites::ID = 0;
90
GetChannelMask(CallInst * inst)91 static unsigned int GetChannelMask(CallInst* inst)
92 {
93 return int_cast<unsigned int>(cast<ConstantInt>(inst->getOperand(1))->getZExtValue());
94 }
95
96 /// This optimization merges shorter writes to URB to get a smaller number of longer writes
97 /// which is more efficient.
98 /// Current implementation can:
99 /// 1) merge consecutive writes of length 4 to a single write of length 8
100 /// 2) merge writes at the same offset with channel masks
101 ///
102 /// The implementation works as follows: we maintain a map (urb offset) --> instruction
103 /// (kept as a vector) that gets filled with information taken from URBWrite instructions.
104 /// If we encounter two instructions writing at the same offset, we merge channel mask and
105 /// overwrite the earlier channel writes for repeated writes.
106 /// In the second phase, we go through the index list and replace two writes at adjacent
107 /// locations with one.
108 ///
109 /// for now, we don't handle the following cases:
110 /// 1) channel mask is a runtime value
111 /// 2) handling of writes of size >4
112 /// so e.g. we don't handle |aaaa|bbbbbbbb|cccc| -> |aaaabbbb|bbbbcccc|
113 /// this will be addressed in the future.
114 ///
runOnFunction(Function & F)115 bool MergeURBWrites::runOnFunction(Function& F)
116 {
117 bool fModified = false;
118 for (auto& BB : F)
119 {
120 FillWriteList(BB);
121 MergeInstructions();
122 fModified |= m_bbModified;
123 }
124 return fModified;
125 }
126
FillWriteList(BasicBlock & BB)127 void MergeURBWrites::FillWriteList(BasicBlock& BB)
128 {
129 m_bbModified = false;
130 m_writeList.clear();
131 int instCounter = 0; // counts the instruction in the BB
132 for (auto iit = BB.begin(); iit != BB.end(); ++iit, ++instCounter)
133 {
134 auto intrinsic = dyn_cast<GenIntrinsicInst>(iit);
135 if (intrinsic == nullptr) continue;
136
137 GenISAIntrinsic::ID IID = intrinsic->getIntrinsicID();
138 if ((IID == GenISAIntrinsic::GenISA_URBReadOutput) ||
139 (IID == GenISAIntrinsic::GenISA_threadgroupbarrier))
140 {
141 MergeInstructions();
142 m_writeList.clear();
143 }
144 // if not URBWrite intrinsic, we are not interested
145 if (IID != GenISAIntrinsic::GenISA_URBWrite)
146 {
147 continue;
148 }
149
150 // intrinsic has the format: URB_write (%offset, %mask, %data0, ... , %data7)
151 ConstantInt* pImmediateMask = dyn_cast<ConstantInt>(iit->getOperand(1));
152 if (pImmediateMask == nullptr || (GetChannelMask(intrinsic) > 0x0F))
153 {
154 // for now, we don't handle the following cases:
155 // 1) mask is a runtime value
156 // 2) handling of writes of size >4
157 // so e.g. we don't handle |aaaa|bbbbbbbb|cccc| -> |aaaabbbb|bbbbcccc|
158 // this will be addressed in the future
159 continue;
160 }
161
162 std::pair<Value*, unsigned int> baseAndOffset =
163 GetURBBaseAndOffset(iit->getOperand(0));
164
165 auto it = m_writeList.find(baseAndOffset);
166 // we encountered an instruction writing at the same offset,
167 // most likely we write RTAI, VPAI or PSIZE to vertex header
168 // or we overwrite the old value
169 if (it != m_writeList.end())
170 {
171 const InstWithIndex& instWithIndex = it->second;
172 auto oldMask = GetChannelMask(instWithIndex.GetInst());
173 auto newMask = GetChannelMask(intrinsic);
174 // assume the write lengths are <=4
175 // if we have writes to the same channel, we retain the later one,
176 // discarding the earlier one
177 if (oldMask <= 0x0F && newMask <= 0x0F)
178 {
179 // get difference oldMask-newMask to determine if we need to take any old operands
180 auto takeFromOlderMask = oldMask & (~newMask);
181 // update the mask stored in operand #1 of the second instruction
182 auto mergedMask = oldMask | newMask;
183 auto mergedMaskValue = llvm::ConstantInt::get(
184 llvm::Type::getInt32Ty(BB.getContext()),
185 mergedMask);
186 intrinsic->setOperand(1, mergedMaskValue);
187 // move data operands from the older instruction to the newer one
188 unsigned int opIndex = 0;
189 while (takeFromOlderMask != 0)
190 {
191 if (takeFromOlderMask & 1)
192 {
193 intrinsic->setOperand(
194 opIndex + 2,
195 instWithIndex.GetInst()->getOperand(opIndex + 2));
196 }
197 ++opIndex;
198 takeFromOlderMask = takeFromOlderMask >> 1;
199 }
200 // after transferring the operands, remove the old instruction and store the new one
201 instWithIndex.GetInst()->eraseFromParent();
202 m_bbModified = true;
203 m_writeList[baseAndOffset] = InstWithIndex(intrinsic, instCounter);
204 }
205 }
206 else
207 {
208 // adding new write at this offset
209 m_writeList[baseAndOffset] = InstWithIndex(intrinsic, instCounter);
210 }
211 }
212 } // FillWriteList()
213
MergeInstructions()214 void MergeURBWrites::MergeInstructions()
215 {
216 // nothing to do for an empty list
217 if (m_writeList.size() == 0)
218 {
219 return;
220 }
221
222 // vector of URBWrite8 in order of offsets
223 llvm::SmallVector<InstWithIndex, 16> URBWrite8;
224
225 auto last = std::prev(m_writeList.end());
226 for (auto ii = m_writeList.begin(); ii != m_writeList.end() && ii != last; ++ii)
227 {
228 auto next = std::next(ii);
229 if (ii->second.GetInst() == nullptr || next->second.GetInst() == nullptr)
230 {
231 //nothing to do, no write at current or next offset
232 continue;
233 }
234
235 // ii->first.first is the dynamic URB base offset, may be nullptr
236 // ii->first.second is the immediate constant offset from ii->first.first
237 if (ii->first.first != next->first.first)
238 {
239 // nothing to do, different base URB offset
240 continue;
241 }
242 if (ii->first.second + 1 != next->first.second)
243 {
244 // nothing to do, not a consecutive URB access
245 continue;
246 }
247 // We have two instructions, merge them by moving operands from the one appearing
248 // earlier in the BB to the one appearing later and increasing write length.
249 //
250 // From this (instructions "in order"):
251 // earlierInst : URBWrite(offset, mask1, d0, d1, d2, d3, xx, xx, xx, xx)
252 // laterInst : URBWrite(offset+1, mask2, g0, g1, g2, g3, xx, xx, xx, xx)
253 //
254 // we need to get this
255 // laterInst : URBWrite(offset, mask1 | mask2<<4, d0, d1, d2, d3, g0, g1, g2, g3)
256 //
257 // and from this ("reversed order"):
258 // earlierInst : URBWrite(offset+1, mask1, d0, d1, d2, d3, xx, xx, xx, xx)
259 // laterInst : URBWrite(offset, mask2, g0, g1, g2, g3, xx, xx, xx, xx)
260 //
261 // we need to get this:
262 // laterInst : URBWrite(offset, mask2 | mask1<<4, g0, g1, g2, g3, d0, d1, d2, d3)
263 //
264 // Note that earlier, later refers to the placement of the instruction
265 // in the basic block while ii, next iterators refer to the placement
266 // in the vector indexed by urb offsets, so 'ii' corresponds to 'offset'
267 // and 'next' corresponds to 'offset+1'.
268 //
269 // determine which instruction is appearing earlier in the BB
270 const bool inOrder = ii->second.GetPlace() < next->second.GetPlace();
271 CallInst* earlierInst = inOrder ? ii->second.GetInst() : next->second.GetInst();
272 CallInst* laterInst = !inOrder ? ii->second.GetInst() : next->second.GetInst();
273
274 // merge per-channel write masks
275 auto lowWriteMask = GetChannelMask(ii->second.GetInst());
276 auto highWriteMask = GetChannelMask(next->second.GetInst());
277 IGC_ASSERT(lowWriteMask <= 0x0F);
278 IGC_ASSERT(highWriteMask <= 0x0F);
279 auto mergedMask = lowWriteMask | (highWriteMask << 4);
280
281
282 // Move the data operands from the earlier instruction to the later instruction.
283 // If instructions are in order, we need to add new operands to positions 2..5 while
284 // moving the existing data operands from 2...5 to the higher positions 6...9.
285 // If in reversed order, we just need to add new operands to positions 6...9.
286 const unsigned int displacement = inOrder ? 4 : 0;
287 const unsigned int displacement2 = inOrder ? 0 : 4;
288 for (unsigned int k = 2; k < 6; ++k)
289 {
290 // move existing operand if necessary
291 laterInst->setOperand(k + displacement, laterInst->getOperand(k));
292 // get the new one from the other instruction
293 laterInst->setOperand(k + displacement2, earlierInst->getOperand(k));
294 }
295
296 // now take the smaller of the two offsets from the instruction in the current slot
297 laterInst->setOperand(0, ii->second.GetInst()->getOperand(0));
298 // and update the mask operand
299 auto mergedMaskVal = llvm::ConstantInt::get(
300 llvm::Type::getInt32Ty(laterInst->getParent()->getContext()),
301 mergedMask);
302 laterInst->setOperand(1, mergedMaskVal);
303
304 // earlier instruction is no longer needed
305 earlierInst->eraseFromParent();
306 m_bbModified = true;
307 ++ii; // skip the next slot since we just considered it as 'next'
308 if (nullptr == ii->first.first) // if URB offset is immediate const
309 {
310 URBWrite8.push_back(laterInst == ii->second.GetInst() ? ii->second : next->second);
311 }
312 } // for
313
314 } // MergeInstructions
315
316
317 #define PASS_FLAG "igc-merge-urb-writes"
318 #define PASS_DESCRIPTION "merges urb writes"
319 #define PASS_CFG_ONLY false
320 #define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(MergeURBWrites,PASS_FLAG,PASS_DESCRIPTION,PASS_CFG_ONLY,PASS_ANALYSIS)321 IGC_INITIALIZE_PASS_BEGIN(MergeURBWrites, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
322 IGC_INITIALIZE_PASS_END(MergeURBWrites, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
323
324 llvm::FunctionPass* createMergeURBWritesPass()
325 {
326 return new MergeURBWrites();
327 }
328
329 } // namespace IGC
330