1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #include "MergeURBWrites.hpp"
10 #include "Compiler/CodeGenPublic.h"
11 #include "common/LLVMWarningsPush.hpp"
12 #include <llvm/Pass.h>
13 #include <llvm/IR/BasicBlock.h>
14 #include <llvm/IR/Instruction.h>
15 #include <llvm/IR/IntrinsicInst.h>
16 #include <llvm/IR/Constants.h>
17 #include "common/LLVMWarningsPop.hpp"
18 #include "GenISAIntrinsics/GenIntrinsics.h"
19 #include "GenISAIntrinsics/GenIntrinsicInst.h"
20 #include "Probe/Assertion.h"
21 #include "IGCPassSupport.h"
22 
23 namespace IGC
24 {
25 using namespace llvm;
26 
27 /// Used to store information about instructions and their positions in the current BB.
28 class InstWithIndex
29 {
30 public:
31     // Initialize pointer to null and place to -1 to recognize "empty" entries
InstWithIndex()32     InstWithIndex() : m_inst(nullptr), m_place(-1)
33     {
34     }
35 
InstWithIndex(CallInst * inst,int place)36     InstWithIndex(CallInst* inst, int place) :
37         m_inst(inst),
38         m_place(place)
39     {
40     }
41 
GetPlace() const42     int GetPlace() const { return m_place; }
GetInst() const43     CallInst* GetInst() const { return m_inst; }
44 
45 private:
46     CallInst* m_inst;
47     int m_place;
48 };
49 
50 class MergeURBWrites : public FunctionPass
51 {
52 public:
53     static char ID;
54 
MergeURBWrites()55     MergeURBWrites() :
56         FunctionPass(ID)
57     {
58         initializeMergeURBWritesPass(*PassRegistry::getPassRegistry());
59     }
60 
61     virtual bool runOnFunction(Function& F);
62 
getAnalysisUsage(AnalysisUsage & AU) const63     virtual void getAnalysisUsage(AnalysisUsage& AU) const
64     {
65         AU.setPreservesCFG();
66         AU.addRequired<CodeGenContextWrapper>();
67     }
68 
getPassName() const69     virtual llvm::StringRef getPassName() const { return "MergeURBWrites"; }
70 
71 private:
72     /// Stores all URB write instructions in a vector.
73     /// Also merges partial (channel granularity) writes to the same offset.
74     void FillWriteList(BasicBlock& BB);
75 
76     // Tries to merge two writes to adjacent offsets into a single instruction.
77     // Does this by going through write list containing stored write instructions
78     // in order of offsets and merging adjacent writes.
79     void MergeInstructions();
80     // represents the map (urb index) --> (instruction, instruction index in BB)
81     // The key consists of a dynamic URB base offset (key.first) and
82     // an immediate offset from this dynamic base
83     // Dynamic URB base offset is null if URB offset is constant.
84     std::map<std::pair<Value*, unsigned int>, InstWithIndex> m_writeList;
85 
86     bool m_bbModified;
87 };
88 
89 char MergeURBWrites::ID = 0;
90 
GetChannelMask(CallInst * inst)91 static unsigned int GetChannelMask(CallInst* inst)
92 {
93     return int_cast<unsigned int>(cast<ConstantInt>(inst->getOperand(1))->getZExtValue());
94 }
95 
96 /// This optimization merges shorter writes to URB to get a smaller number of longer writes
97 /// which is more efficient.
98 /// Current implementation can:
99 /// 1) merge consecutive writes of length 4 to a single write of length 8
100 /// 2) merge writes at the same offset with channel masks
101 ///
102 /// The implementation works as follows: we maintain a map (urb offset) --> instruction
103 /// (kept as a vector) that gets filled with information taken from URBWrite instructions.
104 /// If we encounter two instructions writing at the same offset, we merge channel mask and
105 /// overwrite the earlier channel writes for repeated writes.
106 /// In the second phase, we go through the index list and replace two writes at adjacent
107 /// locations with one.
108 ///
109 /// for now, we don't handle the following cases:
110 /// 1) channel mask is a runtime value
111 /// 2) handling of writes of size >4
112 ///    so e.g. we don't handle |aaaa|bbbbbbbb|cccc| -> |aaaabbbb|bbbbcccc|
113 /// this will be addressed in the future.
114 ///
runOnFunction(Function & F)115 bool MergeURBWrites::runOnFunction(Function& F)
116 {
117     bool fModified = false;
118     for (auto& BB : F)
119     {
120         FillWriteList(BB);
121         MergeInstructions();
122         fModified |= m_bbModified;
123     }
124     return fModified;
125 }
126 
FillWriteList(BasicBlock & BB)127 void MergeURBWrites::FillWriteList(BasicBlock& BB)
128 {
129     m_bbModified = false;
130     m_writeList.clear();
131     int instCounter = 0; // counts the instruction in the BB
132     for (auto iit = BB.begin(); iit != BB.end(); ++iit, ++instCounter)
133     {
134         auto intrinsic = dyn_cast<GenIntrinsicInst>(iit);
135         if (intrinsic == nullptr) continue;
136 
137         GenISAIntrinsic::ID IID = intrinsic->getIntrinsicID();
138         if ((IID == GenISAIntrinsic::GenISA_URBReadOutput) ||
139             (IID == GenISAIntrinsic::GenISA_threadgroupbarrier))
140         {
141             MergeInstructions();
142             m_writeList.clear();
143         }
144         // if not URBWrite intrinsic, we are not interested
145         if (IID != GenISAIntrinsic::GenISA_URBWrite)
146         {
147             continue;
148         }
149 
150         // intrinsic has the format: URB_write (%offset, %mask, %data0, ... , %data7)
151         ConstantInt* pImmediateMask = dyn_cast<ConstantInt>(iit->getOperand(1));
152         if (pImmediateMask == nullptr || (GetChannelMask(intrinsic) > 0x0F))
153         {
154             // for now, we don't handle the following cases:
155             // 1) mask is a runtime value
156             // 2) handling of writes of size >4
157             //    so e.g. we don't handle |aaaa|bbbbbbbb|cccc| -> |aaaabbbb|bbbbcccc|
158             // this will be addressed in the future
159             continue;
160         }
161 
162         std::pair<Value*, unsigned int> baseAndOffset =
163             GetURBBaseAndOffset(iit->getOperand(0));
164 
165         auto it = m_writeList.find(baseAndOffset);
166         // we encountered an instruction writing at the same offset,
167         // most likely we write RTAI, VPAI or PSIZE to vertex header
168         // or we overwrite the old value
169         if (it != m_writeList.end())
170         {
171             const InstWithIndex& instWithIndex = it->second;
172             auto oldMask = GetChannelMask(instWithIndex.GetInst());
173             auto newMask = GetChannelMask(intrinsic);
174             // assume the write lengths are <=4
175             // if we have writes to the same channel, we retain the later one,
176             // discarding the earlier one
177             if (oldMask <= 0x0F && newMask <= 0x0F)
178             {
179                 // get difference oldMask-newMask to determine if we need to take any old operands
180                 auto takeFromOlderMask = oldMask & (~newMask);
181                 // update the mask stored in operand #1 of the second instruction
182                 auto mergedMask = oldMask | newMask;
183                 auto mergedMaskValue = llvm::ConstantInt::get(
184                     llvm::Type::getInt32Ty(BB.getContext()),
185                     mergedMask);
186                 intrinsic->setOperand(1, mergedMaskValue);
187                 // move data operands from the older instruction to the newer one
188                 unsigned int opIndex = 0;
189                 while (takeFromOlderMask != 0)
190                 {
191                     if (takeFromOlderMask & 1)
192                     {
193                         intrinsic->setOperand(
194                             opIndex + 2,
195                             instWithIndex.GetInst()->getOperand(opIndex + 2));
196                     }
197                     ++opIndex;
198                     takeFromOlderMask = takeFromOlderMask >> 1;
199                 }
200                 // after transferring the operands, remove the old instruction and store the new one
201                 instWithIndex.GetInst()->eraseFromParent();
202                 m_bbModified = true;
203                 m_writeList[baseAndOffset] = InstWithIndex(intrinsic, instCounter);
204             }
205         }
206         else
207         {
208             // adding new write at this offset
209             m_writeList[baseAndOffset] = InstWithIndex(intrinsic, instCounter);
210         }
211     }
212 } // FillWriteList()
213 
MergeInstructions()214 void MergeURBWrites::MergeInstructions()
215 {
216     // nothing to do for an empty list
217     if (m_writeList.size() == 0)
218     {
219         return;
220     }
221 
222     // vector of URBWrite8 in order of offsets
223     llvm::SmallVector<InstWithIndex, 16> URBWrite8;
224 
225     auto last = std::prev(m_writeList.end());
226     for (auto ii = m_writeList.begin(); ii != m_writeList.end() && ii != last; ++ii)
227     {
228         auto next = std::next(ii);
229         if (ii->second.GetInst() == nullptr || next->second.GetInst() == nullptr)
230         {
231             //nothing to do, no write at current or next offset
232             continue;
233         }
234 
235         // ii->first.first is the dynamic URB base offset, may be nullptr
236         // ii->first.second is the immediate constant offset from ii->first.first
237         if (ii->first.first != next->first.first)
238         {
239             // nothing to do, different base URB offset
240             continue;
241         }
242         if (ii->first.second + 1 != next->first.second)
243         {
244             // nothing to do, not a consecutive URB access
245             continue;
246         }
247         // We have two instructions, merge them by moving operands from the one appearing
248         // earlier in the BB to the one appearing later and increasing write length.
249         //
250         // From this (instructions "in order"):
251         // earlierInst : URBWrite(offset,   mask1, d0, d1, d2, d3, xx, xx, xx, xx)
252         // laterInst   : URBWrite(offset+1, mask2, g0, g1, g2, g3, xx, xx, xx, xx)
253         //
254         // we need to get this
255         // laterInst   : URBWrite(offset, mask1 | mask2<<4, d0, d1, d2, d3, g0, g1, g2, g3)
256         //
257         // and from this ("reversed order"):
258         // earlierInst : URBWrite(offset+1,   mask1, d0, d1, d2, d3, xx, xx, xx, xx)
259         // laterInst   : URBWrite(offset, mask2, g0, g1, g2, g3, xx, xx, xx, xx)
260         //
261         // we need to get this:
262         // laterInst   : URBWrite(offset, mask2 | mask1<<4, g0, g1, g2, g3, d0, d1, d2, d3)
263         //
264         // Note that earlier, later refers to the placement of the instruction
265         // in the basic block while ii, next iterators refer to the placement
266         // in the vector indexed by urb offsets, so 'ii' corresponds to 'offset'
267         // and 'next' corresponds to 'offset+1'.
268         //
269         // determine which instruction is appearing earlier in the BB
270         const bool inOrder = ii->second.GetPlace() < next->second.GetPlace();
271         CallInst* earlierInst = inOrder ? ii->second.GetInst() : next->second.GetInst();
272         CallInst* laterInst = !inOrder ? ii->second.GetInst() : next->second.GetInst();
273 
274         // merge per-channel write masks
275         auto lowWriteMask = GetChannelMask(ii->second.GetInst());
276         auto highWriteMask = GetChannelMask(next->second.GetInst());
277         IGC_ASSERT(lowWriteMask <= 0x0F);
278         IGC_ASSERT(highWriteMask <= 0x0F);
279         auto mergedMask = lowWriteMask | (highWriteMask << 4);
280 
281 
282         // Move the data operands from the earlier instruction to the later instruction.
283         // If instructions are in order, we need to add new operands to positions 2..5 while
284         // moving the existing data operands from 2...5 to the higher positions 6...9.
285         // If in reversed order, we just need to add new operands to positions 6...9.
286         const unsigned int displacement = inOrder ? 4 : 0;
287         const unsigned int displacement2 = inOrder ? 0 : 4;
288         for (unsigned int k = 2; k < 6; ++k)
289         {
290             // move existing operand if necessary
291             laterInst->setOperand(k + displacement, laterInst->getOperand(k));
292             // get the new one from the other instruction
293             laterInst->setOperand(k + displacement2, earlierInst->getOperand(k));
294         }
295 
296         // now take the smaller of the two offsets from the instruction in the current slot
297         laterInst->setOperand(0, ii->second.GetInst()->getOperand(0));
298         // and update the mask operand
299         auto mergedMaskVal = llvm::ConstantInt::get(
300             llvm::Type::getInt32Ty(laterInst->getParent()->getContext()),
301             mergedMask);
302         laterInst->setOperand(1, mergedMaskVal);
303 
304         // earlier instruction is no longer needed
305         earlierInst->eraseFromParent();
306         m_bbModified = true;
307         ++ii; // skip the next slot since we just considered it as 'next'
308         if (nullptr == ii->first.first) // if URB offset is immediate const
309         {
310             URBWrite8.push_back(laterInst == ii->second.GetInst() ? ii->second : next->second);
311         }
312     } // for
313 
314 } // MergeInstructions
315 
316 
317 #define PASS_FLAG "igc-merge-urb-writes"
318 #define PASS_DESCRIPTION "merges urb writes"
319 #define PASS_CFG_ONLY false
320 #define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(MergeURBWrites,PASS_FLAG,PASS_DESCRIPTION,PASS_CFG_ONLY,PASS_ANALYSIS)321 IGC_INITIALIZE_PASS_BEGIN(MergeURBWrites, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
322 IGC_INITIALIZE_PASS_END(MergeURBWrites, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
323 
324 llvm::FunctionPass* createMergeURBWritesPass()
325 {
326     return new MergeURBWrites();
327 }
328 
329 } // namespace IGC
330