1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file blend_jit.cpp
24  *
25  * @brief Implementation of the blend jitter
26  *
27  * Notes:
28  *
29  ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "blend_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
36 
37 #include "util/compiler.h"
38 
39 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
40 #define QUANTIZE_THRESHOLD 2
41 
42 using namespace llvm;
43 using namespace SwrJit;
44 
45 //////////////////////////////////////////////////////////////////////////
46 /// Interface to Jitting a blend shader
47 //////////////////////////////////////////////////////////////////////////
48 struct BlendJit : public Builder
49 {
BlendJitBlendJit50     BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
51 
52     template <bool Color, bool Alpha>
GenerateBlendFactorBlendJit53     void GenerateBlendFactor(SWR_BLEND_FACTOR factor,
54                              Value*           constColor[4],
55                              Value*           src[4],
56                              Value*           src1[4],
57                              Value*           dst[4],
58                              Value*           result[4])
59     {
60         Value* out[4];
61 
62         switch (factor)
63         {
64         case BLENDFACTOR_ONE:
65             out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
66             break;
67         case BLENDFACTOR_SRC_COLOR:
68             out[0] = src[0];
69             out[1] = src[1];
70             out[2] = src[2];
71             out[3] = src[3];
72             break;
73         case BLENDFACTOR_SRC_ALPHA:
74             out[0] = out[1] = out[2] = out[3] = src[3];
75             break;
76         case BLENDFACTOR_DST_ALPHA:
77             out[0] = out[1] = out[2] = out[3] = dst[3];
78             break;
79         case BLENDFACTOR_DST_COLOR:
80             out[0] = dst[0];
81             out[1] = dst[1];
82             out[2] = dst[2];
83             out[3] = dst[3];
84             break;
85         case BLENDFACTOR_SRC_ALPHA_SATURATE:
86             out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
87             out[3]                   = VIMMED1(1.0f);
88             break;
89         case BLENDFACTOR_CONST_COLOR:
90             out[0] = constColor[0];
91             out[1] = constColor[1];
92             out[2] = constColor[2];
93             out[3] = constColor[3];
94             break;
95         case BLENDFACTOR_CONST_ALPHA:
96             out[0] = out[1] = out[2] = out[3] = constColor[3];
97             break;
98         case BLENDFACTOR_SRC1_COLOR:
99             out[0] = src1[0];
100             out[1] = src1[1];
101             out[2] = src1[2];
102             out[3] = src1[3];
103             break;
104         case BLENDFACTOR_SRC1_ALPHA:
105             out[0] = out[1] = out[2] = out[3] = src1[3];
106             break;
107         case BLENDFACTOR_ZERO:
108             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
109             break;
110         case BLENDFACTOR_INV_SRC_COLOR:
111             out[0] = FSUB(VIMMED1(1.0f), src[0]);
112             out[1] = FSUB(VIMMED1(1.0f), src[1]);
113             out[2] = FSUB(VIMMED1(1.0f), src[2]);
114             out[3] = FSUB(VIMMED1(1.0f), src[3]);
115             break;
116         case BLENDFACTOR_INV_SRC_ALPHA:
117             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
118             break;
119         case BLENDFACTOR_INV_DST_ALPHA:
120             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
121             break;
122         case BLENDFACTOR_INV_DST_COLOR:
123             out[0] = FSUB(VIMMED1(1.0f), dst[0]);
124             out[1] = FSUB(VIMMED1(1.0f), dst[1]);
125             out[2] = FSUB(VIMMED1(1.0f), dst[2]);
126             out[3] = FSUB(VIMMED1(1.0f), dst[3]);
127             break;
128         case BLENDFACTOR_INV_CONST_COLOR:
129             out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
130             out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
131             out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
132             out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
133             break;
134         case BLENDFACTOR_INV_CONST_ALPHA:
135             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
136             break;
137         case BLENDFACTOR_INV_SRC1_COLOR:
138             out[0] = FSUB(VIMMED1(1.0f), src1[0]);
139             out[1] = FSUB(VIMMED1(1.0f), src1[1]);
140             out[2] = FSUB(VIMMED1(1.0f), src1[2]);
141             out[3] = FSUB(VIMMED1(1.0f), src1[3]);
142             break;
143         case BLENDFACTOR_INV_SRC1_ALPHA:
144             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
145             break;
146         default:
147             SWR_INVALID("Unsupported blend factor: %d", factor);
148             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
149             break;
150         }
151 
152         if (Color)
153         {
154             result[0] = out[0];
155             result[1] = out[1];
156             result[2] = out[2];
157         }
158 
159         if (Alpha)
160         {
161             result[3] = out[3];
162         }
163     }
164 
ClampBlendJit165     void Clamp(SWR_FORMAT format, Value* src[4])
166     {
167         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
168         SWR_TYPE               type = info.type[0];
169 
170         switch (type)
171         {
172         default:
173             break;
174 
175         case SWR_TYPE_UNORM:
176             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
177             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
178             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
179             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
180             break;
181 
182         case SWR_TYPE_SNORM:
183             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
184             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
185             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
186             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
187             break;
188 
189         case SWR_TYPE_UNKNOWN:
190             SWR_INVALID("Unsupported format type: %d", type);
191         }
192     }
193 
ApplyDefaultsBlendJit194     void ApplyDefaults(SWR_FORMAT format, Value* src[4])
195     {
196         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
197 
198         bool valid[] = {false, false, false, false};
199         for (uint32_t c = 0; c < info.numComps; ++c)
200         {
201             valid[info.swizzle[c]] = true;
202         }
203 
204         for (uint32_t c = 0; c < 4; ++c)
205         {
206             if (!valid[c])
207             {
208                 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
209             }
210         }
211     }
212 
ApplyUnusedDefaultsBlendJit213     void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
214     {
215         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
216 
217         for (uint32_t c = 0; c < info.numComps; ++c)
218         {
219             if (info.type[c] == SWR_TYPE_UNUSED)
220             {
221                 src[info.swizzle[c]] =
222                     BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
223             }
224         }
225     }
226 
QuantizeBlendJit227     void Quantize(SWR_FORMAT format, Value* src[4])
228     {
229         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
230         for (uint32_t c = 0; c < info.numComps; ++c)
231         {
232             if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
233             {
234                 uint32_t swizComp = info.swizzle[c];
235                 float    factor   = (float)((1 << info.bpc[c]) - 1);
236                 switch (info.type[c])
237                 {
238                 case SWR_TYPE_UNORM:
239                     src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
240                     src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
241                     src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor));
242                     break;
243                 default:
244                     SWR_INVALID("Unsupported format type: %d", info.type[c]);
245                 }
246             }
247         }
248     }
249 
250     template <bool Color, bool Alpha>
BlendFuncBlendJit251     void BlendFunc(SWR_BLEND_OP blendOp,
252                    Value*       src[4],
253                    Value*       srcFactor[4],
254                    Value*       dst[4],
255                    Value*       dstFactor[4],
256                    Value*       result[4])
257     {
258         Value* out[4];
259         Value* srcBlend[4];
260         Value* dstBlend[4];
261         for (uint32_t i = 0; i < 4; ++i)
262         {
263             srcBlend[i] = FMUL(src[i], srcFactor[i]);
264             dstBlend[i] = FMUL(dst[i], dstFactor[i]);
265         }
266 
267         switch (blendOp)
268         {
269         case BLENDOP_ADD:
270             out[0] = FADD(srcBlend[0], dstBlend[0]);
271             out[1] = FADD(srcBlend[1], dstBlend[1]);
272             out[2] = FADD(srcBlend[2], dstBlend[2]);
273             out[3] = FADD(srcBlend[3], dstBlend[3]);
274             break;
275 
276         case BLENDOP_SUBTRACT:
277             out[0] = FSUB(srcBlend[0], dstBlend[0]);
278             out[1] = FSUB(srcBlend[1], dstBlend[1]);
279             out[2] = FSUB(srcBlend[2], dstBlend[2]);
280             out[3] = FSUB(srcBlend[3], dstBlend[3]);
281             break;
282 
283         case BLENDOP_REVSUBTRACT:
284             out[0] = FSUB(dstBlend[0], srcBlend[0]);
285             out[1] = FSUB(dstBlend[1], srcBlend[1]);
286             out[2] = FSUB(dstBlend[2], srcBlend[2]);
287             out[3] = FSUB(dstBlend[3], srcBlend[3]);
288             break;
289 
290         case BLENDOP_MIN:
291             out[0] = VMINPS(src[0], dst[0]);
292             out[1] = VMINPS(src[1], dst[1]);
293             out[2] = VMINPS(src[2], dst[2]);
294             out[3] = VMINPS(src[3], dst[3]);
295             break;
296 
297         case BLENDOP_MAX:
298             out[0] = VMAXPS(src[0], dst[0]);
299             out[1] = VMAXPS(src[1], dst[1]);
300             out[2] = VMAXPS(src[2], dst[2]);
301             out[3] = VMAXPS(src[3], dst[3]);
302             break;
303 
304         default:
305             SWR_INVALID("Unsupported blend operation: %d", blendOp);
306             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
307             break;
308         }
309 
310         if (Color)
311         {
312             result[0] = out[0];
313             result[1] = out[1];
314             result[2] = out[2];
315         }
316 
317         if (Alpha)
318         {
319             result[3] = out[3];
320         }
321     }
322 
LogicOpFuncBlendJit323     void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
324     {
325         // Op: (s == PS output, d = RT contents)
326         switch (logicOp)
327         {
328         case LOGICOP_CLEAR:
329             result[0] = VIMMED1(0);
330             result[1] = VIMMED1(0);
331             result[2] = VIMMED1(0);
332             result[3] = VIMMED1(0);
333             break;
334 
335         case LOGICOP_NOR:
336             // ~(s | d)
337             result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
338             result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
339             result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
340             result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
341             break;
342 
343         case LOGICOP_AND_INVERTED:
344             // ~s & d
345             // todo: use avx andnot instr when I can find the intrinsic to call
346             result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
347             result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
348             result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
349             result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
350             break;
351 
352         case LOGICOP_COPY_INVERTED:
353             // ~s
354             result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
355             result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
356             result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
357             result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
358             break;
359 
360         case LOGICOP_AND_REVERSE:
361             // s & ~d
362             // todo: use avx andnot instr when I can find the intrinsic to call
363             result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
364             result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
365             result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
366             result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
367             break;
368 
369         case LOGICOP_INVERT:
370             // ~d
371             result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
372             result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
373             result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
374             result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
375             break;
376 
377         case LOGICOP_XOR:
378             // s ^ d
379             result[0] = XOR(src[0], dst[0]);
380             result[1] = XOR(src[1], dst[1]);
381             result[2] = XOR(src[2], dst[2]);
382             result[3] = XOR(src[3], dst[3]);
383             break;
384 
385         case LOGICOP_NAND:
386             // ~(s & d)
387             result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
388             result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
389             result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
390             result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
391             break;
392 
393         case LOGICOP_AND:
394             // s & d
395             result[0] = AND(src[0], dst[0]);
396             result[1] = AND(src[1], dst[1]);
397             result[2] = AND(src[2], dst[2]);
398             result[3] = AND(src[3], dst[3]);
399             break;
400 
401         case LOGICOP_EQUIV:
402             // ~(s ^ d)
403             result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
404             result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
405             result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
406             result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
407             break;
408 
409         case LOGICOP_NOOP:
410             result[0] = dst[0];
411             result[1] = dst[1];
412             result[2] = dst[2];
413             result[3] = dst[3];
414             break;
415 
416         case LOGICOP_OR_INVERTED:
417             // ~s | d
418             result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
419             result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
420             result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
421             result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
422             break;
423 
424         case LOGICOP_COPY:
425             result[0] = src[0];
426             result[1] = src[1];
427             result[2] = src[2];
428             result[3] = src[3];
429             break;
430 
431         case LOGICOP_OR_REVERSE:
432             // s | ~d
433             result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
434             result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
435             result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
436             result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
437             break;
438 
439         case LOGICOP_OR:
440             // s | d
441             result[0] = OR(src[0], dst[0]);
442             result[1] = OR(src[1], dst[1]);
443             result[2] = OR(src[2], dst[2]);
444             result[3] = OR(src[3], dst[3]);
445             break;
446 
447         case LOGICOP_SET:
448             result[0] = VIMMED1(0xFFFFFFFF);
449             result[1] = VIMMED1(0xFFFFFFFF);
450             result[2] = VIMMED1(0xFFFFFFFF);
451             result[3] = VIMMED1(0xFFFFFFFF);
452             break;
453 
454         default:
455             SWR_INVALID("Unsupported logic operation: %d", logicOp);
456             result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
457             break;
458         }
459     }
460 
461     void
AlphaTestBlendJit462     AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
463     {
464         // load uint32_t reference
465         Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference}));
466 
467         // load alpha
468         Value* pAlpha = LOAD(ppAlpha, {0, 0});
469 
470         Value* pTest = nullptr;
471         if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
472         {
473             // convert float alpha to unorm8
474             Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
475             pAlphaU8        = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
476 
477             // compare
478             switch (state.alphaTestFunction)
479             {
480             case ZFUNC_ALWAYS:
481                 pTest = VIMMED1(true);
482                 break;
483             case ZFUNC_NEVER:
484                 pTest = VIMMED1(false);
485                 break;
486             case ZFUNC_LT:
487                 pTest = ICMP_ULT(pAlphaU8, pRef);
488                 break;
489             case ZFUNC_EQ:
490                 pTest = ICMP_EQ(pAlphaU8, pRef);
491                 break;
492             case ZFUNC_LE:
493                 pTest = ICMP_ULE(pAlphaU8, pRef);
494                 break;
495             case ZFUNC_GT:
496                 pTest = ICMP_UGT(pAlphaU8, pRef);
497                 break;
498             case ZFUNC_NE:
499                 pTest = ICMP_NE(pAlphaU8, pRef);
500                 break;
501             case ZFUNC_GE:
502                 pTest = ICMP_UGE(pAlphaU8, pRef);
503                 break;
504             default:
505                 SWR_INVALID("Invalid alpha test function");
506                 break;
507             }
508         }
509         else
510         {
511             // cast ref to float
512             pRef = BITCAST(pRef, mSimdFP32Ty);
513 
514             // compare
515             switch (state.alphaTestFunction)
516             {
517             case ZFUNC_ALWAYS:
518                 pTest = VIMMED1(true);
519                 break;
520             case ZFUNC_NEVER:
521                 pTest = VIMMED1(false);
522                 break;
523             case ZFUNC_LT:
524                 pTest = FCMP_OLT(pAlpha, pRef);
525                 break;
526             case ZFUNC_EQ:
527                 pTest = FCMP_OEQ(pAlpha, pRef);
528                 break;
529             case ZFUNC_LE:
530                 pTest = FCMP_OLE(pAlpha, pRef);
531                 break;
532             case ZFUNC_GT:
533                 pTest = FCMP_OGT(pAlpha, pRef);
534                 break;
535             case ZFUNC_NE:
536                 pTest = FCMP_ONE(pAlpha, pRef);
537                 break;
538             case ZFUNC_GE:
539                 pTest = FCMP_OGE(pAlpha, pRef);
540                 break;
541             default:
542                 SWR_INVALID("Invalid alpha test function");
543                 break;
544             }
545         }
546 
547         // load current mask
548         Value* pMask = LOAD(ppMask);
549 
550         // convert to int1 mask
551         pMask = MASK(pMask);
552 
553         // and with alpha test result
554         pMask = AND(pMask, pTest);
555 
556         // convert back to vector mask
557         pMask = VMASK(pMask);
558 
559         // store new mask
560         STORE(pMask, ppMask);
561     }
562 
CreateBlendJit563     Function* Create(const BLEND_COMPILE_STATE& state)
564     {
565         std::stringstream fnName("BLND_",
566                                  std::ios_base::in | std::ios_base::out | std::ios_base::ate);
567         fnName << ComputeCRC(0, &state, sizeof(state));
568 
569         // blend function signature
570         // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);
571 
572         std::vector<Type*> args{
573             PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT*
574         };
575 
576         // std::vector<Type*> args{
577         //    PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT*
578         //};
579 
580         FunctionType* fTy       = FunctionType::get(IRB()->getVoidTy(), args, false);
581         Function*     blendFunc = Function::Create(
582             fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
583         blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
584 
585         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
586 
587         IRB()->SetInsertPoint(entry);
588 
589         // arguments
590         auto   argitr        = blendFunc->arg_begin();
591         Value* pBlendContext = &*argitr++;
592         pBlendContext->setName("pBlendContext");
593         Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState});
594         pBlendState->setName("pBlendState");
595         Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src});
596         pSrc->setName("src");
597         Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1});
598         pSrc1->setName("src1");
599         Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha});
600         pSrc0Alpha->setName("src0alpha");
601         Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum});
602         sampleNum->setName("sampleNum");
603         Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst});
604         pDst->setName("pDst");
605         Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result});
606         pResult->setName("result");
607         Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask});
608         ppoMask->setName("ppoMask");
609         Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask});
610         ppMask->setName("pMask");
611 
612         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
613                       "Unsupported hot tile format");
614         Value* dst[4];
615         Value* constantColor[4];
616         Value* src[4];
617         Value* src1[4];
618         Value* result[4];
619         for (uint32_t i = 0; i < 4; ++i)
620         {
621             // load hot tile
622             dst[i] = LOAD(pDst, {0, i});
623 
624             // load constant color
625             constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i}));
626 
627             // load src
628             src[i] = LOAD(pSrc, {0, i});
629 
630             // load src1
631             src1[i] = LOAD(pSrc1, {0, i});
632         }
633         Value* currentSampleMask = VIMMED1(-1);
634         if (state.desc.alphaToCoverageEnable)
635         {
636             Value*   pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
637             uint32_t bits        = (1 << state.desc.numSamples) - 1;
638             currentSampleMask    = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
639             currentSampleMask    = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
640         }
641 
642         // alpha test
643         if (state.desc.alphaTestEnable)
644         {
645             // Gather for archrast stats
646             STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
647             AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
648         }
649         else
650         {
651             // Gather for archrast stats
652             STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
653         }
654 
655         // color blend
656         if (state.blendState.blendEnable)
657         {
658             // Gather for archrast stats
659             STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
660 
661             // clamp sources
662             Clamp(state.format, src);
663             Clamp(state.format, src1);
664             Clamp(state.format, dst);
665             Clamp(state.format, constantColor);
666 
667             // apply defaults to hottile contents to take into account missing components
668             ApplyDefaults(state.format, dst);
669 
670             // Force defaults for unused 'X' components
671             ApplyUnusedDefaults(state.format, dst);
672 
673             // Quantize low precision components
674             Quantize(state.format, dst);
675 
676             // special case clamping for R11G11B10_float which has no sign bit
677             if (state.format == R11G11B10_FLOAT)
678             {
679                 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
680                 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
681                 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
682                 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
683             }
684 
685             Value* srcFactor[4];
686             Value* dstFactor[4];
687             if (state.desc.independentAlphaBlendEnable)
688             {
689                 GenerateBlendFactor<true, false>(
690                     state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
691                 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor,
692                                                  constantColor,
693                                                  src,
694                                                  src1,
695                                                  dst,
696                                                  srcFactor);
697 
698                 GenerateBlendFactor<true, false>(
699                     state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
700                 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor,
701                                                  constantColor,
702                                                  src,
703                                                  src1,
704                                                  dst,
705                                                  dstFactor);
706 
707                 BlendFunc<true, false>(
708                     state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
709                 BlendFunc<false, true>(
710                     state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
711             }
712             else
713             {
714                 GenerateBlendFactor<true, true>(
715                     state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
716                 GenerateBlendFactor<true, true>(
717                     state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
718 
719                 BlendFunc<true, true>(
720                     state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
721             }
722 
723             // store results out
724             for (uint32_t i = 0; i < 4; ++i)
725             {
726                 STORE(result[i], pResult, {0, i});
727             }
728         }
729         else
730         {
731             // Gather for archrast stats
732             STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
733         }
734 
735         if (state.blendState.logicOpEnable)
736         {
737             const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
738             Value*                 vMask[4];
739             float                  scale[4];
740 
741             if (!state.blendState.blendEnable)
742             {
743                 Clamp(state.format, src);
744                 Clamp(state.format, dst);
745             }
746 
747             for (uint32_t i = 0; i < 4; i++)
748             {
749                 if (info.type[i] == SWR_TYPE_UNUSED)
750                 {
751                     continue;
752                 }
753 
754                 if (info.bpc[i] >= 32)
755                 {
756                     vMask[i] = VIMMED1(0xFFFFFFFF);
757                     scale[i] = 0xFFFFFFFF;
758                 }
759                 else
760                 {
761                     vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
762                     if (info.type[i] == SWR_TYPE_SNORM)
763                         scale[i] = (1 << (info.bpc[i] - 1)) - 1;
764                     else
765                         scale[i] = (1 << info.bpc[i]) - 1;
766                 }
767 
768                 switch (info.type[i])
769                 {
770                 default:
771                     SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
772                     break;
773 
774                 case SWR_TYPE_UNKNOWN:
775                 case SWR_TYPE_UNUSED:
776                     FALLTHROUGH;
777 
778                 case SWR_TYPE_UINT:
779                 case SWR_TYPE_SINT:
780                     src[i] = BITCAST(src[i], mSimdInt32Ty);
781                     dst[i] = BITCAST(dst[i], mSimdInt32Ty);
782                     break;
783                 case SWR_TYPE_SNORM:
784                     src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
785                     dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
786                     break;
787                 case SWR_TYPE_UNORM:
788                     src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
789                     dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
790                     break;
791                 }
792             }
793 
794             LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
795 
796             // store results out
797             for (uint32_t i = 0; i < 4; ++i)
798             {
799                 if (info.type[i] == SWR_TYPE_UNUSED)
800                 {
801                     continue;
802                 }
803 
804                 // clear upper bits from PS output not in RT format after doing logic op
805                 result[i] = AND(result[i], vMask[i]);
806 
807                 switch (info.type[i])
808                 {
809                 default:
810                     SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
811                     break;
812 
813                 case SWR_TYPE_UNKNOWN:
814                 case SWR_TYPE_UNUSED:
815                     FALLTHROUGH;
816 
817                 case SWR_TYPE_UINT:
818                 case SWR_TYPE_SINT:
819                     result[i] = BITCAST(result[i], mSimdFP32Ty);
820                     break;
821                 case SWR_TYPE_SNORM:
822                     result[i] = SHL(result[i], C(32 - info.bpc[i]));
823                     result[i] = ASHR(result[i], C(32 - info.bpc[i]));
824                     result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
825                     break;
826                 case SWR_TYPE_UNORM:
827                     result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
828                     break;
829                 }
830 
831                 STORE(result[i], pResult, {0, i});
832             }
833         }
834 
835         if (state.desc.oMaskEnable)
836         {
837             assert(!(state.desc.alphaToCoverageEnable));
838             // load current mask
839             Value* oMask      = LOAD(ppoMask);
840             currentSampleMask = AND(oMask, currentSampleMask);
841         }
842 
843         if (state.desc.sampleMaskEnable)
844         {
845             Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask});
846             currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
847         }
848 
849         if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
850             state.desc.oMaskEnable)
851         {
852             // load coverage mask and mask off any lanes with no samples
853             Value* pMask        = LOAD(ppMask);
854             Value* sampleMasked = SHL(C(1), sampleNum);
855             currentSampleMask   = AND(currentSampleMask, VBROADCAST(sampleMasked));
856             currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
857             Value* outputMask = AND(pMask, currentSampleMask);
858             // store new mask
859             STORE(outputMask, GEP(ppMask, C(0)));
860         }
861 
862         RET_VOID();
863 
864         JitManager::DumpToFile(blendFunc, "");
865 
866         ::FunctionPassManager passes(JM()->mpCurrentModule);
867 
868         passes.add(createBreakCriticalEdgesPass());
869         passes.add(createCFGSimplificationPass());
870         passes.add(createEarlyCSEPass());
871         passes.add(createPromoteMemoryToRegisterPass());
872         passes.add(createCFGSimplificationPass());
873         passes.add(createEarlyCSEPass());
874         passes.add(createInstructionCombiningPass());
875 #if LLVM_VERSION_MAJOR <= 11
876         passes.add(createConstantPropagationPass());
877 #endif
878         passes.add(createSCCPPass());
879         passes.add(createAggressiveDCEPass());
880 
881         passes.add(createLowerX86Pass(this));
882 
883         passes.run(*blendFunc);
884 
885         JitManager::DumpToFile(blendFunc, "optimized");
886 
887         return blendFunc;
888     }
889 };
890 
891 //////////////////////////////////////////////////////////////////////////
892 /// @brief JITs from fetch shader IR
893 /// @param hJitMgr - JitManager handle
894 /// @param func   - LLVM function IR
895 /// @return PFN_FETCH_FUNC - pointer to fetch code
JitBlendFunc(HANDLE hJitMgr,const HANDLE hFunc)896 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
897 {
898     const llvm::Function* func    = (const llvm::Function*)hFunc;
899     JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
900     PFN_BLEND_JIT_FUNC    pfnBlend;
901     pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
902     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
903     // add new IR to the module
904     pJitMgr->mIsModuleFinalized = true;
905 
906     return pfnBlend;
907 }
908 
909 //////////////////////////////////////////////////////////////////////////
910 /// @brief JIT compiles blend shader
911 /// @param hJitMgr - JitManager handle
912 /// @param state   - blend state to build function from
JitCompileBlend(HANDLE hJitMgr,const BLEND_COMPILE_STATE & state)913 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE                     hJitMgr,
914                                                       const BLEND_COMPILE_STATE& state)
915 {
916     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
917 
918     pJitMgr->SetupNewModule();
919 
920     BlendJit theJit(pJitMgr);
921     HANDLE   hFunc = theJit.Create(state);
922 
923     return JitBlendFunc(hJitMgr, hFunc);
924 }
925