1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file blend_jit.cpp
24 *
25 * @brief Implementation of the blend jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "blend_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
36
37 #include "util/compiler.h"
38
39 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
40 #define QUANTIZE_THRESHOLD 2
41
42 using namespace llvm;
43 using namespace SwrJit;
44
45 //////////////////////////////////////////////////////////////////////////
46 /// Interface to Jitting a blend shader
47 //////////////////////////////////////////////////////////////////////////
48 struct BlendJit : public Builder
49 {
BlendJitBlendJit50 BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
51
52 template <bool Color, bool Alpha>
GenerateBlendFactorBlendJit53 void GenerateBlendFactor(SWR_BLEND_FACTOR factor,
54 Value* constColor[4],
55 Value* src[4],
56 Value* src1[4],
57 Value* dst[4],
58 Value* result[4])
59 {
60 Value* out[4];
61
62 switch (factor)
63 {
64 case BLENDFACTOR_ONE:
65 out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
66 break;
67 case BLENDFACTOR_SRC_COLOR:
68 out[0] = src[0];
69 out[1] = src[1];
70 out[2] = src[2];
71 out[3] = src[3];
72 break;
73 case BLENDFACTOR_SRC_ALPHA:
74 out[0] = out[1] = out[2] = out[3] = src[3];
75 break;
76 case BLENDFACTOR_DST_ALPHA:
77 out[0] = out[1] = out[2] = out[3] = dst[3];
78 break;
79 case BLENDFACTOR_DST_COLOR:
80 out[0] = dst[0];
81 out[1] = dst[1];
82 out[2] = dst[2];
83 out[3] = dst[3];
84 break;
85 case BLENDFACTOR_SRC_ALPHA_SATURATE:
86 out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
87 out[3] = VIMMED1(1.0f);
88 break;
89 case BLENDFACTOR_CONST_COLOR:
90 out[0] = constColor[0];
91 out[1] = constColor[1];
92 out[2] = constColor[2];
93 out[3] = constColor[3];
94 break;
95 case BLENDFACTOR_CONST_ALPHA:
96 out[0] = out[1] = out[2] = out[3] = constColor[3];
97 break;
98 case BLENDFACTOR_SRC1_COLOR:
99 out[0] = src1[0];
100 out[1] = src1[1];
101 out[2] = src1[2];
102 out[3] = src1[3];
103 break;
104 case BLENDFACTOR_SRC1_ALPHA:
105 out[0] = out[1] = out[2] = out[3] = src1[3];
106 break;
107 case BLENDFACTOR_ZERO:
108 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
109 break;
110 case BLENDFACTOR_INV_SRC_COLOR:
111 out[0] = FSUB(VIMMED1(1.0f), src[0]);
112 out[1] = FSUB(VIMMED1(1.0f), src[1]);
113 out[2] = FSUB(VIMMED1(1.0f), src[2]);
114 out[3] = FSUB(VIMMED1(1.0f), src[3]);
115 break;
116 case BLENDFACTOR_INV_SRC_ALPHA:
117 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
118 break;
119 case BLENDFACTOR_INV_DST_ALPHA:
120 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
121 break;
122 case BLENDFACTOR_INV_DST_COLOR:
123 out[0] = FSUB(VIMMED1(1.0f), dst[0]);
124 out[1] = FSUB(VIMMED1(1.0f), dst[1]);
125 out[2] = FSUB(VIMMED1(1.0f), dst[2]);
126 out[3] = FSUB(VIMMED1(1.0f), dst[3]);
127 break;
128 case BLENDFACTOR_INV_CONST_COLOR:
129 out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
130 out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
131 out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
132 out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
133 break;
134 case BLENDFACTOR_INV_CONST_ALPHA:
135 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
136 break;
137 case BLENDFACTOR_INV_SRC1_COLOR:
138 out[0] = FSUB(VIMMED1(1.0f), src1[0]);
139 out[1] = FSUB(VIMMED1(1.0f), src1[1]);
140 out[2] = FSUB(VIMMED1(1.0f), src1[2]);
141 out[3] = FSUB(VIMMED1(1.0f), src1[3]);
142 break;
143 case BLENDFACTOR_INV_SRC1_ALPHA:
144 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
145 break;
146 default:
147 SWR_INVALID("Unsupported blend factor: %d", factor);
148 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
149 break;
150 }
151
152 if (Color)
153 {
154 result[0] = out[0];
155 result[1] = out[1];
156 result[2] = out[2];
157 }
158
159 if (Alpha)
160 {
161 result[3] = out[3];
162 }
163 }
164
ClampBlendJit165 void Clamp(SWR_FORMAT format, Value* src[4])
166 {
167 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
168 SWR_TYPE type = info.type[0];
169
170 switch (type)
171 {
172 default:
173 break;
174
175 case SWR_TYPE_UNORM:
176 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
177 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
178 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
179 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
180 break;
181
182 case SWR_TYPE_SNORM:
183 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
184 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
185 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
186 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
187 break;
188
189 case SWR_TYPE_UNKNOWN:
190 SWR_INVALID("Unsupported format type: %d", type);
191 }
192 }
193
ApplyDefaultsBlendJit194 void ApplyDefaults(SWR_FORMAT format, Value* src[4])
195 {
196 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
197
198 bool valid[] = {false, false, false, false};
199 for (uint32_t c = 0; c < info.numComps; ++c)
200 {
201 valid[info.swizzle[c]] = true;
202 }
203
204 for (uint32_t c = 0; c < 4; ++c)
205 {
206 if (!valid[c])
207 {
208 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
209 }
210 }
211 }
212
ApplyUnusedDefaultsBlendJit213 void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
214 {
215 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
216
217 for (uint32_t c = 0; c < info.numComps; ++c)
218 {
219 if (info.type[c] == SWR_TYPE_UNUSED)
220 {
221 src[info.swizzle[c]] =
222 BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
223 }
224 }
225 }
226
QuantizeBlendJit227 void Quantize(SWR_FORMAT format, Value* src[4])
228 {
229 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
230 for (uint32_t c = 0; c < info.numComps; ++c)
231 {
232 if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
233 {
234 uint32_t swizComp = info.swizzle[c];
235 float factor = (float)((1 << info.bpc[c]) - 1);
236 switch (info.type[c])
237 {
238 case SWR_TYPE_UNORM:
239 src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
240 src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
241 src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor));
242 break;
243 default:
244 SWR_INVALID("Unsupported format type: %d", info.type[c]);
245 }
246 }
247 }
248 }
249
250 template <bool Color, bool Alpha>
BlendFuncBlendJit251 void BlendFunc(SWR_BLEND_OP blendOp,
252 Value* src[4],
253 Value* srcFactor[4],
254 Value* dst[4],
255 Value* dstFactor[4],
256 Value* result[4])
257 {
258 Value* out[4];
259 Value* srcBlend[4];
260 Value* dstBlend[4];
261 for (uint32_t i = 0; i < 4; ++i)
262 {
263 srcBlend[i] = FMUL(src[i], srcFactor[i]);
264 dstBlend[i] = FMUL(dst[i], dstFactor[i]);
265 }
266
267 switch (blendOp)
268 {
269 case BLENDOP_ADD:
270 out[0] = FADD(srcBlend[0], dstBlend[0]);
271 out[1] = FADD(srcBlend[1], dstBlend[1]);
272 out[2] = FADD(srcBlend[2], dstBlend[2]);
273 out[3] = FADD(srcBlend[3], dstBlend[3]);
274 break;
275
276 case BLENDOP_SUBTRACT:
277 out[0] = FSUB(srcBlend[0], dstBlend[0]);
278 out[1] = FSUB(srcBlend[1], dstBlend[1]);
279 out[2] = FSUB(srcBlend[2], dstBlend[2]);
280 out[3] = FSUB(srcBlend[3], dstBlend[3]);
281 break;
282
283 case BLENDOP_REVSUBTRACT:
284 out[0] = FSUB(dstBlend[0], srcBlend[0]);
285 out[1] = FSUB(dstBlend[1], srcBlend[1]);
286 out[2] = FSUB(dstBlend[2], srcBlend[2]);
287 out[3] = FSUB(dstBlend[3], srcBlend[3]);
288 break;
289
290 case BLENDOP_MIN:
291 out[0] = VMINPS(src[0], dst[0]);
292 out[1] = VMINPS(src[1], dst[1]);
293 out[2] = VMINPS(src[2], dst[2]);
294 out[3] = VMINPS(src[3], dst[3]);
295 break;
296
297 case BLENDOP_MAX:
298 out[0] = VMAXPS(src[0], dst[0]);
299 out[1] = VMAXPS(src[1], dst[1]);
300 out[2] = VMAXPS(src[2], dst[2]);
301 out[3] = VMAXPS(src[3], dst[3]);
302 break;
303
304 default:
305 SWR_INVALID("Unsupported blend operation: %d", blendOp);
306 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
307 break;
308 }
309
310 if (Color)
311 {
312 result[0] = out[0];
313 result[1] = out[1];
314 result[2] = out[2];
315 }
316
317 if (Alpha)
318 {
319 result[3] = out[3];
320 }
321 }
322
LogicOpFuncBlendJit323 void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
324 {
325 // Op: (s == PS output, d = RT contents)
326 switch (logicOp)
327 {
328 case LOGICOP_CLEAR:
329 result[0] = VIMMED1(0);
330 result[1] = VIMMED1(0);
331 result[2] = VIMMED1(0);
332 result[3] = VIMMED1(0);
333 break;
334
335 case LOGICOP_NOR:
336 // ~(s | d)
337 result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
338 result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
339 result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
340 result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
341 break;
342
343 case LOGICOP_AND_INVERTED:
344 // ~s & d
345 // todo: use avx andnot instr when I can find the intrinsic to call
346 result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
347 result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
348 result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
349 result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
350 break;
351
352 case LOGICOP_COPY_INVERTED:
353 // ~s
354 result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
355 result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
356 result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
357 result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
358 break;
359
360 case LOGICOP_AND_REVERSE:
361 // s & ~d
362 // todo: use avx andnot instr when I can find the intrinsic to call
363 result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
364 result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
365 result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
366 result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
367 break;
368
369 case LOGICOP_INVERT:
370 // ~d
371 result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
372 result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
373 result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
374 result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
375 break;
376
377 case LOGICOP_XOR:
378 // s ^ d
379 result[0] = XOR(src[0], dst[0]);
380 result[1] = XOR(src[1], dst[1]);
381 result[2] = XOR(src[2], dst[2]);
382 result[3] = XOR(src[3], dst[3]);
383 break;
384
385 case LOGICOP_NAND:
386 // ~(s & d)
387 result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
388 result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
389 result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
390 result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
391 break;
392
393 case LOGICOP_AND:
394 // s & d
395 result[0] = AND(src[0], dst[0]);
396 result[1] = AND(src[1], dst[1]);
397 result[2] = AND(src[2], dst[2]);
398 result[3] = AND(src[3], dst[3]);
399 break;
400
401 case LOGICOP_EQUIV:
402 // ~(s ^ d)
403 result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
404 result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
405 result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
406 result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
407 break;
408
409 case LOGICOP_NOOP:
410 result[0] = dst[0];
411 result[1] = dst[1];
412 result[2] = dst[2];
413 result[3] = dst[3];
414 break;
415
416 case LOGICOP_OR_INVERTED:
417 // ~s | d
418 result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
419 result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
420 result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
421 result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
422 break;
423
424 case LOGICOP_COPY:
425 result[0] = src[0];
426 result[1] = src[1];
427 result[2] = src[2];
428 result[3] = src[3];
429 break;
430
431 case LOGICOP_OR_REVERSE:
432 // s | ~d
433 result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
434 result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
435 result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
436 result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
437 break;
438
439 case LOGICOP_OR:
440 // s | d
441 result[0] = OR(src[0], dst[0]);
442 result[1] = OR(src[1], dst[1]);
443 result[2] = OR(src[2], dst[2]);
444 result[3] = OR(src[3], dst[3]);
445 break;
446
447 case LOGICOP_SET:
448 result[0] = VIMMED1(0xFFFFFFFF);
449 result[1] = VIMMED1(0xFFFFFFFF);
450 result[2] = VIMMED1(0xFFFFFFFF);
451 result[3] = VIMMED1(0xFFFFFFFF);
452 break;
453
454 default:
455 SWR_INVALID("Unsupported logic operation: %d", logicOp);
456 result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
457 break;
458 }
459 }
460
461 void
AlphaTestBlendJit462 AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
463 {
464 // load uint32_t reference
465 Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference}));
466
467 // load alpha
468 Value* pAlpha = LOAD(ppAlpha, {0, 0});
469
470 Value* pTest = nullptr;
471 if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
472 {
473 // convert float alpha to unorm8
474 Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
475 pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
476
477 // compare
478 switch (state.alphaTestFunction)
479 {
480 case ZFUNC_ALWAYS:
481 pTest = VIMMED1(true);
482 break;
483 case ZFUNC_NEVER:
484 pTest = VIMMED1(false);
485 break;
486 case ZFUNC_LT:
487 pTest = ICMP_ULT(pAlphaU8, pRef);
488 break;
489 case ZFUNC_EQ:
490 pTest = ICMP_EQ(pAlphaU8, pRef);
491 break;
492 case ZFUNC_LE:
493 pTest = ICMP_ULE(pAlphaU8, pRef);
494 break;
495 case ZFUNC_GT:
496 pTest = ICMP_UGT(pAlphaU8, pRef);
497 break;
498 case ZFUNC_NE:
499 pTest = ICMP_NE(pAlphaU8, pRef);
500 break;
501 case ZFUNC_GE:
502 pTest = ICMP_UGE(pAlphaU8, pRef);
503 break;
504 default:
505 SWR_INVALID("Invalid alpha test function");
506 break;
507 }
508 }
509 else
510 {
511 // cast ref to float
512 pRef = BITCAST(pRef, mSimdFP32Ty);
513
514 // compare
515 switch (state.alphaTestFunction)
516 {
517 case ZFUNC_ALWAYS:
518 pTest = VIMMED1(true);
519 break;
520 case ZFUNC_NEVER:
521 pTest = VIMMED1(false);
522 break;
523 case ZFUNC_LT:
524 pTest = FCMP_OLT(pAlpha, pRef);
525 break;
526 case ZFUNC_EQ:
527 pTest = FCMP_OEQ(pAlpha, pRef);
528 break;
529 case ZFUNC_LE:
530 pTest = FCMP_OLE(pAlpha, pRef);
531 break;
532 case ZFUNC_GT:
533 pTest = FCMP_OGT(pAlpha, pRef);
534 break;
535 case ZFUNC_NE:
536 pTest = FCMP_ONE(pAlpha, pRef);
537 break;
538 case ZFUNC_GE:
539 pTest = FCMP_OGE(pAlpha, pRef);
540 break;
541 default:
542 SWR_INVALID("Invalid alpha test function");
543 break;
544 }
545 }
546
547 // load current mask
548 Value* pMask = LOAD(ppMask);
549
550 // convert to int1 mask
551 pMask = MASK(pMask);
552
553 // and with alpha test result
554 pMask = AND(pMask, pTest);
555
556 // convert back to vector mask
557 pMask = VMASK(pMask);
558
559 // store new mask
560 STORE(pMask, ppMask);
561 }
562
CreateBlendJit563 Function* Create(const BLEND_COMPILE_STATE& state)
564 {
565 std::stringstream fnName("BLND_",
566 std::ios_base::in | std::ios_base::out | std::ios_base::ate);
567 fnName << ComputeCRC(0, &state, sizeof(state));
568
569 // blend function signature
570 // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);
571
572 std::vector<Type*> args{
573 PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT*
574 };
575
576 // std::vector<Type*> args{
577 // PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT*
578 //};
579
580 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
581 Function* blendFunc = Function::Create(
582 fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
583 blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
584
585 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
586
587 IRB()->SetInsertPoint(entry);
588
589 // arguments
590 auto argitr = blendFunc->arg_begin();
591 Value* pBlendContext = &*argitr++;
592 pBlendContext->setName("pBlendContext");
593 Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState});
594 pBlendState->setName("pBlendState");
595 Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src});
596 pSrc->setName("src");
597 Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1});
598 pSrc1->setName("src1");
599 Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha});
600 pSrc0Alpha->setName("src0alpha");
601 Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum});
602 sampleNum->setName("sampleNum");
603 Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst});
604 pDst->setName("pDst");
605 Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result});
606 pResult->setName("result");
607 Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask});
608 ppoMask->setName("ppoMask");
609 Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask});
610 ppMask->setName("pMask");
611
612 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
613 "Unsupported hot tile format");
614 Value* dst[4];
615 Value* constantColor[4];
616 Value* src[4];
617 Value* src1[4];
618 Value* result[4];
619 for (uint32_t i = 0; i < 4; ++i)
620 {
621 // load hot tile
622 dst[i] = LOAD(pDst, {0, i});
623
624 // load constant color
625 constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i}));
626
627 // load src
628 src[i] = LOAD(pSrc, {0, i});
629
630 // load src1
631 src1[i] = LOAD(pSrc1, {0, i});
632 }
633 Value* currentSampleMask = VIMMED1(-1);
634 if (state.desc.alphaToCoverageEnable)
635 {
636 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
637 uint32_t bits = (1 << state.desc.numSamples) - 1;
638 currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
639 currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
640 }
641
642 // alpha test
643 if (state.desc.alphaTestEnable)
644 {
645 // Gather for archrast stats
646 STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
647 AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
648 }
649 else
650 {
651 // Gather for archrast stats
652 STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
653 }
654
655 // color blend
656 if (state.blendState.blendEnable)
657 {
658 // Gather for archrast stats
659 STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
660
661 // clamp sources
662 Clamp(state.format, src);
663 Clamp(state.format, src1);
664 Clamp(state.format, dst);
665 Clamp(state.format, constantColor);
666
667 // apply defaults to hottile contents to take into account missing components
668 ApplyDefaults(state.format, dst);
669
670 // Force defaults for unused 'X' components
671 ApplyUnusedDefaults(state.format, dst);
672
673 // Quantize low precision components
674 Quantize(state.format, dst);
675
676 // special case clamping for R11G11B10_float which has no sign bit
677 if (state.format == R11G11B10_FLOAT)
678 {
679 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
680 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
681 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
682 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
683 }
684
685 Value* srcFactor[4];
686 Value* dstFactor[4];
687 if (state.desc.independentAlphaBlendEnable)
688 {
689 GenerateBlendFactor<true, false>(
690 state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
691 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor,
692 constantColor,
693 src,
694 src1,
695 dst,
696 srcFactor);
697
698 GenerateBlendFactor<true, false>(
699 state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
700 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor,
701 constantColor,
702 src,
703 src1,
704 dst,
705 dstFactor);
706
707 BlendFunc<true, false>(
708 state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
709 BlendFunc<false, true>(
710 state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
711 }
712 else
713 {
714 GenerateBlendFactor<true, true>(
715 state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
716 GenerateBlendFactor<true, true>(
717 state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
718
719 BlendFunc<true, true>(
720 state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
721 }
722
723 // store results out
724 for (uint32_t i = 0; i < 4; ++i)
725 {
726 STORE(result[i], pResult, {0, i});
727 }
728 }
729 else
730 {
731 // Gather for archrast stats
732 STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
733 }
734
735 if (state.blendState.logicOpEnable)
736 {
737 const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
738 Value* vMask[4];
739 float scale[4];
740
741 if (!state.blendState.blendEnable)
742 {
743 Clamp(state.format, src);
744 Clamp(state.format, dst);
745 }
746
747 for (uint32_t i = 0; i < 4; i++)
748 {
749 if (info.type[i] == SWR_TYPE_UNUSED)
750 {
751 continue;
752 }
753
754 if (info.bpc[i] >= 32)
755 {
756 vMask[i] = VIMMED1(0xFFFFFFFF);
757 scale[i] = 0xFFFFFFFF;
758 }
759 else
760 {
761 vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
762 if (info.type[i] == SWR_TYPE_SNORM)
763 scale[i] = (1 << (info.bpc[i] - 1)) - 1;
764 else
765 scale[i] = (1 << info.bpc[i]) - 1;
766 }
767
768 switch (info.type[i])
769 {
770 default:
771 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
772 break;
773
774 case SWR_TYPE_UNKNOWN:
775 case SWR_TYPE_UNUSED:
776 FALLTHROUGH;
777
778 case SWR_TYPE_UINT:
779 case SWR_TYPE_SINT:
780 src[i] = BITCAST(src[i], mSimdInt32Ty);
781 dst[i] = BITCAST(dst[i], mSimdInt32Ty);
782 break;
783 case SWR_TYPE_SNORM:
784 src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
785 dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
786 break;
787 case SWR_TYPE_UNORM:
788 src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
789 dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
790 break;
791 }
792 }
793
794 LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
795
796 // store results out
797 for (uint32_t i = 0; i < 4; ++i)
798 {
799 if (info.type[i] == SWR_TYPE_UNUSED)
800 {
801 continue;
802 }
803
804 // clear upper bits from PS output not in RT format after doing logic op
805 result[i] = AND(result[i], vMask[i]);
806
807 switch (info.type[i])
808 {
809 default:
810 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
811 break;
812
813 case SWR_TYPE_UNKNOWN:
814 case SWR_TYPE_UNUSED:
815 FALLTHROUGH;
816
817 case SWR_TYPE_UINT:
818 case SWR_TYPE_SINT:
819 result[i] = BITCAST(result[i], mSimdFP32Ty);
820 break;
821 case SWR_TYPE_SNORM:
822 result[i] = SHL(result[i], C(32 - info.bpc[i]));
823 result[i] = ASHR(result[i], C(32 - info.bpc[i]));
824 result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
825 break;
826 case SWR_TYPE_UNORM:
827 result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
828 break;
829 }
830
831 STORE(result[i], pResult, {0, i});
832 }
833 }
834
835 if (state.desc.oMaskEnable)
836 {
837 assert(!(state.desc.alphaToCoverageEnable));
838 // load current mask
839 Value* oMask = LOAD(ppoMask);
840 currentSampleMask = AND(oMask, currentSampleMask);
841 }
842
843 if (state.desc.sampleMaskEnable)
844 {
845 Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask});
846 currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
847 }
848
849 if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
850 state.desc.oMaskEnable)
851 {
852 // load coverage mask and mask off any lanes with no samples
853 Value* pMask = LOAD(ppMask);
854 Value* sampleMasked = SHL(C(1), sampleNum);
855 currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked));
856 currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
857 Value* outputMask = AND(pMask, currentSampleMask);
858 // store new mask
859 STORE(outputMask, GEP(ppMask, C(0)));
860 }
861
862 RET_VOID();
863
864 JitManager::DumpToFile(blendFunc, "");
865
866 ::FunctionPassManager passes(JM()->mpCurrentModule);
867
868 passes.add(createBreakCriticalEdgesPass());
869 passes.add(createCFGSimplificationPass());
870 passes.add(createEarlyCSEPass());
871 passes.add(createPromoteMemoryToRegisterPass());
872 passes.add(createCFGSimplificationPass());
873 passes.add(createEarlyCSEPass());
874 passes.add(createInstructionCombiningPass());
875 #if LLVM_VERSION_MAJOR <= 11
876 passes.add(createConstantPropagationPass());
877 #endif
878 passes.add(createSCCPPass());
879 passes.add(createAggressiveDCEPass());
880
881 passes.add(createLowerX86Pass(this));
882
883 passes.run(*blendFunc);
884
885 JitManager::DumpToFile(blendFunc, "optimized");
886
887 return blendFunc;
888 }
889 };
890
891 //////////////////////////////////////////////////////////////////////////
892 /// @brief JITs from fetch shader IR
893 /// @param hJitMgr - JitManager handle
894 /// @param func - LLVM function IR
895 /// @return PFN_FETCH_FUNC - pointer to fetch code
JitBlendFunc(HANDLE hJitMgr,const HANDLE hFunc)896 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
897 {
898 const llvm::Function* func = (const llvm::Function*)hFunc;
899 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
900 PFN_BLEND_JIT_FUNC pfnBlend;
901 pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
902 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
903 // add new IR to the module
904 pJitMgr->mIsModuleFinalized = true;
905
906 return pfnBlend;
907 }
908
909 //////////////////////////////////////////////////////////////////////////
910 /// @brief JIT compiles blend shader
911 /// @param hJitMgr - JitManager handle
912 /// @param state - blend state to build function from
JitCompileBlend(HANDLE hJitMgr,const BLEND_COMPILE_STATE & state)913 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr,
914 const BLEND_COMPILE_STATE& state)
915 {
916 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
917
918 pJitMgr->SetupNewModule();
919
920 BlendJit theJit(pJitMgr);
921 HANDLE hFunc = theJit.Create(state);
922
923 return JitBlendFunc(hJitMgr, hFunc);
924 }
925