1 // [Blend2D]
2 // 2D Vector Graphics Powered by a JIT Compiler.
3 //
4 // [License]
5 // Zlib - See LICENSE.md file in the package.
6
7 #include "../api-build_p.h"
8 #if BL_TARGET_ARCH_X86 && !defined(BL_BUILD_NO_JIT)
9
10 #include "../pipegen/compoppart_p.h"
11 #include "../pipegen/fetchpart_p.h"
12 #include "../pipegen/fetchpatternpart_p.h"
13 #include "../pipegen/fetchpixelptrpart_p.h"
14 #include "../pipegen/fetchsolidpart_p.h"
15 #include "../pipegen/pipecompiler_p.h"
16
17 #define C_MEM(CONST) pc->constAsMem(blCommonTable.CONST)
18
19 namespace BLPipeGen {
20
21 // ============================================================================
22 // [BLPipeGen::CompOpPart - Construction / Destruction]
23 // ============================================================================
24
CompOpPart(PipeCompiler * pc,uint32_t compOp,FetchPart * dstPart,FetchPart * srcPart)25 CompOpPart::CompOpPart(PipeCompiler* pc, uint32_t compOp, FetchPart* dstPart, FetchPart* srcPart) noexcept
26 : PipePart(pc, kTypeComposite),
27 _compOp(compOp),
28 _pixelType(dstPart->hasRGB() ? Pixel::kTypeRGBA : Pixel::kTypeAlpha),
29 _cMaskLoopType(kCMaskLoopTypeNone),
30 _maxPixels(1),
31 _pixelGranularity(0),
32 _minAlignment(1),
33 _isInPartialMode(false),
34 _hasDa(dstPart->hasAlpha()),
35 _hasSa(srcPart->hasAlpha()),
36 _cMaskLoopHook(nullptr),
37 _solidPre(_pixelType),
38 _partialPixel(_pixelType) {
39
40 // Initialize the children of this part.
41 _children[kIndexDstPart] = dstPart;
42 _children[kIndexSrcPart] = srcPart;
43 _childrenCount = 2;
44
45 _maxSimdWidthSupported = 16;
46
47 bool isSolid = srcPart->isSolid();
48 uint32_t maxPixels = 0;
49 uint32_t pixelLimit = 64;
50
51 // Limit the maximum pixel-step to 4 it the style is not solid and the target
52 // is not 64-bit. There's not enough registers to process 8 pixels in parallel
53 // in 32-bit mode.
54 if (BL_TARGET_ARCH_BITS < 64 && !isSolid && _pixelType != Pixel::kTypeAlpha)
55 pixelLimit = 4;
56
57 // Decrease the maximum pixels to 4 if the source is complex to fetch.
58 // In such case fetching and processing more pixels would result in
59 // emitting bloated pipelines that are not faster compared to pipelines
60 // working with just 4 pixels at a time.
61 if (dstPart->isComplexFetch() || srcPart->isComplexFetch())
62 pixelLimit = 4;
63
64 switch (pixelType()) {
65 case Pixel::kTypeRGBA:
66 switch (compOp) {
67 case BL_COMP_OP_SRC_OVER : maxPixels = 8; break;
68 case BL_COMP_OP_SRC_COPY : maxPixels = 8; break;
69 case BL_COMP_OP_SRC_IN : maxPixels = 8; break;
70 case BL_COMP_OP_SRC_OUT : maxPixels = 8; break;
71 case BL_COMP_OP_SRC_ATOP : maxPixels = 8; break;
72 case BL_COMP_OP_DST_OVER : maxPixels = 8; break;
73 case BL_COMP_OP_DST_IN : maxPixels = 8; break;
74 case BL_COMP_OP_DST_OUT : maxPixels = 8; break;
75 case BL_COMP_OP_DST_ATOP : maxPixels = 8; break;
76 case BL_COMP_OP_XOR : maxPixels = 8; break;
77 case BL_COMP_OP_CLEAR : maxPixels = 8; break;
78 case BL_COMP_OP_PLUS : maxPixels = 8; break;
79 case BL_COMP_OP_MINUS : maxPixels = 4; break;
80 case BL_COMP_OP_MULTIPLY : maxPixels = 8; break;
81 case BL_COMP_OP_SCREEN : maxPixels = 8; break;
82 case BL_COMP_OP_OVERLAY : maxPixels = 4; break;
83 case BL_COMP_OP_DARKEN : maxPixels = 8; break;
84 case BL_COMP_OP_LIGHTEN : maxPixels = 8; break;
85 case BL_COMP_OP_COLOR_DODGE : maxPixels = 1; break;
86 case BL_COMP_OP_COLOR_BURN : maxPixels = 1; break;
87 case BL_COMP_OP_LINEAR_BURN : maxPixels = 8; break;
88 case BL_COMP_OP_LINEAR_LIGHT: maxPixels = 1; break;
89 case BL_COMP_OP_PIN_LIGHT : maxPixels = 4; break;
90 case BL_COMP_OP_HARD_LIGHT : maxPixels = 4; break;
91 case BL_COMP_OP_SOFT_LIGHT : maxPixels = 1; break;
92 case BL_COMP_OP_DIFFERENCE : maxPixels = 8; break;
93 case BL_COMP_OP_EXCLUSION : maxPixels = 8; break;
94
95 default:
96 BL_NOT_REACHED();
97 }
98 break;
99
100 case Pixel::kTypeAlpha:
101 maxPixels = 8;
102 break;
103 }
104
105 // Descrease to N pixels at a time if the fetch part doesn't support more.
106 // This is suboptimal, but can happen if the fetch part is not optimized.
107 maxPixels = blMin(maxPixels, pixelLimit, srcPart->maxPixels());
108
109 if (isRGBAType()) {
110 if (maxPixels >= 4)
111 _minAlignment = 16;
112 }
113
114 _maxPixels = uint8_t(maxPixels);
115 _mask->reset();
116 }
117
118 // ============================================================================
119 // [BLPipeGen::CompOpPart - Init / Fini]
120 // ============================================================================
121
init(x86::Gp & x,x86::Gp & y,uint32_t pixelGranularity)122 void CompOpPart::init(x86::Gp& x, x86::Gp& y, uint32_t pixelGranularity) noexcept {
123 _pixelGranularity = uint8_t(pixelGranularity);
124
125 dstPart()->init(x, y, pixelType(), pixelGranularity);
126 srcPart()->init(x, y, pixelType(), pixelGranularity);
127 }
128
fini()129 void CompOpPart::fini() noexcept {
130 dstPart()->fini();
131 srcPart()->fini();
132
133 _pixelGranularity = 0;
134 }
135
136 // ============================================================================
137 // [BLPipeGen::CompOpPart - Optimization Opportunities]
138 // ============================================================================
139
shouldOptimizeOpaqueFill() const140 bool CompOpPart::shouldOptimizeOpaqueFill() const noexcept {
141 // Should be always optimized if the source is not solid.
142 if (!srcPart()->isSolid())
143 return true;
144
145 // Do not optimize if the CompOp is TypeA. This operator doesn't need any
146 // special handling as the source pixel is multiplied with mask before it's
147 // passed to the compositor.
148 if (compOpFlags() & BL_COMP_OP_FLAG_TYPE_A)
149 return false;
150
151 // We assume that in all other cases there is a benefit of using optimized
152 // `cMask` loop for a fully opaque mask.
153 return true;
154 }
155
shouldJustCopyOpaqueFill() const156 bool CompOpPart::shouldJustCopyOpaqueFill() const noexcept {
157 if (compOp() != BL_COMP_OP_SRC_COPY)
158 return false;
159
160 if (srcPart()->isSolid())
161 return true;
162
163 if (srcPart()->isFetchType(BL_PIPE_FETCH_TYPE_PATTERN_AA_BLIT) &&
164 srcPart()->format() == dstPart()->format())
165 return true;
166
167 return false;
168 }
169
170 // ============================================================================
171 // [BLPipeGen::CompOpPart - Advance]
172 // ============================================================================
173
startAtX(x86::Gp & x)174 void CompOpPart::startAtX(x86::Gp& x) noexcept {
175 dstPart()->startAtX(x);
176 srcPart()->startAtX(x);
177 }
178
advanceX(x86::Gp & x,x86::Gp & diff)179 void CompOpPart::advanceX(x86::Gp& x, x86::Gp& diff) noexcept {
180 dstPart()->advanceX(x, diff);
181 srcPart()->advanceX(x, diff);
182 }
183
advanceY()184 void CompOpPart::advanceY() noexcept {
185 dstPart()->advanceY();
186 srcPart()->advanceY();
187 }
188
189 // ============================================================================
190 // [BLPipeGen::CompOpPart - Prefetch / Postfetch]
191 // ============================================================================
192
prefetch1()193 void CompOpPart::prefetch1() noexcept {
194 dstPart()->prefetch1();
195 srcPart()->prefetch1();
196 }
197
enterN()198 void CompOpPart::enterN() noexcept {
199 dstPart()->enterN();
200 srcPart()->enterN();
201 }
202
leaveN()203 void CompOpPart::leaveN() noexcept {
204 dstPart()->leaveN();
205 srcPart()->leaveN();
206 }
207
prefetchN()208 void CompOpPart::prefetchN() noexcept {
209 dstPart()->prefetchN();
210 srcPart()->prefetchN();
211 }
212
postfetchN()213 void CompOpPart::postfetchN() noexcept {
214 dstPart()->postfetchN();
215 srcPart()->postfetchN();
216 }
217
218 // ============================================================================
219 // [BLPipeGen::CompOpPart - Fetch]
220 // ============================================================================
221
dstFetch(Pixel & p,uint32_t flags,uint32_t n)222 void CompOpPart::dstFetch(Pixel& p, uint32_t flags, uint32_t n) noexcept {
223 switch (n) {
224 case 1: dstPart()->fetch1(p, flags); break;
225 case 4: dstPart()->fetch4(p, flags); break;
226 case 8: dstPart()->fetch8(p, flags); break;
227 }
228 }
229
srcFetch(Pixel & p,uint32_t flags,uint32_t n)230 void CompOpPart::srcFetch(Pixel& p, uint32_t flags, uint32_t n) noexcept {
231 // Pixels must match as we have already preconfigured the CompOpPart.
232 BL_ASSERT(p.type() == pixelType());
233
234 if (!p.count())
235 p.setCount(n);
236
237 // Composition with a preprocessed solid color.
238 if (isUsingSolidPre()) {
239 Pixel& s = _solidPre;
240
241 // INJECT:
242 {
243 ScopedInjector injector(cc, &_cMaskLoopHook);
244 pc->xSatisfySolid(s, flags);
245 }
246
247 if (p.isRGBA()) {
248 if (flags & Pixel::kImmutable) {
249 if (flags & Pixel::kPC) p.pc.init(s.pc[0]);
250 if (flags & Pixel::kUC) p.uc.init(s.uc[0]);
251 if (flags & Pixel::kUA) p.ua.init(s.ua[0]);
252 if (flags & Pixel::kUIA) p.uia.init(s.uia[0]);
253 }
254 else {
255 switch (n) {
256 case 1:
257 if (flags & Pixel::kPC) { p.pc.init(cc->newXmm("pre.pc")); pc->vmov(p.pc[0], s.pc[0]); }
258 if (flags & Pixel::kUC) { p.uc.init(cc->newXmm("pre.uc")); pc->vmov(p.uc[0], s.uc[0]); }
259 if (flags & Pixel::kUA) { p.ua.init(cc->newXmm("pre.ua")); pc->vmov(p.ua[0], s.ua[0]); }
260 if (flags & Pixel::kUIA) { p.uia.init(cc->newXmm("pre.uia")); pc->vmov(p.uia[0], s.uia[0]); }
261 break;
262
263 case 4:
264 if (flags & Pixel::kPC) {
265 pc->newXmmArray(p.pc, 1, "pre.pc");
266 pc->vmov(p.pc[0], s.pc[0]);
267 }
268
269 if (flags & Pixel::kUC) {
270 pc->newXmmArray(p.uc, 2, "pre.uc");
271 pc->vmov(p.uc[0], s.uc[0]);
272 pc->vmov(p.uc[1], s.uc[0]);
273 }
274
275 if (flags & Pixel::kUA) {
276 pc->newXmmArray(p.ua, 2, "pre.ua");
277 pc->vmov(p.ua[0], s.ua[0]);
278 pc->vmov(p.ua[1], s.ua[0]);
279 }
280
281 if (flags & Pixel::kUIA) {
282 pc->newXmmArray(p.uia, 2, "pre.uia");
283 pc->vmov(p.uia[0], s.uia[0]);
284 pc->vmov(p.uia[1], s.uia[0]);
285 }
286 break;
287
288 case 8:
289 if (flags & Pixel::kPC) {
290 pc->newXmmArray(p.pc, 2, "pre.pc");
291 pc->vmov(p.pc[0], s.pc[0]);
292 pc->vmov(p.pc[1], s.pc[0]);
293 }
294
295 if (flags & Pixel::kUC) {
296 pc->newXmmArray(p.uc, 4, "pre.uc");
297 pc->vmov(p.uc[0], s.uc[0]);
298 pc->vmov(p.uc[1], s.uc[0]);
299 pc->vmov(p.uc[2], s.uc[0]);
300 pc->vmov(p.uc[3], s.uc[0]);
301 }
302
303 if (flags & Pixel::kUA) {
304 pc->newXmmArray(p.ua, 4, "pre.ua");
305 pc->vmov(p.ua[0], s.ua[0]);
306 pc->vmov(p.ua[1], s.ua[0]);
307 pc->vmov(p.ua[2], s.ua[0]);
308 pc->vmov(p.ua[3], s.ua[0]);
309 }
310
311 if (flags & Pixel::kUIA) {
312 pc->newXmmArray(p.uia, 4, "pre.uia");
313 pc->vmov(p.uia[0], s.uia[0]);
314 pc->vmov(p.uia[1], s.uia[0]);
315 pc->vmov(p.uia[2], s.uia[0]);
316 pc->vmov(p.uia[3], s.uia[0]);
317 }
318 break;
319 }
320 }
321 }
322 else if (p.isAlpha()) {
323 // TODO: A8 pipepine.
324 BL_ASSERT(false);
325 }
326
327 return;
328 }
329
330 // Partial mode is designed to fetch pixels on the right side of the
331 // border one by one, so it's an error if the pipeline requests more
332 // than 1 pixel at a time.
333 if (isInPartialMode()) {
334 BL_ASSERT(n == 1);
335
336 if (p.isRGBA()) {
337 if (!(flags & Pixel::kImmutable)) {
338 if (flags & Pixel::kUC) {
339 pc->newXmmArray(p.uc, 1, "uc");
340 pc->vmovu8u16(p.uc[0], _partialPixel.pc[0]);
341 }
342 else {
343 pc->newXmmArray(p.pc, 1, "pc");
344 pc->vmov(p.pc[0], _partialPixel.pc[0]);
345 }
346 }
347 else {
348 p.pc.init(_partialPixel.pc[0]);
349 }
350 }
351 else if (p.isAlpha()) {
352 p.sa = cc->newUInt32("sa");
353 pc->vextractu16(p.sa, _partialPixel.ua[0], 0);
354 }
355
356 pc->xSatisfyPixel(p, flags);
357 return;
358 }
359
360 switch (n) {
361 case 1: srcPart()->fetch1(p, flags); break;
362 case 4: srcPart()->fetch4(p, flags); break;
363 case 8: srcPart()->fetch8(p, flags); break;
364 }
365 }
366
367 // ============================================================================
368 // [BLPipeGen::CompOpPart - PartialFetch]
369 // ============================================================================
370
enterPartialMode(uint32_t partialFlags)371 void CompOpPart::enterPartialMode(uint32_t partialFlags) noexcept {
372 // Doesn't apply to solid fills.
373 if (isUsingSolidPre())
374 return;
375
376 // TODO: [PIPEGEN] We only support partial fetch of 4 pixels at the moment.
377 BL_ASSERT(!isInPartialMode());
378 BL_ASSERT(pixelGranularity() == 4);
379
380 switch (pixelType()) {
381 case Pixel::kTypeRGBA: {
382 srcFetch(_partialPixel, Pixel::kPC | partialFlags, pixelGranularity());
383 break;
384 }
385
386 case Pixel::kTypeAlpha: {
387 srcFetch(_partialPixel, Pixel::kUA | partialFlags, pixelGranularity());
388 break;
389 }
390 }
391
392 _isInPartialMode = true;
393 }
394
exitPartialMode()395 void CompOpPart::exitPartialMode() noexcept {
396 // Doesn't apply to solid fills.
397 if (isUsingSolidPre())
398 return;
399
400 BL_ASSERT(isInPartialMode());
401
402 _isInPartialMode = false;
403 _partialPixel.resetAllExceptType();
404 }
405
nextPartialPixel()406 void CompOpPart::nextPartialPixel() noexcept {
407 if (!isInPartialMode())
408 return;
409
410 switch (pixelType()) {
411 case Pixel::kTypeRGBA: {
412 const x86::Vec& pix = _partialPixel.pc[0];
413 pc->vsrli128b(pix, pix, 4);
414 break;
415 }
416
417 case Pixel::kTypeAlpha: {
418 const x86::Vec& pix = _partialPixel.ua[0];
419 pc->vsrli128b(pix, pix, 2);
420 break;
421 }
422 }
423 }
424
425 // ============================================================================
426 // [BLPipeGen::CompOpPart - CMask - Init / Fini]
427 // ============================================================================
428
cMaskInit(const x86::Mem & mem)429 void CompOpPart::cMaskInit(const x86::Mem& mem) noexcept {
430 switch (pixelType()) {
431 case Pixel::kTypeRGBA: {
432 x86::Vec mVec = cc->newXmm("msk");
433 x86::Mem m(mem);
434
435 m.setSize(4);
436 pc->vbroadcast_u16(mVec, m);
437 cMaskInitRGBA32(mVec);
438 break;
439 }
440
441 case Pixel::kTypeAlpha: {
442 x86::Gp mGp = cc->newUInt32("msk");
443 pc->load8(mGp, mem);
444 cMaskInitA8(mGp, x86::Vec());
445 break;
446 }
447
448 default:
449 BL_NOT_REACHED();
450 }
451 }
452
cMaskInit(const x86::Gp & sm_,const x86::Vec & vm_)453 void CompOpPart::cMaskInit(const x86::Gp& sm_, const x86::Vec& vm_) noexcept {
454 x86::Gp sm(sm_);
455 x86::Vec vm(vm_);
456
457 switch (pixelType()) {
458 case Pixel::kTypeRGBA: {
459 if (!vm.isValid() && sm.isValid()) {
460 vm = cc->newXmm("c.vm");
461 pc->vbroadcast_u16(vm, sm);
462 }
463
464 cMaskInitRGBA32(vm);
465 break;
466 }
467
468 case Pixel::kTypeAlpha: {
469 cMaskInitA8(sm, vm);
470 break;
471 }
472
473 default:
474 BL_NOT_REACHED();
475 }
476 }
477
cMaskInitOpaque()478 void CompOpPart::cMaskInitOpaque() noexcept {
479 switch (pixelType()) {
480 case Pixel::kTypeRGBA: {
481 cMaskInitRGBA32(x86::Vec());
482 break;
483 }
484
485 case Pixel::kTypeAlpha: {
486 cMaskInitA8(x86::Gp(), x86::Vec());
487 break;
488 }
489
490 default:
491 BL_NOT_REACHED();
492 }
493 }
494
cMaskFini()495 void CompOpPart::cMaskFini() noexcept {
496 switch (pixelType()) {
497 case Pixel::kTypeAlpha:
498 cMaskFiniA8();
499 break;
500
501 case Pixel::kTypeRGBA:
502 cMaskFiniRGBA32();
503 break;
504
505 default:
506 BL_NOT_REACHED();
507 }
508 }
509
_cMaskLoopInit(uint32_t loopType)510 void CompOpPart::_cMaskLoopInit(uint32_t loopType) noexcept {
511 // Make sure `_cMaskLoopInit()` and `_cMaskLoopFini()` are used as a pair.
512 BL_ASSERT(_cMaskLoopType == kCMaskLoopTypeNone);
513 BL_ASSERT(_cMaskLoopHook == nullptr);
514
515 _cMaskLoopType = uint8_t(loopType);
516 _cMaskLoopHook = cc->cursor();
517 }
518
_cMaskLoopFini()519 void CompOpPart::_cMaskLoopFini() noexcept {
520 // Make sure `_cMaskLoopInit()` and `_cMaskLoopFini()` are used as a pair.
521 BL_ASSERT(_cMaskLoopType != kCMaskLoopTypeNone);
522 BL_ASSERT(_cMaskLoopHook != nullptr);
523
524 _cMaskLoopType = kCMaskLoopTypeNone;
525 _cMaskLoopHook = nullptr;
526 }
527
528 // ============================================================================
529 // [BLPipeGen::CompOpPart - CMask - Generic Loop]
530 // ============================================================================
531
cMaskGenericLoop(x86::Gp & i)532 void CompOpPart::cMaskGenericLoop(x86::Gp& i) noexcept {
533 if (isLoopOpaque() && shouldJustCopyOpaqueFill()) {
534 cMaskMemcpyOrMemsetLoop(i);
535 return;
536 }
537
538 cMaskGenericLoopXmm(i);
539 }
540
cMaskGenericLoopXmm(x86::Gp & i)541 void CompOpPart::cMaskGenericLoopXmm(x86::Gp& i) noexcept {
542 x86::Gp dPtr = dstPart()->as<FetchPixelPtrPart>()->ptr();
543
544 // 1 pixel at a time.
545 if (maxPixels() == 1) {
546 Label L_Loop = cc->newLabel();
547
548 prefetch1();
549
550 cc->bind(L_Loop);
551 cMaskCompositeAndStore(x86::ptr(dPtr), 1);
552 pc->uAdvanceAndDecrement(dPtr, int(dstPart()->bpp()), i, 1);
553 cc->jnz(L_Loop);
554
555 return;
556 }
557
558 BL_ASSERT(minAlignment() >= 1);
559 int alignmentMask = int(minAlignment()) - 1;
560
561 // 4+ pixels at a time [no alignment].
562 if (maxPixels() == 4 && minAlignment() == 1) {
563 Label L_Loop1 = cc->newLabel();
564 Label L_Loop4 = cc->newLabel();
565 Label L_Skip4 = cc->newLabel();
566 Label L_Exit = cc->newLabel();
567
568 cc->sub(i, 4);
569 cc->jc(L_Skip4);
570
571 enterN();
572 prefetchN();
573
574 cc->bind(L_Loop4);
575 cMaskCompositeAndStore(x86::ptr(dPtr), 4);
576 pc->uAdvanceAndDecrement(dPtr, int(dstPart()->bpp() * 4), i, 4);
577 cc->jnc(L_Loop4);
578
579 postfetchN();
580 leaveN();
581
582 cc->bind(L_Skip4);
583 prefetch1();
584 cc->add(i, 4);
585 cc->jz(L_Exit);
586
587 cc->bind(L_Loop1);
588 cMaskCompositeAndStore(x86::ptr(dPtr), 1);
589 pc->uAdvanceAndDecrement(dPtr, int(dstPart()->bpp()), i, 1);
590 cc->jnz(L_Loop1);
591
592 cc->bind(L_Exit);
593 return;
594 }
595
596 // 4+ pixels at a time [with alignment].
597 if (maxPixels() == 4 && minAlignment() != 1) {
598 Label L_Loop1 = cc->newLabel();
599 Label L_Loop4 = cc->newLabel();
600 Label L_Aligned = cc->newLabel();
601 Label L_Exit = cc->newLabel();
602
603 pc->uTest(dPtr, alignmentMask);
604 cc->jz(L_Aligned);
605
606 prefetch1();
607
608 cc->bind(L_Loop1);
609 cMaskCompositeAndStore(x86::ptr(dPtr), 1);
610 pc->uAdvanceAndDecrement(dPtr, int(dstPart()->bpp()), i, 1);
611 cc->jz(L_Exit);
612
613 pc->uTest(dPtr, alignmentMask);
614 cc->jnz(L_Loop1);
615
616 cc->bind(L_Aligned);
617 cc->cmp(i, 4);
618 cc->jb(L_Loop1);
619
620 cc->sub(i, 4);
621 dstPart()->as<FetchPixelPtrPart>()->setPtrAlignment(16);
622
623 enterN();
624 prefetchN();
625
626 cc->bind(L_Loop4);
627 cMaskCompositeAndStore(x86::ptr(dPtr), 4, 16);
628 cc->add(dPtr, int(dstPart()->bpp() * 4));
629 cc->sub(i, 4);
630 cc->jnc(L_Loop4);
631
632 postfetchN();
633 leaveN();
634 dstPart()->as<FetchPixelPtrPart>()->setPtrAlignment(0);
635
636 prefetch1();
637
638 cc->add(i, 4);
639 cc->jnz(L_Loop1);
640
641 cc->bind(L_Exit);
642 return;
643 }
644
645 // 8+ pixels at a time [no alignment].
646 if (maxPixels() == 8 && minAlignment() == 1) {
647 Label L_Loop1 = cc->newLabel();
648 Label L_Loop4 = cc->newLabel();
649 Label L_Loop8 = cc->newLabel();
650 Label L_Skip4 = cc->newLabel();
651 Label L_Skip8 = cc->newLabel();
652 Label L_Init1 = cc->newLabel();
653 Label L_Exit = cc->newLabel();
654
655 cc->sub(i, 4);
656 cc->jc(L_Skip4);
657
658 enterN();
659 prefetchN();
660
661 cc->sub(i, 4);
662 cc->jc(L_Skip8);
663
664 cc->bind(L_Loop8);
665 cMaskCompositeAndStore(x86::ptr(dPtr), 8);
666 pc->uAdvanceAndDecrement(dPtr, int(dstPart()->bpp() * 8), i, 8);
667 cc->jnc(L_Loop8);
668
669 cc->bind(L_Skip8);
670 cc->add(i, 4);
671 cc->jnc(L_Init1);
672
673 cc->bind(L_Loop4);
674 cMaskCompositeAndStore(x86::ptr(dPtr), 4);
675 pc->uAdvanceAndDecrement(dPtr, int(dstPart()->bpp() * 4), i, 4);
676 cc->jnc(L_Loop4);
677
678 cc->bind(L_Init1);
679 postfetchN();
680 leaveN();
681
682 cc->bind(L_Skip4);
683 prefetch1();
684 cc->add(i, 4);
685 cc->jz(L_Exit);
686
687 cc->bind(L_Loop1);
688 cMaskCompositeAndStore(x86::ptr(dPtr), 1);
689 pc->uAdvanceAndDecrement(dPtr, int(dstPart()->bpp()), i, 1);
690 cc->jnz(L_Loop1);
691
692 cc->bind(L_Exit);
693 return;
694 }
695
696 // 8+ pixels at a time [with alignment].
697 if (maxPixels() == 8 && minAlignment() != 1) {
698 Label L_Loop1 = cc->newLabel();
699 Label L_Loop8 = cc->newLabel();
700 Label L_Skip8 = cc->newLabel();
701 Label L_Skip4 = cc->newLabel();
702 Label L_Aligned = cc->newLabel();
703 Label L_Exit = cc->newLabel();
704
705 cc->test(dPtr.r8(), alignmentMask);
706 cc->jz(L_Aligned);
707
708 prefetch1();
709
710 cc->bind(L_Loop1);
711 cMaskCompositeAndStore(x86::ptr(dPtr), 1);
712 pc->uAdvanceAndDecrement(dPtr, int(dstPart()->bpp()), i, 1);
713 cc->jz(L_Exit);
714
715 cc->test(dPtr.r8(), alignmentMask);
716 cc->jnz(L_Loop1);
717
718 cc->bind(L_Aligned);
719 cc->cmp(i, 4);
720 cc->jb(L_Loop1);
721
722 dstPart()->as<FetchPixelPtrPart>()->setPtrAlignment(16);
723 enterN();
724 prefetchN();
725
726 cc->sub(i, 8);
727 cc->jc(L_Skip8);
728
729 cc->bind(L_Loop8);
730 cMaskCompositeAndStore(x86::ptr(dPtr), 8, minAlignment());
731 cc->add(dPtr, int(dstPart()->bpp() * 8));
732 cc->sub(i, 8);
733 cc->jnc(L_Loop8);
734
735 cc->bind(L_Skip8);
736 cc->add(i, 4);
737 cc->jnc(L_Skip4);
738
739 cMaskCompositeAndStore(x86::ptr(dPtr), 4, minAlignment());
740 cc->add(dPtr, int(dstPart()->bpp() * 4));
741 cc->sub(i, 4);
742 cc->bind(L_Skip4);
743
744 postfetchN();
745 leaveN();
746 dstPart()->as<FetchPixelPtrPart>()->setPtrAlignment(0);
747
748 prefetch1();
749
750 cc->add(i, 4);
751 cc->jnz(L_Loop1);
752
753 cc->bind(L_Exit);
754 return;
755 }
756
757 BL_NOT_REACHED();
758 }
759
760 // ============================================================================
761 // [BLPipeGen::CompOpPart - CMask - Granular Loop]
762 // ============================================================================
763
cMaskGranularLoop(x86::Gp & i)764 void CompOpPart::cMaskGranularLoop(x86::Gp& i) noexcept {
765 if (isLoopOpaque() && shouldJustCopyOpaqueFill()) {
766 cMaskMemcpyOrMemsetLoop(i);
767 return;
768 }
769
770 cMaskGranularLoopXmm(i);
771 }
772
cMaskGranularLoopXmm(x86::Gp & i)773 void CompOpPart::cMaskGranularLoopXmm(x86::Gp& i) noexcept {
774 BL_ASSERT(pixelGranularity() == 4);
775
776 x86::Gp dPtr = dstPart()->as<FetchPixelPtrPart>()->ptr();
777 if (pixelGranularity() == 4) {
778 // 1 pixel at a time.
779 if (maxPixels() == 1) {
780 Label L_Loop = cc->newLabel();
781 Label L_Step = cc->newLabel();
782
783 cc->bind(L_Loop);
784 enterPartialMode();
785
786 cc->bind(L_Step);
787 cMaskCompositeAndStore(x86::ptr(dPtr), 1);
788 cc->sub(i, 1);
789 cc->add(dPtr, int(dstPart()->bpp()));
790 nextPartialPixel();
791
792 cc->test(i, 0x3);
793 cc->jnz(L_Step);
794
795 exitPartialMode();
796
797 cc->test(i, i);
798 cc->jnz(L_Loop);
799
800 return;
801 }
802
803 // 4+ pixels at a time.
804 if (maxPixels() == 4) {
805 Label L_Loop = cc->newLabel();
806
807 cc->bind(L_Loop);
808 cMaskCompositeAndStore(x86::ptr(dPtr), 4);
809 cc->add(dPtr, int(dstPart()->bpp() * 4));
810 cc->sub(i, 4);
811 cc->jnz(L_Loop);
812
813 return;
814 }
815
816 // 8+ pixels at a time.
817 if (maxPixels() == 8) {
818 Label L_Loop = cc->newLabel();
819 Label L_Skip = cc->newLabel();
820 Label L_End = cc->newLabel();
821
822 cc->sub(i, 8);
823 cc->jc(L_Skip);
824
825 cc->bind(L_Loop);
826 cMaskCompositeAndStore(x86::ptr(dPtr), 8);
827 cc->add(dPtr, int(dstPart()->bpp() * 8));
828 cc->sub(i, 8);
829 cc->jnc(L_Loop);
830
831 cc->bind(L_Skip);
832 cc->add(i, 8);
833 cc->jz(L_End);
834
835 // 4 remaining pixels.
836 cMaskCompositeAndStore(x86::ptr(dPtr), 4);
837 cc->add(dPtr, int(dstPart()->bpp() * 4));
838
839 cc->bind(L_End);
840 return;
841 }
842 }
843
844 BL_NOT_REACHED();
845 }
846
847 // ============================================================================
848 // [BLPipeGen::CompOpPart - CMask - MemCpy / MemSet Loop]
849 // ============================================================================
850
cMaskMemcpyOrMemsetLoop(x86::Gp & i)851 void CompOpPart::cMaskMemcpyOrMemsetLoop(x86::Gp& i) noexcept {
852 BL_ASSERT(shouldJustCopyOpaqueFill());
853 x86::Gp dPtr = dstPart()->as<FetchPixelPtrPart>()->ptr();
854
855 if (srcPart()->isSolid()) {
856 // Optimized solid opaque fill -> MemSet.
857 BL_ASSERT(_solidOpt.px.isValid());
858 pc->xInlinePixelFillLoop(dPtr, _solidOpt.px, i, 64, dstPart()->bpp(), pixelGranularity());
859 }
860 else if (srcPart()->isFetchType(BL_PIPE_FETCH_TYPE_PATTERN_AA_BLIT)) {
861 // Optimized solid opaque blit -> MemCopy.
862 pc->xInlinePixelCopyLoop(dPtr, srcPart()->as<FetchSimplePatternPart>()->f->srcp1, i, 64, dstPart()->bpp(), pixelGranularity(), dstPart()->format());
863 }
864 else {
865 BL_NOT_REACHED();
866 }
867 }
868
869 // ============================================================================
870 // [BLPipeGen::CompOpPart - CMask - Composition Helpers]
871 // ============================================================================
872
cMaskCompositeAndStore(const x86::Mem & dPtr_,uint32_t n,uint32_t alignment)873 void CompOpPart::cMaskCompositeAndStore(const x86::Mem& dPtr_, uint32_t n, uint32_t alignment) noexcept {
874 Pixel dPix(pixelType());
875 x86::Mem dPtr(dPtr_);
876
877 switch (pixelType()) {
878 case Pixel::kTypeRGBA: {
879 switch (n) {
880 case 1:
881 cMaskProcRGBA32Xmm(dPix, 1, Pixel::kPC | Pixel::kImmutable);
882 pc->vstorei32(dPtr, dPix.pc[0]);
883 break;
884
885 case 4:
886 cMaskProcRGBA32Xmm(dPix, 4, Pixel::kPC | Pixel::kImmutable);
887 pc->vstorei128x(dPtr, dPix.pc[0], alignment);
888 break;
889
890 case 8:
891 cMaskProcRGBA32Xmm(dPix, 8, Pixel::kPC | Pixel::kImmutable);
892 pc->vstorei128x(dPtr, dPix.pc[0], alignment);
893 dPtr.addOffset(16);
894 pc->vstorei128x(dPtr, dPix.pc[dPix.pc.size() > 1 ? 1 : 0], alignment);
895 break;
896
897 default:
898 BL_NOT_REACHED();
899 }
900 break;
901 }
902
903 case Pixel::kTypeAlpha: {
904 switch (n) {
905 case 1:
906 cMaskProcA8Gp(dPix, Pixel::kSA | Pixel::kImmutable);
907 pc->store8(dPtr, dPix.sa);
908 break;
909
910 case 4:
911 cMaskProcA8Xmm(dPix, 4, Pixel::kPA | Pixel::kImmutable);
912 pc->vstorei32(dPtr, dPix.pa[0]);
913 break;
914
915 case 8:
916 cMaskProcA8Xmm(dPix, 8, Pixel::kPA | Pixel::kImmutable);
917 pc->vstorei64(dPtr, dPix.pa[0]);
918 break;
919
920 default:
921 BL_NOT_REACHED();
922 }
923 break;
924 }
925
926 default:
927 BL_NOT_REACHED();
928 }
929 }
930
931 // ============================================================================
932 // [BLPipeGen::CompOpPart - VMask - Composition Helpers]
933 // ============================================================================
934
vMaskProc(Pixel & out,uint32_t flags,x86::Gp & msk,bool mImmutable)935 void CompOpPart::vMaskProc(Pixel& out, uint32_t flags, x86::Gp& msk, bool mImmutable) noexcept {
936 switch (pixelType()) {
937 case Pixel::kTypeRGBA: {
938 x86::Vec vm = cc->newXmm("c.vm");
939 pc->vmovsi32(vm, msk);
940 pc->vswizli16(vm, vm, x86::Predicate::shuf(0, 0, 0, 0));
941
942 VecArray vm_(vm);
943 vMaskProcRGBA32Xmm(out, 1, flags, vm_, false);
944 break;
945 }
946
947 case Pixel::kTypeAlpha: {
948 vMaskProcA8Gp(out, flags, msk, mImmutable);
949 break;
950 }
951
952 default:
953 BL_NOT_REACHED();
954 }
955 }
956
957 // ============================================================================
958 // [BLPipeGen::CompOpPart - CMask - Init / Fini - A8]
959 // ============================================================================
960
cMaskInitA8(const x86::Gp & sm_,const x86::Vec & vm_)961 void CompOpPart::cMaskInitA8(const x86::Gp& sm_, const x86::Vec& vm_) noexcept {
962 x86::Gp sm(sm_);
963 x86::Vec vm(vm_);
964
965 bool hasMask = sm.isValid() || vm.isValid();
966 if (hasMask) {
967 // SM must be 32-bit, so make it 32-bit if it's 64-bit for any reason.
968 if (sm.isValid())
969 sm = sm.r32();
970
971 if (vm.isValid() && !sm.isValid()) {
972 sm = cc->newUInt32("sm");
973 pc->vextractu16(vm, sm, 0);
974 }
975
976 _mask->sm = sm;
977 _mask->vm = vm;
978 }
979
980 if (srcPart()->isSolid()) {
981 Pixel& s = srcPart()->as<FetchSolidPart>()->_pixel;
982 SolidPixel& o = _solidOpt;
983 bool convertToVec = true;
984
985 // ------------------------------------------------------------------------
986 // [CMaskInit - A8 - Solid - SrcCopy]
987 // ------------------------------------------------------------------------
988
989 if (compOp() == BL_COMP_OP_SRC_COPY) {
990 if (!hasMask) {
991 // Xa = Sa
992 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
993 o.sa = s.sa;
994
995 if (maxPixels() > 1) {
996 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kPA);
997 o.px = s.pa[0];
998 }
999
1000 convertToVec = false;
1001 }
1002 else {
1003 // Xa = (Sa * m) + 0.5 <Rounding>
1004 // Ya = (1 - m)
1005 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1006
1007 o.sx = cc->newUInt32("p.sx");
1008 o.sy = sm;
1009
1010 pc->uMul(o.sx, s.sa, o.sy);
1011 pc->uAdd(o.sx, o.sx, imm(0x80));
1012 pc->uInv8(o.sy, o.sy);
1013 }
1014 }
1015
1016 // ------------------------------------------------------------------------
1017 // [CMaskInit - A8 - Solid - SrcOver]
1018 // ------------------------------------------------------------------------
1019
1020 else if (compOp() == BL_COMP_OP_SRC_OVER) {
1021 if (!hasMask) {
1022 // Xa = Sa * 1 + 0.5 <Rounding>
1023 // Ya = 1 - Sa
1024 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1025
1026 o.sx = cc->newUInt32("p.sx");
1027 o.sy = sm;
1028
1029 pc->uMov(o.sx, s.sa);
1030 cc->shl(o.sx, 8);
1031 pc->uSub(o.sx, o.sx, s.sa);
1032 pc->uInv8(o.sy, o.sy);
1033 }
1034 else {
1035 // Xa = Sa * m + 0.5 <Rounding>
1036 // Ya = 1 - (Sa * m)
1037 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1038
1039 o.sx = cc->newUInt32("p.sx");
1040 o.sy = sm;
1041
1042 pc->uMul(o.sy, sm, s.sa);
1043 pc->uDiv255(o.sy, o.sy);
1044
1045 pc->uShl(o.sx, o.sy, imm(8));
1046 pc->uSub(o.sx, o.sx, o.sy);
1047 pc->uAdd(o.sx, o.sx, imm(0x80));
1048 pc->uInv8(o.sy, o.sy);
1049 }
1050 }
1051
1052 // ------------------------------------------------------------------------
1053 // [CMaskInit - A8 - Solid - SrcIn]
1054 // ------------------------------------------------------------------------
1055
1056 else if (compOp() == BL_COMP_OP_SRC_IN) {
1057 if (!hasMask) {
1058 // Xa = Sa
1059 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1060
1061 o.sx = s.sa;
1062 if (maxPixels() > 1) {
1063 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUA);
1064 o.ux = s.ua[0];
1065 }
1066 }
1067 else {
1068 // Xa = Sa * m + (1 - m)
1069 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1070
1071 o.sx = cc->newUInt32("o.sx");
1072 pc->uMul(o.sx, s.sa, sm);
1073 pc->uDiv255(o.sx, o.sx);
1074 pc->uInv8(sm, sm);
1075 pc->uAdd(o.sx, o.sx, sm);
1076 }
1077 }
1078
1079 // ------------------------------------------------------------------------
1080 // [CMaskInit - A8 - Solid - SrcOut]
1081 // ------------------------------------------------------------------------
1082
1083 else if (compOp() == BL_COMP_OP_SRC_OUT) {
1084 if (!hasMask) {
1085 // Xa = Sa
1086 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1087
1088 o.sx = s.sa;
1089 if (maxPixels() > 1) {
1090 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUA);
1091 o.ux = s.ua[0];
1092 }
1093 }
1094 else {
1095 // Xa = Sa * m
1096 // Ya = 1 - m
1097 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1098
1099 o.sx = cc->newUInt32("o.sx");
1100 o.sy = sm;
1101
1102 pc->uMul(o.sx, s.sa, o.sy);
1103 pc->uDiv255(o.sx, o.sx);
1104 pc->uInv8(o.sy, o.sy);
1105 }
1106 }
1107
1108 // ------------------------------------------------------------------------
1109 // [CMaskInit - A8 - Solid - DstOut]
1110 // ------------------------------------------------------------------------
1111
1112 else if (compOp() == BL_COMP_OP_DST_OUT) {
1113 if (!hasMask) {
1114 // Xa = 1 - Sa
1115 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1116
1117 o.sx = cc->newUInt32("o.sx");
1118 pc->uInv8(o.sx, s.sa);
1119
1120 if (maxPixels() > 1) {
1121 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUIA);
1122 o.ux = s.uia[0];
1123 }
1124 }
1125 else {
1126 // Xa = 1 - (Sa * m)
1127 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1128
1129 o.sx = sm;
1130 pc->uMul(o.sx, sm, s.sa);
1131 pc->uDiv255(o.sx, o.sx);
1132 pc->uInv8(o.sx, o.sx);
1133 }
1134 }
1135
1136 // ------------------------------------------------------------------------
1137 // [CMaskInit - A8 - Solid - Xor]
1138 // ------------------------------------------------------------------------
1139
1140 else if (compOp() == BL_COMP_OP_XOR) {
1141 if (!hasMask) {
1142 // Xa = Sa
1143 // Ya = 1 - Xa (SIMD only)
1144 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1145 o.sx = s.sa;
1146
1147 if (maxPixels() > 1) {
1148 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUA | Pixel::kUIA);
1149
1150 o.ux = s.ua[0];
1151 o.uy = s.uia[0];
1152 }
1153 }
1154 else {
1155 // Xa = Sa * m
1156 // Ya = 1 - Xa (SIMD only)
1157 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1158
1159 o.sx = cc->newUInt32("o.sx");
1160 pc->uMul(o.sx, sm, s.sa);
1161 pc->uDiv255(o.sx, o.sx);
1162
1163 if (maxPixels() > 1) {
1164 o.ux = cc->newXmm("o.ux");
1165 o.uy = cc->newXmm("o.uy");
1166 pc->vbroadcast_u16(o.ux, o.sx);
1167 pc->vinv255u16(o.uy, o.ux);
1168 }
1169 }
1170 }
1171
1172 // ------------------------------------------------------------------------
1173 // [CMaskInit - A8 - Solid - Plus]
1174 // ------------------------------------------------------------------------
1175
1176 else if (compOp() == BL_COMP_OP_PLUS) {
1177 if (!hasMask) {
1178 // Xa = Sa
1179 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA | Pixel::kPA);
1180 o.sa = s.sa;
1181 o.px = s.pa[0];
1182 convertToVec = false;
1183 }
1184 else {
1185 // Xca = Sca * m
1186 // Xa = Sa * m
1187 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kSA);
1188 o.sx = sm;
1189 pc->uMul(o.sx, o.sx, s.sa);
1190 pc->uDiv255(o.sx, o.sx);
1191
1192 if (maxPixels() > 1) {
1193 o.px = cc->newXmm("o.px");
1194 pc->uMul(o.sx, o.sx, 0x01010101);
1195 pc->vbroadcast_u32(o.px, o.sx);
1196 pc->uShr(o.sx, o.sx, imm(24));
1197 }
1198
1199 convertToVec = false;
1200 }
1201 }
1202
1203 // ------------------------------------------------------------------------
1204 // [CMaskInit - A8 - Solid - Extras]
1205 // ------------------------------------------------------------------------
1206
1207 if (convertToVec && maxPixels() > 1) {
1208 if (o.sx.isValid() && !o.ux.isValid()) {
1209 o.ux = cc->newXmm("p.ux");
1210 pc->vbroadcast_u16(o.ux, o.sx);
1211 }
1212
1213 if (o.sy.isValid() && !o.uy.isValid()) {
1214 o.uy = cc->newXmm("p.uy");
1215 pc->vbroadcast_u16(o.uy, o.sy);
1216 }
1217 }
1218 }
1219 else {
1220 if (sm.isValid() && !vm.isValid() && maxPixels() > 1) {
1221 vm = cc->newXmm("vm");
1222 pc->vbroadcast_u16(vm, sm);
1223 _mask->vm = vm;
1224 }
1225
1226 /*
1227 // ------------------------------------------------------------------------
1228 // [CMaskInit - A8 - NonSolid - SrcCopy]
1229 // ------------------------------------------------------------------------
1230
1231 if (compOp() == BL_COMP_OP_SRC_COPY) {
1232 if (hasMask) {
1233 x86::Xmm vn = cc->newXmm("vn");
1234 pc->vinv255u16(vn, m);
1235 _mask->vec.vn = vn;
1236 }
1237 }
1238 */
1239 }
1240
1241 _cMaskLoopInit(hasMask ? kCMaskLoopTypeMask : kCMaskLoopTypeOpaque);
1242 }
1243
cMaskFiniA8()1244 void CompOpPart::cMaskFiniA8() noexcept {
1245 if (srcPart()->isSolid()) {
1246 _solidOpt.reset();
1247 _solidPre.reset();
1248 }
1249 else {
1250 // TODO: [PIPEGEN] ???
1251 }
1252
1253 _mask->reset();
1254 _cMaskLoopFini();
1255 }
1256
1257 // ============================================================================
1258 // [BLPipeGen::CompOpPart - CMask - Proc - A8]
1259 // ============================================================================
1260
cMaskProcA8Gp(Pixel & out,uint32_t flags)1261 void CompOpPart::cMaskProcA8Gp(Pixel& out, uint32_t flags) noexcept {
1262 out.setCount(1);
1263
1264 bool hasMask = isLoopCMask();
1265
1266 if (srcPart()->isSolid()) {
1267 Pixel d(pixelType());
1268 SolidPixel& o = _solidOpt;
1269
1270 x86::Gp& da = d.sa;
1271 x86::Gp sx = cc->newUInt32("sx");
1272
1273 // ------------------------------------------------------------------------
1274 // [CMaskProc - A8 - SrcCopy]
1275 // ------------------------------------------------------------------------
1276
1277 if (compOp() == BL_COMP_OP_SRC_COPY) {
1278 if (!hasMask) {
1279 // Da' = Xa
1280 out.sa = o.sa;
1281 out.makeImmutable();
1282 }
1283 else {
1284 // Da' = Xa + Da .(1 - m)
1285 dstFetch(d, Pixel::kSA, 1);
1286
1287 pc->uMul(da, da, o.sy),
1288 pc->uAdd(da, da, o.sx);
1289 pc->uMul257hu16(da, da);
1290
1291 out.sa = da;
1292 }
1293
1294 pc->xSatisfyPixel(out, flags);
1295 return;
1296 }
1297
1298 // ------------------------------------------------------------------------
1299 // [CMaskProc - A8 - SrcOver]
1300 // ------------------------------------------------------------------------
1301
1302 if (compOp() == BL_COMP_OP_SRC_OVER) {
1303 // Da' = Xa + Da .Ya
1304 dstFetch(d, Pixel::kSA, 1);
1305
1306 pc->uMul(da, da, o.sy);
1307 pc->uAdd(da, da, o.sx);
1308 pc->uMul257hu16(da, da);
1309
1310 out.sa = da;
1311
1312 pc->xSatisfyPixel(out, flags);
1313 return;
1314 }
1315
1316 // ------------------------------------------------------------------------
1317 // [CMaskProc - A8 - SrcIn / DstOut]
1318 // ------------------------------------------------------------------------
1319
1320 if (compOp() == BL_COMP_OP_SRC_IN || compOp() == BL_COMP_OP_DST_OUT) {
1321 // Da' = Xa.Da
1322 dstFetch(d, Pixel::kSA, 1);
1323
1324 pc->uMul(da, da, o.sx);
1325 pc->uDiv255(da, da);
1326 out.sa = da;
1327
1328 pc->xSatisfyPixel(out, flags);
1329 return;
1330 }
1331
1332 // ------------------------------------------------------------------------
1333 // [CMaskProc - A8 - SrcOut]
1334 // ------------------------------------------------------------------------
1335
1336 if (compOp() == BL_COMP_OP_SRC_OUT) {
1337 if (!hasMask) {
1338 // Da' = Xa.(1 - Da)
1339 dstFetch(d, Pixel::kSA, 1);
1340
1341 pc->uInv8(da, da);
1342 pc->uMul(da, da, o.sx);
1343 pc->uDiv255(da, da);
1344 out.sa = da;
1345 }
1346 else {
1347 // Da' = Xa.(1 - Da) + Da.Ya
1348 dstFetch(d, Pixel::kSA, 1);
1349
1350 pc->uInv8(sx, da);
1351 pc->uMul(sx, sx, o.sx);
1352 pc->uMul(da, da, o.sy);
1353 pc->uAdd(da, da, sx);
1354 pc->uDiv255(da, da);
1355 out.sa = da;
1356 }
1357
1358 pc->xSatisfyPixel(out, flags);
1359 return;
1360 }
1361
1362 // ------------------------------------------------------------------------
1363 // [CMaskProc - A8 - Xor]
1364 // ------------------------------------------------------------------------
1365
1366 if (compOp() == BL_COMP_OP_XOR) {
1367 // Da' = Xa.(1 - Da) + Da.Ya
1368 dstFetch(d, Pixel::kSA, 1);
1369
1370 pc->uMul(sx, da, o.sy);
1371 pc->uInv8(da, da);
1372 pc->uMul(da, da, o.sx);
1373 pc->uAdd(da, da, sx);
1374 pc->uDiv255(da, da);
1375 out.sa = da;
1376
1377 pc->xSatisfyPixel(out, flags);
1378 return;
1379 }
1380
1381 // ------------------------------------------------------------------------
1382 // [CMaskProc - A8 - Plus]
1383 // ------------------------------------------------------------------------
1384
1385 if (compOp() == BL_COMP_OP_PLUS) {
1386 // Da' = Clamp(Da + Xa)
1387 dstFetch(d, Pixel::kSA, 1);
1388
1389 pc->uAddsU8(da, da, o.sx);
1390 out.sa = da;
1391
1392 pc->xSatisfyPixel(out, flags);
1393 return;
1394 }
1395 }
1396
1397 vMaskProcA8Gp(out, flags, _mask->sm, true);
1398 }
1399
cMaskProcA8Xmm(Pixel & out,uint32_t n,uint32_t flags)1400 void CompOpPart::cMaskProcA8Xmm(Pixel& out, uint32_t n, uint32_t flags) noexcept {
1401 out.setCount(n);
1402
1403 bool hasMask = isLoopCMask();
1404
1405 if (srcPart()->isSolid()) {
1406 Pixel d(pixelType());
1407 SolidPixel& o = _solidOpt;
1408
1409 uint32_t kFullN = (n + 7) / 8;
1410
1411 VecArray& da = d.ua;
1412 VecArray xa;
1413 pc->newXmmArray(xa, kFullN, "x");
1414
1415 // ------------------------------------------------------------------------
1416 // [CMaskProc - A8 - SrcCopy]
1417 // ------------------------------------------------------------------------
1418
1419 if (compOp() == BL_COMP_OP_SRC_COPY) {
1420 if (!hasMask) {
1421 // Da' = Xa
1422 out.pa.init(o.px);
1423 out.makeImmutable();
1424 }
1425 else {
1426 // Da' = Xa + Da .(1 - m)
1427 dstFetch(d, Pixel::kUA, n);
1428
1429 pc->vmuli16(da, da, o.uy),
1430 pc->vaddi16(da, da, o.ux);
1431 pc->vmul257hu16(da, da);
1432
1433 out.ua.init(da);
1434 }
1435
1436 pc->xSatisfyPixel(out, flags);
1437 return;
1438 }
1439
1440 // ------------------------------------------------------------------------
1441 // [CMaskProc - A8 - SrcOver]
1442 // ------------------------------------------------------------------------
1443
1444 if (compOp() == BL_COMP_OP_SRC_OVER) {
1445 // Da' = Xa + Da.Ya
1446 dstFetch(d, Pixel::kUA, n);
1447
1448 pc->vmuli16(da, da, o.uy);
1449 pc->vaddi16(da, da, o.ux);
1450 pc->vmul257hu16(da, da);
1451
1452 out.ua.init(da);
1453
1454 pc->xSatisfyPixel(out, flags);
1455 return;
1456 }
1457
1458 // ------------------------------------------------------------------------
1459 // [CMaskProc - A8 - SrcIn / DstOut]
1460 // ------------------------------------------------------------------------
1461
1462 if (compOp() == BL_COMP_OP_SRC_IN || compOp() == BL_COMP_OP_DST_OUT) {
1463 // Da' = Xa.Da
1464 dstFetch(d, Pixel::kUA, n);
1465
1466 pc->vmulu16(da, da, o.ux);
1467 pc->vdiv255u16(da);
1468 out.ua.init(da);
1469
1470 pc->xSatisfyPixel(out, flags);
1471 return;
1472 }
1473
1474 // ------------------------------------------------------------------------
1475 // [CMaskProc - A8 - SrcOut]
1476 // ------------------------------------------------------------------------
1477
1478 if (compOp() == BL_COMP_OP_SRC_OUT) {
1479 if (!hasMask) {
1480 // Da' = Xa.(1 - Da)
1481 dstFetch(d, Pixel::kUA, n);
1482
1483 pc->vinv255u16(da, da);
1484 pc->vmulu16(da, da, o.ux);
1485 pc->vdiv255u16(da);
1486 out.ua.init(da);
1487 }
1488 else {
1489 // Da' = Xa.(1 - Da) + Da.Ya
1490 dstFetch(d, Pixel::kUA, n);
1491
1492 pc->vinv255u16(xa, da);
1493 pc->vmulu16(xa, xa, o.ux);
1494 pc->vmulu16(da, da, o.uy);
1495 pc->vaddi16(da, da, xa);
1496 pc->vdiv255u16(da);
1497 out.ua.init(da);
1498 }
1499
1500 pc->xSatisfyPixel(out, flags);
1501 return;
1502 }
1503
1504 // ------------------------------------------------------------------------
1505 // [CMaskProc - A8 - Xor]
1506 // ------------------------------------------------------------------------
1507
1508 if (compOp() == BL_COMP_OP_XOR) {
1509 // Da' = Xa.(1 - Da) + Da.Ya
1510 dstFetch(d, Pixel::kUA, n);
1511
1512 pc->vmulu16(xa, da, o.uy);
1513 pc->vinv255u16(da, da);
1514 pc->vmulu16(da, da, o.ux);
1515 pc->vaddi16(da, da, xa);
1516 pc->vdiv255u16(da);
1517 out.ua.init(da);
1518
1519 pc->xSatisfyPixel(out, flags);
1520 return;
1521 }
1522
1523 // ------------------------------------------------------------------------
1524 // [CMaskProc - A8 - Plus]
1525 // ------------------------------------------------------------------------
1526
1527 if (compOp() == BL_COMP_OP_PLUS) {
1528 // Da' = Clamp(Da + Xa)
1529 dstFetch(d, Pixel::kPA, n);
1530
1531 pc->vaddsu8(d.pa, d.pa, o.px);
1532 out.pa.init(d.pa);
1533
1534 pc->xSatisfyPixel(out, flags);
1535 return;
1536 }
1537 }
1538
1539 VecArray vm;
1540 if (_mask->vm.isValid())
1541 vm.init(_mask->vm);
1542 vMaskProcA8Xmm(out, n, flags, vm, true);
1543 }
1544
1545 // ============================================================================
1546 // [BLPipeGen::CompOpPart - VMask Proc - A8 (Scalar)]
1547 // ============================================================================
1548
vMaskProcA8Gp(Pixel & out,uint32_t flags,x86::Gp & msk,bool mImmutable)1549 void CompOpPart::vMaskProcA8Gp(Pixel& out, uint32_t flags, x86::Gp& msk, bool mImmutable) noexcept {
1550 bool hasMask = msk.isValid();
1551
1552 Pixel d(Pixel::kTypeAlpha);
1553 Pixel s(Pixel::kTypeAlpha);
1554
1555 x86::Gp x = cc->newUInt32("@x");
1556 x86::Gp y = cc->newUInt32("@y");
1557
1558 x86::Gp& da = d.sa;
1559 x86::Gp& sa = s.sa;
1560
1561 out.setCount(1);
1562
1563 // --------------------------------------------------------------------------
1564 // [VMask - A8 - SrcCopy]
1565 // --------------------------------------------------------------------------
1566
1567 if (compOp() == BL_COMP_OP_SRC_COPY) {
1568 if (!hasMask) {
1569 // Da' = Sa
1570 srcFetch(out, flags, 1);
1571 }
1572 else {
1573 // Da' = Sa.m + Da.(1 - m)
1574 srcFetch(s, Pixel::kSA, 1);
1575 dstFetch(d, Pixel::kSA, 1);
1576
1577 pc->uMul(sa, sa, msk);
1578 pc->uInv8(msk, msk);
1579 pc->uMul(da, da, msk);
1580
1581 if (mImmutable)
1582 pc->uInv8(msk, msk);
1583
1584 pc->uAdd(da, da, sa);
1585 pc->uDiv255(da, da);
1586
1587 out.sa = da;
1588 }
1589
1590 pc->xSatisfyPixel(out, flags);
1591 return;
1592 }
1593
1594 // --------------------------------------------------------------------------
1595 // [VMask - A8 - SrcOver]
1596 // --------------------------------------------------------------------------
1597
1598 if (compOp() == BL_COMP_OP_SRC_OVER) {
1599 if (!hasMask) {
1600 // Da' = Sa + Da.(1 - Sa)
1601 srcFetch(s, Pixel::kSA | Pixel::kImmutable, 1);
1602 dstFetch(d, Pixel::kSA, 1);
1603
1604 pc->uInv8(x, sa);
1605 pc->uMul(da, da, x);
1606 pc->uDiv255(da, da);
1607 pc->uAdd(da, da, sa);
1608 }
1609 else {
1610 // Da' = Sa.m + Da.(1 - Sa.m)
1611 srcFetch(s, Pixel::kSA, 1);
1612 dstFetch(d, Pixel::kSA, 1);
1613
1614 pc->uMul(sa, sa, msk);
1615 pc->uDiv255(sa, sa);
1616 pc->uInv8(x, sa);
1617 pc->uMul(da, da, x);
1618 pc->uDiv255(da, da);
1619 pc->uAdd(da, da, sa);
1620 }
1621
1622 out.sa = da;
1623 pc->xSatisfyPixel(out, flags);
1624 return;
1625 }
1626
1627 // --------------------------------------------------------------------------
1628 // [VMask - A8 - SrcIn]
1629 // --------------------------------------------------------------------------
1630
1631 if (compOp() == BL_COMP_OP_SRC_IN) {
1632 if (!hasMask) {
1633 // Da' = Sa.Da
1634 srcFetch(s, Pixel::kSA | Pixel::kImmutable, 1);
1635 dstFetch(d, Pixel::kSA, 1);
1636
1637 pc->uMul(da, da, sa);
1638 pc->uDiv255(da, da);
1639 }
1640 else {
1641 // Da' = Da.(Sa.m) + Da.(1 - m)
1642 // = Da.(Sa.m + 1 - m)
1643 srcFetch(s, Pixel::kSA, 1);
1644 dstFetch(d, Pixel::kSA, 1);
1645
1646 pc->uMul(sa, sa, msk);
1647 pc->uDiv255(sa, sa);
1648 pc->uAdd(sa, sa, imm(255));
1649 pc->uSub(sa, sa, msk);
1650 pc->uMul(da, da, sa);
1651 pc->uDiv255(da, da);
1652 }
1653
1654 out.sa = da;
1655 pc->xSatisfyPixel(out, flags);
1656 return;
1657 }
1658
1659 // --------------------------------------------------------------------------
1660 // [VMask - A8 - SrcOut]
1661 // --------------------------------------------------------------------------
1662
1663 if (compOp() == BL_COMP_OP_SRC_OUT) {
1664 if (!hasMask) {
1665 // Da' = Sa.(1 - Da)
1666 srcFetch(s, Pixel::kSA | Pixel::kImmutable, 1);
1667 dstFetch(d, Pixel::kSA, 1);
1668
1669 pc->uInv8(da, da);
1670 pc->uMul(da, da, sa);
1671 pc->uDiv255(da, da);
1672 }
1673 else {
1674 // Da' = Sa.m.(1 - Da) + Da.(1 - m)
1675 srcFetch(s, Pixel::kSA, 1);
1676 dstFetch(d, Pixel::kSA, 1);
1677
1678 pc->uMul(sa, sa, msk);
1679 pc->uDiv255(sa, sa);
1680
1681 pc->uInv8(x, da);
1682 pc->uInv8(msk, msk);
1683 pc->uMul(sa, sa, x);
1684 pc->uMul(da, da, msk);
1685
1686 if (mImmutable)
1687 pc->uInv8(msk, msk);
1688
1689 pc->uAdd(da, da, sa);
1690 pc->uDiv255(da, da);
1691 }
1692
1693 out.sa = da;
1694 pc->xSatisfyPixel(out, flags);
1695 return;
1696 }
1697
1698 // --------------------------------------------------------------------------
1699 // [VMask - A8 - DstOut]
1700 // --------------------------------------------------------------------------
1701
1702 if (compOp() == BL_COMP_OP_DST_OUT) {
1703 if (!hasMask) {
1704 // Da' = Da.(1 - Sa)
1705 srcFetch(s, Pixel::kSA, 1);
1706 dstFetch(d, Pixel::kSA, 1);
1707
1708 pc->uInv8(sa, sa);
1709 pc->uMul(da, da, sa);
1710 pc->uDiv255(da, da);
1711 }
1712 else {
1713 // Da' = Da.(1 - Sa.m)
1714 srcFetch(s, Pixel::kSA, 1);
1715 dstFetch(d, Pixel::kSA, 1);
1716
1717 pc->uMul(sa, sa, msk);
1718 pc->uDiv255(sa, sa);
1719 pc->uInv8(sa, sa);
1720 pc->uMul(da, da, sa);
1721 pc->uDiv255(da, da);
1722 }
1723
1724 out.sa = da;
1725 pc->xSatisfyPixel(out, flags);
1726 return;
1727 }
1728
1729 // --------------------------------------------------------------------------
1730 // [VMask - A8 - Xor]
1731 // --------------------------------------------------------------------------
1732
1733 if (compOp() == BL_COMP_OP_XOR) {
1734 if (!hasMask) {
1735 // Da' = Da.(1 - Sa) + Sa.(1 - Da)
1736 srcFetch(s, Pixel::kSA, 1);
1737 dstFetch(d, Pixel::kSA, 1);
1738
1739 pc->uInv8(y, sa);
1740 pc->uInv8(x, da);
1741
1742 pc->uMul(da, da, y);
1743 pc->uMul(sa, sa, x);
1744 pc->uAdd(da, da, sa);
1745 pc->uDiv255(da, da);
1746 }
1747 else {
1748 // Da' = Da.(1 - Sa.m) + Sa.m.(1 - Da)
1749 srcFetch(s, Pixel::kSA, 1);
1750 dstFetch(d, Pixel::kSA, 1);
1751
1752 pc->uMul(sa, sa, msk);
1753 pc->uDiv255(sa, sa);
1754
1755 pc->uInv8(y, sa);
1756 pc->uInv8(x, da);
1757
1758 pc->uMul(da, da, y);
1759 pc->uMul(sa, sa, x);
1760 pc->uAdd(da, da, sa);
1761 pc->uDiv255(da, da);
1762 }
1763
1764 out.sa = da;
1765 pc->xSatisfyPixel(out, flags);
1766 return;
1767 }
1768
1769 // --------------------------------------------------------------------------
1770 // [VMask - A8 - Plus]
1771 // --------------------------------------------------------------------------
1772
1773 if (compOp() == BL_COMP_OP_PLUS) {
1774 // Da' = Clamp(Da + Sa)
1775 // Da' = Clamp(Da + Sa.m)
1776 if (hasMask) {
1777 srcFetch(s, Pixel::kSA, 1);
1778 dstFetch(d, Pixel::kSA, 1);
1779
1780 pc->uMul(sa, sa, msk);
1781 pc->uDiv255(sa, sa);
1782 }
1783 else {
1784 srcFetch(s, Pixel::kSA | Pixel::kImmutable, 1);
1785 dstFetch(d, Pixel::kSA, 1);
1786 }
1787
1788 pc->uAddsU8(da, da, sa);
1789
1790 out.sa = da;
1791 pc->xSatisfyPixel(out, flags);
1792 return;
1793 }
1794
1795 // --------------------------------------------------------------------------
1796 // [VMask - A8 - Invert]
1797 // --------------------------------------------------------------------------
1798
1799 if (compOp() == BL_COMP_OP_INTERNAL_ALPHA_INV) {
1800 // Da' = 1 - Da
1801 // Da' = Da.(1 - m) + (1 - Da).m
1802 if (hasMask) {
1803 dstFetch(d, Pixel::kSA, 1);
1804 pc->uInv8(x, msk);
1805 pc->uMul(x, x, da);
1806 pc->uInv8(da, da);
1807 pc->uMul(da, da, msk);
1808 pc->uAdd(da, da, x);
1809 pc->uDiv255(da, da);
1810 }
1811 else {
1812 dstFetch(d, Pixel::kSA, 1);
1813 pc->uInv8(da, da);
1814 }
1815
1816 out.sa = da;
1817 pc->xSatisfyPixel(out, flags);
1818 return;
1819 }
1820
1821 // --------------------------------------------------------------------------
1822 // [VMask - A8 - Invalid]
1823 // --------------------------------------------------------------------------
1824
1825 BL_NOT_REACHED();
1826 }
1827
1828 // ============================================================================
1829 // [BLPipeGen::CompOpPart - VMask - Proc - A8 (XMM)]
1830 // ============================================================================
1831
vMaskProcA8Xmm(Pixel & out,uint32_t n,uint32_t flags,VecArray & vm,bool mImmutable)1832 void CompOpPart::vMaskProcA8Xmm(Pixel& out, uint32_t n, uint32_t flags, VecArray& vm, bool mImmutable) noexcept {
1833 bool hasMask = !vm.empty();
1834 uint32_t kFullN = (n + 7) / 8;
1835
1836 VecArray xv, yv;
1837 pc->newXmmArray(xv, kFullN, "x");
1838 pc->newXmmArray(yv, kFullN, "y");
1839
1840 Pixel d(Pixel::kTypeAlpha);
1841 Pixel s(Pixel::kTypeAlpha);
1842
1843 VecArray& da = d.ua;
1844 VecArray& sa = s.ua;
1845
1846 out.setCount(n);
1847
1848 // --------------------------------------------------------------------------
1849 // [VMask - A8 - SrcCopy]
1850 // --------------------------------------------------------------------------
1851
1852 if (compOp() == BL_COMP_OP_SRC_COPY) {
1853 if (!hasMask) {
1854 // Da' = Sa
1855 srcFetch(out, flags, n);
1856 }
1857 else {
1858 // Da' = Sa.m + Da.(1 - m)
1859 srcFetch(s, Pixel::kUA, n);
1860 dstFetch(d, Pixel::kUA, n);
1861
1862 pc->vmulu16(sa, sa, vm);
1863 pc->vinv255u16(vm, vm);
1864 pc->vmulu16(da, da, vm);
1865
1866 if (mImmutable)
1867 pc->vinv255u16(vm, vm);
1868
1869 pc->vaddi16(da, da, sa);
1870 pc->vdiv255u16(da);
1871
1872 out.ua = da;
1873 }
1874
1875 pc->xSatisfyPixel(out, flags);
1876 return;
1877 }
1878
1879 // --------------------------------------------------------------------------
1880 // [VMask - A8 - SrcOver]
1881 // --------------------------------------------------------------------------
1882
1883 if (compOp() == BL_COMP_OP_SRC_OVER) {
1884 if (!hasMask) {
1885 // Da' = Sa + Da.(1 - Sa)
1886 srcFetch(s, Pixel::kUA | Pixel::kImmutable, n);
1887 dstFetch(d, Pixel::kUA, n);
1888
1889 pc->vinv255u16(xv, sa);
1890 pc->vmulu16(da, da, xv);
1891 pc->vdiv255u16(da);
1892 pc->vaddi16(da, da, sa);
1893 }
1894 else {
1895 // Da' = Sa.m + Da.(1 - Sa.m)
1896 srcFetch(s, Pixel::kUA, n);
1897 dstFetch(d, Pixel::kUA, n);
1898
1899 pc->vmulu16(sa, sa, vm);
1900 pc->vdiv255u16(sa);
1901 pc->vinv255u16(xv, sa);
1902 pc->vmulu16(da, da, xv);
1903 pc->vdiv255u16(da);
1904 pc->vaddi16(da, da, sa);
1905 }
1906
1907 out.ua = da;
1908 pc->xSatisfyPixel(out, flags);
1909 return;
1910 }
1911
1912 // --------------------------------------------------------------------------
1913 // [VMask - A8 - SrcIn]
1914 // --------------------------------------------------------------------------
1915
1916 if (compOp() == BL_COMP_OP_SRC_IN) {
1917 if (!hasMask) {
1918 // Da' = Sa.Da
1919 srcFetch(s, Pixel::kUA | Pixel::kImmutable, n);
1920 dstFetch(d, Pixel::kUA, n);
1921
1922 pc->vmulu16(da, da, sa);
1923 pc->vdiv255u16(da);
1924 }
1925 else {
1926 // Da' = Da.(Sa.m) + Da.(1 - m)
1927 // = Da.(Sa.m + 1 - m)
1928 srcFetch(s, Pixel::kUA, n);
1929 dstFetch(d, Pixel::kUA, n);
1930
1931 pc->vmulu16(sa, sa, vm);
1932 pc->vdiv255u16(sa);
1933 pc->vaddi16(sa, sa, C_MEM(i128_00FF00FF00FF00FF));
1934 pc->vsubi16(sa, sa, vm);
1935 pc->vmulu16(da, da, sa);
1936 pc->vdiv255u16(da);
1937 }
1938
1939 out.ua = da;
1940 pc->xSatisfyPixel(out, flags);
1941 return;
1942 }
1943
1944 // --------------------------------------------------------------------------
1945 // [VMask - A8 - SrcOut]
1946 // --------------------------------------------------------------------------
1947
1948 if (compOp() == BL_COMP_OP_SRC_OUT) {
1949 if (!hasMask) {
1950 // Da' = Sa.(1 - Da)
1951 srcFetch(s, Pixel::kUA | Pixel::kImmutable, n);
1952 dstFetch(d, Pixel::kUA, n);
1953
1954 pc->vinv255u16(da, da);
1955 pc->vmulu16(da, da, sa);
1956 pc->vdiv255u16(da);
1957 }
1958 else {
1959 // Da' = Sa.m.(1 - Da) + Da.(1 - m)
1960 srcFetch(s, Pixel::kUA, n);
1961 dstFetch(d, Pixel::kUA, n);
1962
1963 pc->vmulu16(sa, sa, vm);
1964 pc->vdiv255u16(sa);
1965
1966 pc->vinv255u16(xv, da);
1967 pc->vinv255u16(vm, vm);
1968 pc->vmulu16(sa, sa, xv);
1969 pc->vmulu16(da, da, vm);
1970
1971 if (mImmutable)
1972 pc->vinv255u16(vm, vm);
1973
1974 pc->vaddi16(da, da, sa);
1975 pc->vdiv255u16(da);
1976 }
1977
1978 out.ua = da;
1979 pc->xSatisfyPixel(out, flags);
1980 return;
1981 }
1982
1983 // --------------------------------------------------------------------------
1984 // [VMask - A8 - DstOut]
1985 // --------------------------------------------------------------------------
1986
1987 if (compOp() == BL_COMP_OP_DST_OUT) {
1988 if (!hasMask) {
1989 // Da' = Da.(1 - Sa)
1990 srcFetch(s, Pixel::kUA, n);
1991 dstFetch(d, Pixel::kUA, n);
1992
1993 pc->vinv255u16(sa, sa);
1994 pc->vmulu16(da, da, sa);
1995 pc->vdiv255u16(da);
1996 }
1997 else {
1998 // Da' = Da.(1 - Sa.m)
1999 srcFetch(s, Pixel::kUA, n);
2000 dstFetch(d, Pixel::kUA, n);
2001
2002 pc->vmulu16(sa, sa, vm);
2003 pc->vdiv255u16(sa);
2004 pc->vinv255u16(sa, sa);
2005 pc->vmulu16(da, da, sa);
2006 pc->vdiv255u16(da);
2007 }
2008
2009 out.ua = da;
2010 pc->xSatisfyPixel(out, flags);
2011 return;
2012 }
2013
2014 // --------------------------------------------------------------------------
2015 // [VMask - A8 - Xor]
2016 // --------------------------------------------------------------------------
2017
2018 if (compOp() == BL_COMP_OP_XOR) {
2019 if (!hasMask) {
2020 // Da' = Da.(1 - Sa) + Sa.(1 - Da)
2021 srcFetch(s, Pixel::kUA, n);
2022 dstFetch(d, Pixel::kUA, n);
2023
2024 pc->vinv255u16(yv, sa);
2025 pc->vinv255u16(xv, da);
2026
2027 pc->vmulu16(da, da, yv);
2028 pc->vmulu16(sa, sa, xv);
2029 pc->vaddi16(da, da, sa);
2030 pc->vdiv255u16(da);
2031 }
2032 else {
2033 // Da' = Da.(1 - Sa.m) + Sa.m.(1 - Da)
2034 srcFetch(s, Pixel::kUA, n);
2035 dstFetch(d, Pixel::kUA, n);
2036
2037 pc->vmulu16(sa, sa, vm);
2038 pc->vdiv255u16(sa);
2039
2040 pc->vinv255u16(yv, sa);
2041 pc->vinv255u16(xv, da);
2042
2043 pc->vmulu16(da, da, yv);
2044 pc->vmulu16(sa, sa, xv);
2045 pc->vaddi16(da, da, sa);
2046 pc->vdiv255u16(da);
2047 }
2048
2049 out.ua = da;
2050 pc->xSatisfyPixel(out, flags);
2051 return;
2052 }
2053
2054 // --------------------------------------------------------------------------
2055 // [VMask - A8 - Plus]
2056 // --------------------------------------------------------------------------
2057
2058 if (compOp() == BL_COMP_OP_PLUS) {
2059 // Da' = Clamp(Da + Sa)
2060 // Da' = Clamp(Da + Sa.m)
2061 if (hasMask) {
2062 srcFetch(s, Pixel::kUA, n);
2063 dstFetch(d, Pixel::kPA, n);
2064
2065 pc->vmulu16(sa, sa, vm);
2066 pc->vdiv255u16(sa);
2067
2068 s.pa = sa.even();
2069 pc->vpacki16u8(s.pa, s.pa, sa.odd());
2070 }
2071 else {
2072 srcFetch(s, Pixel::kPA | Pixel::kImmutable, n);
2073 dstFetch(d, Pixel::kPA, n);
2074 }
2075
2076 pc->vaddsu8(d.pa, d.pa, s.pa);
2077 out.pa = d.pa;
2078 pc->xSatisfyPixel(out, flags);
2079 return;
2080 }
2081
2082 // --------------------------------------------------------------------------
2083 // [VMask - A8 - Invert]
2084 // --------------------------------------------------------------------------
2085
2086 if (compOp() == BL_COMP_OP_INTERNAL_ALPHA_INV) {
2087 // Da' = 1 - Da
2088 // Da' = Da.(1 - m) + (1 - Da).m
2089 if (hasMask) {
2090 dstFetch(d, Pixel::kUA, n);
2091 pc->vinv255u16(xv, vm);
2092 pc->vmulu16(xv, xv, da);
2093 pc->vinv255u16(da, da);
2094 pc->vmulu16(da, da, vm);
2095 pc->vaddi16(da, da, xv);
2096 pc->vdiv255u16(da);
2097 }
2098 else {
2099 dstFetch(d, Pixel::kUA, n);
2100 pc->vinv255u16(da, da);
2101 }
2102
2103 out.ua = da;
2104 pc->xSatisfyPixel(out, flags);
2105 return;
2106 }
2107
2108 // --------------------------------------------------------------------------
2109 // [VMask - A8 - Invalid]
2110 // --------------------------------------------------------------------------
2111
2112 BL_NOT_REACHED();
2113 }
2114
2115 // ============================================================================
2116 // [BLPipeGen::CompOpPart - CMask - Init / Fini - RGBA]
2117 // ============================================================================
2118
cMaskInitRGBA32(const x86::Vec & vm)2119 void CompOpPart::cMaskInitRGBA32(const x86::Vec& vm) noexcept {
2120 bool hasMask = vm.isValid();
2121 bool useDa = hasDa();
2122
2123 if (srcPart()->isSolid()) {
2124 Pixel& s = srcPart()->as<FetchSolidPart>()->_pixel;
2125 SolidPixel& o = _solidOpt;
2126
2127 // ------------------------------------------------------------------------
2128 // [CMaskInit - RGBA32 - Solid - SrcCopy]
2129 // ------------------------------------------------------------------------
2130
2131 if (compOp() == BL_COMP_OP_SRC_COPY) {
2132 if (!hasMask) {
2133 // Xca = Sca
2134 // Xa = Sa
2135 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kPC);
2136 o.px = s.pc[0];
2137 }
2138 else {
2139 // Xca = (Sca * m) + 0.5 <Rounding>
2140 // Xa = (Sa * m) + 0.5 <Rounding>
2141 // Im = (1 - m)
2142 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2143
2144 o.ux = cc->newXmm("p.ux");
2145 o.vn = vm;
2146
2147 pc->vmulu16(o.ux, s.uc[0], o.vn);
2148 pc->vaddi16(o.ux, o.ux, pc->constAsXmm(blCommonTable.i128_0080008000800080));
2149 pc->vinv255u16(o.vn, o.vn);
2150 }
2151 }
2152
2153 // ------------------------------------------------------------------------
2154 // [CMaskInit - RGBA32 - Solid - SrcOver]
2155 // ------------------------------------------------------------------------
2156
2157 else if (compOp() == BL_COMP_OP_SRC_OVER) {
2158 if (!hasMask) {
2159 // Xca = Sca * 1 + 0.5 <Rounding>
2160 // Xa = Sa * 1 + 0.5 <Rounding>
2161 // Yca = 1 - Sa
2162 // Ya = 1 - Sa
2163 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC | Pixel::kUIA | Pixel::kImmutable);
2164
2165 o.ux = cc->newXmm("p.ux");
2166 o.uy = s.uia[0];
2167
2168 pc->vslli16(o.ux, s.uc[0], 8);
2169 pc->vsubi16(o.ux, o.ux, s.uc[0]);
2170 pc->vaddi16(o.ux, o.ux, pc->constAsXmm(blCommonTable.i128_0080008000800080));
2171
2172 cc->alloc(o.uy);
2173 }
2174 else {
2175 // Xca = Sca * m + 0.5 <Rounding>
2176 // Xa = Sa * m + 0.5 <Rounding>
2177 // Yca = 1 - (Sa * m)
2178 // Ya = 1 - (Sa * m)
2179 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC | Pixel::kImmutable);
2180
2181 o.ux = cc->newXmm("p.ux");
2182 o.uy = cc->newXmm("p.uy");
2183
2184 pc->vmulu16(o.uy, s.uc[0], vm);
2185 pc->vdiv255u16(o.uy);
2186
2187 pc->vslli16(o.ux, o.uy, 8);
2188 pc->vsubi16(o.ux, o.ux, o.uy);
2189 pc->vaddi16(o.ux, o.ux, pc->constAsXmm(blCommonTable.i128_0080008000800080));
2190
2191 pc->vswizli16(o.uy, o.uy, x86::Predicate::shuf(3, 3, 3, 3));
2192 pc->vswizhi16(o.uy, o.uy, x86::Predicate::shuf(3, 3, 3, 3));
2193 pc->vinv255u16(o.uy, o.uy);
2194 }
2195 }
2196
2197 // ------------------------------------------------------------------------
2198 // [CMaskInit - RGBA32 - Solid - SrcIn / SrcOut]
2199 // ------------------------------------------------------------------------
2200
2201 else if (compOp() == BL_COMP_OP_SRC_IN || compOp() == BL_COMP_OP_SRC_OUT) {
2202 if (!hasMask) {
2203 // Xca = Sca
2204 // Xa = Sa
2205 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2206
2207 o.ux = s.uc[0];
2208 cc->alloc(o.ux);
2209 }
2210 else {
2211 // Xca = Sca * m
2212 // Xa = Sa * m
2213 // Im = 1 - m
2214 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2215
2216 o.ux = cc->newXmm("o.uc0");
2217 o.vn = vm;
2218
2219 pc->vmulu16(o.ux, s.uc[0], vm);
2220 pc->vdiv255u16(o.ux);
2221 pc->vinv255u16(vm, vm);
2222 }
2223 }
2224
2225 // ------------------------------------------------------------------------
2226 // [CMaskInit - RGBA32 - Solid - SrcAtop / Xor / Darken / Lighten]
2227 // ------------------------------------------------------------------------
2228
2229 else if (compOp() == BL_COMP_OP_SRC_ATOP || compOp() == BL_COMP_OP_XOR || compOp() == BL_COMP_OP_DARKEN || compOp() == BL_COMP_OP_LIGHTEN) {
2230 if (!hasMask) {
2231 // Xca = Sca
2232 // Xa = Sa
2233 // Yca = 1 - Sa
2234 // Ya = 1 - Sa
2235 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC | Pixel::kUIA);
2236
2237 o.ux = s.uc[0];
2238 o.uy = s.uia[0];
2239
2240 cc->alloc(o.ux);
2241 cc->alloc(o.uy);
2242 }
2243 else {
2244 // Xca = Sca * m
2245 // Xa = Sa * m
2246 // Yca = 1 - (Sa * m)
2247 // Ya = 1 - (Sa * m)
2248 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2249
2250 o.ux = cc->newXmm("o.ux");
2251 o.uy = vm;
2252
2253 pc->vmulu16(o.ux, s.uc[0], o.uy);
2254 pc->vdiv255u16(o.ux);
2255
2256 pc->vswizli16(o.uy, o.ux, x86::Predicate::shuf(3, 3, 3, 3));
2257 pc->vswizi32(o.uy, o.uy, x86::Predicate::shuf(0, 0, 0, 0));
2258 pc->vinv255u16(o.uy, o.uy);
2259 }
2260 }
2261
2262 // ------------------------------------------------------------------------
2263 // [CMaskInit - RGBA32 - Solid - Dst]
2264 // ------------------------------------------------------------------------
2265
2266 else if (compOp() == BL_COMP_OP_DST_COPY) {
2267 BL_NOT_REACHED();
2268 }
2269
2270 // ------------------------------------------------------------------------
2271 // [CMaskInit - RGBA32 - Solid - DstOver]
2272 // ------------------------------------------------------------------------
2273
2274 else if (compOp() == BL_COMP_OP_DST_OVER) {
2275 if (!hasMask) {
2276 // Xca = Sca
2277 // Xa = Sa
2278 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2279
2280 o.ux = s.uc[0];
2281 cc->alloc(o.ux);
2282 }
2283 else {
2284 // Xca = Sca * m
2285 // Xa = Sa * m
2286 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2287
2288 o.ux = cc->newXmm("o.uc0");
2289 pc->vmulu16(o.ux, s.uc[0], vm);
2290 pc->vdiv255u16(o.ux);
2291 }
2292 }
2293
2294 // ------------------------------------------------------------------------
2295 // [CMaskInit - RGBA32 - Solid - DstIn]
2296 // ------------------------------------------------------------------------
2297
2298 else if (compOp() == BL_COMP_OP_DST_IN) {
2299 if (!hasMask) {
2300 // Xca = Sa
2301 // Xa = Sa
2302 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUA);
2303
2304 o.ux = s.ua[0];
2305 cc->alloc(o.ux);
2306 }
2307 else {
2308 // Xca = 1 - m.(1 - Sa)
2309 // Xa = 1 - m.(1 - Sa)
2310 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUA);
2311
2312 o.ux = cc->newXmm("o.ux");
2313 pc->vmov(o.ux, s.ua[0]);
2314
2315 pc->vinv255u16(o.ux, o.ux);
2316 pc->vmulu16(o.ux, o.ux, vm);
2317 pc->vdiv255u16(o.ux);
2318 pc->vinv255u16(o.ux, o.ux);
2319 }
2320 }
2321
2322 // ------------------------------------------------------------------------
2323 // [CMaskInit - RGBA32 - Solid - DstOut]
2324 // ------------------------------------------------------------------------
2325
2326 else if (compOp() == BL_COMP_OP_DST_OUT) {
2327 if (!hasMask) {
2328 // Xca = 1 - Sa
2329 // Xa = 1 - Sa
2330 if (useDa) {
2331 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUIA);
2332
2333 o.ux = s.uia[0];
2334 cc->alloc(o.ux);
2335 }
2336 // Xca = 1 - Sa
2337 // Xa = 1
2338 else {
2339 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUA);
2340
2341 o.ux = cc->newXmm("ux");
2342 pc->vmov(o.ux, s.ua[0]);
2343 pc->vNegRgb8W(o.ux, o.ux);
2344 }
2345 }
2346 else {
2347 // Xca = 1 - (Sa * m)
2348 // Xa = 1 - (Sa * m)
2349 if (useDa) {
2350 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUA);
2351
2352 o.ux = vm;
2353 pc->vmulu16(o.ux, o.ux, s.ua[0]);
2354 pc->vdiv255u16(o.ux);
2355 pc->vinv255u16(o.ux, o.ux);
2356 }
2357 // Xca = 1 - (Sa * m)
2358 // Xa = 1
2359 else {
2360 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUA);
2361
2362 o.ux = vm;
2363 pc->vmulu16(o.ux, o.ux, s.ua[0]);
2364 pc->vdiv255u16(o.ux);
2365 pc->vinv255u16(o.ux, o.ux);
2366 pc->vFillAlpha255W(o.ux, o.ux);
2367 }
2368 }
2369 }
2370
2371 // ------------------------------------------------------------------------
2372 // [CMaskInit - RGBA32 - Solid - DstAtop]
2373 // ------------------------------------------------------------------------
2374
2375 else if (compOp() == BL_COMP_OP_DST_ATOP) {
2376 if (!hasMask) {
2377 // Xca = Sca
2378 // Xa = Sa
2379 // Yca = Sa
2380 // Ya = Sa
2381 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC | Pixel::kUA);
2382
2383 o.ux = s.uc[0];
2384 o.uy = s.ua[0];
2385
2386 cc->alloc(o.ux);
2387 cc->alloc(o.uy);
2388 }
2389 else {
2390 // Xca = Sca * m
2391 // Xa = Sa * m
2392 // Yca = 1 - m.(1 - Sa)
2393 // Ya = 1 - m.(1 - Sa)
2394 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC | Pixel::kUA);
2395
2396 o.ux = cc->newXmm("o.ux");
2397 o.uy = cc->newXmm("o.uy");
2398
2399 pc->vmov(o.uy, s.ua[0]);
2400 pc->vinv255u16(o.uy, o.uy);
2401
2402 pc->vmulu16(o.ux, s.uc[0], vm);
2403 pc->vmulu16(o.uy, o.uy, vm);
2404
2405 pc->vdiv255u16_2x(o.ux, o.uy);
2406 pc->vinv255u16(o.uy, o.uy);
2407 }
2408 }
2409
2410 // ------------------------------------------------------------------------
2411 // [CMaskInit - RGBA32 - Solid - Plus]
2412 // ------------------------------------------------------------------------
2413
2414 else if (compOp() == BL_COMP_OP_PLUS) {
2415 if (!hasMask) {
2416 // Xca = Sca
2417 // Xa = Sa
2418 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kPC);
2419
2420 o.px = s.pc[0];
2421 cc->alloc(o.px);
2422 }
2423 else {
2424 // Xca = Sca * m
2425 // Xa = Sa * m
2426 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2427 o.px = cc->newXmm("px");
2428
2429 pc->vmulu16(o.px, s.uc[0], vm);
2430 pc->vdiv255u16(o.px);
2431 pc->vpacki16u8(o.px, o.px, o.px);
2432 }
2433 }
2434
2435 // ------------------------------------------------------------------------
2436 // [CMaskInit - RGBA32 - Solid - Minus]
2437 // ------------------------------------------------------------------------
2438
2439 else if (compOp() == BL_COMP_OP_MINUS) {
2440 if (!hasMask) {
2441 // Xca = Sca
2442 // Xa = 0
2443 // Yca = Sca
2444 // Ya = Sa
2445 if (useDa) {
2446 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2447
2448 o.ux = cc->newXmm("ux");
2449 o.uy = s.uc[0];
2450
2451 cc->alloc(o.uy);
2452 pc->vmov(o.ux, o.uy);
2453 pc->vZeroAlphaW(o.ux, o.ux);
2454 }
2455 else {
2456 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kPC);
2457 o.px = cc->newXmm("px");
2458 pc->vmov(o.px, s.pc[0]);
2459 pc->vZeroAlphaB(o.px, o.px);
2460 }
2461 }
2462 else {
2463 // Xca = Sca
2464 // Xa = 0
2465 // Yca = Sca
2466 // Ya = Sa
2467 // M = m <Alpha channel is set to 256>
2468 // Im = 1 - m <Alpha channel is set to 0 >
2469 if (useDa) {
2470 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2471
2472 o.ux = cc->newXmm("ux");
2473 o.uy = cc->newXmm("uy");
2474 o.vm = vm;
2475 o.vn = cc->newXmm("vn");
2476
2477 pc->vZeroAlphaW(o.ux, s.uc[0]);
2478 pc->vmov(o.uy, s.uc[0]);
2479
2480 pc->vinv255u16(o.vn, o.vm);
2481 pc->vZeroAlphaW(o.vm, o.vm);
2482 pc->vZeroAlphaW(o.vn, o.vn);
2483 pc->vFillAlpha255W(o.vm, o.vm);
2484 }
2485 else {
2486 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2487
2488 o.ux = cc->newXmm("ux");
2489 o.vm = vm;
2490 o.vn = cc->newXmm("vn");
2491
2492 pc->vZeroAlphaW(o.ux, s.uc[0]);
2493 pc->vinv255u16(o.vn, o.vm);
2494 }
2495 }
2496 }
2497
2498 // ------------------------------------------------------------------------
2499 // [CMaskInit - RGBA32 - Solid - Multiply]
2500 // ------------------------------------------------------------------------
2501
2502 else if (compOp() == BL_COMP_OP_MULTIPLY) {
2503 if (!hasMask) {
2504 // Xca = Sca
2505 // Xa = Sa
2506 // Yca = Sca + (1 - Sa)
2507 // Ya = Sa + (1 - Sa)
2508 if (useDa) {
2509 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC | Pixel::kUIA);
2510
2511 o.ux = s.uc[0];
2512 o.uy = cc->newXmm("uy");
2513
2514 cc->alloc(o.ux);
2515 pc->vmov(o.uy, s.uia[0]);
2516 pc->vaddi16(o.uy, o.uy, o.ux);
2517 }
2518 // Yca = Sca + (1 - Sa)
2519 // Ya = Sa + (1 - Sa)
2520 else {
2521 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC | Pixel::kUIA);
2522
2523 o.uy = cc->newXmm("uy");
2524 pc->vmov(o.uy, s.uia[0]);
2525 pc->vaddi16(o.uy, o.uy, s.uc[0]);
2526 }
2527 }
2528 else {
2529 // Xca = Sca * m
2530 // Xa = Sa * m
2531 // Yca = Sca * m + (1 - Sa * m)
2532 // Ya = Sa * m + (1 - Sa * m)
2533 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2534
2535 o.ux = cc->newXmm("ux");
2536 o.uy = cc->newXmm("uy");
2537
2538 pc->vmulu16(o.ux, s.uc[0], vm);
2539 pc->vdiv255u16(o.ux);
2540
2541 pc->vswizli16(o.uy, o.ux, x86::Predicate::shuf(3, 3, 3, 3));
2542 pc->vinv255u16(o.uy, o.uy);
2543 pc->vswizi32(o.uy, o.uy, x86::Predicate::shuf(0, 0, 0, 0));
2544 pc->vaddi16(o.uy, o.uy, o.ux);
2545 }
2546 }
2547
2548 // ------------------------------------------------------------------------
2549 // [CMaskInit - RGBA32 - Solid - Screen]
2550 // ------------------------------------------------------------------------
2551
2552 else if (compOp() == BL_COMP_OP_SCREEN) {
2553 if (!hasMask) {
2554 // Xca = Sca * 1 + 0.5 <Rounding>
2555 // Xa = Sa * 1 + 0.5 <Rounding>
2556 // Yca = 1 - Sca
2557 // Ya = 1 - Sa
2558
2559 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2560
2561 o.ux = cc->newXmm("p.ux");
2562 o.uy = cc->newXmm("p.uy");
2563
2564 pc->vinv255u16(o.uy, o.ux);
2565 pc->vslli16(o.ux, s.uc[0], 8);
2566 pc->vsubi16(o.ux, o.ux, s.uc[0]);
2567 pc->vaddi16(o.ux, o.ux, pc->constAsXmm(blCommonTable.i128_0080008000800080));
2568
2569 cc->alloc(o.uy);
2570 }
2571 else {
2572 // Xca = Sca * m + 0.5 <Rounding>
2573 // Xa = Sa * m + 0.5 <Rounding>
2574 // Yca = 1 - (Sca * m)
2575 // Ya = 1 - (Sa * m)
2576 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2577
2578 o.ux = cc->newXmm("p.ux");
2579 o.uy = cc->newXmm("p.uy");
2580
2581 pc->vmulu16(o.uy, s.uc[0], vm);
2582 pc->vdiv255u16(o.uy);
2583
2584 pc->vslli16(o.ux, o.uy, 8);
2585 pc->vsubi16(o.ux, o.ux, o.uy);
2586 pc->vaddi16(o.ux, o.ux, pc->constAsXmm(blCommonTable.i128_0080008000800080));
2587 pc->vinv255u16(o.uy, o.uy);
2588 }
2589 }
2590
2591 // ------------------------------------------------------------------------
2592 // [CMaskInit - RGBA32 - Solid - LinearBurn / Difference / Exclusion]
2593 // ------------------------------------------------------------------------
2594
2595 else if (compOp() == BL_COMP_OP_LINEAR_BURN || compOp() == BL_COMP_OP_DIFFERENCE || compOp() == BL_COMP_OP_EXCLUSION) {
2596 if (!hasMask) {
2597 // Xca = Sca
2598 // Xa = Sa
2599 // Yca = Sa
2600 // Ya = Sa
2601 srcPart()->as<FetchSolidPart>()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC | Pixel::kUA);
2602
2603 o.ux = s.uc[0];
2604 o.uy = s.ua[0];
2605
2606 cc->alloc(o.ux);
2607 cc->alloc(o.uy);
2608 }
2609 else {
2610 // Xca = Sca * m
2611 // Xa = Sa * m
2612 // Yca = Sa * m
2613 // Ya = Sa * m
2614 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2615
2616 o.ux = cc->newXmm("ux");
2617 o.uy = cc->newXmm("uy");
2618
2619 pc->vmulu16(o.ux, s.uc[0], vm);
2620 pc->vdiv255u16(o.ux);
2621
2622 pc->vswizli16(o.uy, o.ux, x86::Predicate::shuf(3, 3, 3, 3));
2623 pc->vswizi32(o.uy, o.uy, x86::Predicate::shuf(0, 0, 0, 0));
2624 }
2625 }
2626
2627 // ------------------------------------------------------------------------
2628 // [CMaskInit - RGBA32 - Solid - TypeA (Non-Opaque)]
2629 // ------------------------------------------------------------------------
2630
2631 else if ((compOpFlags() & BL_COMP_OP_FLAG_TYPE_A) && hasMask) {
2632 // Multiply the source pixel with the mask if `TypeA`.
2633 srcPart()->as<FetchSolidPart>()->initSolidFlags(Pixel::kUC);
2634
2635 Pixel& pre = _solidPre;
2636 pre.setCount(1);
2637 pre.uc.init(cc->newXmm("pre.uc"));
2638
2639 pc->vmulu16(pre.uc[0], s.uc[0], vm);
2640 pc->vdiv255u16(pre.uc[0]);
2641 }
2642
2643 // ------------------------------------------------------------------------
2644 // [CMaskInit - RGBA32 - Solid - No Optimizations]
2645 // ------------------------------------------------------------------------
2646
2647 else {
2648 // No optimization. The compositor will simply use the mask provided.
2649 _mask->vm = vm;
2650 }
2651 }
2652 else {
2653 _mask->vm = vm;
2654
2655 // ------------------------------------------------------------------------
2656 // [CMaskInit - RGBA32 - NonSolid - SrcCopy]
2657 // ------------------------------------------------------------------------
2658
2659 if (compOp() == BL_COMP_OP_SRC_COPY) {
2660 if (hasMask) {
2661 _mask->vn = cc->newXmm("vn");
2662 pc->vinv255u16(_mask->vn, vm);
2663 }
2664 }
2665 }
2666
2667 _cMaskLoopInit(hasMask ? kCMaskLoopTypeMask : kCMaskLoopTypeOpaque);
2668 }
2669
cMaskFiniRGBA32()2670 void CompOpPart::cMaskFiniRGBA32() noexcept {
2671 if (srcPart()->isSolid()) {
2672 _solidOpt.reset();
2673 _solidPre.reset();
2674 }
2675 else {
2676 // TODO: [PIPEGEN]
2677 }
2678
2679 _mask->reset();
2680 _cMaskLoopFini();
2681 }
2682
2683 // ============================================================================
2684 // [BLPipeGen::CompOpPart - CMask - Proc - RGBA]
2685 // ============================================================================
2686
cMaskProcRGBA32Xmm(Pixel & out,uint32_t n,uint32_t flags)2687 void CompOpPart::cMaskProcRGBA32Xmm(Pixel& out, uint32_t n, uint32_t flags) noexcept {
2688 bool hasMask = isLoopCMask();
2689
2690 uint32_t kFullN = (n + 1) / 2;
2691 uint32_t kUseHi = n > 1;
2692
2693 out.setCount(n);
2694
2695 if (srcPart()->isSolid()) {
2696 Pixel d(pixelType());
2697 SolidPixel& o = _solidOpt;
2698 VecArray xv, yv, zv;
2699
2700 pc->newXmmArray(xv, kFullN, "x");
2701 pc->newXmmArray(yv, kFullN, "y");
2702 pc->newXmmArray(zv, kFullN, "z");
2703
2704 bool useDa = hasDa();
2705
2706 // ------------------------------------------------------------------------
2707 // [CMaskProc - RGBA32 - SrcCopy]
2708 // ------------------------------------------------------------------------
2709
2710 if (compOp() == BL_COMP_OP_SRC_COPY) {
2711 // Dca' = Xca
2712 // Da' = Xa
2713 if (!hasMask) {
2714 out.pc.init(o.px);
2715 out.makeImmutable();
2716 }
2717 // Dca' = Xca + Dca.(1 - m)
2718 // Da' = Xa + Da .(1 - m)
2719 else {
2720 dstFetch(d, Pixel::kUC, n);
2721 VecArray& dv = d.uc;
2722 pc->vmulu16(dv, dv, o.vn);
2723 pc->vaddi16(dv, dv, o.ux);
2724 pc->vmul257hu16(dv, dv);
2725 out.uc.init(dv);
2726 }
2727
2728 pc->xSatisfyPixel(out, flags);
2729 return;
2730 }
2731
2732 // ------------------------------------------------------------------------
2733 // [CMaskProc - RGBA32 - SrcOver / Screen]
2734 // ------------------------------------------------------------------------
2735
2736 if (compOp() == BL_COMP_OP_SRC_OVER || compOp() == BL_COMP_OP_SCREEN) {
2737 // Dca' = Xca + Dca.Yca
2738 // Da' = Xa + Da .Ya
2739 dstFetch(d, Pixel::kUC, n);
2740 VecArray& dv = d.uc;
2741
2742 pc->vmulu16(dv, dv, o.uy);
2743 pc->vaddi16(dv, dv, o.ux);
2744 pc->vmul257hu16(dv, dv);
2745
2746 out.uc.init(dv);
2747 pc->xSatisfyPixel(out, flags);
2748
2749 return;
2750 }
2751
2752 // ------------------------------------------------------------------------
2753 // [CMaskProc - RGBA32 - SrcIn]
2754 // ------------------------------------------------------------------------
2755
2756 if (compOp() == BL_COMP_OP_SRC_IN) {
2757 // Dca' = Xca.Da
2758 // Da' = Xa .Da
2759 if (!hasMask) {
2760 dstFetch(d, Pixel::kUA, n);
2761 VecArray& dv = d.ua;
2762
2763 pc->vmulu16(dv, dv, o.ux);
2764 pc->vdiv255u16(dv);
2765 out.uc.init(dv);
2766 }
2767 // Dca' = Xca.Da + Dca.(1 - m)
2768 // Da' = Xa .Da + Da .(1 - m)
2769 else {
2770 dstFetch(d, Pixel::kUC | Pixel::kUA, n);
2771 VecArray& dv = d.uc;
2772 VecArray& da = d.ua;
2773
2774 pc->vmulu16(dv, dv, o.vn);
2775 pc->vmulu16(da, da, o.ux);
2776 pc->vaddi16(dv, dv, da);
2777 pc->vdiv255u16(dv);
2778 out.uc.init(dv);
2779 }
2780
2781 pc->xSatisfyPixel(out, flags);
2782 return;
2783 }
2784
2785 // ------------------------------------------------------------------------
2786 // [CMaskProc - RGBA32 - SrcOut]
2787 // ------------------------------------------------------------------------
2788
2789 if (compOp() == BL_COMP_OP_SRC_OUT) {
2790 // Dca' = Xca.(1 - Da)
2791 // Da' = Xa .(1 - Da)
2792 if (!hasMask) {
2793 dstFetch(d, Pixel::kUIA, n);
2794 VecArray& dv = d.uia;
2795
2796 pc->vmulu16(dv, dv, o.ux);
2797 pc->vdiv255u16(dv);
2798 out.uc.init(dv);
2799 }
2800 // Dca' = Xca.(1 - Da) + Dca.(1 - m)
2801 // Da' = Xa .(1 - Da) + Da .(1 - m)
2802 else {
2803 dstFetch(d, Pixel::kUC, n);
2804 VecArray& dv = d.uc;
2805
2806 pc->vExpandAlpha16(xv, dv, kUseHi);
2807 pc->vinv255u16(dv, dv);
2808 pc->vmulu16(xv, xv, o.ux);
2809 pc->vmulu16(dv, dv, o.vn);
2810 pc->vaddi16(dv, dv, xv);
2811 pc->vdiv255u16(dv);
2812 out.uc.init(dv);
2813 }
2814
2815 pc->xSatisfyPixel(out, flags);
2816 return;
2817 }
2818
2819 // ------------------------------------------------------------------------
2820 // [CMaskProc - RGBA32 - SrcAtop]
2821 // ------------------------------------------------------------------------
2822
2823 if (compOp() == BL_COMP_OP_SRC_ATOP) {
2824 // Dca' = Xca.Da + Dca.Yca
2825 // Da' = Xa .Da + Da .Ya
2826 dstFetch(d, Pixel::kUC, n);
2827 VecArray& dv = d.uc;
2828
2829 pc->vExpandAlpha16(xv, dv, kUseHi);
2830 pc->vmulu16(dv, dv, o.uy);
2831 pc->vmulu16(xv, xv, o.ux);
2832
2833 pc->vaddi16(dv, dv, xv);
2834 pc->vdiv255u16(dv);
2835
2836 out.uc.init(dv);
2837 pc->xSatisfyPixel(out, flags);
2838 return;
2839 }
2840
2841 // ------------------------------------------------------------------------
2842 // [CMaskProc - RGBA32 - Dst]
2843 // ------------------------------------------------------------------------
2844
2845 if (compOp() == BL_COMP_OP_DST_COPY) {
2846 // Dca' = Dca
2847 // Da' = Da
2848 BL_NOT_REACHED();
2849 }
2850
2851 // ------------------------------------------------------------------------
2852 // [CMaskProc - RGBA32 - DstOver]
2853 // ------------------------------------------------------------------------
2854
2855 if (compOp() == BL_COMP_OP_DST_OVER) {
2856 // Dca' = Xca.(1 - Da) + Dca
2857 // Da' = Xa .(1 - Da) + Da
2858 dstFetch(d, Pixel::kPC | Pixel::kUIA, n);
2859 VecArray& dv = d.uia;
2860
2861 pc->vmulu16(dv, dv, o.ux);
2862 pc->vdiv255u16(dv);
2863
2864 VecArray dh = dv.even();
2865 pc->vpacki16u8(dh, dh, dv.odd());
2866 pc->vaddi32(dh, dh, d.pc);
2867
2868 out.pc.init(dh);
2869 pc->xSatisfyPixel(out, flags);
2870 return;
2871 }
2872
2873 // ------------------------------------------------------------------------
2874 // [CMaskProc - RGBA32 - DstIn / DstOut]
2875 // ------------------------------------------------------------------------
2876
2877 if (compOp() == BL_COMP_OP_DST_IN || compOp() == BL_COMP_OP_DST_OUT) {
2878 // Dca' = Xca.Dca
2879 // Da' = Xa .Da
2880 dstFetch(d, Pixel::kUC, n);
2881 VecArray& dv = d.uc;
2882
2883 pc->vmulu16(dv, dv, o.ux);
2884 pc->vdiv255u16(dv);
2885
2886 out.uc.init(dv);
2887 pc->xSatisfyPixel(out, flags);
2888 return;
2889 }
2890
2891 // ------------------------------------------------------------------------
2892 // [CMaskProc - RGBA32 - DstAtop / Xor / Multiply]
2893 // ------------------------------------------------------------------------
2894
2895 if (compOp() == BL_COMP_OP_DST_ATOP || compOp() == BL_COMP_OP_XOR || compOp() == BL_COMP_OP_MULTIPLY) {
2896 // Dca' = Xca.(1 - Da) + Dca.Yca
2897 // Da' = Xa .(1 - Da) + Da .Ya
2898 if (useDa) {
2899 dstFetch(d, Pixel::kUC, n);
2900 VecArray& dv = d.uc;
2901
2902 pc->vExpandAlpha16(xv, dv, kUseHi);
2903 pc->vmulu16(dv, dv, o.uy);
2904 pc->vinv255u16(xv, xv);
2905 pc->vmulu16(xv, xv, o.ux);
2906
2907 pc->vaddi16(dv, dv, xv);
2908 pc->vdiv255u16(dv);
2909 out.uc.init(dv);
2910 }
2911 // Dca' = Dca.Yca
2912 // Da' = Da .Ya
2913 else {
2914 dstFetch(d, Pixel::kUC, n);
2915 VecArray& dv = d.uc;
2916
2917 pc->vmulu16(dv, dv, o.uy);
2918 pc->vdiv255u16(dv);
2919 out.uc.init(dv);
2920 }
2921
2922 pc->xSatisfyPixel(out, flags);
2923 return;
2924 }
2925
2926 // ------------------------------------------------------------------------
2927 // [CMaskProc - RGBA32 - Plus]
2928 // ------------------------------------------------------------------------
2929
2930 if (compOp() == BL_COMP_OP_PLUS) {
2931 // Dca' = Clamp(Dca + Sca)
2932 // Da' = Clamp(Da + Sa )
2933 dstFetch(d, Pixel::kPC, n);
2934 VecArray& dv = d.pc;
2935
2936 pc->vaddsu8(dv, dv, o.px);
2937 out.pc.init(dv);
2938
2939 pc->xSatisfyPixel(out, flags);
2940 return;
2941 }
2942
2943 // ------------------------------------------------------------------------
2944 // [CMaskProc - RGBA32 - Minus]
2945 // ------------------------------------------------------------------------
2946
2947 if (compOp() == BL_COMP_OP_MINUS) {
2948 if (!hasMask) {
2949 // Dca' = Clamp(Dca - Xca) + Yca.(1 - Da)
2950 // Da' = Da + Ya.(1 - Da)
2951 if (useDa) {
2952 dstFetch(d, Pixel::kUC, n);
2953 VecArray& dv = d.uc;
2954
2955 pc->vExpandAlpha16(xv, dv, kUseHi);
2956 pc->vinv255u16(xv, xv);
2957 pc->vmulu16(xv, xv, o.uy);
2958 pc->vsubsu16(dv, dv, o.ux);
2959 pc->vdiv255u16(xv);
2960
2961 pc->vaddi16(dv, dv, xv);
2962 out.uc.init(dv);
2963 }
2964 // Dca' = Clamp(Dca - Xca)
2965 // Da' = <unchanged>
2966 else {
2967 dstFetch(d, Pixel::kPC, n);
2968 VecArray& dh = d.pc;
2969
2970 pc->vsubsu8(dh, dh, o.px);
2971 out.pc.init(dh);
2972 }
2973 }
2974 else {
2975 // Dca' = (Clamp(Dca - Xca) + Yca.(1 - Da)).m + Dca.(1 - m)
2976 // Da' = Da + Ya.(1 - Da)
2977 if (useDa) {
2978 dstFetch(d, Pixel::kUC, n);
2979 VecArray& dv = d.uc;
2980
2981 pc->vExpandAlpha16(xv, dv, kUseHi);
2982 pc->vinv255u16(xv, xv);
2983 pc->vmulu16(yv, dv, o.vn);
2984 pc->vsubsu16(dv, dv, o.ux);
2985 pc->vmulu16(xv, xv, o.uy);
2986 pc->vdiv255u16(xv);
2987 pc->vaddi16(dv, dv, xv);
2988 pc->vmulu16(dv, dv, o.vm);
2989
2990 pc->vaddi16(dv, dv, yv);
2991 pc->vdiv255u16(dv);
2992 out.uc.init(dv);
2993 }
2994 // Dca' = Clamp(Dca - Xca).m + Dca.(1 - m)
2995 // Da' = <unchanged>
2996 else {
2997 dstFetch(d, Pixel::kUC, n);
2998 VecArray& dv = d.uc;
2999
3000 pc->vmulu16(yv, dv, o.vn);
3001 pc->vsubsu16(dv, dv, o.ux);
3002 pc->vmulu16(dv, dv, o.vm);
3003
3004 pc->vaddi16(dv, dv, yv);
3005 pc->vdiv255u16(dv);
3006 out.uc.init(dv);
3007 }
3008 }
3009
3010 pc->xSatisfyPixel(out, flags);
3011 return;
3012 }
3013
3014 // ------------------------------------------------------------------------
3015 // [CMaskProc - RGBA32 - Darken / Lighten]
3016 // ------------------------------------------------------------------------
3017
3018 if (compOp() == BL_COMP_OP_DARKEN || compOp() == BL_COMP_OP_LIGHTEN) {
3019 // Dca' = minmax(Dca + Xca.(1 - Da), Xca + Dca.Yca)
3020 // Da' = Xa + Da.Ya
3021 dstFetch(d, Pixel::kUC, n);
3022 VecArray& dv = d.uc;
3023
3024 pc->vExpandAlpha16(xv, dv, kUseHi);
3025 pc->vinv255u16(xv, xv);
3026 pc->vmulu16(xv, xv, o.ux);
3027 pc->vdiv255u16(xv);
3028 pc->vaddi16(xv, xv, dv);
3029 pc->vmulu16(dv, dv, o.uy);
3030 pc->vdiv255u16(dv);
3031 pc->vaddi16(dv, dv, o.ux);
3032
3033 if (compOp() == BL_COMP_OP_DARKEN)
3034 pc->vminu8(dv, dv, xv);
3035 else
3036 pc->vmaxu8(dv, dv, xv);
3037
3038 out.uc.init(dv);
3039 pc->xSatisfyPixel(out, flags);
3040 return;
3041 }
3042
3043 // ------------------------------------------------------------------------
3044 // [CMaskProc - RGBA32 - LinearBurn]
3045 // ------------------------------------------------------------------------
3046
3047 if (compOp() == BL_COMP_OP_LINEAR_BURN) {
3048 // Dca' = Dca + Xca - Yca.Da
3049 // Da' = Da + Xa - Ya .Da
3050 dstFetch(d, Pixel::kUC, n);
3051 VecArray& dv = d.uc;
3052
3053 pc->vExpandAlpha16(xv, dv, kUseHi);
3054 pc->vmulu16(xv, xv, o.uy);
3055 pc->vaddi16(dv, dv, o.ux);
3056 pc->vdiv255u16(xv);
3057 pc->vsubsu16(dv, dv, xv);
3058
3059 out.uc.init(dv);
3060 pc->xSatisfyPixel(out, flags);
3061 return;
3062 }
3063
3064 // ------------------------------------------------------------------------
3065 // [CMaskProc - RGBA32 - Difference]
3066 // ------------------------------------------------------------------------
3067
3068 if (compOp() == BL_COMP_OP_DIFFERENCE) {
3069 // Dca' = Dca + Sca - 2.min(Sca.Da, Dca.Sa)
3070 // Da' = Da + Sa - min(Sa .Da, Da .Sa)
3071 dstFetch(d, Pixel::kUC, n);
3072 VecArray& dv = d.uc;
3073
3074 pc->vExpandAlpha16(xv, dv, kUseHi);
3075 pc->vmulu16(yv, o.uy, dv);
3076 pc->vmulu16(xv, xv, o.ux);
3077 pc->vaddi16(dv, dv, o.ux);
3078 pc->vminu16(yv, yv, xv);
3079 pc->vdiv255u16(yv);
3080 pc->vsubi16(dv, dv, yv);
3081 pc->vZeroAlphaW(yv, yv);
3082 pc->vsubi16(dv, dv, yv);
3083
3084 out.uc.init(dv);
3085 pc->xSatisfyPixel(out, flags);
3086 return;
3087 }
3088
3089 // ------------------------------------------------------------------------
3090 // [CMaskProc - RGBA32 - Exclusion]
3091 // ------------------------------------------------------------------------
3092
3093 if (compOp() == BL_COMP_OP_EXCLUSION) {
3094 // Dca' = Dca + Xca - 2.Xca.Dca
3095 // Da' = Da + Xa - Xa.Da
3096 dstFetch(d, Pixel::kUC, n);
3097 VecArray& dv = d.uc;
3098
3099 pc->vmulu16(xv, dv, o.ux);
3100 pc->vaddi16(dv, dv, o.ux);
3101 pc->vdiv255u16(xv);
3102 pc->vsubi16(dv, dv, xv);
3103 pc->vZeroAlphaW(xv, xv);
3104 pc->vsubi16(dv, dv, xv);
3105
3106 out.uc.init(dv);
3107 pc->xSatisfyPixel(out, flags);
3108 return;
3109 }
3110 }
3111
3112 VecArray vm;
3113 if (_mask->vm.isValid())
3114 vm.init(_mask->vm);
3115
3116 vMaskProcRGBA32Xmm(out, n, flags, vm, true);
3117 }
3118
3119 // ============================================================================
3120 // [BLPipeGen::CompOpPart - VMask - RGBA32 (XMM)]
3121 // ============================================================================
3122
vMaskProcRGBA32Xmm(Pixel & out,uint32_t n,uint32_t flags,VecArray & vm,bool mImmutable)3123 void CompOpPart::vMaskProcRGBA32Xmm(Pixel& out, uint32_t n, uint32_t flags, VecArray& vm, bool mImmutable) noexcept {
3124 bool hasMask = !vm.empty();
3125
3126 bool useDa = hasDa();
3127 bool useSa = hasSa() || hasMask || isLoopCMask();
3128
3129 uint32_t kFullN = (n + 1) / 2;
3130 uint32_t kUseHi = (n > 1);
3131 uint32_t kSplit = (kFullN == 1) ? 1 : 2;
3132
3133 VecArray xv, yv, zv;
3134 pc->newXmmArray(xv, kFullN, "x");
3135 pc->newXmmArray(yv, kFullN, "y");
3136 pc->newXmmArray(zv, kFullN, "z");
3137
3138 Pixel d(Pixel::kTypeRGBA);
3139 Pixel s(Pixel::kTypeRGBA);
3140
3141 out.setCount(n);
3142
3143 // --------------------------------------------------------------------------
3144 // [VMaskProc - RGBA32 - SrcCopy]
3145 // --------------------------------------------------------------------------
3146
3147 if (compOp() == BL_COMP_OP_SRC_COPY) {
3148 // Composition:
3149 // Da - Optional.
3150 // Sa - Optional.
3151
3152 if (!hasMask) {
3153 // Dca' = Sca
3154 // Da' = Sa
3155 srcFetch(out, flags, n);
3156 }
3157 else {
3158 // Dca' = Sca.m + Dca.(1 - m)
3159 // Da' = Sa .m + Da .(1 - m)
3160 srcFetch(s, Pixel::kUC, n);
3161 dstFetch(d, Pixel::kUC, n);
3162
3163 VecArray& vs = s.uc;
3164 VecArray& vd = d.uc;
3165 VecArray vn;
3166
3167 pc->vmulu16(vs, vs, vm);
3168 vMaskProcRGBA32InvertMask(vn, vm);
3169
3170 pc->vmulu16(vd, vd, vn);
3171 pc->vaddi16(vd, vd, vs);
3172 vMaskProcRGBA32InvertDone(vn, mImmutable);
3173
3174 pc->vdiv255u16(vd);
3175 out.uc.init(vd);
3176 }
3177
3178 pc->xSatisfyPixel(out, flags);
3179 return;
3180 }
3181
3182 // --------------------------------------------------------------------------
3183 // [VMaskProc - RGBA32 - SrcOver]
3184 // --------------------------------------------------------------------------
3185
3186 if (compOp() == BL_COMP_OP_SRC_OVER) {
3187 // Composition:
3188 // Da - Optional.
3189 // Sa - Required, otherwise SRC_COPY.
3190
3191 if (!hasMask) {
3192 // Dca' = Sca + Dca.(1 - Sa)
3193 // Da' = Sa + Da .(1 - Sa)
3194 srcFetch(s, Pixel::kPC | Pixel::kUIA | Pixel::kImmutable, n);
3195 dstFetch(d, Pixel::kUC, n);
3196
3197 VecArray& uv = s.uia;
3198 VecArray& dv = d.uc;
3199
3200 pc->vmulu16(dv, dv, uv);
3201 pc->vdiv255u16(dv);
3202
3203 VecArray dh = dv.even();
3204 pc->vpacki16u8(dh, dh, dv.odd());
3205 pc->vaddi32(dh, dh, s.pc);
3206
3207 out.pc.init(dh);
3208 }
3209 else {
3210 // Dca' = Sca.m + Dca.(1 - Sa.m)
3211 // Da' = Sa .m + Da .(1 - Sa.m)
3212 srcFetch(s, Pixel::kUC, n);
3213 dstFetch(d, Pixel::kUC, n);
3214
3215 VecArray& sv = s.uc;
3216 VecArray& dv = d.uc;
3217
3218 pc->vmulu16(sv, sv, vm);
3219 pc->vdiv255u16(sv);
3220
3221 pc->vExpandAlpha16(xv, sv, kUseHi);
3222 pc->vinv255u16(xv, xv);
3223 pc->vmulu16(dv, dv, xv);
3224 pc->vdiv255u16(dv);
3225
3226 pc->vaddi16(dv, dv, sv);
3227 out.uc.init(dv);
3228 }
3229
3230 pc->xSatisfyPixel(out, flags);
3231 return;
3232 }
3233
3234 // --------------------------------------------------------------------------
3235 // [VMaskProc - RGBA32 - SrcIn]
3236 // --------------------------------------------------------------------------
3237
3238 if (compOp() == BL_COMP_OP_SRC_IN) {
3239 // Composition:
3240 // Da - Required, otherwise SRC_COPY.
3241 // Sa - Optional.
3242
3243 if (!hasMask) {
3244 // Dca' = Sca.Da
3245 // Da' = Sa .Da
3246 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3247 dstFetch(d, Pixel::kUA, n);
3248
3249 VecArray& sv = s.uc;
3250 VecArray& dv = d.ua;
3251
3252 pc->vmulu16(dv, dv, sv);
3253 pc->vdiv255u16(dv);
3254 out.uc.init(dv);
3255 }
3256 else {
3257 // Dca' = Sca.m.Da + Dca.(1 - m)
3258 // Da' = Sa .m.Da + Da .(1 - m)
3259 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3260 dstFetch(d, Pixel::kUC, n);
3261
3262 VecArray& sv = s.uc;
3263 VecArray& dv = d.uc;
3264
3265 pc->vExpandAlpha16(xv, dv, kUseHi);
3266 pc->vmulu16(xv, xv, sv);
3267 pc->vdiv255u16(xv);
3268 pc->vmulu16(xv, xv, vm);
3269 vMaskProcRGBA32InvertMask(vm, vm);
3270
3271 pc->vmulu16(dv, dv, vm);
3272 vMaskProcRGBA32InvertDone(vm, mImmutable);
3273
3274 pc->vaddi16(dv, dv, xv);
3275 pc->vdiv255u16(dv);
3276 out.uc.init(dv);
3277 }
3278
3279 pc->xSatisfyPixel(out, flags);
3280 return;
3281 }
3282
3283 // --------------------------------------------------------------------------
3284 // [VMaskProc - RGBA32 - SrcOut]
3285 // --------------------------------------------------------------------------
3286
3287 if (compOp() == BL_COMP_OP_SRC_OUT) {
3288 // Composition:
3289 // Da - Required, otherwise CLEAR.
3290 // Sa - Optional.
3291
3292 if (!hasMask) {
3293 // Dca' = Sca.(1 - Da)
3294 // Da' = Sa .(1 - Da)
3295 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3296 dstFetch(d, Pixel::kUIA, n);
3297
3298 VecArray& sv = s.uc;
3299 VecArray& dv = d.uia;
3300
3301 pc->vmulu16(dv, dv, sv);
3302 pc->vdiv255u16(dv);
3303 out.uc.init(dv);
3304 }
3305 else {
3306 // Dca' = Sca.m.(1 - Da) + Dca.(1 - m)
3307 // Da' = Sa .m.(1 - Da) + Da .(1 - m)
3308 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3309 dstFetch(d, Pixel::kUC, n);
3310
3311 VecArray& sv = s.uc;
3312 VecArray& dv = d.uc;
3313
3314 pc->vExpandAlpha16(xv, dv, kUseHi);
3315 pc->vinv255u16(xv, xv);
3316
3317 pc->vmulu16(xv, xv, sv);
3318 pc->vdiv255u16(xv);
3319 pc->vmulu16(xv, xv, vm);
3320 vMaskProcRGBA32InvertMask(vm, vm);
3321
3322 pc->vmulu16(dv, dv, vm);
3323 vMaskProcRGBA32InvertDone(vm, mImmutable);
3324
3325 pc->vaddi16(dv, dv, xv);
3326 pc->vdiv255u16(dv);
3327 out.uc.init(dv);
3328 }
3329
3330 pc->xSatisfyPixel(out, flags);
3331 return;
3332 }
3333
3334 // --------------------------------------------------------------------------
3335 // [VMaskProc - RGBA32 - SrcAtop]
3336 // --------------------------------------------------------------------------
3337
3338 if (compOp() == BL_COMP_OP_SRC_ATOP) {
3339 // Composition:
3340 // Da - Required.
3341 // Sa - Required.
3342
3343 if (!hasMask) {
3344 // Dca' = Sca.Da + Dca.(1 - Sa)
3345 // Da' = Sa .Da + Da .(1 - Sa) = Da
3346 srcFetch(s, Pixel::kUC | Pixel::kUIA | Pixel::kImmutable, n);
3347 dstFetch(d, Pixel::kUC, n);
3348
3349 VecArray& sv = s.uc;
3350 VecArray& uv = s.uia;
3351 VecArray& dv = d.uc;
3352
3353 pc->vExpandAlpha16(xv, dv, kUseHi);
3354 pc->vmulu16(dv, dv, uv);
3355 pc->vmulu16(xv, xv, sv);
3356 pc->vaddi16(dv, dv, xv);
3357 pc->vdiv255u16(dv);
3358
3359 out.uc.init(dv);
3360 }
3361 else {
3362 // Dca' = Sca.Da.m + Dca.(1 - Sa.m)
3363 // Da' = Sa .Da.m + Da .(1 - Sa.m) = Da
3364 srcFetch(s, Pixel::kUC, n);
3365 dstFetch(d, Pixel::kUC, n);
3366
3367 VecArray& sv = s.uc;
3368 VecArray& dv = d.uc;
3369
3370 pc->vmulu16(sv, sv, vm);
3371 pc->vdiv255u16(sv);
3372
3373 pc->vExpandAlpha16(xv, sv, kUseHi);
3374 pc->vinv255u16(xv, xv);
3375 pc->vExpandAlpha16(yv, dv, kUseHi);
3376 pc->vmulu16(dv, dv, xv);
3377 pc->vmulu16(yv, yv, sv);
3378 pc->vaddi16(dv, dv, yv);
3379 pc->vdiv255u16(dv);
3380
3381 out.uc.init(dv);
3382 }
3383
3384 pc->xSatisfyPixel(out, flags);
3385 return;
3386 }
3387
3388 // --------------------------------------------------------------------------
3389 // [VMaskProc - RGBA32 - Dst]
3390 // --------------------------------------------------------------------------
3391
3392 if (compOp() == BL_COMP_OP_DST_COPY) {
3393 // Dca' = Dca
3394 // Da' = Da
3395 BL_NOT_REACHED();
3396 }
3397
3398 // --------------------------------------------------------------------------
3399 // [VMaskProc - RGBA32 - DstOver]
3400 // --------------------------------------------------------------------------
3401
3402 if (compOp() == BL_COMP_OP_DST_OVER) {
3403 // Composition:
3404 // Da - Required, otherwise DST_COPY.
3405 // Sa - Optional.
3406
3407 if (!hasMask) {
3408 // Dca' = Dca + Sca.(1 - Da)
3409 // Da' = Da + Sa .(1 - Da)
3410 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3411 dstFetch(d, Pixel::kPC | Pixel::kUIA, n);
3412
3413 VecArray& sv = s.uc;
3414 VecArray& dv = d.uia;
3415
3416 pc->vmulu16(dv, dv, sv);
3417 pc->vdiv255u16(dv);
3418
3419 VecArray dh = dv.even();
3420 pc->vpacki16u8(dh, dh, dv.odd());
3421 pc->vaddi32(dh, dh, d.pc);
3422
3423 out.pc.init(dh);
3424 }
3425 else {
3426 // Dca' = Dca + Sca.m.(1 - Da)
3427 // Da' = Da + Sa .m.(1 - Da)
3428 srcFetch(s, Pixel::kUC, n);
3429 dstFetch(d, Pixel::kPC | Pixel::kUIA, n);
3430
3431 VecArray& sv = s.uc;
3432 VecArray& dv = d.uia;
3433
3434 pc->vmulu16(sv, sv, vm);
3435 pc->vdiv255u16(sv);
3436
3437 pc->vmulu16(dv, dv, sv);
3438 pc->vdiv255u16(dv);
3439
3440 VecArray dh = dv.even();
3441 pc->vpacki16u8(dh, dh, dv.odd());
3442 pc->vaddi32(dh, dh, d.pc);
3443
3444 out.pc.init(dh);
3445 }
3446
3447 pc->xSatisfyPixel(out, flags);
3448 return;
3449 }
3450
3451 // --------------------------------------------------------------------------
3452 // [VMaskProc - RGBA32 - DstIn]
3453 // --------------------------------------------------------------------------
3454
3455 if (compOp() == BL_COMP_OP_DST_IN) {
3456 // Composition:
3457 // Da - Optional.
3458 // Sa - Required, otherwise DST_COPY.
3459
3460 if (!hasMask) {
3461 // Dca' = Dca.Sa
3462 // Da' = Da .Sa
3463 srcFetch(s, Pixel::kUA | Pixel::kImmutable, n);
3464 dstFetch(d, Pixel::kUC, n);
3465
3466 VecArray& sv = s.ua;
3467 VecArray& dv = d.uc;
3468
3469 pc->vmulu16(dv, dv, sv);
3470 pc->vdiv255u16(dv);
3471 out.uc.init(dv);
3472 }
3473 else {
3474 // Dca' = Dca.(1 - m.(1 - Sa))
3475 // Da' = Da .(1 - m.(1 - Sa))
3476 srcFetch(s, Pixel::kUIA, n);
3477 dstFetch(d, Pixel::kUC, n);
3478
3479 VecArray& sv = s.uia;
3480 VecArray& dv = d.uc;
3481
3482 pc->vmulu16(sv, sv, vm);
3483 pc->vdiv255u16(sv);
3484 pc->vinv255u16(sv, sv);
3485
3486 pc->vmulu16(dv, dv, sv);
3487 pc->vdiv255u16(dv);
3488 out.uc.init(dv);
3489 }
3490
3491 pc->xSatisfyPixel(out, flags);
3492 return;
3493 }
3494
3495 // --------------------------------------------------------------------------
3496 // [VMaskProc - RGBA32 - DstOut]
3497 // --------------------------------------------------------------------------
3498
3499 if (compOp() == BL_COMP_OP_DST_OUT) {
3500 // Composition:
3501 // Da - Optional.
3502 // Sa - Required, otherwise CLEAR.
3503
3504 if (!hasMask) {
3505 // Dca' = Dca.(1 - Sa)
3506 // Da' = Da .(1 - Sa)
3507 srcFetch(s, Pixel::kUIA | Pixel::kImmutable, n);
3508 dstFetch(d, Pixel::kUC, n);
3509
3510 VecArray& sv = s.uia;
3511 VecArray& dv = d.uc;
3512
3513 pc->vmulu16(dv, dv, sv);
3514 pc->vdiv255u16(dv);
3515 out.uc.init(dv);
3516 }
3517 else {
3518 // Dca' = Dca.(1 - Sa.m)
3519 // Da' = Da .(1 - Sa.m)
3520 srcFetch(s, Pixel::kUA, n);
3521 dstFetch(d, Pixel::kUC, n);
3522
3523 VecArray& sv = s.ua;
3524 VecArray& dv = d.uc;
3525
3526 pc->vmulu16(sv, sv, vm);
3527 pc->vdiv255u16(sv);
3528 pc->vinv255u16(sv, sv);
3529
3530 pc->vmulu16(dv, dv, sv);
3531 pc->vdiv255u16(dv);
3532 out.uc.init(dv);
3533 }
3534
3535 pc->xSatisfyPixel(out, flags);
3536 if (!useDa) pc->vFillAlpha(out);
3537 return;
3538 }
3539
3540 // --------------------------------------------------------------------------
3541 // [VMaskProc - RGBA32 - DstAtop]
3542 // --------------------------------------------------------------------------
3543
3544 if (compOp() == BL_COMP_OP_DST_ATOP) {
3545 // Composition:
3546 // Da - Required.
3547 // Sa - Required.
3548
3549 if (!hasMask) {
3550 // Dca' = Dca.Sa + Sca.(1 - Da)
3551 // Da' = Da .Sa + Sa .(1 - Da)
3552 srcFetch(s, Pixel::kUC | Pixel::kUA | Pixel::kImmutable, n);
3553 dstFetch(d, Pixel::kUC, n);
3554
3555 VecArray& sv = s.uc;
3556 VecArray& uv = s.ua;
3557 VecArray& dv = d.uc;
3558
3559 pc->vExpandAlpha16(xv, dv, kUseHi);
3560 pc->vmulu16(dv, dv, uv);
3561 pc->vinv255u16(xv, xv);
3562 pc->vmulu16(xv, xv, sv);
3563
3564 pc->vaddi16(dv, dv, xv);
3565 pc->vdiv255u16(dv);
3566 out.uc.init(dv);
3567 }
3568 else {
3569 // Dca' = Dca.(1 - m.(1 - Sa)) + Sca.m.(1 - Da)
3570 // Da' = Da .(1 - m.(1 - Sa)) + Sa .m.(1 - Da)
3571 srcFetch(s, Pixel::kUC | Pixel::kUIA, n);
3572 dstFetch(d, Pixel::kUC, n);
3573
3574 VecArray& sv = s.uc;
3575 VecArray& uv = s.uia;
3576 VecArray& dv = d.uc;
3577
3578 pc->vExpandAlpha16(xv, dv, kUseHi);
3579 pc->vmulu16(sv, sv, vm);
3580 pc->vmulu16(uv, uv, vm);
3581
3582 pc->vdiv255u16(sv);
3583 pc->vdiv255u16(uv);
3584 pc->vinv255u16(xv, xv);
3585 pc->vinv255u16(uv, uv);
3586 pc->vmulu16(xv, xv, sv);
3587 pc->vmulu16(dv, dv, uv);
3588
3589 pc->vaddi16(dv, dv, xv);
3590 pc->vdiv255u16(dv);
3591 out.uc.init(dv);
3592 }
3593
3594 pc->xSatisfyPixel(out, flags);
3595 return;
3596 }
3597
3598 // --------------------------------------------------------------------------
3599 // [VMaskProc - RGBA32 - Xor]
3600 // --------------------------------------------------------------------------
3601
3602 if (compOp() == BL_COMP_OP_XOR) {
3603 // Composition:
3604 // Da - Required.
3605 // Sa - Required.
3606
3607 if (!hasMask) {
3608 // Dca' = Dca.(1 - Sa) + Sca.(1 - Da)
3609 // Da' = Da .(1 - Sa) + Sa .(1 - Da)
3610 srcFetch(s, Pixel::kUC | Pixel::kUIA | Pixel::kImmutable, n);
3611 dstFetch(d, Pixel::kUC, n);
3612
3613 VecArray& sv = s.uc;
3614 VecArray& uv = s.uia;
3615 VecArray& dv = d.uc;
3616
3617 pc->vExpandAlpha16(xv, dv, kUseHi);
3618 pc->vmulu16(dv, dv, uv);
3619 pc->vinv255u16(xv, xv);
3620 pc->vmulu16(xv, xv, sv);
3621
3622 pc->vaddi16(dv, dv, xv);
3623 pc->vdiv255u16(dv);
3624 out.uc.init(dv);
3625 }
3626 else {
3627 // Dca' = Dca.(1 - Sa.m) + Sca.m.(1 - Da)
3628 // Da' = Da .(1 - Sa.m) + Sa .m.(1 - Da)
3629 srcFetch(s, Pixel::kUC, n);
3630 dstFetch(d, Pixel::kUC, n);
3631
3632 VecArray& sv = s.uc;
3633 VecArray& dv = d.uc;
3634
3635 pc->vmulu16(sv, sv, vm);
3636 pc->vdiv255u16(sv);
3637
3638 pc->vExpandAlpha16(xv, sv, kUseHi);
3639 pc->vExpandAlpha16(yv, dv, kUseHi);
3640 pc->vinv255u16(xv, xv);
3641 pc->vinv255u16(yv, yv);
3642 pc->vmulu16(dv, dv, xv);
3643 pc->vmulu16(sv, sv, yv);
3644
3645 pc->vaddi16(dv, dv, sv);
3646 pc->vdiv255u16(dv);
3647 out.uc.init(dv);
3648 }
3649
3650 pc->xSatisfyPixel(out, flags);
3651 return;
3652 }
3653
3654 // --------------------------------------------------------------------------
3655 // [VMaskProc - RGBA32 - Plus]
3656 // --------------------------------------------------------------------------
3657
3658 if (compOp() == BL_COMP_OP_PLUS) {
3659 if (!hasMask) {
3660 // Dca' = Clamp(Dca + Sca)
3661 // Da' = Clamp(Da + Sa )
3662 srcFetch(s, Pixel::kPC | Pixel::kImmutable, n);
3663 dstFetch(d, Pixel::kPC, n);
3664
3665 VecArray& sh = s.pc;
3666 VecArray& dh = d.pc;
3667
3668 pc->vaddsu8(dh, dh, sh);
3669 out.pc.init(dh);
3670 }
3671 else {
3672 // Dca' = Clamp(Dca + Sca.m)
3673 // Da' = Clamp(Da + Sa .m)
3674 srcFetch(s, Pixel::kUC, n);
3675 dstFetch(d, Pixel::kPC, n);
3676
3677 VecArray& sv = s.uc;
3678 VecArray& dh = d.pc;
3679
3680 pc->vmulu16(sv, sv, vm);
3681 pc->vdiv255u16(sv);
3682
3683 VecArray sh = sv.even();
3684 pc->vpacki16u8(sh, sh, sv.odd());
3685 pc->vaddsu8(dh, dh, sh);
3686
3687 out.pc.init(dh);
3688 }
3689
3690 pc->xSatisfyPixel(out, flags);
3691 return;
3692 }
3693
3694 // --------------------------------------------------------------------------
3695 // [VMaskProc - RGBA32 - Minus]
3696 // --------------------------------------------------------------------------
3697
3698 if (compOp() == BL_COMP_OP_MINUS) {
3699 if (!hasMask) {
3700 // Dca' = Clamp(Dca - Sca) + Sca.(1 - Da)
3701 // Da' = Da + Sa.(1 - Da)
3702 if (useDa) {
3703 srcFetch(s, Pixel::kUC, n);
3704 dstFetch(d, Pixel::kUC, n);
3705
3706 VecArray& sv = s.uc;
3707 VecArray& dv = d.uc;
3708
3709 pc->vExpandAlpha16(xv, dv, kUseHi);
3710 pc->vinv255u16(xv, xv);
3711 pc->vmulu16(xv, xv, sv);
3712 pc->vZeroAlphaW(sv, sv);
3713 pc->vdiv255u16(xv);
3714
3715 pc->vsubsu16(dv, dv, sv);
3716 pc->vaddi16(dv, dv, xv);
3717 out.uc.init(dv);
3718 }
3719 // Dca' = Clamp(Dca - Sca)
3720 // Da' = <unchanged>
3721 else {
3722 srcFetch(s, Pixel::kPC, n);
3723 dstFetch(d, Pixel::kPC, n);
3724
3725 VecArray& sh = s.pc;
3726 VecArray& dh = d.pc;
3727
3728 pc->vZeroAlphaB(sh, sh);
3729 pc->vsubsu8(dh, dh, sh);
3730
3731 out.pc.init(dh);
3732 }
3733 }
3734 else {
3735 // Dca' = (Clamp(Dca - Sca) + Sca.(1 - Da)).m + Dca.(1 - m)
3736 // Da' = Da + Sa.m(1 - Da)
3737 if (useDa) {
3738 srcFetch(s, Pixel::kUC, n);
3739 dstFetch(d, Pixel::kUC, n);
3740
3741 VecArray& sv = s.uc;
3742 VecArray& dv = d.uc;
3743
3744 pc->vExpandAlpha16(xv, dv, kUseHi);
3745 pc->vmov(yv, dv);
3746 pc->vinv255u16(xv, xv);
3747 pc->vsubsu16(dv, dv, sv);
3748 pc->vmulu16(sv, sv, xv);
3749
3750 pc->vZeroAlphaW(dv, dv);
3751 pc->vdiv255u16(sv);
3752 pc->vaddi16(dv, dv, sv);
3753 pc->vmulu16(dv, dv, vm);
3754
3755 pc->vZeroAlphaW(vm, vm);
3756 pc->vinv255u16(vm, vm);
3757
3758 pc->vmulu16(yv, yv, vm);
3759
3760 if (mImmutable) {
3761 pc->vinv255u16(vm[0], vm[0]);
3762 pc->vswizi32(vm[0], vm[0], x86::Predicate::shuf(2, 2, 0, 0));
3763 }
3764
3765 pc->vaddi16(dv, dv, yv);
3766 pc->vdiv255u16(dv);
3767 out.uc.init(dv);
3768 }
3769 // Dca' = Clamp(Dca - Sca).m + Dca.(1 - m)
3770 // Da' = <unchanged>
3771 else {
3772 srcFetch(s, Pixel::kUC, n);
3773 dstFetch(d, Pixel::kUC, n);
3774
3775 VecArray& sv = s.uc;
3776 VecArray& dv = d.uc;
3777
3778 pc->vinv255u16(xv, vm);
3779 pc->vZeroAlphaW(sv, sv);
3780
3781 pc->vmulu16(xv, xv, dv);
3782 pc->vsubsu16(dv, dv, sv);
3783 pc->vmulu16(dv, dv, vm);
3784
3785 pc->vaddi16(dv, dv, xv);
3786 pc->vdiv255u16(dv);
3787 out.uc.init(dv);
3788 }
3789 }
3790
3791 pc->xSatisfyPixel(out, flags);
3792 return;
3793 }
3794
3795 // --------------------------------------------------------------------------
3796 // [VMaskProc - RGBA32 - Multiply]
3797 // --------------------------------------------------------------------------
3798
3799 if (compOp() == BL_COMP_OP_MULTIPLY) {
3800 if (!hasMask) {
3801 // Dca' = Dca.(Sca + 1 - Sa) + Sca.(1 - Da)
3802 // Da' = Da .(Sa + 1 - Sa) + Sa .(1 - Da)
3803 if (useDa && useSa) {
3804 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3805 dstFetch(d, Pixel::kUC, n);
3806
3807 VecArray& sv = s.uc;
3808 VecArray& dv = d.uc;
3809
3810 // SPLIT.
3811 for (unsigned int i = 0; i < kSplit; i++) {
3812 VecArray sh = sv.even_odd(i);
3813 VecArray dh = dv.even_odd(i);
3814 VecArray xh = xv.even_odd(i);
3815 VecArray yh = yv.even_odd(i);
3816
3817 pc->vExpandAlpha16(yh, sh, kUseHi);
3818 pc->vExpandAlpha16(xh, dh, kUseHi);
3819 pc->vinv255u16(yh, yh);
3820 pc->vaddi16(yh, yh, sh);
3821 pc->vinv255u16(xh, xh);
3822 pc->vmulu16(dh, dh, yh);
3823 pc->vmulu16(xh, xh, sh);
3824 pc->vaddi16(dh, dh, xh);
3825 }
3826
3827 pc->vdiv255u16(dv);
3828 out.uc.init(dv);
3829 }
3830 // Dca' = Sc.(Dca + 1 - Da)
3831 // Da' = 1 .(Da + 1 - Da) = 1
3832 else if (useDa) {
3833 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3834 dstFetch(d, Pixel::kUC, n);
3835
3836 VecArray& sv = s.uc;
3837 VecArray& dv = d.uc;
3838
3839 pc->vExpandAlpha16(xv, dv, kUseHi);
3840 pc->vinv255u16(xv, xv);
3841 pc->vaddi16(dv, dv, xv);
3842 pc->vmulu16(dv, dv, sv);
3843
3844 pc->vdiv255u16(dv);
3845 out.uc.init(dv);
3846 }
3847 // Dc' = Dc.(Sca + 1 - Sa)
3848 // Da' = Da.(Sa + 1 - Sa)
3849 else if (hasSa()) {
3850 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3851 dstFetch(d, Pixel::kUC, n);
3852
3853 VecArray& sv = s.uc;
3854 VecArray& dv = d.uc;
3855
3856 pc->vExpandAlpha16(xv, sv, kUseHi);
3857 pc->vinv255u16(xv, xv);
3858 pc->vaddi16(xv, xv, sv);
3859 pc->vmulu16(dv, dv, xv);
3860
3861 pc->vdiv255u16(dv);
3862 out.uc.init(dv);
3863 }
3864 // Dc' = Dc.Sc
3865 // Da' = Da.Sa
3866 else {
3867 srcFetch(s, Pixel::kUC | Pixel::kImmutable, n);
3868 dstFetch(d, Pixel::kUC, n);
3869
3870 VecArray& sv = s.uc;
3871 VecArray& dv = d.uc;
3872
3873 pc->vmulu16(dv, dv, sv);
3874 pc->vdiv255u16(dv);
3875 out.uc.init(dv);
3876 }
3877 }
3878 else {
3879 // Dca' = Dca.(Sca.m + 1 - Sa.m) + Sca.m(1 - Da)
3880 // Da' = Da .(Sa .m + 1 - Sa.m) + Sa .m(1 - Da)
3881 if (useDa) {
3882 srcFetch(s, Pixel::kUC, n);
3883 dstFetch(d, Pixel::kUC, n);
3884
3885 VecArray& sv = s.uc;
3886 VecArray& dv = d.uc;
3887
3888 pc->vmulu16(sv, sv, vm);
3889 pc->vdiv255u16(sv);
3890
3891 // SPLIT.
3892 for (unsigned int i = 0; i < kSplit; i++) {
3893 VecArray sh = sv.even_odd(i);
3894 VecArray dh = dv.even_odd(i);
3895 VecArray xh = xv.even_odd(i);
3896 VecArray yh = yv.even_odd(i);
3897
3898 pc->vExpandAlpha16(yh, sh, kUseHi);
3899 pc->vExpandAlpha16(xh, dh, kUseHi);
3900 pc->vinv255u16(yh, yh);
3901 pc->vaddi16(yh, yh, sh);
3902 pc->vinv255u16(xh, xh);
3903 pc->vmulu16(dh, dh, yh);
3904 pc->vmulu16(xh, xh, sh);
3905 pc->vaddi16(dh, dh, xh);
3906 }
3907
3908 pc->vdiv255u16(dv);
3909 out.uc.init(dv);
3910 }
3911 else {
3912 srcFetch(s, Pixel::kUC, n);
3913 dstFetch(d, Pixel::kUC, n);
3914
3915 VecArray& sv = s.uc;
3916 VecArray& dv = d.uc;
3917
3918 pc->vmulu16(sv, sv, vm);
3919 pc->vdiv255u16(sv);
3920
3921 pc->vExpandAlpha16(xv, sv, kUseHi);
3922 pc->vinv255u16(xv, xv);
3923 pc->vaddi16(xv, xv, sv);
3924 pc->vmulu16(dv, dv, xv);
3925
3926 pc->vdiv255u16(dv);
3927 out.uc.init(dv);
3928 }
3929 }
3930
3931 pc->xSatisfyPixel(out, flags);
3932 return;
3933 }
3934
3935 // --------------------------------------------------------------------------
3936 // [VMaskProc - RGBA32 - Overlay]
3937 // --------------------------------------------------------------------------
3938
3939 if (compOp() == BL_COMP_OP_OVERLAY) {
3940 srcFetch(s, Pixel::kUC, n);
3941 dstFetch(d, Pixel::kUC, n);
3942
3943 VecArray& sv = s.uc;
3944 VecArray& dv = d.uc;
3945
3946 if (hasMask) {
3947 pc->vmulu16(sv, sv, vm);
3948 pc->vdiv255u16(sv);
3949 useSa = true;
3950 }
3951
3952 if (useSa) {
3953 // if (2.Dca < Da)
3954 // Dca' = Dca + Sca - (Dca.Sa + Sca.Da - 2.Sca.Dca)
3955 // Da' = Da + Sa - (Da .Sa + Sa .Da - 2.Sa .Da ) - Sa.Da
3956 // Da' = Da + Sa - Sa.Da
3957 // else
3958 // Dca' = Dca + Sca + (Dca.Sa + Sca.Da - 2.Sca.Dca) - Sa.Da
3959 // Da' = Da + Sa + (Da .Sa + Sa .Da - 2.Sa .Da ) - Sa.Da
3960 // Da' = Da + Sa - Sa.Da
3961
3962 for (unsigned int i = 0; i < kSplit; i++) {
3963 VecArray sh = sv.even_odd(i);
3964 VecArray dh = dv.even_odd(i);
3965
3966 VecArray xh = xv.even_odd(i);
3967 VecArray yh = yv.even_odd(i);
3968 VecArray zh = zv.even_odd(i);
3969
3970 if (!useDa)
3971 pc->vFillAlpha255W(dh, dh);
3972
3973 pc->vExpandAlpha16(xh, dh, kUseHi);
3974 pc->vExpandAlpha16(yh, sh, kUseHi);
3975
3976 pc->vmulu16(xh, xh, sh); // Sca.Da
3977 pc->vmulu16(yh, yh, dh); // Dca.Sa
3978 pc->vmulu16(zh, dh, sh); // Dca.Sca
3979
3980 pc->vaddi16(sh, sh, dh); // Dca + Sca
3981 pc->vsubi16(xh, xh, zh); // Sca.Da - Dca.Sca
3982 pc->vZeroAlphaW(zh, zh);
3983 pc->vaddi16(xh, xh, yh); // Dca.Sa + Sca.Da - Dca.Sca
3984 pc->vExpandAlpha16(yh, dh, kUseHi); // Da
3985 pc->vsubi16(xh, xh, zh); // [C=Dca.Sa + Sca.Da - 2.Dca.Sca] [A=Sa.Da]
3986
3987 pc->vslli16(dh, dh, 1); // 2.Dca
3988 pc->vcmpgti16(yh, yh, dh); // 2.Dca < Da
3989 pc->vdiv255u16(xh);
3990 pc->vor(yh, yh, C_MEM(i128_FFFF000000000000));
3991
3992 pc->vExpandAlpha16(zh, xh, kUseHi);
3993 // if (2.Dca < Da)
3994 // X = [C = -(Dca.Sa + Sca.Da - 2.Sca.Dca)] [A = -Sa.Da]
3995 // else
3996 // X = [C = (Dca.Sa + Sca.Da - 2.Sca.Dca)] [A = -Sa.Da]
3997 pc->vxor(xh, xh, yh);
3998 pc->vsubi16(xh, xh, yh);
3999
4000 // if (2.Dca < Da)
4001 // Y = [C = 0] [A = 0]
4002 // else
4003 // Y = [C = Sa.Da] [A = 0]
4004 pc->vandnot_a(yh, yh, zh);
4005
4006 pc->vaddi16(sh, sh, xh);
4007 pc->vsubi16(sh, sh, yh);
4008 }
4009
4010 out.uc.init(sv);
4011 }
4012 else if (useDa) {
4013 // if (2.Dca < Da)
4014 // Dca' = Sc.(1 + 2.Dca - Da)
4015 // Da' = 1
4016 // else
4017 // Dca' = 2.Dca - Da + Sc.(1 - (2.Dca - Da))
4018 // Da' = 1
4019
4020 pc->vExpandAlpha16(xv, dv, kUseHi); // Da
4021 pc->vslli16(dv, dv, 1); // 2.Dca
4022
4023 pc->vcmpgti16(yv, xv, dv); // (2.Dca < Da) ? -1 : 0
4024 pc->vsubi16(xv, xv, dv); // -(2.Dca - Da)
4025
4026 pc->vxor(xv, xv, yv);
4027 pc->vsubi16(xv, xv, yv); // 2.Dca < Da ? 2.Dca - Da : -(2.Dca - Da)
4028 pc->vandnot_a(yv, yv, xv); // 2.Dca < Da ? 0 : -(2.Dca - Da)
4029 pc->vaddi16(xv, xv, C_MEM(i128_00FF00FF00FF00FF));
4030
4031 pc->vmulu16(xv, xv, sv);
4032 pc->vdiv255u16(xv);
4033 pc->vsubi16(xv, xv, yv);
4034
4035 out.uc.init(xv);
4036 }
4037 else {
4038 // if (2.Dc < 1)
4039 // Dc' = 2.Dc.Sc
4040 // else
4041 // Dc' = 2.Dc + 2.Sc - 1 - 2.Dc.Sc
4042
4043 pc->vmulu16(xv, dv, sv); // Dc.Sc
4044 pc->vcmpgti16(yv, dv, C_MEM(i128_007F007F007F007F)); // !(2.Dc < 1)
4045 pc->vaddi16(dv, dv, sv); // Dc + Sc
4046 pc->vdiv255u16(xv);
4047
4048 pc->vslli16(dv, dv, 1); // 2.Dc + 2.Sc
4049 pc->vslli16(xv, xv, 1); // 2.Dc.Sc
4050 pc->vsubi16(dv, dv, C_MEM(i128_00FF00FF00FF00FF)); // 2.Dc + 2.Sc - 1
4051
4052 pc->vxor(xv, xv, yv);
4053 pc->vand(dv, dv, yv); // 2.Dc < 1 ? 0 : 2.Dc + 2.Sc - 1
4054 pc->vsubi16(xv, xv, yv); // 2.Dc < 1 ? 2.Dc.Sc : -2.Dc.Sc
4055 pc->vaddi16(dv, dv, xv); // 2.Dc < 1 ? 2.Dc.Sc : 2.Dc + 2.Sc - 1 - 2.Dc.Sc
4056
4057 out.uc.init(dv);
4058 }
4059
4060 pc->xSatisfyPixel(out, flags);
4061 return;
4062 }
4063
4064 // --------------------------------------------------------------------------
4065 // [VMaskProc - RGBA32 - Screen]
4066 // --------------------------------------------------------------------------
4067
4068 if (compOp() == BL_COMP_OP_SCREEN) {
4069 // Dca' = Sca + Dca.(1 - Sca)
4070 // Da' = Sa + Da .(1 - Sa)
4071 srcFetch(s, Pixel::kUC | (hasMask ? uint32_t(0) : Pixel::kImmutable), n);
4072 dstFetch(d, Pixel::kUC, n);
4073
4074 VecArray& sv = s.uc;
4075 VecArray& dv = d.uc;
4076
4077 if (hasMask) {
4078 pc->vmulu16(sv, sv, vm);
4079 pc->vdiv255u16(sv);
4080 }
4081
4082 pc->vinv255u16(xv, sv);
4083 pc->vmulu16(dv, dv, xv);
4084 pc->vdiv255u16(dv);
4085 pc->vaddi16(dv, dv, sv);
4086
4087 out.uc.init(dv);
4088 pc->xSatisfyPixel(out, flags);
4089 return;
4090 }
4091
4092 // --------------------------------------------------------------------------
4093 // [VMaskProc - RGBA32 - Darken / Lighten]
4094 // --------------------------------------------------------------------------
4095
4096 if (compOp() == BL_COMP_OP_DARKEN || compOp() == BL_COMP_OP_LIGHTEN) {
4097 srcFetch(s, Pixel::kUC, n);
4098 dstFetch(d, Pixel::kUC, n);
4099
4100 VecArray& sv = s.uc;
4101 VecArray& dv = d.uc;
4102
4103 bool minMaxPredicate = compOp() == BL_COMP_OP_DARKEN;
4104
4105 if (hasMask) {
4106 pc->vmulu16(sv, sv, vm);
4107 pc->vdiv255u16(sv);
4108 useSa = true;
4109 }
4110
4111 if (useSa && useDa) {
4112 // Dca' = minmax(Dca + Sca.(1 - Da), Sca + Dca.(1 - Sa))
4113 // Da' = Sa + Da.(1 - Sa)
4114 for (unsigned int i = 0; i < kSplit; i++) {
4115 VecArray sh = sv.even_odd(i);
4116 VecArray dh = dv.even_odd(i);
4117 VecArray xh = xv.even_odd(i);
4118 VecArray yh = yv.even_odd(i);
4119
4120 pc->vExpandAlpha16(xh, dh, kUseHi);
4121 pc->vExpandAlpha16(yh, sh, kUseHi);
4122
4123 pc->vinv255u16(xh, xh);
4124 pc->vinv255u16(yh, yh);
4125
4126 pc->vmulu16(xh, xh, sh);
4127 pc->vmulu16(yh, yh, dh);
4128 pc->vdiv255u16_2x(xh, yh);
4129
4130 pc->vaddi16(dh, dh, xh);
4131 pc->vaddi16(sh, sh, yh);
4132
4133 pc->vminmaxu8(dh, dh, sh, minMaxPredicate);
4134 }
4135
4136 out.uc.init(dv);
4137 }
4138 else if (useDa) {
4139 // Dca' = minmax(Dca + Sc.(1 - Da), Sc)
4140 // Da' = 1
4141 pc->vExpandAlpha16(xv, dv, kUseHi);
4142 pc->vinv255u16(xv, xv);
4143 pc->vmulu16(xv, xv, sv);
4144 pc->vdiv255u16(xv);
4145 pc->vaddi16(dv, dv, xv);
4146 pc->vminmaxu8(dv, dv, sv, minMaxPredicate);
4147
4148 out.uc.init(dv);
4149 }
4150 else if (useSa) {
4151 // Dc' = minmax(Dc, Sca + Dc.(1 - Sa))
4152 pc->vExpandAlpha16(xv, sv, kUseHi);
4153 pc->vinv255u16(xv, xv);
4154 pc->vmulu16(xv, xv, dv);
4155 pc->vdiv255u16(xv);
4156 pc->vaddi16(xv, xv, sv);
4157 pc->vminmaxu8(dv, dv, xv, minMaxPredicate);
4158
4159 out.uc.init(dv);
4160 }
4161 else {
4162 // Dc' = minmax(Dc, Sc)
4163 pc->vminmaxu8(dv, dv, sv, minMaxPredicate);
4164
4165 out.uc.init(dv);
4166 }
4167
4168 pc->xSatisfyPixel(out, flags);
4169 return;
4170 }
4171
4172 // --------------------------------------------------------------------------
4173 // [VMaskProc - RGBA32 - ColorDodge (SCALAR)]
4174 // --------------------------------------------------------------------------
4175
4176 if (compOp() == BL_COMP_OP_COLOR_DODGE && n == 1) {
4177 // Dca' = min(Dca.Sa.Sa / max(Sa - Sca, 0.001), Sa.Da) + Sca.(1 - Da) + Dca.(1 - Sa);
4178 // Da' = min(Da .Sa.Sa / max(Sa - Sa , 0.001), Sa.Da) + Sa .(1 - Da) + Da .(1 - Sa);
4179
4180 srcFetch(s, Pixel::kUC, n);
4181 dstFetch(d, Pixel::kPC, n);
4182
4183 x86::Vec& s0 = s.uc[0];
4184 x86::Vec& d0 = d.pc[0];
4185 x86::Vec& x0 = xv[0];
4186 x86::Vec& y0 = yv[0];
4187 x86::Vec& z0 = zv[0];
4188
4189 if (hasMask) {
4190 pc->vmulu16(s0, s0, vm[0]);
4191 pc->vdiv255u16(s0);
4192 }
4193
4194 pc->vmovu8u32(d0, d0);
4195 pc->vmovu16u32(s0, s0);
4196
4197 pc->vcvti32ps(y0, s0);
4198 pc->vcvti32ps(z0, d0);
4199 pc->vpacki32i16(d0, d0, s0);
4200
4201 pc->vExpandAlphaPS(x0, y0);
4202 pc->vxorps(y0, y0, C_MEM(f128_sgn));
4203 pc->vmulps(z0, z0, x0);
4204 pc->vandps(y0, y0, C_MEM(i128_FFFFFFFF_FFFFFFFF_FFFFFFFF_0));
4205 pc->vaddps(y0, y0, x0);
4206
4207 pc->vmaxps(y0, y0, C_MEM(f128_1e_m3));
4208 pc->vdivps(z0, z0, y0);
4209
4210 pc->vswizi32(s0, d0, x86::Predicate::shuf(1, 1, 3, 3));
4211 pc->vExpandAlphaHi16(s0, s0);
4212 pc->vExpandAlphaLo16(s0, s0);
4213 pc->vinv255u16(s0, s0);
4214 pc->vmulu16(d0, d0, s0);
4215 pc->vswizi32(s0, d0, x86::Predicate::shuf(1, 0, 3, 2));
4216 pc->vaddi16(d0, d0, s0);
4217
4218 pc->vmulps(z0, z0, x0);
4219 pc->vExpandAlphaPS(x0, z0);
4220 pc->vminps(z0, z0, x0);
4221
4222 pc->vcvttpsi32(z0, z0);
4223 pc->xPackU32ToU16Lo(z0, z0);
4224 pc->vaddi16(d0, d0, z0);
4225
4226 pc->vdiv255u16(d0);
4227 out.uc.init(d0);
4228
4229 pc->xSatisfyPixel(out, flags);
4230 return;
4231 }
4232
4233 // --------------------------------------------------------------------------
4234 // [VMaskProc - RGBA32 - ColorBurn (SCALAR)]
4235 // --------------------------------------------------------------------------
4236
4237 if (compOp() == BL_COMP_OP_COLOR_BURN && n == 1) {
4238 // Dca' = Sa.Da - min(Sa.Da, (Da - Dca).Sa.Sa / max(Sca, 0.001)) + Sca.(1 - Da) + Dca.(1 - Sa)
4239 // Da' = Sa.Da - min(Sa.Da, (Da - Da ).Sa.Sa / max(Sa , 0.001)) + Sa .(1 - Da) + Da .(1 - Sa)
4240 srcFetch(s, Pixel::kUC, n);
4241 dstFetch(d, Pixel::kPC, n);
4242
4243 x86::Vec& s0 = s.uc[0];
4244 x86::Vec& d0 = d.pc[0];
4245 x86::Vec& x0 = xv[0];
4246 x86::Vec& y0 = yv[0];
4247 x86::Vec& z0 = zv[0];
4248
4249 if (hasMask) {
4250 pc->vmulu16(s0, s0, vm[0]);
4251 pc->vdiv255u16(s0);
4252 }
4253
4254 pc->vmovu8u32(d0, d0);
4255 pc->vmovu16u32(s0, s0);
4256
4257 pc->vcvti32ps(y0, s0);
4258 pc->vcvti32ps(z0, d0);
4259 pc->vpacki32i16(d0, d0, s0);
4260
4261 pc->vExpandAlphaPS(x0, y0);
4262 pc->vmaxps(y0, y0, C_MEM(f128_1e_m3));
4263 pc->vmulps(z0, z0, x0); // Dca.Sa
4264
4265 pc->vExpandAlphaPS(x0, z0); // Sa.Da
4266 pc->vxorps(z0, z0, C_MEM(f128_sgn));
4267
4268 pc->vandps(z0, z0, C_MEM(i128_FFFFFFFF_FFFFFFFF_FFFFFFFF_0));
4269 pc->vaddps(z0, z0, x0); // (Da - Dxa).Sa
4270 pc->vdivps(z0, z0, y0);
4271
4272 pc->vswizi32(s0, d0, x86::Predicate::shuf(1, 1, 3, 3));
4273 pc->vExpandAlphaHi16(s0, s0);
4274 pc->vExpandAlphaLo16(s0, s0);
4275 pc->vinv255u16(s0, s0);
4276 pc->vmulu16(d0, d0, s0);
4277 pc->vswizi32(s0, d0, x86::Predicate::shuf(1, 0, 3, 2));
4278 pc->vaddi16(d0, d0, s0);
4279
4280 pc->vExpandAlphaPS(x0, y0); // Sa
4281 pc->vmulps(z0, z0, x0);
4282 pc->vExpandAlphaPS(x0, z0); // Sa.Da
4283 pc->vminps(z0, z0, x0);
4284 pc->vandps(z0, z0, C_MEM(i128_FFFFFFFF_FFFFFFFF_FFFFFFFF_0));
4285 pc->vsubps(x0, x0, z0);
4286
4287 pc->vcvttpsi32(x0, x0);
4288 pc->xPackU32ToU16Lo(x0, x0);
4289 pc->vaddi16(d0, d0, x0);
4290
4291 pc->vdiv255u16(d0);
4292 out.uc.init(d0);
4293
4294 pc->xSatisfyPixel(out, flags);
4295 return;
4296 }
4297
4298 // --------------------------------------------------------------------------
4299 // [VMaskProc - RGBA32 - LinearBurn]
4300 // --------------------------------------------------------------------------
4301
4302 if (compOp() == BL_COMP_OP_LINEAR_BURN) {
4303 srcFetch(s, Pixel::kUC | (hasMask ? uint32_t(0) : Pixel::kImmutable), n);
4304 dstFetch(d, Pixel::kUC, n);
4305
4306 VecArray& sv = s.uc;
4307 VecArray& dv = d.uc;
4308
4309 if (hasMask) {
4310 pc->vmulu16(sv, sv, vm);
4311 pc->vdiv255u16(sv);
4312 }
4313
4314 if (useDa && useSa) {
4315 // Dca' = Dca + Sca - Sa.Da
4316 // Da' = Da + Sa - Sa.Da
4317 pc->vExpandAlpha16(xv, sv, kUseHi);
4318 pc->vExpandAlpha16(yv, dv, kUseHi);
4319 pc->vmulu16(xv, xv, yv);
4320 pc->vdiv255u16(xv);
4321 pc->vaddi16(dv, dv, sv);
4322 pc->vsubsu16(dv, dv, xv);
4323 }
4324 else if (useDa || useSa) {
4325 pc->vExpandAlpha16(xv, useDa ? dv : sv, kUseHi);
4326 pc->vaddi16(dv, dv, sv);
4327 pc->vsubsu16(dv, dv, xv);
4328 }
4329 else {
4330 // Dca' = Dc + Sc - 1
4331 pc->vaddi16(dv, dv, sv);
4332 pc->vsubsu16(dv, dv, C_MEM(i128_000000FF00FF00FF));
4333 }
4334
4335 out.uc.init(dv);
4336 pc->xSatisfyPixel(out, flags);
4337 return;
4338 }
4339
4340 // --------------------------------------------------------------------------
4341 // [VMaskProc - RGBA32 - LinearLight]
4342 // --------------------------------------------------------------------------
4343
4344 if (compOp() == BL_COMP_OP_LINEAR_LIGHT && n == 1) {
4345 srcFetch(s, Pixel::kUC, 1);
4346 dstFetch(d, Pixel::kUC, 1);
4347
4348 VecArray& sv = s.uc;
4349 VecArray& dv = d.uc;
4350
4351 if (hasMask) {
4352 pc->vmulu16(sv, sv, vm);
4353 pc->vdiv255u16(sv);
4354 useSa = 1;
4355 }
4356
4357 if (useSa || useDa) {
4358 // Dca' = min(max((Dca.Sa + 2.Sca.Da - Sa.Da), 0), Sa.Da) + Sca.(1 - Da) + Dca.(1 - Sa)
4359 // Da' = min(max((Da .Sa + 2.Sa .Da - Sa.Da), 0), Sa.Da) + Sa .(1 - Da) + Da .(1 - Sa)
4360
4361 x86::Vec& d0 = dv[0];
4362 x86::Vec& s0 = sv[0];
4363 x86::Vec& x0 = xv[0];
4364 x86::Vec& y0 = yv[0];
4365
4366 pc->vExpandAlphaLo16(y0, d0);
4367 pc->vExpandAlphaLo16(x0, s0);
4368
4369 pc->vunpackli64(d0, d0, s0);
4370 pc->vunpackli64(x0, x0, y0);
4371
4372 pc->vmov(s0, d0);
4373 pc->vmulu16(d0, d0, x0);
4374 pc->vinv255u16(x0, x0);
4375 pc->vdiv255u16(d0);
4376
4377 pc->vmulu16(s0, s0, x0);
4378 pc->vswapi64(x0, s0);
4379 pc->vswapi64(y0, d0);
4380 pc->vaddi16(s0, s0, x0);
4381 pc->vaddi16(d0, d0, y0);
4382 pc->vExpandAlphaLo16(x0, y0);
4383 pc->vaddi16(d0, d0, y0);
4384 pc->vdiv255u16(s0);
4385
4386 pc->vsubsu16(d0, d0, x0);
4387 pc->vmini16(d0, d0, x0);
4388
4389 pc->vaddi16(d0, d0, s0);
4390 out.uc.init(d0);
4391 }
4392 else {
4393 // Dc' = min(max((Dc + 2.Sc - 1), 0), 1)
4394 pc->vslli16(sv, sv, 1);
4395 pc->vaddi16(dv, dv, sv);
4396 pc->vsubsu16(dv, dv, C_MEM(i128_000000FF00FF00FF));
4397 pc->vmini16(dv, dv, C_MEM(i128_00FF00FF00FF00FF));
4398
4399 out.uc.init(dv);
4400 }
4401
4402 pc->xSatisfyPixel(out, flags);
4403 return;
4404 }
4405
4406 // --------------------------------------------------------------------------
4407 // [VMaskProc - RGBA32 - PinLight]
4408 // --------------------------------------------------------------------------
4409
4410 if (compOp() == BL_COMP_OP_PIN_LIGHT) {
4411 srcFetch(s, Pixel::kUC, n);
4412 dstFetch(d, Pixel::kUC, n);
4413
4414 VecArray& sv = s.uc;
4415 VecArray& dv = d.uc;
4416
4417 if (hasMask) {
4418 pc->vmulu16(sv, sv, vm);
4419 pc->vdiv255u16(sv);
4420
4421 useSa = true;
4422 }
4423
4424 if (useSa && useDa) {
4425 // if 2.Sca <= Sa
4426 // Dca' = min(Dca + Sca - Sca.Da, Dca + Sca + Sca.Da - Dca.Sa)
4427 // Da' = min(Da + Sa - Sa .Da, Da + Sa + Sa .Da - Da .Sa) = Da + Sa.(1 - Da)
4428 // else
4429 // Dca' = max(Dca + Sca - Sca.Da, Dca + Sca + Sca.Da - Dca.Sa - Da.Sa)
4430 // Da' = max(Da + Sa - Sa .Da, Da + Sa + Sa .Da - Da .Sa - Da.Sa) = Da + Sa.(1 - Da)
4431
4432 pc->vExpandAlpha16(yv, sv, kUseHi); // Sa
4433 pc->vExpandAlpha16(xv, dv, kUseHi); // Da
4434
4435 pc->vmulu16(yv, yv, dv); // Dca.Sa
4436 pc->vmulu16(xv, xv, sv); // Sca.Da
4437 pc->vaddi16(dv, dv, sv); // Dca + Sca
4438 pc->vdiv255u16_2x(yv, xv);
4439
4440 pc->vsubi16(yv, yv, dv); // Dca.Sa - Dca - Sca
4441 pc->vsubi16(dv, dv, xv); // Dca + Sca - Sca.Da
4442 pc->vsubi16(xv, xv, yv); // Dca + Sca + Sca.Da - Dca.Sa
4443
4444 pc->vExpandAlpha16(yv, sv, kUseHi); // Sa
4445 pc->vslli16(sv, sv, 1); // 2.Sca
4446 pc->vcmpgti16(sv, sv, yv); // !(2.Sca <= Sa)
4447
4448 pc->vsubi16(zv, dv, xv);
4449 pc->vExpandAlpha16(zv, zv, kUseHi); // -Da.Sa
4450 pc->vand(zv, zv, sv); // 2.Sca <= Sa ? 0 : -Da.Sa
4451 pc->vaddi16(xv, xv, zv); // 2.Sca <= Sa ? Dca + Sca + Sca.Da - Dca.Sa : Dca + Sca + Sca.Da - Dca.Sa - Da.Sa
4452
4453 // if 2.Sca <= Sa:
4454 // min(dv, xv)
4455 // else
4456 // max(dv, xv) <- ~min(~dv, ~xv)
4457 pc->vxor(dv, dv, sv);
4458 pc->vxor(xv, xv, sv);
4459 pc->vmini16(dv, dv, xv);
4460 pc->vxor(dv, dv, sv);
4461
4462 out.uc.init(dv);
4463 }
4464 else if (useDa) {
4465 // if 2.Sc <= 1
4466 // Dca' = min(Dca + Sc - Sc.Da, Sc + Sc.Da)
4467 // Da' = min(Da + 1 - 1 .Da, 1 + 1 .Da) = 1
4468 // else
4469 // Dca' = max(Dca + Sc - Sc.Da, Sc + Sc.Da - Da)
4470 // Da' = max(Da + 1 - 1 .Da, 1 + 1 .Da - Da) = 1
4471
4472 pc->vExpandAlpha16(xv, dv, kUseHi); // Da
4473 pc->vmulu16(xv, xv, sv); // Sc.Da
4474 pc->vaddi16(dv, dv, sv); // Dca + Sc
4475 pc->vdiv255u16(xv);
4476
4477 pc->vcmpgti16(yv, sv, C_MEM(i128_007F007F007F007F)); // !(2.Sc <= 1)
4478 pc->vaddi16(sv, sv, xv); // Sc + Sc.Da
4479 pc->vsubi16(dv, dv, xv); // Dca + Sc - Sc.Da
4480 pc->vExpandAlpha16(xv, xv); // Da
4481 pc->vand(xv, xv, yv); // 2.Sc <= 1 ? 0 : Da
4482 pc->vsubi16(sv, sv, xv); // 2.Sc <= 1 ? Sc + Sc.Da : Sc + Sc.Da - Da
4483
4484 // if 2.Sc <= 1:
4485 // min(dv, sv)
4486 // else
4487 // max(dv, sv) <- ~min(~dv, ~sv)
4488 pc->vxor(dv, dv, yv);
4489 pc->vxor(sv, sv, yv);
4490 pc->vmini16(dv, dv, sv);
4491 pc->vxor(dv, dv, yv);
4492
4493 out.uc.init(dv);
4494 }
4495 else if (useSa) {
4496 // if 2.Sca <= Sa
4497 // Dc' = min(Dc, Dc + 2.Sca - Dc.Sa)
4498 // else
4499 // Dc' = max(Dc, Dc + 2.Sca - Dc.Sa - Sa)
4500
4501 pc->vExpandAlpha16(xv, sv, kUseHi); // Sa
4502 pc->vslli16(sv, sv, 1); // 2.Sca
4503 pc->vcmpgti16(yv, sv, xv); // !(2.Sca <= Sa)
4504 pc->vand(yv, yv, xv); // 2.Sca <= Sa ? 0 : Sa
4505 pc->vmulu16(xv, xv, dv); // Dc.Sa
4506 pc->vaddi16(sv, sv, dv); // Dc + 2.Sca
4507 pc->vdiv255u16(xv);
4508 pc->vsubi16(sv, sv, yv); // 2.Sca <= Sa ? Dc + 2.Sca : Dc + 2.Sca - Sa
4509 pc->vcmpeqi16(yv, yv, C_MEM(i128_0000000000000000)); // 2.Sc <= 1
4510 pc->vsubi16(sv, sv, xv); // 2.Sca <= Sa ? Dc + 2.Sca - Dc.Sa : Dc + 2.Sca - Dc.Sa - Sa
4511
4512 // if 2.Sc <= 1:
4513 // min(dv, sv)
4514 // else
4515 // max(dv, sv) <- ~min(~dv, ~sv)
4516 pc->vxor(dv, dv, yv);
4517 pc->vxor(sv, sv, yv);
4518 pc->vmaxi16(dv, dv, sv);
4519 pc->vxor(dv, dv, yv);
4520
4521 out.uc.init(dv);
4522 }
4523 else {
4524 // if 2.Sc <= 1
4525 // Dc' = min(Dc, 2.Sc)
4526 // else
4527 // Dc' = max(Dc, 2.Sc - 1)
4528
4529 pc->vslli16(sv, sv, 1); // 2.Sc
4530 pc->vmini16(xv, sv, dv); // min(Dc, 2.Sc)
4531
4532 pc->vcmpgti16(yv, sv, C_MEM(i128_00FF00FF00FF00FF)); // !(2.Sc <= 1)
4533 pc->vsubi16(sv, sv, C_MEM(i128_00FF00FF00FF00FF)); // 2.Sc - 1
4534 pc->vmaxi16(dv, dv, sv); // max(Dc, 2.Sc - 1)
4535
4536 pc->vblendv8_destructive(xv, xv, dv, yv); // 2.Sc <= 1 ? min(Dc, 2.Sc) : max(Dc, 2.Sc - 1)
4537 out.uc.init(xv);
4538 }
4539
4540 pc->xSatisfyPixel(out, flags);
4541 return;
4542 }
4543
4544 // --------------------------------------------------------------------------
4545 // [VMaskProc - RGBA32 - HardLight]
4546 // --------------------------------------------------------------------------
4547
4548 if (compOp() == BL_COMP_OP_HARD_LIGHT) {
4549 // if (2.Sca < Sa)
4550 // Dca' = Dca + Sca - (Dca.Sa + Sca.Da - 2.Sca.Dca)
4551 // Da' = Da + Sa - Sa.Da
4552 // else
4553 // Dca' = Dca + Sca + (Dca.Sa + Sca.Da - 2.Sca.Dca) - Sa.Da
4554 // Da' = Da + Sa - Sa.Da
4555 srcFetch(s, Pixel::kUC, n);
4556 dstFetch(d, Pixel::kUC, n);
4557
4558 VecArray& sv = s.uc;
4559 VecArray& dv = d.uc;
4560
4561 if (hasMask) {
4562 pc->vmulu16(sv, sv, vm);
4563 pc->vdiv255u16(sv);
4564 }
4565
4566 // SPLIT.
4567 for (unsigned int i = 0; i < kSplit; i++) {
4568 VecArray sh = sv.even_odd(i);
4569 VecArray dh = dv.even_odd(i);
4570 VecArray xh = xv.even_odd(i);
4571 VecArray yh = yv.even_odd(i);
4572 VecArray zh = zv.even_odd(i);
4573
4574 pc->vExpandAlpha16(xh, dh, kUseHi);
4575 pc->vExpandAlpha16(yh, sh, kUseHi);
4576
4577 pc->vmulu16(xh, xh, sh); // Sca.Da
4578 pc->vmulu16(yh, yh, dh); // Dca.Sa
4579 pc->vmulu16(zh, dh, sh); // Dca.Sca
4580
4581 pc->vaddi16(dh, dh, sh);
4582 pc->vsubi16(xh, xh, zh);
4583 pc->vaddi16(xh, xh, yh);
4584 pc->vsubi16(xh, xh, zh);
4585
4586 pc->vExpandAlpha16(yh, yh, kUseHi);
4587 pc->vExpandAlpha16(zh, sh, kUseHi);
4588 pc->vdiv255u16_2x(xh, yh);
4589
4590 pc->vslli16(sh, sh, 1);
4591 pc->vcmpgti16(zh, zh, sh);
4592
4593 pc->vxor(xh, xh, zh);
4594 pc->vsubi16(xh, xh, zh);
4595 pc->vZeroAlphaW(zh, zh);
4596 pc->vandnot_a(zh, zh, yh);
4597 pc->vaddi16(dh, dh, xh);
4598 pc->vsubi16(dh, dh, zh);
4599 }
4600
4601 out.uc.init(dv);
4602 pc->xSatisfyPixel(out, flags);
4603 return;
4604 }
4605
4606 // --------------------------------------------------------------------------
4607 // [VMaskProc - RGBA32 - SoftLight (SCALAR)]
4608 // --------------------------------------------------------------------------
4609
4610 if (compOp() == BL_COMP_OP_SOFT_LIGHT && n == 1) {
4611 // Dc = Dca/Da
4612 //
4613 // Dca' =
4614 // if 2.Sca - Sa <= 0
4615 // Dca + Sca.(1 - Da) + (2.Sca - Sa).Da.[[ Dc.(1 - Dc) ]]
4616 // else if 2.Sca - Sa > 0 and 4.Dc <= 1
4617 // Dca + Sca.(1 - Da) + (2.Sca - Sa).Da.[[ 4.Dc.(4.Dc.Dc + Dc - 4.Dc + 1) - Dc]]
4618 // else
4619 // Dca + Sca.(1 - Da) + (2.Sca - Sa).Da.[[ sqrt(Dc) - Dc ]]
4620 // Da' = Da + Sa - Sa.Da
4621 srcFetch(s, Pixel::kUC, n);
4622 dstFetch(d, Pixel::kPC, n);
4623
4624 x86::Vec& s0 = s.uc[0];
4625 x86::Vec& d0 = d.pc[0];
4626
4627 x86::Vec a0 = cc->newXmm("a0");
4628 x86::Vec b0 = cc->newXmm("b0");
4629 x86::Vec& x0 = xv[0];
4630 x86::Vec& y0 = yv[0];
4631 x86::Vec& z0 = zv[0];
4632
4633 if (hasMask) {
4634 pc->vmulu16(s0, s0, vm[0]);
4635 pc->vdiv255u16(s0);
4636 }
4637
4638 pc->vmovu8u32(d0, d0);
4639 pc->vmovu16u32(s0, s0);
4640 pc->vloadps_128a(x0, C_MEM(f128_1div255));
4641
4642 pc->vcvti32ps(s0, s0);
4643 pc->vcvti32ps(d0, d0);
4644
4645 pc->vmulps(s0, s0, x0); // Sca (0..1)
4646 pc->vmulps(d0, d0, x0); // Dca (0..1)
4647
4648 pc->vExpandAlphaPS(b0, d0); // Da
4649 pc->vmulps(x0, s0, b0); // Sca.Da
4650 pc->vmaxps(b0, b0, C_MEM(f128_1e_m3)); // max(Da, 0.001)
4651
4652 pc->vdivps(a0, d0, b0); // Dc <- Dca/Da
4653 pc->vaddps(d0, d0, s0); // Dca + Sca
4654
4655 pc->vExpandAlphaPS(y0, s0); // Sa
4656 pc->vloadps_128a(z0, C_MEM(f128_4)); // 4
4657
4658 pc->vsubps(d0, d0, x0); // Dca + Sca.(1 - Da)
4659 pc->vaddps(s0, s0, s0); // 2.Sca
4660 pc->vmulps(z0, z0, a0); // 4.Dc
4661
4662 pc->vsqrtps(x0, a0); // sqrt(Dc)
4663 pc->vsubps(s0, s0, y0); // 2.Sca - Sa
4664
4665 pc->vmovaps(y0, z0); // 4.Dc
4666 pc->vmulps(z0, z0, a0); // 4.Dc.Dc
4667
4668 pc->vaddps(z0, z0, a0); // 4.Dc.Dc + Dc
4669 pc->vmulps(s0, s0, b0); // (2.Sca - Sa).Da
4670
4671 pc->vsubps(z0, z0, y0); // 4.Dc.Dc + Dc - 4.Dc
4672 pc->vloadps_128a(b0, C_MEM(f128_1)); // 1
4673
4674 pc->vaddps(z0, z0, b0); // 4.Dc.Dc + Dc - 4.Dc + 1
4675 pc->vmulps(z0, z0, y0); // 4.Dc(4.Dc.Dc + Dc - 4.Dc + 1)
4676 pc->vcmpps(y0, y0, b0, x86::Predicate::kCmpLE); // 4.Dc <= 1
4677
4678 pc->vandps(z0, z0, y0);
4679 pc->vandnot_aps(y0, y0, x0);
4680
4681 pc->vzerops(x0);
4682 pc->vorps(z0, z0, y0); // (4.Dc(4.Dc.Dc + Dc - 4.Dc + 1)) or sqrt(Dc)
4683
4684 pc->vcmpps(x0, x0, s0, x86::Predicate::kCmpLT); // 2.Sca - Sa > 0
4685 pc->vsubps(z0, z0, a0); // [[4.Dc(4.Dc.Dc + Dc - 4.Dc + 1) or sqrt(Dc)]] - Dc
4686
4687 pc->vsubps(b0, b0, a0); // 1 - Dc
4688 pc->vandps(z0, z0, x0);
4689
4690 pc->vmulps(b0, b0, a0); // Dc.(1 - Dc)
4691 pc->vandnot_aps(x0, x0, b0);
4692 pc->vandps(s0, s0, C_MEM(i128_FFFFFFFF_FFFFFFFF_FFFFFFFF_0)); // Zero alpha.
4693
4694 pc->vorps(z0, z0, x0);
4695 pc->vmulps(s0, s0, z0);
4696
4697 pc->vaddps(d0, d0, s0);
4698 pc->vmulps(d0, d0, C_MEM(f128_255));
4699
4700 pc->vcvtpsi32(d0, d0);
4701 pc->vpacki32i16(d0, d0, d0);
4702 pc->vpacki16u8(d0, d0, d0);
4703 out.pc.init(d0);
4704
4705 pc->xSatisfyPixel(out, flags);
4706 return;
4707 }
4708
4709 // --------------------------------------------------------------------------
4710 // [VMaskProc - RGBA32 - Difference]
4711 // --------------------------------------------------------------------------
4712
4713 if (compOp() == BL_COMP_OP_DIFFERENCE) {
4714 // Dca' = Dca + Sca - 2.min(Sca.Da, Dca.Sa)
4715 // Da' = Da + Sa - min(Sa .Da, Da .Sa)
4716 if (!hasMask) {
4717 srcFetch(s, Pixel::kUC | Pixel::kUA, n);
4718 dstFetch(d, Pixel::kUC, n);
4719
4720 VecArray& sv = s.uc;
4721 VecArray& uv = s.ua;
4722 VecArray& dv = d.uc;
4723
4724 // SPLIT.
4725 for (unsigned int i = 0; i < kSplit; i++) {
4726 VecArray sh = sv.even_odd(i);
4727 VecArray uh = uv.even_odd(i);
4728 VecArray dh = dv.even_odd(i);
4729 VecArray xh = xv.even_odd(i);
4730
4731 pc->vExpandAlpha16(xh, dh, kUseHi);
4732 pc->vmulu16(uh, uh, dh);
4733 pc->vmulu16(xh, xh, sh);
4734 pc->vaddi16(dh, dh, sh);
4735 pc->vminu16(uh, uh, xh);
4736 }
4737
4738 pc->vdiv255u16(uv);
4739 pc->vsubi16(dv, dv, uv);
4740
4741 pc->vZeroAlphaW(uv, uv);
4742 pc->vsubi16(dv, dv, uv);
4743 out.uc.init(dv);
4744 }
4745 // Dca' = Dca + Sca.m - 2.min(Sca.Da, Dca.Sa).m
4746 // Da' = Da + Sa .m - min(Sa .Da, Da .Sa).m
4747 else {
4748 srcFetch(s, Pixel::kUC, n);
4749 dstFetch(d, Pixel::kUC, n);
4750
4751 VecArray& sv = s.uc;
4752 VecArray& dv = d.uc;
4753
4754 pc->vmulu16(sv, sv, vm);
4755 pc->vdiv255u16(sv);
4756
4757 // SPLIT.
4758 for (unsigned int i = 0; i < kSplit; i++) {
4759 VecArray sh = sv.even_odd(i);
4760 VecArray dh = dv.even_odd(i);
4761 VecArray xh = xv.even_odd(i);
4762 VecArray yh = yv.even_odd(i);
4763
4764 pc->vExpandAlpha16(yh, sh, kUseHi);
4765 pc->vExpandAlpha16(xh, dh, kUseHi);
4766 pc->vmulu16(yh, yh, dh);
4767 pc->vmulu16(xh, xh, sh);
4768 pc->vaddi16(dh, dh, sh);
4769 pc->vminu16(yh, yh, xh);
4770 }
4771
4772 pc->vdiv255u16(yv);
4773 pc->vsubi16(dv, dv, yv);
4774
4775 pc->vZeroAlphaW(yv, yv);
4776 pc->vsubi16(dv, dv, yv);
4777 out.uc.init(dv);
4778 }
4779
4780 pc->xSatisfyPixel(out, flags);
4781 return;
4782 }
4783
4784 // --------------------------------------------------------------------------
4785 // [VMaskProc - RGBA32 - Exclusion]
4786 // --------------------------------------------------------------------------
4787
4788 if (compOp() == BL_COMP_OP_EXCLUSION) {
4789 // Dca' = Dca + Sca - 2.Sca.Dca
4790 // Da' = Da + Sa - Sa.Da
4791 srcFetch(s, Pixel::kUC | (hasMask ? uint32_t(0) : Pixel::kImmutable), n);
4792 dstFetch(d, Pixel::kUC, n);
4793
4794 VecArray& sv = s.uc;
4795 VecArray& dv = d.uc;
4796
4797 if (hasMask) {
4798 pc->vmulu16(sv, sv, vm);
4799 pc->vdiv255u16(sv);
4800 }
4801
4802 pc->vmulu16(xv, dv, sv);
4803 pc->vaddi16(dv, dv, sv);
4804 pc->vdiv255u16(xv);
4805 pc->vsubi16(dv, dv, xv);
4806
4807 pc->vZeroAlphaW(xv, xv);
4808 pc->vsubi16(dv, dv, xv);
4809
4810 out.uc.init(dv);
4811 pc->xSatisfyPixel(out, flags);
4812 return;
4813 }
4814
4815 // --------------------------------------------------------------------------
4816 // [VMaskProc - RGBA32 - Invalid]
4817 // --------------------------------------------------------------------------
4818
4819 BL_NOT_REACHED();
4820 }
4821
vMaskProcRGBA32InvertMask(VecArray & vn,VecArray & vm)4822 void CompOpPart::vMaskProcRGBA32InvertMask(VecArray& vn, VecArray& vm) noexcept {
4823 uint32_t i;
4824 uint32_t size = vm.size();
4825
4826 if (cMaskLoopType() == kCMaskLoopTypeMask) {
4827 if (_mask->vn.isValid()) {
4828 bool ok = true;
4829
4830 // TODO: [PIPEGEN] A leftover from a template-based code, I don't understand
4831 // it anymore and it seems it's unnecessary so verify this and all places
4832 // that hit `ok == false`.
4833 for (i = 0; i < blMin(vn.size(), size); i++)
4834 if (vn[i].id() != vm[i].id())
4835 ok = false;
4836
4837 if (ok) {
4838 vn.init(_mask->vn);
4839 return;
4840 }
4841 }
4842 }
4843
4844 if (vn.empty())
4845 pc->newXmmArray(vn, size, "vn");
4846
4847 if (vm.isScalar()) {
4848 // TODO: Seems wrong as well, the `vmov` code-path would never execute.
4849 pc->vinv255u16(vn[0], vm[0]);
4850 for (i = 1; i < size; i++)
4851 pc->vmov(vn[i], vn[0]);
4852 }
4853 else {
4854 pc->vinv255u16(vn, vm);
4855 }
4856 }
4857
vMaskProcRGBA32InvertDone(VecArray & vn,bool mImmutable)4858 void CompOpPart::vMaskProcRGBA32InvertDone(VecArray& vn, bool mImmutable) noexcept {
4859 BL_UNUSED(mImmutable);
4860
4861 if (cMaskLoopType() == kCMaskLoopTypeMask) {
4862 if (vn[0].id() == _mask->vm.id())
4863 pc->vinv255u16(vn, vn);
4864 }
4865 }
4866
4867 } // {BLPipeGen}
4868
4869 #endif
4870