1 // [Blend2D]
2 // 2D Vector Graphics Powered by a JIT Compiler.
3 //
4 // [License]
5 // Zlib - See LICENSE.md file in the package.
6
7 #include "../api-build_p.h"
8 #if BL_TARGET_ARCH_X86 && !defined(BL_BUILD_NO_JIT)
9
10 #include "../pipegen/compoppart_p.h"
11 #include "../pipegen/fetchpart_p.h"
12 #include "../pipegen/fetchgradientpart_p.h"
13 #include "../pipegen/fetchpixelptrpart_p.h"
14 #include "../pipegen/fetchsolidpart_p.h"
15 #include "../pipegen/fetchpatternpart_p.h"
16 #include "../pipegen/fillpart_p.h"
17 #include "../pipegen/pipecompiler_p.h"
18
19 namespace BLPipeGen {
20
21 // ============================================================================
22 // [BLPipeGen::PipeCompiler - Construction / Destruction]
23 // ============================================================================
24
PipeCompiler(x86::Compiler * cc,const asmjit::x86::Features & features)25 PipeCompiler::PipeCompiler(x86::Compiler* cc, const asmjit::x86::Features& features) noexcept
26 : cc(cc),
27 _features(features) { reset(); }
~PipeCompiler()28 PipeCompiler::~PipeCompiler() noexcept {}
29
30 // ============================================================================
31 // [BLPipeGen::PipeCompiler - Reset]
32 // ============================================================================
33
reset()34 void PipeCompiler::reset() noexcept {
35 _simdWidth = 0;
36 _availableRegs.reset();
37 _persistentRegs.reset();
38 _temporaryRegs.reset();
39
40 _funcNode = nullptr;
41 _funcInit = nullptr;
42 _funcEnd = nullptr;
43
44 _commonTableOff = 128;
45 _commonTablePtr.reset();
46 JitUtils::resetVarStruct(_constantsXmm);
47
48 // These are always overwritten by `compileFunc()`, reset for safety.
49 _ctxData.reset();
50 _fillData.reset();
51 _fetchData.reset();
52
53 _ctxDataOffset = 0;
54 _fillDataOffset = 0;
55 _fetchDataOffset = 0;
56
57 initSimdWidth();
58 }
59
60 // ============================================================================
61 // [BLPipeGen::PipeCompiler - Optimization Level]
62 // ============================================================================
63
initSimdWidth()64 void PipeCompiler::initSimdWidth() noexcept {
65 // NOTE: It doesn't matter what we set here. The compiler will only use the
66 // maximum SIMD width that all parts support, so we can still be using 128-bit
67 // SIMD even when we have AVX-512, for example.
68 if (_features.hasAVX512_BW())
69 _simdWidth = 64;
70 else if (_features.hasAVX2())
71 _simdWidth = 32;
72 else
73 _simdWidth = 16;
74 }
75
76 // ============================================================================
77 // [BLPipeGen::PipeCompiler - BeginFunction / EndFunction]
78 // ============================================================================
79
beginFunction()80 void PipeCompiler::beginFunction() noexcept {
81 // Setup constants first.
82 _availableRegs[x86::Reg::kGroupGp ] = cc->gpCount() - kReservedGpRegs;
83 _availableRegs[x86::Reg::kGroupMm ] = 8 - kReservedMmRegs;
84 _availableRegs[x86::Reg::kGroupVec ] = cc->gpCount() - kReservedVecRegs;
85 _availableRegs[x86::Reg::kGroupKReg] = 8;
86
87 // Function prototype and arguments.
88 _funcNode = cc->addFunc(asmjit::FuncSignatureT<void, void*, void*, void*>(asmjit::CallConv::kIdHostCDecl));
89 _funcInit = cc->cursor();
90 _funcEnd = _funcNode->endNode()->prev();
91
92 if (_features.hasAVX())
93 _funcNode->frame().setAvxEnabled();
94
95 _ctxData = cc->newIntPtr("ctxData");
96 _fillData = cc->newIntPtr("fillData");
97 _fetchData = cc->newIntPtr("fetchData");
98
99 cc->setArg(0, _ctxData);
100 cc->setArg(1, _fillData);
101 cc->setArg(2, _fetchData);
102 }
103
endFunction()104 void PipeCompiler::endFunction() noexcept {
105 // Finalize the pipeline function.
106 cc->endFunc();
107 }
108
109 // ============================================================================
110 // [BLPipeGen::PipeCompiler - Parts]
111 // ============================================================================
112
newFillPart(uint32_t fillType,FetchPart * dstPart,CompOpPart * compOpPart)113 FillPart* PipeCompiler::newFillPart(uint32_t fillType, FetchPart* dstPart, CompOpPart* compOpPart) noexcept {
114 if (fillType == BL_PIPE_FILL_TYPE_BOX_AA)
115 return newPartT<FillBoxAAPart>(fillType, dstPart->as<FetchPixelPtrPart>(), compOpPart);
116
117 if (fillType == BL_PIPE_FILL_TYPE_BOX_AU)
118 return newPartT<FillBoxAUPart>(fillType, dstPart->as<FetchPixelPtrPart>(), compOpPart);
119
120 if (fillType == BL_PIPE_FILL_TYPE_ANALYTIC)
121 return newPartT<FillAnalyticPart>(fillType, dstPart->as<FetchPixelPtrPart>(), compOpPart);
122
123 return nullptr;
124 }
125
newFetchPart(uint32_t fetchType,uint32_t fetchPayload,uint32_t format)126 FetchPart* PipeCompiler::newFetchPart(uint32_t fetchType, uint32_t fetchPayload, uint32_t format) noexcept {
127 if (fetchType == BL_PIPE_FETCH_TYPE_SOLID)
128 return newPartT<FetchSolidPart>(fetchType, fetchPayload, format);
129
130 if (fetchType >= BL_PIPE_FETCH_TYPE_GRADIENT_LINEAR_FIRST && fetchType <= BL_PIPE_FETCH_TYPE_GRADIENT_LINEAR_LAST)
131 return newPartT<FetchLinearGradientPart>(fetchType, fetchPayload, format);
132
133 if (fetchType >= BL_PIPE_FETCH_TYPE_GRADIENT_RADIAL_FIRST && fetchType <= BL_PIPE_FETCH_TYPE_GRADIENT_RADIAL_LAST)
134 return newPartT<FetchRadialGradientPart>(fetchType, fetchPayload, format);
135
136 if (fetchType >= BL_PIPE_FETCH_TYPE_GRADIENT_CONICAL_FIRST && fetchType <= BL_PIPE_FETCH_TYPE_GRADIENT_CONICAL_LAST)
137 return newPartT<FetchConicalGradientPart>(fetchType, fetchPayload, format);
138
139 if (fetchType >= BL_PIPE_FETCH_TYPE_PATTERN_SIMPLE_FIRST && fetchType <= BL_PIPE_FETCH_TYPE_PATTERN_SIMPLE_LAST)
140 return newPartT<FetchSimplePatternPart>(fetchType, fetchPayload, format);
141
142 if (fetchType >= BL_PIPE_FETCH_TYPE_PATTERN_AFFINE_FIRST && fetchType <= BL_PIPE_FETCH_TYPE_PATTERN_AFFINE_LAST)
143 return newPartT<FetchAffinePatternPart>(fetchType, fetchPayload, format);
144
145 if (fetchType == BL_PIPE_FETCH_TYPE_PIXEL_PTR)
146 return newPartT<FetchPixelPtrPart>(fetchType, fetchPayload, format);
147
148 return nullptr;
149 }
150
newCompOpPart(uint32_t compOp,FetchPart * dstPart,FetchPart * srcPart)151 CompOpPart* PipeCompiler::newCompOpPart(uint32_t compOp, FetchPart* dstPart, FetchPart* srcPart) noexcept {
152 return newPartT<CompOpPart>(compOp, dstPart, srcPart);
153 }
154
155 // ============================================================================
156 // [BLPipeGen::PipeCompiler - Init]
157 // ============================================================================
158
initPipeline(PipePart * root)159 void PipeCompiler::initPipeline(PipePart* root) noexcept {
160 if (_ctxDataOffset != 0) cc->add(_ctxData, _ctxDataOffset);
161 if (_fillDataOffset != 0) cc->add(_fillData, _fillDataOffset);
162 if (_fetchDataOffset != 0) cc->add(_fetchData, _fetchDataOffset);
163
164 root->preparePart();
165 onPreInitPart(root);
166 onPostInitPart(root);
167 }
168
onPreInitPart(PipePart * part)169 void PipeCompiler::onPreInitPart(PipePart* part) noexcept {
170 PipePart** children = part->children();
171 uint32_t count = part->childrenCount();
172
173 // Mark so `onPreInitPart()` is called only once for this `part`.
174 part->_flags |= PipePart::kFlagPreInitDone;
175
176 // Restrict the compiler's SIMD width to the lowest supported by all parts.
177 _simdWidth = blMin<uint32_t>(_simdWidth, part->_maxSimdWidthSupported);
178
179 // Collect the register usage of the part.
180 _persistentRegs.add(part->_persistentRegs);
181 _persistentRegs.add(part->_spillableRegs);
182 _temporaryRegs.max(part->_temporaryRegs);
183
184 for (uint32_t i = 0; i < count; i++) {
185 PipePart* child = children[i];
186 if (!(child->flags() & PipePart::kFlagPreInitDone))
187 onPreInitPart(child);
188 }
189 }
190
onPostInitPart(PipePart * part)191 void PipeCompiler::onPostInitPart(PipePart* part) noexcept {
192 PipePart** children = part->children();
193 uint32_t count = part->childrenCount();
194
195 // Mark so `onPostInitPart()` is called only once for this `part`.
196 part->_flags |= PipePart::kFlagPostInitDone;
197
198 // Mark `hasLow` registers in case that the register usage is greater than
199 // the total number of registers available. This is per-part only, not global.
200 for (uint32_t i = 0; i < kNumVirtGroups; i++) {
201 if (_persistentRegs[i] > _availableRegs[i]) {
202 part->_hasLowRegs[i] = true;
203 _persistentRegs[i] -= part->_spillableRegs[i];
204 }
205 }
206
207 for (uint32_t i = 0; i < count; i++) {
208 PipePart* child = children[i];
209 if (!(child->flags() & PipePart::kFlagPostInitDone))
210 onPostInitPart(child);
211 }
212 }
213
214 // ============================================================================
215 // [BLPipeGen::PipeCompiler - Constants]
216 // ============================================================================
217
_initCommonTablePtr()218 void PipeCompiler::_initCommonTablePtr() noexcept {
219 const void* global = &blCommonTable;
220
221 if (!_commonTablePtr.isValid()) {
222 asmjit::BaseNode* prevNode = cc->setCursor(_funcInit);
223 _commonTablePtr = cc->newIntPtr("commonTablePtr");
224
225 cc->alloc(_commonTablePtr);
226 cc->mov(_commonTablePtr, (int64_t)global + _commonTableOff);
227
228 _funcInit = cc->setCursor(prevNode);
229 }
230 }
231
constAsMem(const void * p)232 x86::Mem PipeCompiler::constAsMem(const void* p) noexcept {
233 // Make sure we are addressing a constant from the `blCommonTable` constant pool.
234 const void* global = &blCommonTable;
235 BL_ASSERT((uintptr_t)p >= (uintptr_t)global &&
236 (uintptr_t)p < (uintptr_t)global + sizeof(BLCommonTable));
237
238 if (asmjit::ArchInfo::kIdHost == asmjit::ArchInfo::kIdX86) {
239 // 32-bit mode - These constants will never move in memory so the absolute
240 // addressing is a win/win as we can save one GP register that can be used
241 // for something else.
242 return x86::ptr((uint64_t)p);
243 }
244 else {
245 // 64-bit mode - One GP register is sacrificed to hold the pointer to the
246 // `blCommonTable`. This is probably the safest approach as relying on absolute
247 // addressing or anything else could lead to problems or performance issues.
248 _initCommonTablePtr();
249
250 int32_t disp = int32_t((intptr_t)p - (intptr_t)global);
251 return x86::ptr(_commonTablePtr, disp - _commonTableOff);
252 }
253 }
254
constAsXmm(const void * p)255 x86::Xmm PipeCompiler::constAsXmm(const void* p) noexcept {
256 static const char xmmNames[4][16] = {
257 "xmm.zero",
258 "xmm.u16_128",
259 "xmm.u16_257",
260 "xmm.alpha"
261 };
262
263 int constIndex = -1;
264
265 if (p == blCommonTable.i128_0000000000000000) constIndex = 0; // Required if the CPU doesn't have SSE4.1.
266 else if (p == blCommonTable.i128_0080008000800080) constIndex = 1; // Required by `xDiv255()` and friends.
267 else if (p == blCommonTable.i128_0101010101010101) constIndex = 2; // Required by `xDiv255()` and friends.
268 else if (p == blCommonTable.i128_FF000000FF000000) constIndex = 3; // Required by fetching XRGB32 pixels as PRGB32 pixels.
269
270 if (constIndex == -1) {
271 // TODO: [PIPEGEN] This works, but it's really nasty!
272 x86::Mem m = constAsMem(p);
273 return reinterpret_cast<x86::Xmm&>(m);
274 }
275
276 x86::Xmm& xmm = _constantsXmm[constIndex];
277 if (!xmm.isValid()) {
278 xmm = cc->newXmm(xmmNames[constIndex]);
279
280 if (constIndex == 0) {
281 asmjit::BaseNode* prevNode = cc->setCursor(_funcInit);
282 vzerops(xmm);
283 _funcInit = cc->setCursor(prevNode);
284 }
285 else {
286 // `constAsMem()` may call `_initCommonTablePtr()` for the very first time.
287 // We cannot inject any code before `constAsMem()` returns.
288 x86::Mem m = constAsMem(p);
289
290 asmjit::BaseNode* prevNode = cc->setCursor(_funcInit);
291 vloadps_128a(xmm, m);
292 _funcInit = cc->setCursor(prevNode);
293 }
294
295 _persistentRegs[x86::Reg::kGroupVec]++;
296 }
297
298 return xmm;
299 }
300
301 // ============================================================================
302 // [BLPipeGen::PipeCompiler - Stack]
303 // ============================================================================
304
tmpStack(uint32_t size)305 x86::Mem PipeCompiler::tmpStack(uint32_t size) noexcept {
306 BL_ASSERT(blIsPowerOf2(size));
307 BL_ASSERT(size <= 32);
308
309 if (!_tmpStack.baseId())
310 _tmpStack = cc->newStack(32, 16, "tmpStack");
311 return _tmpStack;
312 }
313
314 // ============================================================================
315 // [BLPipeGen::PipeCompiler - Emit]
316 // ============================================================================
317
318 static constexpr uint32_t signatureOfXmmYmmZmm[] = {
319 x86::Xmm::kSignature,
320 x86::Ymm::kSignature,
321 x86::Zmm::kSignature
322 };
323
shuf32ToShuf64(uint32_t imm)324 static inline uint32_t shuf32ToShuf64(uint32_t imm) noexcept {
325 uint32_t imm0 = uint32_t(imm ) & 1u;
326 uint32_t imm1 = uint32_t(imm >> 1) & 1u;
327 return x86::Predicate::shuf(imm1 * 2u, imm1 * 2u + 1u, imm0 * 2u, imm0 * 2u + 1u);
328 }
329
fixVecSignature(Operand_ & op,uint32_t signature)330 static inline void fixVecSignature(Operand_& op, uint32_t signature) noexcept {
331 if (x86::Reg::isVec(op) && op.signature() > signature)
332 op.setSignature(signature);
333 }
334
isSameReg(const Operand_ & a,const Operand_ & b)335 static inline bool isSameReg(const Operand_& a, const Operand_& b) noexcept {
336 return a.id() == b.id() && a.id() && b.id();
337 }
338
iemit2(uint32_t instId,const Operand_ & op1,int imm)339 void PipeCompiler::iemit2(uint32_t instId, const Operand_& op1, int imm) noexcept {
340 cc->emit(instId, op1, imm);
341 }
342
iemit2(uint32_t instId,const Operand_ & op1,const Operand_ & op2)343 void PipeCompiler::iemit2(uint32_t instId, const Operand_& op1, const Operand_& op2) noexcept {
344 cc->emit(instId, op1, op2);
345 }
346
iemit3(uint32_t instId,const Operand_ & op1,const Operand_ & op2,int imm)347 void PipeCompiler::iemit3(uint32_t instId, const Operand_& op1, const Operand_& op2, int imm) noexcept {
348 cc->emit(instId, op1, op2, imm);
349 }
350
vemit_xmov(const Operand_ & dst,const Operand_ & src,uint32_t width)351 void PipeCompiler::vemit_xmov(const Operand_& dst, const Operand_& src, uint32_t width) noexcept {
352 if (src.isMem() || !isSameReg(dst, src)) {
353 uint32_t instId = x86::Inst::kIdMovaps;
354
355 if (src.isMem()) {
356 switch (width) {
357 case 4: instId = x86::Inst::kIdMovd; break;
358 case 8: instId = x86::Inst::kIdMovq; break;
359 }
360 }
361
362 cc->emit(instId, dst, src);
363 }
364 }
365
vemit_xmov(const OpArray & dst,const Operand_ & src,uint32_t width)366 void PipeCompiler::vemit_xmov(const OpArray& dst, const Operand_& src, uint32_t width) noexcept {
367 uint32_t dstIndex = 0;
368 uint32_t dstCount = dst.size();
369
370 while (dstIndex < dstCount) {
371 vemit_xmov(dst[dstIndex], src, width);
372 dstIndex++;
373 }
374 }
375
vemit_xmov(const OpArray & dst,const OpArray & src,uint32_t width)376 void PipeCompiler::vemit_xmov(const OpArray& dst, const OpArray& src, uint32_t width) noexcept {
377 uint32_t dstIndex = 0;
378 uint32_t dstCount = dst.size();
379
380 uint32_t srcIndex = 0;
381 uint32_t srcCount = src.size();
382
383 while (dstIndex < dstCount) {
384 vemit_xmov(dst[dstIndex], src[srcIndex], width);
385
386 if (++srcIndex >= srcCount) srcIndex = 0;
387 dstIndex++;
388 }
389 }
390
vemit_vv_vv(uint32_t packedId,const Operand_ & dst_,const Operand_ & src_)391 void PipeCompiler::vemit_vv_vv(uint32_t packedId, const Operand_& dst_, const Operand_& src_) noexcept {
392 Operand dst(dst_);
393 Operand src(src_);
394
395 if (PackedInst::width(packedId) < PackedInst::kWidthZ) {
396 uint32_t signature = signatureOfXmmYmmZmm[PackedInst::width(packedId)];
397 fixVecSignature(dst, signature);
398 fixVecSignature(src, signature);
399 }
400
401 // Intrinsics support.
402 if (PackedInst::isIntrin(packedId)) {
403 switch (PackedInst::intrinId(packedId)) {
404 case kIntrin2Vloadi128uRO: {
405 if (hasSSE3())
406 packedId = PackedInst::packAvxSse(x86::Inst::kIdVlddqu, x86::Inst::kIdLddqu);
407 else
408 packedId = PackedInst::packAvxSse(x86::Inst::kIdVmovdqu, x86::Inst::kIdMovdqu);
409 break;
410 }
411
412 case kIntrin2Vmovu8u16: {
413 if (hasSSE4_1()) {
414 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpmovzxbw, x86::Inst::kIdPmovzxbw);
415 break;
416 }
417
418 vemit_xmov(dst, src, 8);
419 vunpackli8(dst, dst, constAsXmm(blCommonTable.i128_0000000000000000));
420 return;
421 }
422
423 case kIntrin2Vmovu8u32: {
424 if (hasSSE4_1()) {
425 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpmovzxbd, x86::Inst::kIdPmovzxbd);
426 break;
427 }
428
429 vemit_xmov(dst, src, 4);
430 vunpackli8(dst, dst, constAsXmm(blCommonTable.i128_0000000000000000));
431 vunpackli16(dst, dst, constAsXmm(blCommonTable.i128_0000000000000000));
432 return;
433 }
434
435 case kIntrin2Vmovu16u32: {
436 if (hasSSE4_1()) {
437 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpmovzxwd, x86::Inst::kIdPmovzxwd);
438 break;
439 }
440
441 vemit_xmov(dst, src, 8);
442 vunpackli16(dst, dst, constAsXmm(blCommonTable.i128_0000000000000000));
443 return;
444 }
445
446 case kIntrin2Vabsi8: {
447 if (hasSSSE3()) {
448 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpabsb, x86::Inst::kIdPabsb);
449 break;
450 }
451
452 if (isSameReg(dst, src)) {
453 x86::Vec tmp = cc->newSimilarReg(dst.as<x86::Vec>(), "@tmp");
454 vzeropi(tmp);
455 vsubi8(tmp, tmp, dst);
456 vminu8(dst, dst, tmp);
457 }
458 else {
459 vzeropi(dst);
460 vsubi8(dst, dst, src);
461 vminu8(dst, dst, src);
462 }
463 return;
464 }
465
466 case kIntrin2Vabsi16: {
467 if (hasSSSE3()) {
468 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpabsw, x86::Inst::kIdPabsw);
469 break;
470 }
471
472 if (isSameReg(dst, src)) {
473 x86::Vec tmp = cc->newSimilarReg(dst.as<x86::Vec>(), "@tmp");
474 vzeropi(tmp);
475 vsubi16(tmp, tmp, dst);
476 vmaxi16(dst, dst, tmp);
477 }
478 else {
479 vzeropi(dst);
480 vsubi16(dst, dst, src);
481 vmaxi16(dst, dst, src);
482 }
483 return;
484 }
485
486 case kIntrin2Vabsi32: {
487 if (hasSSSE3()) {
488 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpabsd, x86::Inst::kIdPabsd);
489 break;
490 }
491
492 x86::Vec tmp = cc->newSimilarReg(dst.as<x86::Vec>(), "@tmp");
493
494 vmov(tmp, src);
495 vsrai32(tmp, tmp, 31);
496 vxor(dst, src, tmp);
497 vsubi32(dst, dst, tmp);
498 return;
499 }
500
501 case kIntrin2Vabsi64: {
502 x86::Vec tmp = cc->newSimilarReg(dst.as<x86::Vec>(), "@tmp");
503
504 vduphi32(tmp, src);
505 vsrai32(tmp, tmp, 31);
506 vxor(dst, src, tmp);
507 vsubi32(dst, dst, tmp);
508 return;
509 }
510
511 case kIntrin2Vinv255u16: {
512 Operand u16_255 = constAsXmm(blCommonTable.i128_00FF00FF00FF00FF);
513
514 if (hasAVX() || isSameReg(dst, src)) {
515 vxor(dst, src, u16_255);
516 }
517 else {
518 vmov(dst, u16_255);
519 vxor(dst, dst, src);
520 }
521 return;
522 }
523
524 case kIntrin2Vinv256u16: {
525 x86::Vec u16_0100 = constAsXmm(blCommonTable.i128_0100010001000100);
526
527 if (!isSameReg(dst, src)) {
528 vmov(dst, u16_0100);
529 vsubi16(dst, dst, src);
530 }
531 else if (hasSSSE3()) {
532 vsubi16(dst, dst, u16_0100);
533 vabsi16(dst, dst);
534 }
535 else {
536 vxor(dst, dst, constAsXmm(blCommonTable.i128_FFFFFFFFFFFFFFFF));
537 vaddi16(dst, dst, u16_0100);
538 }
539 return;
540 }
541
542 case kIntrin2Vinv255u32: {
543 Operand u32_255 = constAsXmm(blCommonTable.i128_000000FF000000FF);
544
545 if (hasAVX() || isSameReg(dst, src)) {
546 vxor(dst, src, u32_255);
547 }
548 else {
549 vmov(dst, u32_255);
550 vxor(dst, dst, src);
551 }
552 return;
553 }
554
555 case kIntrin2Vinv256u32: {
556 BL_ASSERT(!"Implemented");
557 // TODO: [PIPEGEN]
558 return;
559 }
560
561 case kIntrin2Vduplpd: {
562 if (hasSSE3())
563 vmovduplpd_(dst, src);
564 else if (hasAVX())
565 vunpacklpd(dst, src, src);
566 else if (isSameReg(dst, src))
567 vunpacklpd(dst, dst, src);
568 else
569 vdupli64(dst, src);
570 return;
571 }
572
573 case kIntrin2Vduphpd: {
574 if (hasAVX())
575 vunpackhpd(dst, src, src);
576 if (isSameReg(dst, src))
577 vunpackhpd(dst, dst, src);
578 else
579 vduphi64(dst, src);
580 return;
581 }
582
583 case kIntrin2VBroadcastU16: {
584 BL_ASSERT(src.isReg() || src.isMem());
585
586 if (src.isReg()) {
587 Operand x(src);
588 // Reg <- BroadcastW(Reg).
589 if (src.as<x86::Reg>().isGp()) {
590 vmovsi32(dst.as<x86::Vec>().xmm(), src.as<x86::Gp>().r32());
591 x = dst;
592 }
593
594 if (hasAVX2()) {
595 cc->emit(x86::Inst::kIdVpbroadcastw, dst, x);
596 }
597 else {
598 vswizli16(dst, x, x86::Predicate::shuf(0, 0, 0, 0));
599 vswizi32(dst, dst, x86::Predicate::shuf(1, 0, 1, 0));
600 }
601 }
602 else {
603 // Reg <- BroadcastW(Mem).
604 x86::Mem m(src.as<x86::Mem>());
605
606 if (hasAVX2()) {
607 m.setSize(2);
608 cc->emit(x86::Inst::kIdVpbroadcastw, dst, m);
609 }
610 else {
611 if (m.size() >= 4) {
612 m.setSize(4);
613 vloadi32(dst, m);
614 }
615 else {
616 m.setSize(2);
617 vzeropi(dst);
618 vinsertu16(dst, dst, m, 0);
619 }
620
621 vswizli16(dst, dst, x86::Predicate::shuf(0, 0, 0, 0));
622 vswizi32(dst, dst, x86::Predicate::shuf(1, 0, 1, 0));
623 }
624 }
625
626 return;
627 }
628
629 case kIntrin2VBroadcastU32: {
630 BL_ASSERT(src.isReg() || src.isMem());
631
632 if (src.isReg()) {
633 Operand x(src);
634 // Reg <- BroadcastW(Reg).
635 if (src.as<x86::Reg>().isGp()) {
636 vmovsi32(dst.as<x86::Vec>().xmm(), src.as<x86::Gp>().r32());
637 x = dst;
638 }
639
640 if (hasAVX2()) {
641 cc->emit(x86::Inst::kIdVpbroadcastd, dst, x);
642 }
643 else {
644 vswizi32(dst, dst, x86::Predicate::shuf(0, 0, 0, 0));
645 }
646 }
647 else {
648 // Reg <- BroadcastW(Mem).
649 x86::Mem m(src.as<x86::Mem>());
650 m.setSize(4);
651
652 if (hasAVX2()) {
653 cc->emit(x86::Inst::kIdVpbroadcastd, dst, m);
654 }
655 else {
656 vloadi32(dst.as<x86::Vec>(), m);
657 vswizi32(dst, dst, x86::Predicate::shuf(0, 0, 0, 0));
658 }
659 }
660
661 return;
662 }
663
664 case kIntrin2VBroadcastU64: {
665 BL_ASSERT(src.isReg() || src.isMem());
666
667 if (src.isReg()) {
668 Operand x(src);
669 // Reg <- BroadcastW(Reg).
670 if (src.as<x86::Reg>().isGp()) {
671 vmovsi64(dst.as<x86::Vec>().xmm(), src.as<x86::Gp>().r64());
672 x = dst;
673 }
674
675 vswizi32(dst, dst, x86::Predicate::shuf(1, 0, 1, 0));
676 }
677 else {
678 // Reg <- BroadcastW(Mem).
679 x86::Mem m(src.as<x86::Mem>());
680 m.setSize(8);
681
682 if (hasAVX2()) {
683 cc->emit(x86::Inst::kIdVpbroadcastq, dst, m);
684 }
685 else {
686 vloadi64(dst.as<x86::Vec>(), m);
687 vswizi32(dst, dst, x86::Predicate::shuf(1, 0, 1, 0));
688 }
689 }
690
691 return;
692 }
693
694 default:
695 BL_NOT_REACHED();
696 }
697 }
698
699 // Single instruction.
700 uint32_t instId = hasAVX() ? PackedInst::avxId(packedId)
701 : PackedInst::sseId(packedId);
702 cc->emit(instId, dst, src);
703 }
704
vemit_vv_vv(uint32_t packedId,const OpArray & dst_,const Operand_ & src_)705 void PipeCompiler::vemit_vv_vv(uint32_t packedId, const OpArray& dst_, const Operand_& src_) noexcept {
706 uint32_t dstIndex = 0;
707 uint32_t dstCount = dst_.size();
708
709 while (dstIndex < dstCount) {
710 vemit_vv_vv(packedId, dst_[dstIndex], src_);
711 dstIndex++;
712 }
713 }
714
vemit_vv_vv(uint32_t packedId,const OpArray & dst_,const OpArray & src_)715 void PipeCompiler::vemit_vv_vv(uint32_t packedId, const OpArray& dst_, const OpArray& src_) noexcept {
716 uint32_t dstIndex = 0;
717 uint32_t dstCount = dst_.size();
718
719 uint32_t srcIndex = 0;
720 uint32_t srcCount = src_.size();
721
722 while (dstIndex < dstCount) {
723 vemit_vv_vv(packedId, dst_[dstIndex], src_[srcIndex]);
724
725 if (++srcIndex >= srcCount) srcIndex = 0;
726 dstIndex++;
727 }
728 }
729
vemit_vvi_vi(uint32_t packedId,const Operand_ & dst_,const Operand_ & src_,uint32_t imm)730 void PipeCompiler::vemit_vvi_vi(uint32_t packedId, const Operand_& dst_, const Operand_& src_, uint32_t imm) noexcept {
731 // Intrinsics support.
732 if (PackedInst::isIntrin(packedId)) {
733 switch (PackedInst::intrinId(packedId)) {
734 case kIntrin2iVswizps:
735 if (isSameReg(dst_, src_) || hasAVX())
736 vshufps(dst_, src_, src_, imm);
737 else
738 vswizi32(dst_, src_, imm);
739 return;
740
741 case kIntrin2iVswizpd:
742 if (isSameReg(dst_, src_) || hasAVX())
743 vshufpd(dst_, src_, src_, imm);
744 else
745 vswizi32(dst_, src_, shuf32ToShuf64(imm));
746 return;
747
748 default:
749 BL_NOT_REACHED();
750 }
751 }
752
753 // Instruction support.
754 Operand dst(dst_);
755 Operand src(src_);
756
757 if (PackedInst::width(packedId) < PackedInst::kWidthZ) {
758 uint32_t signature = signatureOfXmmYmmZmm[PackedInst::width(packedId)];
759 fixVecSignature(dst, signature);
760 fixVecSignature(src, signature);
761 }
762
763 if (hasAVX()) {
764 uint32_t instId = PackedInst::avxId(packedId);
765 cc->emit(instId, dst, src, imm);
766 }
767 else {
768 uint32_t instId = PackedInst::sseId(packedId);
769 if (!isSameReg(dst, src))
770 cc->emit(x86::Inst::kIdMovaps, dst, src);
771 cc->emit(instId, dst, imm);
772 }
773 }
774
vemit_vvi_vi(uint32_t packedId,const OpArray & dst_,const Operand_ & src_,uint32_t imm)775 void PipeCompiler::vemit_vvi_vi(uint32_t packedId, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept {
776 uint32_t dstIndex = 0;
777 uint32_t dstCount = dst_.size();
778
779 while (dstIndex < dstCount) {
780 vemit_vvi_vi(packedId, dst_[dstIndex], src_, imm);
781 dstIndex++;
782 }
783 }
784
vemit_vvi_vi(uint32_t packedId,const OpArray & dst_,const OpArray & src_,uint32_t imm)785 void PipeCompiler::vemit_vvi_vi(uint32_t packedId, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept {
786 uint32_t dstIndex = 0;
787 uint32_t dstCount = dst_.size();
788
789 uint32_t srcIndex = 0;
790 uint32_t srcCount = src_.size();
791
792 while (dstIndex < dstCount) {
793 vemit_vvi_vi(packedId, dst_[dstIndex], src_[srcIndex], imm);
794
795 if (++srcIndex >= srcCount) srcIndex = 0;
796 dstIndex++;
797 }
798 }
799
vemit_vvi_vvi(uint32_t packedId,const Operand_ & dst_,const Operand_ & src_,uint32_t imm)800 void PipeCompiler::vemit_vvi_vvi(uint32_t packedId, const Operand_& dst_, const Operand_& src_, uint32_t imm) noexcept {
801 Operand dst(dst_);
802 Operand src(src_);
803
804 if (PackedInst::width(packedId) < PackedInst::kWidthZ) {
805 uint32_t signature = signatureOfXmmYmmZmm[PackedInst::width(packedId)];
806 fixVecSignature(dst, signature);
807 fixVecSignature(src, signature);
808 }
809
810 uint32_t instId = hasAVX() ? PackedInst::avxId(packedId)
811 : PackedInst::sseId(packedId);
812 cc->emit(instId, dst, src, imm);
813 }
814
vemit_vvi_vvi(uint32_t packedId,const OpArray & dst_,const Operand_ & src_,uint32_t imm)815 void PipeCompiler::vemit_vvi_vvi(uint32_t packedId, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept {
816 uint32_t dstIndex = 0;
817 uint32_t dstCount = dst_.size();
818
819 while (dstIndex < dstCount) {
820 vemit_vvi_vvi(packedId, dst_[dstIndex], src_, imm);
821 dstIndex++;
822 }
823 }
824
vemit_vvi_vvi(uint32_t packedId,const OpArray & dst_,const OpArray & src_,uint32_t imm)825 void PipeCompiler::vemit_vvi_vvi(uint32_t packedId, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept {
826 uint32_t dstIndex = 0;
827 uint32_t dstCount = dst_.size();
828
829 uint32_t srcIndex = 0;
830 uint32_t srcCount = src_.size();
831
832 while (dstIndex < dstCount) {
833 vemit_vvi_vvi(packedId, dst_[dstIndex], src_[srcIndex], imm);
834
835 if (++srcIndex >= srcCount) srcIndex = 0;
836 dstIndex++;
837 }
838 }
839
vemit_vvv_vv(uint32_t packedId,const Operand_ & dst_,const Operand_ & src1_,const Operand_ & src2_)840 void PipeCompiler::vemit_vvv_vv(uint32_t packedId, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_) noexcept {
841 Operand dst(dst_);
842 Operand src1(src1_);
843 Operand src2(src2_);
844
845 if (PackedInst::width(packedId) < PackedInst::kWidthZ) {
846 uint32_t signature = signatureOfXmmYmmZmm[PackedInst::width(packedId)];
847 fixVecSignature(dst , signature);
848 fixVecSignature(src1, signature);
849 fixVecSignature(src2, signature);
850 }
851
852 // Intrinsics support.
853 if (PackedInst::isIntrin(packedId)) {
854 switch (PackedInst::intrinId(packedId)) {
855 case kIntrin3Vcombhli64: {
856 // Swap Case:
857 // dst'.u64[0] = src_.u64[1];
858 // dst'.u64[1] = src_.u64[0];
859 if (isSameReg(src1_, src2_)) {
860 vswapi64(dst_, src1_);
861 return;
862 }
863
864 // Dst is Src2 Case:
865 // dst'.u64[0] = src1.u64[1];
866 // dst'.u64[1] = dst_.u64[0];
867 if (isSameReg(dst_, src2_) && !hasAVX()) {
868 if (hasSSSE3()) {
869 valignr8_(dst_, dst_, src1_, 8);
870 }
871 else {
872 vshufpd(dst_, dst_, src1_, x86::Predicate::shuf(1, 0));
873 vswapi64(dst_, dst_);
874 }
875 return;
876 }
877
878 // Common Case:
879 // dst'.u64[0] = src1.u64[1];
880 // dst'.u64[1] = src2.u64[0];
881 vshufpd(dst_, src1_, src2_, x86::Predicate::shuf(0, 1));
882 return;
883 }
884
885 case kIntrin3Vcombhld64: {
886 // Swap Case:
887 // dst'.d64[0] = src_.d64[1];
888 // dst'.d64[1] = src_.d64[0];
889 if (isSameReg(src1_, src2_)) {
890 vswappd(dst_, src1_);
891 return;
892 }
893
894 // Dst is Src2 Case:
895 // dst'.d64[0] = src1.d64[1];
896 // dst'.d64[1] = dst_.d64[0];
897 if (isSameReg(dst_, src2_) && !hasAVX()) {
898 vshufpd(dst_, dst_, src1_, x86::Predicate::shuf(1, 0));
899 vswappd(dst_, dst_);
900 return;
901 }
902
903 // Common Case:
904 // dst'.d64[0] = src1.d64[1];
905 // dst'.d64[1] = src2.d64[0];
906 vshufpd(dst_, src1_, src2_, x86::Predicate::shuf(0, 1));
907 return;
908 }
909
910 case kIntrin3Vminu16: {
911 if (hasSSE4_1()) {
912 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpminuw, x86::Inst::kIdPminuw);
913 break;
914 }
915
916 if (isSameReg(src1, src2)) {
917 vmov(dst, src1);
918 return;
919 }
920
921 if (isSameReg(dst, src2))
922 std::swap(src1, src2);
923
924 x86::Xmm tmp = cc->newXmm("@tmp");
925 vsubsu16(tmp, src1, src2);
926 vsubi16(dst, src1, tmp);
927 return;
928 }
929
930 case kIntrin3Vmaxu16: {
931 if (hasSSE4_1()) {
932 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpmaxuw, x86::Inst::kIdPmaxuw);
933 break;
934 }
935
936 if (isSameReg(src1, src2)) {
937 vmov(dst, src1);
938 return;
939 }
940
941 if (isSameReg(dst, src2))
942 std::swap(src1, src2);
943
944 vsubsu16(dst, src1, src2);
945 vaddi16(dst, dst, src2);
946 return;
947 }
948
949 case kIntrin3Vmulu64x32: {
950 if (isSameReg(dst, src1)) {
951 x86::Vec tmp = cc->newSimilarReg(dst.as<x86::Vec>(), "@tmp");
952
953 vswizi32(tmp, dst, x86::Predicate::shuf(2, 3, 0, 1));
954 vmulxllu32(dst, dst, src2);
955 vmulxllu32(tmp, tmp, src2);
956 vslli64(tmp, tmp, 32);
957 vaddi64(dst, dst, tmp);
958 }
959 else if (isSameReg(dst, src2)) {
960 x86::Vec tmp = cc->newSimilarReg(dst.as<x86::Vec>(), "@tmp");
961
962 vswizi32(tmp, src1, x86::Predicate::shuf(2, 3, 0, 1));
963 vmulxllu32(tmp, tmp, dst);
964 vmulxllu32(dst, dst, src1);
965 vslli64(tmp, tmp, 32);
966 vaddi64(dst, dst, tmp);
967 }
968 else {
969 vswizi32(dst, src1, x86::Predicate::shuf(2, 3, 0, 1));
970 vmulxllu32(dst, dst, src2);
971 vmulxllu32(src1, src1, src2);
972 vslli64(dst, dst, 32);
973 vaddi64(dst, dst, src1);
974 }
975 return;
976 }
977
978 case kIntrin3Vhaddpd: {
979 if (hasSSE3()) {
980 packedId = PackedInst::packAvxSse(x86::Inst::kIdVhaddpd, x86::Inst::kIdHaddpd);
981 break;
982 }
983
984 if (isSameReg(src1, src2)) {
985 if (isSameReg(dst, src1)) {
986 // dst = haddpd(dst, dst);
987 x86::Xmm tmp = cc->newXmmPd("@tmp");
988 vswappd(tmp, dst);
989 vaddpd(dst, dst, tmp);
990 }
991 else {
992 // dst = haddpd(src1, src1);
993 vswappd(dst, src1);
994 vaddpd(dst, dst, src1);
995 }
996 }
997 else {
998 x86::Xmm tmp = cc->newXmmPd("@tmp");
999 // dst = haddpd(src1, src2);
1000 vunpackhpd(tmp, src1, src2);
1001 vunpacklpd(dst, src1, src2);
1002 vaddpd(dst, dst, tmp);
1003 }
1004 return;
1005 }
1006
1007 default:
1008 BL_NOT_REACHED();
1009 }
1010 }
1011
1012 // Single instruction.
1013 if (hasAVX()) {
1014 uint32_t instId = PackedInst::avxId(packedId);
1015 cc->emit(instId, dst, src1, src2);
1016 }
1017 else {
1018 uint32_t instId = PackedInst::sseId(packedId);
1019 if (!isSameReg(dst, src1))
1020 cc->emit(x86::Inst::kIdMovaps, dst, src1);
1021 cc->emit(instId, dst, src2);
1022 }
1023 }
1024
vemit_vvv_vv(uint32_t packedId,const OpArray & dst_,const Operand_ & src1_,const OpArray & src2_)1025 void PipeCompiler::vemit_vvv_vv(uint32_t packedId, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept {
1026 vemit_vvv_vv(packedId, dst_, OpArray(src1_), src2_);
1027 }
1028
vemit_vvv_vv(uint32_t packedId,const OpArray & dst_,const OpArray & src1_,const Operand_ & src2_)1029 void PipeCompiler::vemit_vvv_vv(uint32_t packedId, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept {
1030 vemit_vvv_vv(packedId, dst_, src1_, OpArray(src2_));
1031 }
1032
vemit_vvv_vv(uint32_t packedId,const OpArray & dst_,const OpArray & src1_,const OpArray & src2_)1033 void PipeCompiler::vemit_vvv_vv(uint32_t packedId, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept {
1034 uint32_t dstIndex = 0;
1035 uint32_t dstCount = dst_.size();
1036
1037 uint32_t src1Index = 0;
1038 uint32_t src1Count = src1_.size();
1039
1040 uint32_t src2Index = 0;
1041 uint32_t src2Count = src2_.size();
1042
1043 while (dstIndex < dstCount) {
1044 vemit_vvv_vv(packedId, dst_[dstIndex], src1_[src1Index], src2_[src2Index]);
1045
1046 if (++src1Index >= src1Count) src1Index = 0;
1047 if (++src2Index >= src2Count) src2Index = 0;
1048 dstIndex++;
1049 }
1050 }
1051
vemit_vvvi_vvi(uint32_t packedId,const Operand_ & dst_,const Operand_ & src1_,const Operand_ & src2_,uint32_t imm)1052 void PipeCompiler::vemit_vvvi_vvi(uint32_t packedId, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_, uint32_t imm) noexcept {
1053 Operand dst(dst_);
1054 Operand src1(src1_);
1055 Operand src2(src2_);
1056
1057 if (PackedInst::width(packedId) < PackedInst::kWidthZ) {
1058 uint32_t signature = signatureOfXmmYmmZmm[PackedInst::width(packedId)];
1059 fixVecSignature(dst , signature);
1060 fixVecSignature(src1, signature);
1061 fixVecSignature(src2, signature);
1062 }
1063
1064 if (hasAVX()) {
1065 uint32_t instId = PackedInst::avxId(packedId);
1066 cc->emit(instId, dst, src1, src2, imm);
1067 }
1068 else {
1069 uint32_t instId = PackedInst::sseId(packedId);
1070 if (!isSameReg(dst, src1))
1071 cc->emit(x86::Inst::kIdMovaps, dst, src1);
1072 cc->emit(instId, dst, src2, imm);
1073 }
1074 }
1075
vemit_vvvi_vvi(uint32_t packedId,const OpArray & dst,const Operand_ & src1,const OpArray & src2,uint32_t imm)1076 void PipeCompiler::vemit_vvvi_vvi(uint32_t packedId, const OpArray& dst, const Operand_& src1, const OpArray& src2, uint32_t imm) noexcept {
1077 vemit_vvvi_vvi(packedId, dst, OpArray(src1), src2, imm);
1078 }
1079
vemit_vvvi_vvi(uint32_t packedId,const OpArray & dst,const OpArray & src1,const Operand_ & src2,uint32_t imm)1080 void PipeCompiler::vemit_vvvi_vvi(uint32_t packedId, const OpArray& dst, const OpArray& src1, const Operand_& src2, uint32_t imm) noexcept {
1081 vemit_vvvi_vvi(packedId, dst, src1, OpArray(src2), imm);
1082 }
1083
vemit_vvvi_vvi(uint32_t packedId,const OpArray & dst,const OpArray & src1,const OpArray & src2,uint32_t imm)1084 void PipeCompiler::vemit_vvvi_vvi(uint32_t packedId, const OpArray& dst, const OpArray& src1, const OpArray& src2, uint32_t imm) noexcept {
1085 uint32_t dstIndex = 0;
1086 uint32_t dstCount = dst.size();
1087
1088 uint32_t src1Index = 0;
1089 uint32_t src1Count = src1.size();
1090
1091 uint32_t src2Index = 0;
1092 uint32_t src2Count = src2.size();
1093
1094 while (dstIndex < dstCount) {
1095 vemit_vvvi_vvi(packedId, dst[dstIndex], src1[src1Index], src2[src2Index], imm);
1096
1097 if (++src1Index >= src1Count) src1Index = 0;
1098 if (++src2Index >= src2Count) src2Index = 0;
1099 dstIndex++;
1100 }
1101 }
1102
vemit_vvvv_vvv(uint32_t packedId,const Operand_ & dst_,const Operand_ & src1_,const Operand_ & src2_,const Operand_ & src3_)1103 void PipeCompiler::vemit_vvvv_vvv(uint32_t packedId, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_, const Operand_& src3_) noexcept {
1104 Operand dst(dst_);
1105 Operand src1(src1_);
1106 Operand src2(src2_);
1107 Operand src3(src3_);
1108
1109 if (PackedInst::width(packedId) < PackedInst::kWidthZ) {
1110 uint32_t signature = signatureOfXmmYmmZmm[PackedInst::width(packedId)];
1111 fixVecSignature(dst , signature);
1112 fixVecSignature(src1, signature);
1113 fixVecSignature(src2, signature);
1114 fixVecSignature(src3, signature);
1115 }
1116
1117 // Intrinsics support.
1118 if (PackedInst::isIntrin(packedId)) {
1119 switch (PackedInst::intrinId(packedId)) {
1120 case kIntrin4Vpblendvb: {
1121 // Blend(a, b, cond) == (a & ~cond) | (b & cond)
1122 if (hasSSE4_1()) {
1123 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpblendvb, x86::Inst::kIdPblendvb);
1124 break;
1125 }
1126
1127 // Blend(a, b, cond) == a ^ ((a ^ b) & cond)
1128 // == b ^ ((a ^ b) & ~cond)
1129 if (dst.id() == src1.id()) {
1130 x86::Xmm tmp = cc->newXmm("@tmp");
1131 vxor(tmp, dst, src2);
1132 vand(tmp, tmp, src3);
1133 vxor(dst, dst, tmp);
1134 }
1135 else if (dst.id() == src3.id()) {
1136 x86::Xmm tmp = cc->newXmm("@tmp");
1137 vxor(tmp, src1, src2);
1138 vandnot_a(dst, dst, tmp);
1139 vxor(dst, dst, src2);
1140 }
1141 else {
1142 vxor(dst, src2, src1);
1143 vand(dst, dst, src3);
1144 vxor(dst, dst, src1);
1145 }
1146 return;
1147 }
1148
1149 case kIntrin4VpblendvbDestructive: {
1150 // Blend(a, b, cond) == (a & ~cond) | (b & cond)
1151 if (hasSSE4_1()) {
1152 packedId = PackedInst::packAvxSse(x86::Inst::kIdVpblendvb, x86::Inst::kIdPblendvb);
1153 break;
1154 }
1155
1156 // Blend(a, b, cond) == a ^ ((a ^ b) & cond)
1157 // == b ^ ((a ^ b) & ~cond)
1158 if (dst.id() == src3.id()) {
1159 vand(src2, src2, src3);
1160 vandnot_a(src3, src3, src1);
1161 vor(dst, src3, src2);
1162 }
1163 else {
1164 vand(src2, src2, src3);
1165 vandnot_a(src3, src3, src1);
1166 vor(dst, src2, src3);
1167 }
1168 return;
1169 }
1170
1171 default:
1172 BL_NOT_REACHED();
1173 }
1174 }
1175
1176 if (hasAVX()) {
1177 uint32_t instId = PackedInst::avxId(packedId);
1178 cc->emit(instId, dst, src1, src2, src3);
1179 }
1180 else {
1181 uint32_t instId = PackedInst::sseId(packedId);
1182 if (dst.id() != src1.id())
1183 cc->emit(x86::Inst::kIdMovaps, dst, src1);
1184 cc->emit(instId, dst, src2, src3);
1185 }
1186 }
1187
vemit_vvvv_vvv(uint32_t packedId,const OpArray & dst,const OpArray & src1,const OpArray & src2,const Operand_ & src3)1188 void PipeCompiler::vemit_vvvv_vvv(uint32_t packedId, const OpArray& dst, const OpArray& src1, const OpArray& src2, const Operand_& src3) noexcept {
1189 vemit_vvvv_vvv(packedId, dst, src1, src2, OpArray(src3));
1190 }
1191
vemit_vvvv_vvv(uint32_t packedId,const OpArray & dst,const OpArray & src1,const OpArray & src2,const OpArray & src3)1192 void PipeCompiler::vemit_vvvv_vvv(uint32_t packedId, const OpArray& dst, const OpArray& src1, const OpArray& src2, const OpArray& src3) noexcept {
1193 uint32_t dstIndex = 0;
1194 uint32_t dstCount = dst.size();
1195
1196 uint32_t src1Index = 0;
1197 uint32_t src1Count = src1.size();
1198
1199 uint32_t src2Index = 0;
1200 uint32_t src2Count = src2.size();
1201
1202 uint32_t src3Index = 0;
1203 uint32_t src3Count = src3.size();
1204
1205 while (dstIndex < dstCount) {
1206 vemit_vvvv_vvv(packedId, dst[dstIndex], src1[src1Index], src2[src2Index], src3[src3Index]);
1207
1208 if (++src1Index >= src1Count) src1Index = 0;
1209 if (++src2Index >= src2Count) src2Index = 0;
1210 if (++src3Index >= src3Count) src3Index = 0;
1211 dstIndex++;
1212 }
1213 }
1214
1215 // ============================================================================
1216 // [BLPipeGen::PipeCompiler - Fetch Helpers]
1217 // ============================================================================
1218
xFetchPixel_1x(Pixel & p,uint32_t flags,uint32_t sFormat,const x86::Mem & sMem,uint32_t sAlignment)1219 void PipeCompiler::xFetchPixel_1x(Pixel& p, uint32_t flags, uint32_t sFormat, const x86::Mem& sMem, uint32_t sAlignment) noexcept {
1220 BL_UNUSED(sAlignment);
1221 BL_ASSERT(p.type() != Pixel::kTypeNone);
1222
1223 p.setCount(1);
1224 x86::Mem sAdj(sMem);
1225
1226 if (p.isRGBA()) {
1227 switch (sFormat) {
1228 case BL_FORMAT_PRGB32: {
1229 if (flags & Pixel::kAny) {
1230 newXmmArray(p.pc, 1, "c");
1231 vloadi32(p.pc[0], sAdj);
1232 }
1233 break;
1234 }
1235
1236 case BL_FORMAT_XRGB32: {
1237 if (flags & Pixel::kAny) {
1238 newXmmArray(p.pc, 1, "c");
1239 vloadi32(p.pc[0], sAdj);
1240 vFillAlpha255B(p.pc[0], p.pc[0]);
1241 }
1242 break;
1243 }
1244
1245 case BL_FORMAT_A8: {
1246 if (flags & Pixel::kAny) {
1247 if (hasAVX2()) {
1248 newXmmArray(p.pc, 1, "c");
1249 cc->vpbroadcastb(p.pc[0].as<x86::Xmm>(), sAdj);
1250 }
1251 else if (hasSSE4_1()) {
1252 newXmmArray(p.uc, 1, "c");
1253 vzeropi(p.uc[0]);
1254 vinsertu8_(p.uc[0], p.uc[0], sAdj, 0);
1255 vswizli16(p.uc[0], p.uc[0], x86::Predicate::shuf(0, 0, 0, 0));
1256 }
1257 else {
1258 newXmmArray(p.uc, 1, "c");
1259 x86::Gp scalar = cc->newUInt32();
1260 load8(scalar, sAdj);
1261 vmovsi32(p.uc[0], scalar);
1262 vswizli16(p.uc[0], p.uc[0], x86::Predicate::shuf(0, 0, 0, 0));
1263 }
1264 }
1265
1266 break;
1267 }
1268
1269 default:
1270 BL_NOT_REACHED();
1271 }
1272 }
1273 else if (p.isAlpha()) {
1274 p.sa = cc->newUInt32("a");
1275
1276 switch (sFormat) {
1277 case BL_FORMAT_PRGB32: {
1278 sAdj.addOffset(3);
1279 load8(p.sa, sAdj);
1280 break;
1281 }
1282
1283 case BL_FORMAT_XRGB32: {
1284 cc->mov(p.sa, 255);
1285 break;
1286 }
1287
1288 case BL_FORMAT_A8: {
1289 load8(p.sa, sAdj);
1290 break;
1291 }
1292
1293 default:
1294 BL_NOT_REACHED();
1295 }
1296 }
1297
1298 xSatisfyPixel(p, flags);
1299 }
1300
xFetchPixel_4x(Pixel & p,uint32_t flags,uint32_t sFormat,const x86::Mem & sMem,uint32_t sAlignment)1301 void PipeCompiler::xFetchPixel_4x(Pixel& p, uint32_t flags, uint32_t sFormat, const x86::Mem& sMem, uint32_t sAlignment) noexcept {
1302 BL_ASSERT(p.type() != Pixel::kTypeNone);
1303
1304 p.setCount(4);
1305 x86::Mem sAdj(sMem);
1306
1307 if (p.isRGBA()) {
1308 switch (sFormat) {
1309 case BL_FORMAT_PRGB32: {
1310 if (flags & Pixel::kPC) {
1311 newXmmArray(p.pc, 1, "c");
1312
1313 sAdj.setSize(16);
1314 if (sAlignment == 16)
1315 vloadi128a(p.pc[0], sAdj);
1316 else
1317 vloadi128u(p.pc[0], sAdj);
1318 }
1319 else {
1320 newXmmArray(p.uc, 2, "c");
1321
1322 sAdj.setSize(8);
1323 vmovu8u16(p.uc[0], sAdj); sAdj.addOffsetLo32(8);
1324 vmovu8u16(p.uc[1], sAdj);
1325 }
1326 break;
1327 }
1328
1329 case BL_FORMAT_XRGB32: {
1330 if (flags & Pixel::kAny) {
1331 newXmmArray(p.pc, 1, "c");
1332 sAdj.setSize(16);
1333
1334 if (sAlignment == 16)
1335 vloadi128a(p.pc[0], sAdj);
1336 else
1337 vloadi128u(p.pc[0], sAdj);
1338
1339 vFillAlpha255B(p.pc[0], p.pc[0]);
1340 }
1341 break;
1342 }
1343
1344 case BL_FORMAT_A8: {
1345 sAdj.setSize(4);
1346 if (flags & Pixel::kPC) {
1347 newXmmArray(p.pc, 1, "c");
1348
1349 vloadi32(p.pc[0], sAdj);
1350 vunpackli8(p.pc[0], p.pc[0], p.pc[0]);
1351 vunpackli16(p.pc[0], p.pc[0], p.pc[0]);
1352 }
1353 else {
1354 newXmmArray(p.uc, 2, "c");
1355
1356 vloadi32(p.uc[0], sAdj);
1357 vunpackli8(p.uc[0], p.uc[0], p.uc[0]);
1358 vmovu8u16(p.uc[0], p.uc[0]);
1359
1360 vswizi32(p.uc[1], p.uc[0], x86::Predicate::shuf(3, 3, 2, 2));
1361 vswizi32(p.uc[0], p.uc[0], x86::Predicate::shuf(1, 1, 0, 0));
1362 }
1363 break;
1364 }
1365
1366 default:
1367 BL_NOT_REACHED();
1368 }
1369 }
1370 else if (p.isAlpha()) {
1371 // Cannot use scalar pixel in SIMD mode.
1372 BL_ASSERT((flags & Pixel::kSA) == 0);
1373
1374 switch (sFormat) {
1375 case BL_FORMAT_PRGB32: {
1376 x86::Xmm a = cc->newXmm("a");
1377 sAdj.setSize(16);
1378
1379 if (sAlignment == 16)
1380 vloadi128a(a, sAdj);
1381 else
1382 vloadi128u(a, sAdj);
1383
1384 vsrli32(a, a, 24);
1385 vpacki32i16(a, a, a);
1386
1387 if (flags & Pixel::kPA) {
1388 vpacki16u8(a, a, a);
1389 p.pa.init(a);
1390 }
1391 else {
1392 p.ua.init(a);
1393 }
1394 break;
1395 }
1396
1397 case BL_FORMAT_A8: {
1398 x86::Xmm a = cc->newXmm("a");
1399 sAdj.setSize(4);
1400
1401 vloadi32(a, sAdj);
1402
1403 if (flags & Pixel::kPC) {
1404 p.pa.init(a);
1405 }
1406 else {
1407 vmovu8u16(a, a);
1408 p.ua.init(a);
1409 }
1410 break;
1411 }
1412
1413 default:
1414 BL_NOT_REACHED();
1415 }
1416 }
1417
1418 xSatisfyPixel(p, flags);
1419 }
1420
xFetchPixel_8x(Pixel & p,uint32_t flags,uint32_t sFormat,const x86::Mem & sMem,uint32_t sAlignment)1421 void PipeCompiler::xFetchPixel_8x(Pixel& p, uint32_t flags, uint32_t sFormat, const x86::Mem& sMem, uint32_t sAlignment) noexcept {
1422 BL_ASSERT(p.type() != Pixel::kTypeNone);
1423
1424 p.setCount(8);
1425 x86::Mem sAdj(sMem);
1426
1427 if (p.isRGBA()) {
1428 switch (sFormat) {
1429 case BL_FORMAT_PRGB32: {
1430 if ((flags & Pixel::kPC) || !hasSSE4_1()) {
1431 newXmmArray(p.pc, 2, "c");
1432 sAdj.setSize(16);
1433
1434 if (sAlignment == 16) {
1435 vloadi128a(p.pc[0], sAdj); sAdj.addOffsetLo32(16);
1436 vloadi128a(p.pc[1], sAdj);
1437 }
1438 else {
1439 vloadi128u(p.pc[0], sAdj); sAdj.addOffsetLo32(16);
1440 vloadi128u(p.pc[1], sAdj);
1441 }
1442 }
1443 else {
1444 newXmmArray(p.uc, 4, "c");
1445 sAdj.setSize(8);
1446
1447 vmovu8u16(p.uc[0], sAdj); sAdj.addOffsetLo32(8);
1448 vmovu8u16(p.uc[1], sAdj); sAdj.addOffsetLo32(8);
1449 vmovu8u16(p.uc[2], sAdj); sAdj.addOffsetLo32(8);
1450 vmovu8u16(p.uc[3], sAdj);
1451 }
1452 break;
1453 }
1454
1455 case BL_FORMAT_XRGB32: {
1456 if (flags & Pixel::kAny) {
1457 newXmmArray(p.pc, 2, "c");
1458 sAdj.setSize(16);
1459
1460 if (sAlignment == 16) {
1461 vloadi128a(p.pc[0], sAdj); sAdj.addOffsetLo32(16);
1462 vloadi128a(p.pc[1], sAdj);
1463 }
1464 else {
1465 vloadi128u(p.pc[0], sAdj); sAdj.addOffsetLo32(16);
1466 vloadi128u(p.pc[1], sAdj);
1467 }
1468
1469 vFillAlpha255B(p.pc[0], p.pc[0]);
1470 vFillAlpha255B(p.pc[1], p.pc[1]);
1471 }
1472 break;
1473 }
1474
1475 case BL_FORMAT_A8: {
1476 sAdj.setSize(4);
1477 if (flags & Pixel::kPC) {
1478 newXmmArray(p.pc, 2, "c");
1479
1480 vloadi32(p.pc[0], sAdj);
1481 vloadi32(p.pc[1], sAdj);
1482
1483 vunpackli8(p.pc[0], p.pc[0], p.pc[0]);
1484 vunpackli8(p.pc[1], p.pc[1], p.pc[1]);
1485
1486 vunpackli16(p.pc[0], p.pc[0], p.pc[0]);
1487 vunpackli16(p.pc[1], p.pc[1], p.pc[1]);
1488 }
1489 else {
1490 newXmmArray(p.uc, 4, "c");
1491
1492 vloadi32(p.uc[0], sAdj);
1493 vloadi32(p.uc[2], sAdj);
1494
1495 vunpackli8(p.uc[0], p.uc[0], p.uc[0]);
1496 vunpackli8(p.uc[2], p.uc[2], p.uc[2]);
1497
1498 vmovu8u16(p.uc[0], p.uc[0]);
1499 vmovu8u16(p.uc[2], p.uc[2]);
1500
1501 vswizi32(p.uc[1], p.uc[0], x86::Predicate::shuf(3, 3, 2, 2));
1502 vswizi32(p.uc[3], p.uc[2], x86::Predicate::shuf(3, 3, 2, 2));
1503 vswizi32(p.uc[0], p.uc[0], x86::Predicate::shuf(1, 1, 0, 0));
1504 vswizi32(p.uc[2], p.uc[2], x86::Predicate::shuf(1, 1, 0, 0));
1505 }
1506 break;
1507 }
1508 }
1509 }
1510 else if (p.isAlpha()) {
1511 // Cannot use scalar pixel in SIMD mode.
1512 BL_ASSERT((flags & Pixel::kSA) == 0);
1513
1514 switch (sFormat) {
1515 case BL_FORMAT_PRGB32: {
1516 x86::Xmm a0 = cc->newXmm("a");
1517 x86::Xmm a1 = cc->newXmm("aHi");
1518 sAdj.setSize(16);
1519
1520 if (sAlignment >= 16) {
1521 vloadi128a(a0, sAdj);
1522 sAdj.addOffset(16);
1523 vloadi128a(a1, sAdj);
1524 }
1525 else {
1526 vloadi128u(a0, sAdj);
1527 sAdj.addOffset(16);
1528 vloadi128u(a1, sAdj);
1529 }
1530
1531 vsrli32(a0, a0, 24);
1532 vsrli32(a1, a1, 24);
1533 vpacki32i16(a0, a0, a1);
1534
1535 if (flags & Pixel::kPA) {
1536 vpacki16u8(a0, a0, a0);
1537 p.pa.init(a0);
1538 }
1539 else {
1540 p.ua.init(a0);
1541 }
1542 break;
1543 }
1544
1545 case BL_FORMAT_A8: {
1546 x86::Xmm a = cc->newXmm("a");
1547 sAdj.setSize(8);
1548
1549 if (flags & Pixel::kPA) {
1550 vloadi64(a, sAdj);
1551 p.pa.init(a);
1552 }
1553 else {
1554 if (hasSSE4_1()) {
1555 vloadi64_u8u16_(a, sAdj);
1556 }
1557 else {
1558 vloadi64(a, sAdj);
1559 vmovu8u16(a, a);
1560 }
1561 p.ua.init(a);
1562 }
1563 break;
1564 }
1565
1566 default:
1567 BL_NOT_REACHED();
1568 }
1569 }
1570
1571 xSatisfyPixel(p, flags);
1572 }
1573
xSatisfyPixel(Pixel & p,uint32_t flags)1574 void PipeCompiler::xSatisfyPixel(Pixel& p, uint32_t flags) noexcept {
1575 BL_ASSERT(p.type() != Pixel::kTypeNone);
1576 BL_ASSERT(p.count() != 0);
1577
1578 switch (p.type()) {
1579 case Pixel::kTypeRGBA:
1580 _xSatisfyPixelRGBA(p, flags);
1581 break;
1582 case Pixel::kTypeAlpha:
1583 _xSatisfyPixelAlpha(p, flags);
1584 break;
1585 default:
1586 BL_NOT_REACHED();
1587 }
1588 }
1589
_xSatisfyPixelRGBA(Pixel & p,uint32_t flags)1590 void PipeCompiler::_xSatisfyPixelRGBA(Pixel& p, uint32_t flags) noexcept {
1591 BL_ASSERT(p.type() == Pixel::kTypeRGBA);
1592 BL_ASSERT(p.count() != 0);
1593
1594 uint32_t i;
1595
1596 // Quick reject if all flags were satisfied already or no flags were given.
1597 if ((!(flags & Pixel::kPC ) || !p.pc .empty()) &&
1598 (!(flags & Pixel::kUC ) || !p.uc .empty()) &&
1599 (!(flags & Pixel::kUA ) || !p.ua .empty()) &&
1600 (!(flags & Pixel::kUIA) || !p.uia.empty()) )
1601 return;
1602
1603 // Only fetch if we have already unpacked pixels. Wait otherwise as fetch
1604 // flags may contain `Pixel::kUC`, which is handled below. This is an
1605 // optimization for cases where the caller wants packed RGBA and unpacked
1606 // alpha.
1607 if ((flags & (Pixel::kUA | Pixel::kUIA)) && p.ua.empty() && !p.uc.empty()) {
1608 // Emit pshuflw/pshufhw sequence for every unpacked pixel.
1609 newXmmArray(p.ua, p.uc.size(), "a");
1610
1611 vswizli16(p.ua, p.uc, x86::Predicate::shuf(3, 3, 3, 3));
1612 vswizhi16(p.ua, p.ua, x86::Predicate::shuf(3, 3, 3, 3));
1613 }
1614
1615 if ((flags & Pixel::kPC) && p.pc.empty()) {
1616 // Either PC or UC, but never both.
1617 BL_ASSERT(!p.uc.empty());
1618 BL_ASSERT((flags & Pixel::kUC) == 0);
1619
1620 // Emit pack sequence.
1621 p.pc.init(p.uc.even());
1622 rename(p.pc, "pc");
1623 vpacki16u8(p.pc, p.uc.even(), p.uc.odd());
1624 p.uc.reset();
1625 }
1626 else if ((flags & Pixel::kUC) && p.uc.empty()) {
1627 // Emit unpack sequence.
1628 if (p.count() == 1) {
1629 cc->rename(p.pc[0], "c0");
1630 vmovu8u16(p.pc[0], p.pc[0]);
1631
1632 p.uc.init(p.pc[0]);
1633 p.pc.reset();
1634 }
1635 else {
1636 p.uc._size = p.pc.size() * 2;
1637 for (i = 0; i < p.pc.size(); i++) {
1638 cc->rename(p.pc[i], "c%u", i * 2);
1639
1640 p.uc[i * 2 + 0] = p.pc[i];
1641 p.uc[i * 2 + 1] = cc->newXmm("c%u", i * 2 + 1);
1642
1643 xMovzxBW_LoHi(p.uc[i * 2 + 0], p.uc[i * 2 + 1], p.uc[i * 2 + 0]);
1644 }
1645 p.pc.reset();
1646 }
1647 }
1648
1649 if ((flags & (Pixel::kUA | Pixel::kUIA)) && p.ua.empty()) {
1650 // This time we have to really fetch A8/IA8, if we haven't before.
1651 if (!p.uc.empty()) {
1652 newXmmArray(p.ua, p.uc.size(), "ua");
1653 vswizli16(p.ua, p.uc, x86::Predicate::shuf(3, 3, 3, 3));
1654 if (p.count() > 1)
1655 vswizhi16(p.ua, p.ua, x86::Predicate::shuf(3, 3, 3, 3));
1656 }
1657 else {
1658 BL_ASSERT(!p.pc.empty());
1659 if (p.count() <= 2) {
1660 newXmmArray(p.ua, 1, "ua");
1661 vswizli16(p.ua[0], p.pc[0], x86::Predicate::shuf(1, 1, 1, 1));
1662 vsrli16(p.ua[0], p.ua[0], 8);
1663 }
1664 else {
1665 newXmmArray(p.ua, p.pc.size() * 2, "ua");
1666 for (i = 0; i < p.pc.size(); i++)
1667 xExtractUnpackedAFromPackedARGB32_4(p.ua[i * 2], p.ua[i * 2 + 1], p.pc[i]);
1668 }
1669 }
1670 }
1671
1672 if ((flags & Pixel::kUIA) && p.uia.empty()) {
1673 p.uia.init(p.ua);
1674 p.ua.reset();
1675
1676 rename(p.uia, "uia");
1677 vinv255u16(p.uia, p.uia);
1678 }
1679 }
1680
_xSatisfyPixelAlpha(Pixel & p,uint32_t flags)1681 void PipeCompiler::_xSatisfyPixelAlpha(Pixel& p, uint32_t flags) noexcept {
1682 BL_ASSERT(p.type() == Pixel::kTypeAlpha);
1683 BL_ASSERT(p.count() != 0);
1684
1685 // Scalar mode uses only SA.
1686 if (p.count() == 1) {
1687 BL_ASSERT((flags & (Pixel::kSA)) != 0);
1688 BL_ASSERT((flags & (Pixel::kPA | Pixel::kUA)) == 0);
1689
1690 return;
1691 }
1692
1693 if ((flags & Pixel::kPA) && p.pa.empty()) {
1694 // Either PA or UA, but never both.
1695 BL_ASSERT(!p.ua.empty());
1696 BL_ASSERT((flags & Pixel::kUA) == 0);
1697
1698 // Emit pack sequence.
1699 p.pa.init(p.ua.even());
1700 rename(p.pa, "pa");
1701 vpacki16u8(p.pa, p.ua.even(), p.ua.odd());
1702 p.ua.reset();
1703 }
1704 else if ((flags & Pixel::kUA) && p.ua.empty()) {
1705 // Either PA or UA, but never both.
1706 BL_ASSERT(!p.pa.empty());
1707 BL_ASSERT((flags & Pixel::kPA) == 0);
1708
1709 // Emit unpack sequence.
1710 if (p.count() <= 8) {
1711 cc->rename(p.pa[0], "a0");
1712 vmovu8u16(p.pa[0], p.pa[0]);
1713
1714 p.ua.init(p.pa[0]);
1715 p.pa.reset();
1716 }
1717 else {
1718 p.uc._size = p.pa.size() * 2;
1719 for (uint32_t i = 0; i < p.pa.size(); i++) {
1720 cc->rename(p.pa[i], "c%u", i * 2);
1721
1722 p.ua[i * 2 + 0] = p.pa[i];
1723 p.ua[i * 2 + 1] = cc->newXmm("a%u", i * 2 + 1);
1724
1725 xMovzxBW_LoHi(p.ua[i * 2 + 0], p.ua[i * 2 + 1], p.ua[i * 2 + 0]);
1726 }
1727 p.pc.reset();
1728 }
1729 }
1730
1731 if (flags & (Pixel::kUA | Pixel::kUIA)) {
1732 if (p.ua.empty()) {
1733 // TODO: A8 pipeline - finalize satisfy-pixel.
1734 BL_ASSERT(!true);
1735 }
1736 }
1737 }
1738
xSatisfySolid(Pixel & p,uint32_t flags)1739 void PipeCompiler::xSatisfySolid(Pixel& p, uint32_t flags) noexcept {
1740 BL_ASSERT(p.type() != Pixel::kTypeNone);
1741 BL_ASSERT(p.count() != 0);
1742
1743 switch (p.type()) {
1744 case Pixel::kTypeRGBA:
1745 _xSatisfySolidRGBA(p, flags);
1746 break;
1747 case Pixel::kTypeAlpha:
1748 _xSatisfySolidAlpha(p, flags);
1749 break;
1750 default:
1751 BL_NOT_REACHED();
1752 }
1753 }
1754
_xSatisfySolidRGBA(Pixel & p,uint32_t flags)1755 void PipeCompiler::_xSatisfySolidRGBA(Pixel& p, uint32_t flags) noexcept {
1756 BL_ASSERT(p.type() == Pixel::kTypeRGBA);
1757 BL_ASSERT(p.count() != 0);
1758
1759 if ((flags & Pixel::kPC) && p.pc.empty()) {
1760 BL_ASSERT(!p.uc.empty());
1761 newXmmArray(p.pc, 1, "pixel.pc");
1762
1763 vmov(p.pc[0], p.uc[0]);
1764 vpacki16u8(p.pc[0], p.pc[0], p.pc[0]);
1765 }
1766
1767 if ((flags & Pixel::kUC) && p.uc.empty()) {
1768 BL_ASSERT(!p.pc.empty());
1769 newXmmArray(p.uc, 1, "pixel.uc");
1770
1771 vmovu8u16(p.uc[0], p.pc[0]);
1772 }
1773
1774 if ((flags & Pixel::kUA) && p.ua.empty()) {
1775 newXmmArray(p.ua, 1, "pixel.ua");
1776
1777 if (!p.uc.empty()) {
1778 vswizli16(p.ua[0], p.uc[0], x86::Predicate::shuf(3, 3, 3, 3));
1779 vswizi32(p.ua[0], p.ua[0], x86::Predicate::shuf(1, 0, 1, 0));
1780 }
1781 else {
1782 vswizli16(p.ua[0], p.pc[0], x86::Predicate::shuf(1, 1, 1, 1));
1783 vswizi32(p.ua[0], p.ua[0], x86::Predicate::shuf(1, 0, 1, 0));
1784 vsrli16(p.ua[0], p.ua[0], 8);
1785 }
1786 }
1787
1788 if ((flags & Pixel::kUIA) && p.uia.empty()) {
1789 newXmmArray(p.uia, 1, "pixel.uia");
1790
1791 if (!p.ua.empty()) {
1792 vmov(p.uia[0], p.ua[0]);
1793 }
1794 else if (!p.uc.empty()) {
1795 vswizli16(p.uia[0], p.uc[0], x86::Predicate::shuf(3, 3, 3, 3));
1796 vswizi32(p.uia[0], p.uia[0], x86::Predicate::shuf(1, 0, 1, 0));
1797 }
1798 else {
1799 vswizli16(p.uia[0], p.pc[0], x86::Predicate::shuf(1, 1, 1, 1));
1800 vswizi32(p.uia[0], p.uia[0], x86::Predicate::shuf(1, 0, 1, 0));
1801 vsrli16(p.uia[0], p.uia[0], 8);
1802 }
1803 vinv255u16(p.uia[0], p.uia[0]);
1804 }
1805 }
1806
_xSatisfySolidAlpha(Pixel & p,uint32_t flags)1807 void PipeCompiler::_xSatisfySolidAlpha(Pixel& p, uint32_t flags) noexcept {
1808 BL_ASSERT(p.type() == Pixel::kTypeAlpha);
1809 BL_ASSERT(p.count() != 0);
1810
1811 if ((flags & Pixel::kPA) && p.pa.empty()) {
1812 BL_ASSERT(!p.ua.empty());
1813 newXmmArray(p.pa, 1, "pixel.pa");
1814 vpacki16u8(p.pa[0], p.ua[0], p.ua[0]);
1815 }
1816
1817 // TODO: A8 pipeline - finalize solid-alpha.
1818 }
1819
xFetchUnpackedA8_2x(const x86::Xmm & dst,uint32_t format,const x86::Mem & src1,const x86::Mem & src0)1820 void PipeCompiler::xFetchUnpackedA8_2x(const x86::Xmm& dst, uint32_t format, const x86::Mem& src1, const x86::Mem& src0) noexcept {
1821 x86::Mem m0 = src0;
1822 x86::Mem m1 = src1;
1823
1824 m0.setSize(1);
1825 m1.setSize(1);
1826
1827 if (format == BL_FORMAT_PRGB32) {
1828 m0.addOffset(3);
1829 m1.addOffset(3);
1830 }
1831
1832 if (hasSSE4_1()) {
1833 vzeropi(dst);
1834 vinsertu8_(dst, dst, m0, 0);
1835 vinsertu8_(dst, dst, m1, 2);
1836 }
1837 else {
1838 x86::Gp aGp = cc->newUInt32("aGp");
1839 cc->movzx(aGp, m1);
1840 cc->shl(aGp, 16);
1841 cc->mov(aGp.r8(), m0);
1842 vmovsi32(dst, aGp);
1843 }
1844 }
1845
xAssignUnpackedAlphaValues(Pixel & p,uint32_t flags,x86::Xmm & vec)1846 void PipeCompiler::xAssignUnpackedAlphaValues(Pixel& p, uint32_t flags, x86::Xmm& vec) noexcept {
1847 BL_ASSERT(p.type() != Pixel::kTypeNone);
1848 BL_ASSERT(p.count() != 0);
1849
1850 x86::Xmm v0 = vec;
1851
1852 if (p.isRGBA()) {
1853 switch (p.count()) {
1854 case 1: {
1855 vswizli16(v0, v0, x86::Predicate::shuf(0, 0, 0, 0));
1856 p.uc.init(v0);
1857 break;
1858 }
1859
1860 case 2: {
1861 vunpackli16(v0, v0, v0);
1862 vswizi32(v0, v0, x86::Predicate::shuf(1, 1, 0, 0));
1863 p.uc.init(v0);
1864 break;
1865 }
1866
1867 case 4: {
1868 x86::Xmm v1 = cc->newXmm();
1869
1870 vunpackli16(v0, v0, v0);
1871 vswizi32(v1, v0, x86::Predicate::shuf(3, 3, 2, 2));
1872 vswizi32(v0, v0, x86::Predicate::shuf(1, 1, 0, 0));
1873 p.uc.init(v0, v1);
1874 break;
1875 }
1876
1877 case 8: {
1878 x86::Xmm v1 = cc->newXmm();
1879 x86::Xmm v2 = cc->newXmm();
1880 x86::Xmm v3 = cc->newXmm();
1881
1882 vunpackhi16(v2, v0, v0);
1883 vunpackli16(v0, v0, v0);
1884
1885 vswizi32(v1, v0, x86::Predicate::shuf(3, 3, 2, 2));
1886 vswizi32(v0, v0, x86::Predicate::shuf(1, 1, 0, 0));
1887 vswizi32(v3, v2, x86::Predicate::shuf(3, 3, 2, 2));
1888 vswizi32(v2, v2, x86::Predicate::shuf(1, 1, 0, 0));
1889
1890 p.uc.init(v0, v1, v2, v3);
1891 break;
1892 }
1893
1894 default:
1895 BL_NOT_REACHED();
1896 }
1897
1898 rename(p.uc, "uc");
1899 }
1900 else {
1901 switch (p.count()) {
1902 case 1: {
1903 BL_ASSERT(flags & Pixel::kSA);
1904 x86::Gp sa = cc->newUInt32("sa");
1905 vextractu16(sa, vec, 0);
1906 p.sa = sa;
1907 break;
1908 }
1909
1910 default: {
1911 p.ua.init(vec);
1912 rename(p.ua, "ua");
1913 break;
1914 }
1915 }
1916 }
1917 }
1918
vFillAlpha(Pixel & p)1919 void PipeCompiler::vFillAlpha(Pixel& p) noexcept {
1920 BL_ASSERT(p.type() != Pixel::kTypeNone);
1921
1922 if (!p.pc.empty()) vFillAlpha255B(p.pc, p.pc);
1923 if (!p.uc.empty()) vFillAlpha255W(p.uc, p.uc);
1924 }
1925
1926 // ============================================================================
1927 // [BLPipeGen::PipeCompiler - PixelFill]
1928 // ============================================================================
1929
xInlinePixelFillLoop(x86::Gp & dst,x86::Vec & src,x86::Gp & i,uint32_t mainLoopSize,uint32_t itemSize,uint32_t itemGranularity)1930 void PipeCompiler::xInlinePixelFillLoop(x86::Gp& dst, x86::Vec& src, x86::Gp& i, uint32_t mainLoopSize, uint32_t itemSize, uint32_t itemGranularity) noexcept {
1931 BL_ASSERT(blIsPowerOf2(itemSize));
1932 BL_ASSERT(itemSize <= 16u);
1933
1934 uint32_t granularityInBytes = itemSize * itemGranularity;
1935 uint32_t mainStepInItems = mainLoopSize / itemSize;
1936
1937 BL_ASSERT(blIsPowerOf2(granularityInBytes));
1938 BL_ASSERT(mainStepInItems * itemSize == mainLoopSize);
1939
1940 BL_ASSERT(mainLoopSize >= 16u);
1941 BL_ASSERT(mainLoopSize >= granularityInBytes);
1942
1943 uint32_t j;
1944
1945 // ==========================================================================
1946 // [Granularity >= 16 Bytes]
1947 // ==========================================================================
1948
1949 if (granularityInBytes >= 16u) {
1950 Label L_End = cc->newLabel();
1951
1952 // MainLoop
1953 // --------
1954
1955 {
1956 Label L_MainLoop = cc->newLabel();
1957 Label L_MainSkip = cc->newLabel();
1958
1959 cc->sub(i, mainStepInItems);
1960 cc->jc(L_MainSkip);
1961
1962 cc->bind(L_MainLoop);
1963 cc->add(dst, mainLoopSize);
1964 cc->sub(i, mainStepInItems);
1965 for (j = 0; j < mainLoopSize; j += 16u)
1966 vstorei128u(x86::ptr(dst, int(j) - int(mainLoopSize)), src);
1967 cc->jnc(L_MainLoop);
1968
1969 cc->bind(L_MainSkip);
1970 cc->add(i, mainStepInItems);
1971 cc->jz(L_End);
1972 }
1973
1974 // TailLoop / TailSequence
1975 // -----------------------
1976
1977 if (mainLoopSize * 2 > granularityInBytes) {
1978 Label L_TailLoop = cc->newLabel();
1979 cc->bind(L_TailLoop);
1980 for (j = 0; j < granularityInBytes; j += 16u)
1981 vstorei128u(x86::ptr(dst, int(j)), src);
1982 cc->add(dst, granularityInBytes);
1983 cc->sub(i, itemGranularity);
1984 cc->jnz(L_TailLoop);
1985 }
1986 else if (mainLoopSize * 2 == granularityInBytes) {
1987 for (j = 0; j < granularityInBytes; j += 16u)
1988 vstorei128u(x86::ptr(dst, int(j)), src);
1989 cc->add(dst, granularityInBytes);
1990 }
1991
1992 cc->bind(L_End);
1993 return;
1994 }
1995
1996 // ==========================================================================
1997 // [Granularity == 4 Bytes]
1998 // ==========================================================================
1999
2000 if (granularityInBytes == 4u) {
2001 BL_ASSERT(itemSize <= 4u);
2002 uint32_t sizeShift = blBitCtz(itemSize);
2003 uint32_t alignPattern = (15u * itemSize) & 15u;
2004
2005 uint32_t oneStepInItems = 4u >> sizeShift;
2006 uint32_t tailStepInItems = 16u >> sizeShift;
2007
2008 Label L_Finalize = cc->newLabel();
2009 Label L_End = cc->newLabel();
2010
2011 // Preparation / Alignment
2012 // -----------------------
2013
2014 {
2015 cc->cmp(i, oneStepInItems * 4u);
2016 cc->jb(L_Finalize);
2017
2018 x86::Gp iptr = i.cloneAs(dst);
2019 if (sizeShift)
2020 cc->shl(iptr, sizeShift);
2021 cc->add(iptr, dst);
2022
2023 vstorei128u(x86::ptr(dst), src);
2024 cc->add(dst, 16);
2025 cc->and_(dst, -1 ^ int(alignPattern));
2026
2027 cc->sub(iptr, dst);
2028 if (sizeShift)
2029 cc->shr(iptr, sizeShift);
2030 cc->jz(L_End);
2031 }
2032
2033 // MainLoop
2034 // --------
2035
2036 {
2037 Label L_MainLoop = cc->newLabel();
2038 Label L_MainSkip = cc->newLabel();
2039
2040 cc->sub(i, mainStepInItems);
2041 cc->jc(L_MainSkip);
2042
2043 cc->bind(L_MainLoop);
2044 cc->add(dst, mainLoopSize);
2045 cc->sub(i, mainStepInItems);
2046 for (j = 0; j < mainLoopSize; j += 16u)
2047 vstorei128a(x86::ptr(dst, int(j) - int(mainLoopSize)), src);
2048 cc->jnc(L_MainLoop);
2049
2050 cc->bind(L_MainSkip);
2051 cc->add(i, mainStepInItems);
2052 cc->jz(L_End);
2053 }
2054
2055 // TailLoop / TailSequence
2056 // -----------------------
2057
2058 if (mainLoopSize > 32) {
2059 Label L_TailLoop = cc->newLabel();
2060 Label L_TailSkip = cc->newLabel();
2061
2062 cc->sub(i, tailStepInItems);
2063 cc->jc(L_TailSkip);
2064
2065 cc->bind(L_TailLoop);
2066 cc->add(dst, 16);
2067 cc->sub(i, tailStepInItems);
2068 vstorei128a(x86::ptr(dst, -16), src);
2069 cc->jnc(L_TailLoop);
2070
2071 cc->bind(L_TailSkip);
2072 cc->add(i, tailStepInItems);
2073 cc->jz(L_End);
2074 }
2075 else if (mainLoopSize >= 32) {
2076 cc->cmp(i, tailStepInItems);
2077 cc->jb(L_Finalize);
2078
2079 vstorei128a(x86::ptr(dst), src);
2080 cc->add(dst, 16);
2081 cc->sub(i, tailStepInItems);
2082 cc->jz(L_End);
2083 }
2084
2085 // Finalize
2086 // --------
2087
2088 {
2089 Label L_Store1 = cc->newLabel();
2090
2091 cc->bind(L_Finalize);
2092 cc->cmp(i, 8u / itemSize);
2093 cc->jb(L_Store1);
2094
2095 vstorei64(x86::ptr(dst), src);
2096 cc->add(dst, 8);
2097 cc->sub(i, 8u / itemSize);
2098 cc->jz(L_End);
2099
2100 cc->bind(L_Store1);
2101 vstorei32(x86::ptr(dst), src);
2102 cc->add(dst, 4);
2103 }
2104
2105 cc->bind(L_End);
2106 return;
2107 }
2108
2109 // ==========================================================================
2110 // [Granularity == 1 Byte]
2111 // ==========================================================================
2112
2113 if (granularityInBytes == 1) {
2114 BL_ASSERT(itemSize == 1u);
2115
2116 Label L_Finalize = cc->newLabel();
2117 Label L_End = cc->newLabel();
2118
2119 // Preparation / Alignment
2120 // -----------------------
2121
2122 {
2123 Label L_Small = cc->newLabel();
2124 Label L_Large = cc->newLabel();
2125
2126 cc->cmp(i, 15);
2127 cc->ja(L_Large);
2128
2129 x86::Gp srcGp = cc->newInt32("srcGp");
2130 vmovsi32(srcGp, src);
2131
2132 cc->bind(L_Small);
2133 cc->mov(ptr_8(dst), srcGp.r8());
2134 cc->inc(dst);
2135 cc->dec(i);
2136 cc->jnz(L_Small);
2137
2138 cc->jmp(L_End);
2139
2140 cc->bind(L_Large);
2141 x86::Gp iptr = i.cloneAs(dst);
2142 cc->add(iptr, dst);
2143
2144 vstorei128u(x86::ptr(dst), src);
2145 cc->add(dst, 16);
2146 cc->and_(dst, -16);
2147
2148 cc->sub(iptr, dst);
2149 cc->jz(L_End);
2150 }
2151
2152 // MainLoop
2153 // --------
2154
2155 {
2156 Label L_MainLoop = cc->newLabel();
2157 Label L_MainSkip = cc->newLabel();
2158
2159 cc->sub(i, mainLoopSize);
2160 cc->jc(L_MainSkip);
2161
2162 cc->bind(L_MainLoop);
2163 cc->add(dst, mainLoopSize);
2164 cc->sub(i, mainLoopSize);
2165 for (j = 0; j < mainLoopSize; j += 16u)
2166 vstorei128a(x86::ptr(dst, int(j) - int(mainLoopSize)), src);
2167 cc->jnc(L_MainLoop);
2168
2169 cc->bind(L_MainSkip);
2170 cc->add(i, mainLoopSize);
2171 cc->jz(L_End);
2172 }
2173
2174 // TailLoop / TailSequence
2175 // -----------------------
2176
2177 if (mainLoopSize > 32) {
2178 Label L_TailLoop = cc->newLabel();
2179 Label L_TailSkip = cc->newLabel();
2180
2181 cc->sub(i, 16);
2182 cc->jc(L_TailSkip);
2183
2184 cc->bind(L_TailLoop);
2185 cc->add(dst, 16);
2186 cc->sub(i, 16);
2187 vstorei128a(x86::ptr(dst, -16), src);
2188 cc->jnc(L_TailLoop);
2189
2190 cc->bind(L_TailSkip);
2191 cc->add(i, 16);
2192 cc->jz(L_End);
2193 }
2194 else if (mainLoopSize >= 32) {
2195 cc->cmp(i, 16);
2196 cc->jb(L_Finalize);
2197
2198 vstorei128a(x86::ptr(dst, int(j)), src);
2199 cc->add(dst, 16);
2200 cc->sub(i, 16);
2201 cc->jz(L_End);
2202 }
2203
2204 // Finalize
2205 // --------
2206
2207 {
2208 cc->add(dst, i.cloneAs(dst));
2209 vstorei128u(x86::ptr(dst, -16), src);
2210 }
2211
2212 cc->bind(L_End);
2213 return;
2214 }
2215
2216 BL_NOT_REACHED();
2217 }
2218
2219 // ============================================================================
2220 // [BLPipeGen::PipeCompiler - PixelCopy]
2221 // ============================================================================
2222
xInlinePixelCopyLoop(x86::Gp & dst,x86::Gp & src,x86::Gp & i,uint32_t mainLoopSize,uint32_t itemSize,uint32_t itemGranularity,uint32_t format)2223 void PipeCompiler::xInlinePixelCopyLoop(x86::Gp& dst, x86::Gp& src, x86::Gp& i, uint32_t mainLoopSize, uint32_t itemSize, uint32_t itemGranularity, uint32_t format) noexcept {
2224 BL_ASSERT(blIsPowerOf2(itemSize));
2225 BL_ASSERT(itemSize <= 16u);
2226
2227 uint32_t granularityInBytes = itemSize * itemGranularity;
2228 uint32_t mainStepInItems = mainLoopSize / itemSize;
2229
2230 BL_ASSERT(blIsPowerOf2(granularityInBytes));
2231 BL_ASSERT(mainStepInItems * itemSize == mainLoopSize);
2232
2233 BL_ASSERT(mainLoopSize >= 16u);
2234 BL_ASSERT(mainLoopSize >= granularityInBytes);
2235
2236 x86::Xmm t0 = cc->newXmm("t0");
2237 x86::Xmm fillMask;
2238
2239 if (format == BL_FORMAT_XRGB32)
2240 fillMask = constAsXmm(blCommonTable.i128_FF000000FF000000);
2241
2242 cc->alloc(fillMask);
2243
2244 // ==========================================================================
2245 // [Granularity >= 16 Bytes]
2246 // ==========================================================================
2247
2248 if (granularityInBytes >= 16u) {
2249 Label L_End = cc->newLabel();
2250
2251 // MainLoop
2252 // --------
2253
2254 {
2255 Label L_MainLoop = cc->newLabel();
2256 Label L_MainSkip = cc->newLabel();
2257 int ptrOffset = -int(mainLoopSize);
2258
2259 cc->sub(i, mainStepInItems);
2260 cc->jc(L_MainSkip);
2261
2262 cc->bind(L_MainLoop);
2263 cc->add(dst, mainLoopSize);
2264 cc->add(src, mainLoopSize);
2265 cc->sub(i, mainStepInItems);
2266 _xInlineMemCopySequenceXmm(x86::ptr(dst, ptrOffset), false, x86::ptr(src, ptrOffset), false, mainLoopSize, fillMask);
2267 cc->jnc(L_MainLoop);
2268
2269 cc->bind(L_MainSkip);
2270 cc->add(i, mainStepInItems);
2271 cc->jz(L_End);
2272 }
2273
2274 // TailLoop / TailSequence
2275 // -----------------------
2276
2277 if (mainLoopSize * 2 > granularityInBytes) {
2278 Label L_TailLoop = cc->newLabel();
2279 cc->bind(L_TailLoop);
2280 _xInlineMemCopySequenceXmm(x86::ptr(dst), false, x86::ptr(src), false, granularityInBytes, fillMask);
2281 cc->add(dst, granularityInBytes);
2282 cc->add(src, granularityInBytes);
2283 cc->sub(i, itemGranularity);
2284 cc->jnz(L_TailLoop);
2285 }
2286 else if (mainLoopSize * 2 == granularityInBytes) {
2287 _xInlineMemCopySequenceXmm(x86::ptr(dst), false, x86::ptr(src), false, granularityInBytes, fillMask);
2288 cc->add(dst, granularityInBytes);
2289 cc->add(src, granularityInBytes);
2290 }
2291
2292 cc->bind(L_End);
2293 return;
2294 }
2295
2296 // ==========================================================================
2297 // [Granularity == 4 Bytes]
2298 // ==========================================================================
2299
2300 if (granularityInBytes == 4u) {
2301 BL_ASSERT(itemSize <= 4u);
2302 uint32_t sizeShift = blBitCtz(itemSize);
2303 uint32_t alignPattern = (15u * itemSize) & 15u;
2304
2305 uint32_t oneStepInItems = 4u >> sizeShift;
2306 uint32_t tailStepInItems = 16u >> sizeShift;
2307
2308 Label L_Finalize = cc->newLabel();
2309 Label L_End = cc->newLabel();
2310
2311 // Preparation / Alignment
2312 // -----------------------
2313
2314 {
2315 cc->cmp(i, oneStepInItems * 4u);
2316 cc->jb(L_Finalize);
2317
2318 x86::Gp iptr = i.cloneAs(dst);
2319 vloadi128u(t0, x86::ptr(src));
2320 if (sizeShift)
2321 cc->shl(iptr, sizeShift);
2322
2323 cc->add(iptr, dst);
2324 cc->sub(src, dst);
2325 vstorei128u(x86::ptr(dst), t0);
2326 cc->add(dst, 16);
2327 cc->and_(dst, -1 ^ int(alignPattern));
2328
2329 cc->add(src, dst);
2330 cc->sub(iptr, dst);
2331 if (sizeShift)
2332 cc->shr(iptr, sizeShift);
2333 cc->jz(L_End);
2334 }
2335
2336 // MainLoop
2337 // --------
2338
2339 {
2340 Label L_MainLoop = cc->newLabel();
2341 Label L_MainSkip = cc->newLabel();
2342
2343 cc->sub(i, mainStepInItems);
2344 cc->jc(L_MainSkip);
2345
2346 cc->bind(L_MainLoop);
2347 cc->add(dst, mainLoopSize);
2348 cc->add(src, mainLoopSize);
2349 cc->sub(i, mainStepInItems);
2350 int ptrOffset = - int(mainLoopSize);
2351 _xInlineMemCopySequenceXmm(x86::ptr(dst, ptrOffset), true, x86::ptr(src, ptrOffset), false, mainLoopSize, fillMask);
2352 cc->jnc(L_MainLoop);
2353
2354 cc->bind(L_MainSkip);
2355 cc->add(i, mainStepInItems);
2356 cc->jz(L_End);
2357 }
2358
2359 // TailLoop / TailSequence
2360 // -----------------------
2361
2362 if (mainLoopSize > 32) {
2363 Label L_TailLoop = cc->newLabel();
2364 Label L_TailSkip = cc->newLabel();
2365
2366 cc->sub(i, tailStepInItems);
2367 cc->jc(L_TailSkip);
2368
2369 cc->bind(L_TailLoop);
2370 cc->add(dst, 16);
2371 cc->add(src, 16);
2372 cc->sub(i, tailStepInItems);
2373 _xInlineMemCopySequenceXmm(x86::ptr(dst, -16), true, x86::ptr(src, -16), false, 16, fillMask);
2374 cc->jnc(L_TailLoop);
2375
2376 cc->bind(L_TailSkip);
2377 cc->add(i, tailStepInItems);
2378 cc->jz(L_End);
2379 }
2380 else if (mainLoopSize >= 32) {
2381 cc->cmp(i, tailStepInItems);
2382 cc->jb(L_Finalize);
2383
2384 _xInlineMemCopySequenceXmm(x86::ptr(dst), true, x86::ptr(src), false, 16, fillMask);
2385 cc->add(dst, 16);
2386 cc->add(src, 16);
2387 cc->sub(i, tailStepInItems);
2388 cc->jz(L_End);
2389 }
2390
2391 // Finalize
2392 // --------
2393
2394 {
2395 Label L_Store1 = cc->newLabel();
2396
2397 cc->bind(L_Finalize);
2398 cc->cmp(i, 8u / itemSize);
2399 cc->jb(L_Store1);
2400
2401 vloadi64(t0, x86::ptr(src));
2402 cc->add(src, 8);
2403 vstorei64(x86::ptr(dst), t0);
2404 cc->add(dst, 8);
2405 cc->sub(i, 8u / itemSize);
2406 cc->jz(L_End);
2407
2408 cc->bind(L_Store1);
2409 vloadi32(t0, x86::ptr(src));
2410 cc->add(src, 4);
2411 vstorei32(x86::ptr(dst), t0);
2412 cc->add(dst, 4);
2413 }
2414
2415 cc->bind(L_End);
2416 return;
2417 }
2418
2419 // ==========================================================================
2420 // [Granularity == 1 Byte]
2421 // ==========================================================================
2422
2423 if (granularityInBytes == 1) {
2424 BL_ASSERT(itemSize == 1u);
2425
2426 Label L_Finalize = cc->newLabel();
2427 Label L_End = cc->newLabel();
2428
2429 // Preparation / Alignment
2430 // -----------------------
2431
2432 {
2433 Label L_Small = cc->newLabel();
2434 Label L_Large = cc->newLabel();
2435
2436 x86::Gp iptr = i.cloneAs(dst);
2437 x86::Gp byte_val = cc->newInt32("byte_val");
2438
2439 cc->cmp(i, 15);
2440 cc->ja(L_Large);
2441
2442 cc->bind(L_Small);
2443 cc->movzx(byte_val, ptr_8(src));
2444 cc->inc(src);
2445 cc->mov(ptr_8(dst), byte_val.r8());
2446 cc->inc(dst);
2447 cc->dec(i);
2448 cc->jnz(L_Small);
2449 cc->jmp(L_End);
2450
2451 cc->bind(L_Large);
2452 vloadi128u(t0, x86::ptr(src));
2453 cc->add(iptr, dst);
2454 cc->sub(src, dst);
2455
2456 vstorei128u(x86::ptr(dst), t0);
2457 cc->add(dst, 16);
2458 cc->and_(dst, -16);
2459
2460 cc->add(src, dst);
2461 cc->sub(iptr, dst);
2462 cc->jz(L_End);
2463 }
2464
2465 // MainLoop
2466 // --------
2467
2468 {
2469 Label L_MainLoop = cc->newLabel();
2470 Label L_MainSkip = cc->newLabel();
2471
2472 cc->sub(i, mainLoopSize);
2473 cc->jc(L_MainSkip);
2474
2475 cc->bind(L_MainLoop);
2476 _xInlineMemCopySequenceXmm(x86::ptr(dst), true, x86::ptr(src), false, mainLoopSize, fillMask);
2477 cc->add(dst, mainLoopSize);
2478 cc->add(src, mainLoopSize);
2479 cc->sub(i, mainLoopSize);
2480 cc->jnc(L_MainLoop);
2481
2482 cc->bind(L_MainSkip);
2483 cc->add(i, mainLoopSize);
2484 cc->jz(L_End);
2485 }
2486
2487 // TailLoop / TailSequence
2488 // -----------------------
2489
2490 if (mainLoopSize > 32) {
2491 Label L_TailLoop = cc->newLabel();
2492 Label L_TailSkip = cc->newLabel();
2493
2494 cc->sub(i, 16);
2495 cc->jc(L_TailSkip);
2496
2497 cc->bind(L_TailLoop);
2498 _xInlineMemCopySequenceXmm(x86::ptr(dst), true, x86::ptr(src), false, 16, fillMask);
2499 cc->add(dst, 16);
2500 cc->add(src, 16);
2501 cc->sub(i, 16);
2502 cc->jnc(L_TailLoop);
2503
2504 cc->bind(L_TailSkip);
2505 cc->add(i, 16);
2506 cc->jz(L_End);
2507 }
2508 else if (mainLoopSize >= 32) {
2509 cc->cmp(i, 16);
2510 cc->jb(L_Finalize);
2511
2512 _xInlineMemCopySequenceXmm(x86::ptr(dst), true, x86::ptr(src), false, 16, fillMask);
2513 cc->add(dst, 16);
2514 cc->add(src, 16);
2515 cc->sub(i, 16);
2516 cc->jz(L_End);
2517 }
2518
2519 // Finalize
2520 // --------
2521
2522 {
2523 cc->add(dst, i.cloneAs(dst));
2524 cc->add(src, i.cloneAs(src));
2525 _xInlineMemCopySequenceXmm(x86::ptr(dst, -16), false, x86::ptr(src, -16), false, 16, fillMask);
2526 }
2527
2528 cc->bind(L_End);
2529 return;
2530 }
2531 }
2532
_xInlineMemCopySequenceXmm(const x86::Mem & dPtr,bool dstAligned,const x86::Mem & sPtr,bool srcAligned,uint32_t numBytes,const x86::Vec & fillMask)2533 void PipeCompiler::_xInlineMemCopySequenceXmm(
2534 const x86::Mem& dPtr, bool dstAligned,
2535 const x86::Mem& sPtr, bool srcAligned, uint32_t numBytes, const x86::Vec& fillMask) noexcept {
2536
2537 x86::Mem dAdj(dPtr);
2538 x86::Mem sAdj(sPtr);
2539 VecArray t;
2540
2541 uint32_t fetchInst = hasAVX() ? x86::Inst::kIdVmovaps : x86::Inst::kIdMovaps;
2542 uint32_t storeInst = hasAVX() ? x86::Inst::kIdVmovaps : x86::Inst::kIdMovaps;
2543
2544 if (!srcAligned) fetchInst = hasAVX() ? x86::Inst::kIdVlddqu :
2545 hasSSE3() ? x86::Inst::kIdLddqu : x86::Inst::kIdMovups;
2546 if (!dstAligned) storeInst = hasAVX() ? x86::Inst::kIdVmovups : x86::Inst::kIdMovups;
2547
2548 uint32_t n = numBytes / 16;
2549 uint32_t limit = 2;
2550 newXmmArray(t, blMin(n, limit), "t");
2551
2552 do {
2553 uint32_t a, b = blMin<uint32_t>(n, limit);
2554
2555 if (hasAVX() && fillMask.isValid()) {
2556 // Shortest code for this use case. AVX allows to read from unaligned
2557 // memory, so if we use VEC instructions we are generally safe here.
2558 for (a = 0; a < b; a++) {
2559 vor(t[a], fillMask, sAdj);
2560 sAdj.addOffsetLo32(16);
2561 }
2562
2563 for (a = 0; a < b; a++) {
2564 cc->emit(storeInst, dAdj, t[a]);
2565 dAdj.addOffsetLo32(16);
2566 }
2567 }
2568 else {
2569 for (a = 0; a < b; a++) {
2570 cc->emit(fetchInst, t[a], sAdj);
2571 sAdj.addOffsetLo32(16);
2572 }
2573
2574 for (a = 0; a < b; a++)
2575 if (fillMask.isValid())
2576 vor(t[a], t[a], fillMask);
2577
2578 for (a = 0; a < b; a++) {
2579 cc->emit(storeInst, dAdj, t[a]);
2580 dAdj.addOffsetLo32(16);
2581 }
2582 }
2583
2584 n -= b;
2585 } while (n > 0);
2586 }
2587
2588 } // {BLPipeGen}
2589
2590 #endif
2591