1 // [AsmJit]
2 // Machine Code Generation for C++.
3 //
4 // [License]
5 // Zlib - See LICENSE.md file in the package.
6
7 #ifndef _ASMJIT_TEST_MISC_H
8 #define _ASMJIT_TEST_MISC_H
9
10 #include "./asmjit.h"
11
12 namespace asmtest {
13
14 // Generate a typical alpha blend function using SSE2 instruction set. Used
15 // for benchmarking and also in test86. The generated code should be stable
16 // and fully functional.
generateAlphaBlend(asmjit::x86::Compiler & cc)17 static void generateAlphaBlend(asmjit::x86::Compiler& cc) {
18 using namespace asmjit;
19 using namespace asmjit::x86;
20
21 Gp dst = cc.newIntPtr("dst");
22 Gp src = cc.newIntPtr("src");
23
24 Gp i = cc.newIntPtr("i");
25 Gp j = cc.newIntPtr("j");
26 Gp t = cc.newIntPtr("t");
27
28 Xmm vzero = cc.newXmm("vzero");
29 Xmm v0080 = cc.newXmm("v0080");
30 Xmm v0101 = cc.newXmm("v0101");
31
32 Label L_SmallLoop = cc.newLabel();
33 Label L_SmallEnd = cc.newLabel();
34 Label L_LargeLoop = cc.newLabel();
35 Label L_LargeEnd = cc.newLabel();
36 Label L_DataPool = cc.newLabel();
37
38 cc.addFunc(FuncSignatureT<void, void*, const void*, size_t>(cc.codeInfo().cdeclCallConv()));
39
40 cc.setArg(0, dst);
41 cc.setArg(1, src);
42 cc.setArg(2, i);
43
44 // How many pixels have to be processed to make the loop aligned.
45 cc.lea(t, x86::ptr(L_DataPool));
46 cc.xorps(vzero, vzero);
47 cc.movaps(v0080, x86::ptr(t, 0));
48 cc.movaps(v0101, x86::ptr(t, 16));
49
50 cc.xor_(j, j);
51 cc.sub(j, dst);
52 cc.and_(j, 15);
53 cc.shr(j, 2);
54 cc.jz(L_SmallEnd);
55
56 cc.cmp(j, i);
57 cc.cmovg(j, i); // j = min(i, j).
58 cc.sub(i, j); // i -= j.
59
60 // Small loop.
61 cc.bind(L_SmallLoop);
62 {
63 Xmm x0 = cc.newXmm("x0");
64 Xmm y0 = cc.newXmm("y0");
65 Xmm a0 = cc.newXmm("a0");
66
67 cc.movd(y0, x86::ptr(src));
68 cc.movd(x0, x86::ptr(dst));
69
70 cc.pcmpeqb(a0, a0);
71 cc.pxor(a0, y0);
72 cc.psrlw(a0, 8);
73 cc.punpcklbw(x0, vzero);
74
75 cc.pshuflw(a0, a0, x86::Predicate::shuf(1, 1, 1, 1));
76 cc.punpcklbw(y0, vzero);
77
78 cc.pmullw(x0, a0);
79 cc.paddsw(x0, v0080);
80 cc.pmulhuw(x0, v0101);
81
82 cc.paddw(x0, y0);
83 cc.packuswb(x0, x0);
84
85 cc.movd(x86::ptr(dst), x0);
86
87 cc.add(dst, 4);
88 cc.add(src, 4);
89
90 cc.dec(j);
91 cc.jnz(L_SmallLoop);
92 }
93
94 // Second section, prepare for an aligned loop.
95 cc.bind(L_SmallEnd);
96
97 cc.test(i, i);
98 cc.mov(j, i);
99 cc.jz(cc.func()->exitLabel());
100
101 cc.and_(j, 3);
102 cc.shr(i, 2);
103 cc.jz(L_LargeEnd);
104
105 // Aligned loop.
106 cc.bind(L_LargeLoop);
107 {
108 Xmm x0 = cc.newXmm("x0");
109 Xmm x1 = cc.newXmm("x1");
110 Xmm y0 = cc.newXmm("y0");
111 Xmm a0 = cc.newXmm("a0");
112 Xmm a1 = cc.newXmm("a1");
113
114 cc.movups(y0, x86::ptr(src));
115 cc.movaps(x0, x86::ptr(dst));
116
117 cc.pcmpeqb(a0, a0);
118 cc.xorps(a0, y0);
119 cc.movaps(x1, x0);
120
121 cc.psrlw(a0, 8);
122 cc.punpcklbw(x0, vzero);
123
124 cc.movaps(a1, a0);
125 cc.punpcklwd(a0, a0);
126
127 cc.punpckhbw(x1, vzero);
128 cc.punpckhwd(a1, a1);
129
130 cc.pshufd(a0, a0, x86::Predicate::shuf(3, 3, 1, 1));
131 cc.pshufd(a1, a1, x86::Predicate::shuf(3, 3, 1, 1));
132
133 cc.pmullw(x0, a0);
134 cc.pmullw(x1, a1);
135
136 cc.paddsw(x0, v0080);
137 cc.paddsw(x1, v0080);
138
139 cc.pmulhuw(x0, v0101);
140 cc.pmulhuw(x1, v0101);
141
142 cc.add(src, 16);
143 cc.packuswb(x0, x1);
144
145 cc.paddw(x0, y0);
146 cc.movaps(x86::ptr(dst), x0);
147
148 cc.add(dst, 16);
149
150 cc.dec(i);
151 cc.jnz(L_LargeLoop);
152 }
153
154 cc.bind(L_LargeEnd);
155 cc.test(j, j);
156 cc.jnz(L_SmallLoop);
157
158 cc.endFunc();
159
160 // Data.
161 cc.align(kAlignData, 16);
162 cc.bind(L_DataPool);
163 cc.dxmm(Data128::fromI16(0x0080));
164 cc.dxmm(Data128::fromI16(0x0101));
165 }
166
167 } // {asmtest}
168
169 #endif // _ASMJIT_TEST_MISC_H
170