1 /////////////////////////////////////////////////////////////////////////
2 // $Id: simd_int.h 14133 2021-02-08 13:06:44Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 // Copyright (c) 2011-2017 Stanislav Shwartsman
6 // Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23
24 #ifndef BX_SIMD_INT_FUNCTIONS_H
25 #define BX_SIMD_INT_FUNCTIONS_H
26
27 // absolute value
28
xmm_pabsb(BxPackedXmmRegister * op)29 BX_CPP_INLINE void xmm_pabsb(BxPackedXmmRegister *op)
30 {
31 for(unsigned n=0; n<16; n++) {
32 if(op->xmmsbyte(n) < 0) op->xmmubyte(n) = -op->xmmsbyte(n);
33 }
34 }
35
xmm_pabsw(BxPackedXmmRegister * op)36 BX_CPP_INLINE void xmm_pabsw(BxPackedXmmRegister *op)
37 {
38 for(unsigned n=0; n<8; n++) {
39 if(op->xmm16s(n) < 0) op->xmm16u(n) = -op->xmm16s(n);
40 }
41 }
42
xmm_pabsd(BxPackedXmmRegister * op)43 BX_CPP_INLINE void xmm_pabsd(BxPackedXmmRegister *op)
44 {
45 for(unsigned n=0; n<4; n++) {
46 if(op->xmm32s(n) < 0) op->xmm32u(n) = -op->xmm32s(n);
47 }
48 }
49
xmm_pabsq(BxPackedXmmRegister * op)50 BX_CPP_INLINE void xmm_pabsq(BxPackedXmmRegister *op)
51 {
52 for(unsigned n=0; n<2; n++) {
53 if(op->xmm64s(n) < 0) op->xmm64u(n) = -op->xmm64s(n);
54 }
55 }
56
57 // min/max
58
xmm_pminsb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)59 BX_CPP_INLINE void xmm_pminsb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
60 {
61 for(unsigned n=0; n<16; n++) {
62 if(op2->xmmsbyte(n) < op1->xmmsbyte(n)) op1->xmmubyte(n) = op2->xmmubyte(n);
63 }
64 }
65
xmm_pminub(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)66 BX_CPP_INLINE void xmm_pminub(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
67 {
68 for(unsigned n=0; n<16; n++) {
69 if(op2->xmmubyte(n) < op1->xmmubyte(n)) op1->xmmubyte(n) = op2->xmmubyte(n);
70 }
71 }
72
xmm_pminsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)73 BX_CPP_INLINE void xmm_pminsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
74 {
75 for(unsigned n=0; n<8; n++) {
76 if(op2->xmm16s(n) < op1->xmm16s(n)) op1->xmm16s(n) = op2->xmm16s(n);
77 }
78 }
79
xmm_pminuw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)80 BX_CPP_INLINE void xmm_pminuw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
81 {
82 for(unsigned n=0; n<8; n++) {
83 if(op2->xmm16u(n) < op1->xmm16u(n)) op1->xmm16s(n) = op2->xmm16s(n);
84 }
85 }
86
xmm_pminsd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)87 BX_CPP_INLINE void xmm_pminsd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
88 {
89 for(unsigned n=0; n<4; n++) {
90 if(op2->xmm32s(n) < op1->xmm32s(n)) op1->xmm32u(n) = op2->xmm32u(n);
91 }
92 }
93
xmm_pminud(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)94 BX_CPP_INLINE void xmm_pminud(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
95 {
96 for(unsigned n=0; n<4; n++) {
97 if(op2->xmm32u(n) < op1->xmm32u(n)) op1->xmm32u(n) = op2->xmm32u(n);
98 }
99 }
100
xmm_pminsq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)101 BX_CPP_INLINE void xmm_pminsq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
102 {
103 for(unsigned n=0; n<2; n++) {
104 if(op2->xmm64s(n) < op1->xmm64s(n)) op1->xmm64u(n) = op2->xmm64u(n);
105 }
106 }
107
xmm_pminuq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)108 BX_CPP_INLINE void xmm_pminuq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
109 {
110 for(unsigned n=0; n<2; n++) {
111 if(op2->xmm64u(n) < op1->xmm64u(n)) op1->xmm64u(n) = op2->xmm64u(n);
112 }
113 }
114
xmm_pmaxsb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)115 BX_CPP_INLINE void xmm_pmaxsb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
116 {
117 for(unsigned n=0; n<16; n++) {
118 if(op2->xmmsbyte(n) > op1->xmmsbyte(n)) op1->xmmubyte(n) = op2->xmmubyte(n);
119 }
120 }
121
xmm_pmaxub(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)122 BX_CPP_INLINE void xmm_pmaxub(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
123 {
124 for(unsigned n=0; n<16; n++) {
125 if(op2->xmmubyte(n) > op1->xmmubyte(n)) op1->xmmubyte(n) = op2->xmmubyte(n);
126 }
127 }
128
xmm_pmaxsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)129 BX_CPP_INLINE void xmm_pmaxsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
130 {
131 for(unsigned n=0; n<8; n++) {
132 if(op2->xmm16s(n) > op1->xmm16s(n)) op1->xmm16s(n) = op2->xmm16s(n);
133 }
134 }
135
xmm_pmaxuw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)136 BX_CPP_INLINE void xmm_pmaxuw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
137 {
138 for(unsigned n=0; n<8; n++) {
139 if(op2->xmm16u(n) > op1->xmm16u(n)) op1->xmm16s(n) = op2->xmm16s(n);
140 }
141 }
142
xmm_pmaxsd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)143 BX_CPP_INLINE void xmm_pmaxsd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
144 {
145 for(unsigned n=0; n<4; n++) {
146 if(op2->xmm32s(n) > op1->xmm32s(n)) op1->xmm32u(n) = op2->xmm32u(n);
147 }
148 }
149
xmm_pmaxud(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)150 BX_CPP_INLINE void xmm_pmaxud(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
151 {
152 for(unsigned n=0; n<4; n++) {
153 if(op2->xmm32u(n) > op1->xmm32u(n)) op1->xmm32u(n) = op2->xmm32u(n);
154 }
155 }
156
xmm_pmaxsq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)157 BX_CPP_INLINE void xmm_pmaxsq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
158 {
159 for(unsigned n=0; n<2; n++) {
160 if(op2->xmm64s(n) > op1->xmm64s(n)) op1->xmm64u(n) = op2->xmm64u(n);
161 }
162 }
163
xmm_pmaxuq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)164 BX_CPP_INLINE void xmm_pmaxuq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
165 {
166 for(unsigned n=0; n<2; n++) {
167 if(op2->xmm64u(n) > op1->xmm64u(n)) op1->xmm64u(n) = op2->xmm64u(n);
168 }
169 }
170
171 // unpack
172
xmm_unpcklps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)173 BX_CPP_INLINE void xmm_unpcklps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
174 {
175 op1->xmm32u(3) = op2->xmm32u(1);
176 op1->xmm32u(2) = op1->xmm32u(1);
177 op1->xmm32u(1) = op2->xmm32u(0);
178 //op1->xmm32u(0) = op1->xmm32u(0);
179 }
180
xmm_unpckhps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)181 BX_CPP_INLINE void xmm_unpckhps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
182 {
183 op1->xmm32u(0) = op1->xmm32u(2);
184 op1->xmm32u(1) = op2->xmm32u(2);
185 op1->xmm32u(2) = op1->xmm32u(3);
186 op1->xmm32u(3) = op2->xmm32u(3);
187 }
188
xmm_unpcklpd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)189 BX_CPP_INLINE void xmm_unpcklpd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
190 {
191 //op1->xmm64u(0) = op1->xmm64u(0);
192 op1->xmm64u(1) = op2->xmm64u(0);
193 }
194
xmm_unpckhpd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)195 BX_CPP_INLINE void xmm_unpckhpd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
196 {
197 op1->xmm64u(0) = op1->xmm64u(1);
198 op1->xmm64u(1) = op2->xmm64u(1);
199 }
200
xmm_punpcklbw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)201 BX_CPP_INLINE void xmm_punpcklbw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
202 {
203 op1->xmmubyte(0xF) = op2->xmmubyte(7);
204 op1->xmmubyte(0xE) = op1->xmmubyte(7);
205 op1->xmmubyte(0xD) = op2->xmmubyte(6);
206 op1->xmmubyte(0xC) = op1->xmmubyte(6);
207 op1->xmmubyte(0xB) = op2->xmmubyte(5);
208 op1->xmmubyte(0xA) = op1->xmmubyte(5);
209 op1->xmmubyte(0x9) = op2->xmmubyte(4);
210 op1->xmmubyte(0x8) = op1->xmmubyte(4);
211 op1->xmmubyte(0x7) = op2->xmmubyte(3);
212 op1->xmmubyte(0x6) = op1->xmmubyte(3);
213 op1->xmmubyte(0x5) = op2->xmmubyte(2);
214 op1->xmmubyte(0x4) = op1->xmmubyte(2);
215 op1->xmmubyte(0x3) = op2->xmmubyte(1);
216 op1->xmmubyte(0x2) = op1->xmmubyte(1);
217 op1->xmmubyte(0x1) = op2->xmmubyte(0);
218 //op1->xmmubyte(0x0) = op1->xmmubyte(0);
219 }
220
xmm_punpckhbw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)221 BX_CPP_INLINE void xmm_punpckhbw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
222 {
223 op1->xmmubyte(0x0) = op1->xmmubyte(0x8);
224 op1->xmmubyte(0x1) = op2->xmmubyte(0x8);
225 op1->xmmubyte(0x2) = op1->xmmubyte(0x9);
226 op1->xmmubyte(0x3) = op2->xmmubyte(0x9);
227 op1->xmmubyte(0x4) = op1->xmmubyte(0xA);
228 op1->xmmubyte(0x5) = op2->xmmubyte(0xA);
229 op1->xmmubyte(0x6) = op1->xmmubyte(0xB);
230 op1->xmmubyte(0x7) = op2->xmmubyte(0xB);
231 op1->xmmubyte(0x8) = op1->xmmubyte(0xC);
232 op1->xmmubyte(0x9) = op2->xmmubyte(0xC);
233 op1->xmmubyte(0xA) = op1->xmmubyte(0xD);
234 op1->xmmubyte(0xB) = op2->xmmubyte(0xD);
235 op1->xmmubyte(0xC) = op1->xmmubyte(0xE);
236 op1->xmmubyte(0xD) = op2->xmmubyte(0xE);
237 op1->xmmubyte(0xE) = op1->xmmubyte(0xF);
238 op1->xmmubyte(0xF) = op2->xmmubyte(0xF);
239 }
240
xmm_punpcklwd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)241 BX_CPP_INLINE void xmm_punpcklwd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
242 {
243 op1->xmm16u(7) = op2->xmm16u(3);
244 op1->xmm16u(6) = op1->xmm16u(3);
245 op1->xmm16u(5) = op2->xmm16u(2);
246 op1->xmm16u(4) = op1->xmm16u(2);
247 op1->xmm16u(3) = op2->xmm16u(1);
248 op1->xmm16u(2) = op1->xmm16u(1);
249 op1->xmm16u(1) = op2->xmm16u(0);
250 //op1->xmm16u(0) = op1->xmm16u(0);
251 }
252
xmm_punpckhwd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)253 BX_CPP_INLINE void xmm_punpckhwd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
254 {
255 op1->xmm16u(0) = op1->xmm16u(4);
256 op1->xmm16u(1) = op2->xmm16u(4);
257 op1->xmm16u(2) = op1->xmm16u(5);
258 op1->xmm16u(3) = op2->xmm16u(5);
259 op1->xmm16u(4) = op1->xmm16u(6);
260 op1->xmm16u(5) = op2->xmm16u(6);
261 op1->xmm16u(6) = op1->xmm16u(7);
262 op1->xmm16u(7) = op2->xmm16u(7);
263 }
264
265 // pack
266
xmm_packuswb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)267 BX_CPP_INLINE void xmm_packuswb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
268 {
269 op1->xmmubyte(0x0) = SaturateWordSToByteU(op1->xmm16s(0));
270 op1->xmmubyte(0x1) = SaturateWordSToByteU(op1->xmm16s(1));
271 op1->xmmubyte(0x2) = SaturateWordSToByteU(op1->xmm16s(2));
272 op1->xmmubyte(0x3) = SaturateWordSToByteU(op1->xmm16s(3));
273 op1->xmmubyte(0x4) = SaturateWordSToByteU(op1->xmm16s(4));
274 op1->xmmubyte(0x5) = SaturateWordSToByteU(op1->xmm16s(5));
275 op1->xmmubyte(0x6) = SaturateWordSToByteU(op1->xmm16s(6));
276 op1->xmmubyte(0x7) = SaturateWordSToByteU(op1->xmm16s(7));
277
278 op1->xmmubyte(0x8) = SaturateWordSToByteU(op2->xmm16s(0));
279 op1->xmmubyte(0x9) = SaturateWordSToByteU(op2->xmm16s(1));
280 op1->xmmubyte(0xA) = SaturateWordSToByteU(op2->xmm16s(2));
281 op1->xmmubyte(0xB) = SaturateWordSToByteU(op2->xmm16s(3));
282 op1->xmmubyte(0xC) = SaturateWordSToByteU(op2->xmm16s(4));
283 op1->xmmubyte(0xD) = SaturateWordSToByteU(op2->xmm16s(5));
284 op1->xmmubyte(0xE) = SaturateWordSToByteU(op2->xmm16s(6));
285 op1->xmmubyte(0xF) = SaturateWordSToByteU(op2->xmm16s(7));
286 }
287
xmm_packsswb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)288 BX_CPP_INLINE void xmm_packsswb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
289 {
290 op1->xmmsbyte(0x0) = SaturateWordSToByteS(op1->xmm16s(0));
291 op1->xmmsbyte(0x1) = SaturateWordSToByteS(op1->xmm16s(1));
292 op1->xmmsbyte(0x2) = SaturateWordSToByteS(op1->xmm16s(2));
293 op1->xmmsbyte(0x3) = SaturateWordSToByteS(op1->xmm16s(3));
294 op1->xmmsbyte(0x4) = SaturateWordSToByteS(op1->xmm16s(4));
295 op1->xmmsbyte(0x5) = SaturateWordSToByteS(op1->xmm16s(5));
296 op1->xmmsbyte(0x6) = SaturateWordSToByteS(op1->xmm16s(6));
297 op1->xmmsbyte(0x7) = SaturateWordSToByteS(op1->xmm16s(7));
298
299 op1->xmmsbyte(0x8) = SaturateWordSToByteS(op2->xmm16s(0));
300 op1->xmmsbyte(0x9) = SaturateWordSToByteS(op2->xmm16s(1));
301 op1->xmmsbyte(0xA) = SaturateWordSToByteS(op2->xmm16s(2));
302 op1->xmmsbyte(0xB) = SaturateWordSToByteS(op2->xmm16s(3));
303 op1->xmmsbyte(0xC) = SaturateWordSToByteS(op2->xmm16s(4));
304 op1->xmmsbyte(0xD) = SaturateWordSToByteS(op2->xmm16s(5));
305 op1->xmmsbyte(0xE) = SaturateWordSToByteS(op2->xmm16s(6));
306 op1->xmmsbyte(0xF) = SaturateWordSToByteS(op2->xmm16s(7));
307 }
308
xmm_packusdw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)309 BX_CPP_INLINE void xmm_packusdw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
310 {
311 op1->xmm16u(0) = SaturateDwordSToWordU(op1->xmm32s(0));
312 op1->xmm16u(1) = SaturateDwordSToWordU(op1->xmm32s(1));
313 op1->xmm16u(2) = SaturateDwordSToWordU(op1->xmm32s(2));
314 op1->xmm16u(3) = SaturateDwordSToWordU(op1->xmm32s(3));
315
316 op1->xmm16u(4) = SaturateDwordSToWordU(op2->xmm32s(0));
317 op1->xmm16u(5) = SaturateDwordSToWordU(op2->xmm32s(1));
318 op1->xmm16u(6) = SaturateDwordSToWordU(op2->xmm32s(2));
319 op1->xmm16u(7) = SaturateDwordSToWordU(op2->xmm32s(3));
320 }
321
xmm_packssdw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)322 BX_CPP_INLINE void xmm_packssdw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
323 {
324 op1->xmm16s(0) = SaturateDwordSToWordS(op1->xmm32s(0));
325 op1->xmm16s(1) = SaturateDwordSToWordS(op1->xmm32s(1));
326 op1->xmm16s(2) = SaturateDwordSToWordS(op1->xmm32s(2));
327 op1->xmm16s(3) = SaturateDwordSToWordS(op1->xmm32s(3));
328
329 op1->xmm16s(4) = SaturateDwordSToWordS(op2->xmm32s(0));
330 op1->xmm16s(5) = SaturateDwordSToWordS(op2->xmm32s(1));
331 op1->xmm16s(6) = SaturateDwordSToWordS(op2->xmm32s(2));
332 op1->xmm16s(7) = SaturateDwordSToWordS(op2->xmm32s(3));
333 }
334
335 // shuffle
336
xmm_pshufb(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)337 BX_CPP_INLINE void xmm_pshufb(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
338 {
339 for(unsigned n=0; n<16; n++)
340 {
341 unsigned mask = op2->xmmubyte(n);
342 if (mask & 0x80)
343 r->xmmubyte(n) = 0;
344 else
345 r->xmmubyte(n) = op1->xmmubyte(mask & 0xf);
346 }
347 }
348
xmm_pshufhw(BxPackedXmmRegister * r,const BxPackedXmmRegister * op,Bit8u order)349 BX_CPP_INLINE void xmm_pshufhw(BxPackedXmmRegister *r, const BxPackedXmmRegister *op, Bit8u order)
350 {
351 r->xmm64u(0) = op->xmm64u(0);
352 r->xmm16u(4) = op->xmm16u(4 + ((order >> 0) & 0x3));
353 r->xmm16u(5) = op->xmm16u(4 + ((order >> 2) & 0x3));
354 r->xmm16u(6) = op->xmm16u(4 + ((order >> 4) & 0x3));
355 r->xmm16u(7) = op->xmm16u(4 + ((order >> 6) & 0x3));
356 }
357
xmm_pshuflw(BxPackedXmmRegister * r,const BxPackedXmmRegister * op,Bit8u order)358 BX_CPP_INLINE void xmm_pshuflw(BxPackedXmmRegister *r, const BxPackedXmmRegister *op, Bit8u order)
359 {
360 r->xmm16u(0) = op->xmm16u((order >> 0) & 0x3);
361 r->xmm16u(1) = op->xmm16u((order >> 2) & 0x3);
362 r->xmm16u(2) = op->xmm16u((order >> 4) & 0x3);
363 r->xmm16u(3) = op->xmm16u((order >> 6) & 0x3);
364 r->xmm64u(1) = op->xmm64u(1);
365 }
366
xmm_shufps(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit8u order)367 BX_CPP_INLINE void xmm_shufps(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit8u order)
368 {
369 r->xmm32u(0) = op1->xmm32u((order >> 0) & 0x3);
370 r->xmm32u(1) = op1->xmm32u((order >> 2) & 0x3);
371 r->xmm32u(2) = op2->xmm32u((order >> 4) & 0x3);
372 r->xmm32u(3) = op2->xmm32u((order >> 6) & 0x3);
373 }
374
xmm_shufpd(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit8u order)375 BX_CPP_INLINE void xmm_shufpd(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit8u order)
376 {
377 r->xmm64u(0) = op1->xmm64u((order >> 0) & 0x1);
378 r->xmm64u(1) = op2->xmm64u((order >> 1) & 0x1);
379 }
380
xmm_permilps(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)381 BX_CPP_INLINE void xmm_permilps(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
382 {
383 r->xmm32u(0) = op1->xmm32u(op2->xmm32u(0) & 0x3);
384 r->xmm32u(1) = op1->xmm32u(op2->xmm32u(1) & 0x3);
385 r->xmm32u(2) = op1->xmm32u(op2->xmm32u(2) & 0x3);
386 r->xmm32u(3) = op1->xmm32u(op2->xmm32u(3) & 0x3);
387 }
388
xmm_permilpd(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)389 BX_CPP_INLINE void xmm_permilpd(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
390 {
391 r->xmm64u(0) = op1->xmm64u((op2->xmm32u(0) >> 1) & 0x1);
392 r->xmm64u(1) = op1->xmm64u((op2->xmm32u(2) >> 1) & 0x1);
393 }
394
xmm_permil2ps(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * op3,unsigned m2z)395 BX_CPP_INLINE void xmm_permil2ps(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *op3, unsigned m2z)
396 {
397 for(unsigned n=0; n < 4; n++) {
398 Bit32u ctrl = op3->xmm32u(n);
399 if ((m2z ^ ((ctrl >> 3) & 0x1)) == 0x3)
400 r->xmm32u(n) = 0;
401 else
402 r->xmm32u(n) = (ctrl & 0x4) ? op1->xmm32u(ctrl & 0x3) : op2->xmm32u(ctrl & 0x3);
403 }
404 }
405
xmm_permil2pd(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * op3,unsigned m2z)406 BX_CPP_INLINE void xmm_permil2pd(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *op3, unsigned m2z)
407 {
408 for(unsigned n=0; n < 2; n++) {
409 Bit32u ctrl = op3->xmm32u(n*2);
410 if ((m2z ^ ((ctrl >> 3) & 0x1)) == 0x3)
411 r->xmm64u(n) = 0;
412 else
413 r->xmm64u(n) = (ctrl & 0x4) ? op1->xmm64u((ctrl >> 1) & 0x1) : op2->xmm64u((ctrl >> 1) & 0x1);
414 }
415 }
416
417 #if BX_SUPPORT_AVX
ymm_vpermq(BxPackedYmmRegister * r,const BxPackedYmmRegister * op,Bit8u control)418 BX_CPP_INLINE void ymm_vpermq(BxPackedYmmRegister *r, const BxPackedYmmRegister *op, Bit8u control)
419 {
420 r->ymm64u(0) = op->ymm64u((control) & 0x3);
421 r->ymm64u(1) = op->ymm64u((control >> 2) & 0x3);
422 r->ymm64u(2) = op->ymm64u((control >> 4) & 0x3);
423 r->ymm64u(3) = op->ymm64u((control >> 6) & 0x3);
424 }
425 #endif
426
427 // sign
428
xmm_psignb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)429 BX_CPP_INLINE void xmm_psignb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
430 {
431 for(unsigned n=0; n<16; n++) {
432 int sign = (op2->xmmsbyte(n) > 0) - (op2->xmmsbyte(n) < 0);
433 op1->xmmsbyte(n) *= sign;
434 }
435 }
436
xmm_psignw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)437 BX_CPP_INLINE void xmm_psignw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
438 {
439 for(unsigned n=0; n<8; n++) {
440 int sign = (op2->xmm16s(n) > 0) - (op2->xmm16s(n) < 0);
441 op1->xmm16s(n) *= sign;
442 }
443 }
444
xmm_psignd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)445 BX_CPP_INLINE void xmm_psignd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
446 {
447 for(unsigned n=0; n<4; n++) {
448 int sign = (op2->xmm32s(n) > 0) - (op2->xmm32s(n) < 0);
449 op1->xmm32s(n) *= sign;
450 }
451 }
452
453 // mask creation
454
xmm_pmovmskb(const BxPackedXmmRegister * op)455 BX_CPP_INLINE Bit32u xmm_pmovmskb(const BxPackedXmmRegister *op)
456 {
457 Bit32u mask = 0;
458
459 if(op->xmmsbyte(0x0) < 0) mask |= 0x0001;
460 if(op->xmmsbyte(0x1) < 0) mask |= 0x0002;
461 if(op->xmmsbyte(0x2) < 0) mask |= 0x0004;
462 if(op->xmmsbyte(0x3) < 0) mask |= 0x0008;
463 if(op->xmmsbyte(0x4) < 0) mask |= 0x0010;
464 if(op->xmmsbyte(0x5) < 0) mask |= 0x0020;
465 if(op->xmmsbyte(0x6) < 0) mask |= 0x0040;
466 if(op->xmmsbyte(0x7) < 0) mask |= 0x0080;
467 if(op->xmmsbyte(0x8) < 0) mask |= 0x0100;
468 if(op->xmmsbyte(0x9) < 0) mask |= 0x0200;
469 if(op->xmmsbyte(0xA) < 0) mask |= 0x0400;
470 if(op->xmmsbyte(0xB) < 0) mask |= 0x0800;
471 if(op->xmmsbyte(0xC) < 0) mask |= 0x1000;
472 if(op->xmmsbyte(0xD) < 0) mask |= 0x2000;
473 if(op->xmmsbyte(0xE) < 0) mask |= 0x4000;
474 if(op->xmmsbyte(0xF) < 0) mask |= 0x8000;
475
476 return mask;
477 }
478
xmm_pmovmskw(const BxPackedXmmRegister * op)479 BX_CPP_INLINE Bit32u xmm_pmovmskw(const BxPackedXmmRegister *op)
480 {
481 Bit32u mask = 0;
482
483 if(op->xmm16s(0) < 0) mask |= 0x01;
484 if(op->xmm16s(1) < 0) mask |= 0x02;
485 if(op->xmm16s(2) < 0) mask |= 0x04;
486 if(op->xmm16s(3) < 0) mask |= 0x08;
487 if(op->xmm16s(4) < 0) mask |= 0x10;
488 if(op->xmm16s(5) < 0) mask |= 0x20;
489 if(op->xmm16s(6) < 0) mask |= 0x40;
490 if(op->xmm16s(7) < 0) mask |= 0x80;
491
492 return mask;
493 }
494
xmm_pmovmskd(const BxPackedXmmRegister * op)495 BX_CPP_INLINE Bit32u xmm_pmovmskd(const BxPackedXmmRegister *op)
496 {
497 Bit32u mask = 0;
498
499 if(op->xmm32s(0) < 0) mask |= 0x1;
500 if(op->xmm32s(1) < 0) mask |= 0x2;
501 if(op->xmm32s(2) < 0) mask |= 0x4;
502 if(op->xmm32s(3) < 0) mask |= 0x8;
503
504 return mask;
505 }
506
xmm_pmovmskq(const BxPackedXmmRegister * op)507 BX_CPP_INLINE Bit32u xmm_pmovmskq(const BxPackedXmmRegister *op)
508 {
509 Bit32u mask = 0;
510
511 if(op->xmm32s(1) < 0) mask |= 0x1;
512 if(op->xmm32s(3) < 0) mask |= 0x2;
513
514 return mask;
515 }
516
xmm_pmovm2b(BxPackedXmmRegister * dst,Bit32u mask)517 BX_CPP_INLINE void xmm_pmovm2b(BxPackedXmmRegister *dst, Bit32u mask)
518 {
519 for (unsigned n=0; n < 16; n++, mask >>= 1) {
520 dst->xmmsbyte(n) = - Bit8s(mask & 0x1);
521 }
522 }
523
xmm_pmovm2w(BxPackedXmmRegister * dst,Bit32u mask)524 BX_CPP_INLINE void xmm_pmovm2w(BxPackedXmmRegister *dst, Bit32u mask)
525 {
526 for (unsigned n=0; n < 8; n++, mask >>= 1) {
527 dst->xmm16s(n) = - Bit16s(mask & 0x1);
528 }
529 }
530
xmm_pmovm2d(BxPackedXmmRegister * dst,Bit32u mask)531 BX_CPP_INLINE void xmm_pmovm2d(BxPackedXmmRegister *dst, Bit32u mask)
532 {
533 for (unsigned n=0; n < 4; n++, mask >>= 1) {
534 dst->xmm32s(n) = - Bit32s(mask & 0x1);
535 }
536 }
537
xmm_pmovm2q(BxPackedXmmRegister * dst,Bit32u mask)538 BX_CPP_INLINE void xmm_pmovm2q(BxPackedXmmRegister *dst, Bit32u mask)
539 {
540 dst->xmm64s(0) = (mask & 0x1) ? (Bit64s) -1 : 0;
541 dst->xmm64s(1) = (mask & 0x2) ? (Bit64s) -1 : 0;
542 }
543
544 // blend
545
xmm_pblendb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit32u mask)546 BX_CPP_INLINE void xmm_pblendb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit32u mask)
547 {
548 for (unsigned n=0; n < 16; n++, mask >>= 1) {
549 if (mask & 0x1) op1->xmmubyte(n) = op2->xmmubyte(n);
550 }
551 }
552
xmm_zero_pblendb(BxPackedXmmRegister * dst,const BxPackedXmmRegister * op,Bit32u mask)553 BX_CPP_INLINE void xmm_zero_pblendb(BxPackedXmmRegister *dst, const BxPackedXmmRegister *op, Bit32u mask)
554 {
555 for (unsigned n=0; n < 16; n++, mask >>= 1) {
556 dst->xmmubyte(n) = (mask & 0x1) ? op->xmmubyte(n) : 0;
557 }
558 }
559
560 #if BX_SUPPORT_EVEX
simd_pblendb(BxPackedAvxRegister * op1,const BxPackedAvxRegister * op2,Bit64u mask,unsigned len)561 BX_CPP_INLINE void simd_pblendb(BxPackedAvxRegister *op1, const BxPackedAvxRegister *op2, Bit64u mask, unsigned len)
562 {
563 for (unsigned n=0; n < len; n++) {
564 if (mask & 0x1) op1->vmmubyte(n) = op2->vmmubyte(n);
565 mask >>= 1;
566 }
567 }
568
simd_zero_pblendb(BxPackedAvxRegister * dst,const BxPackedAvxRegister * op,Bit64u mask,unsigned len)569 BX_CPP_INLINE void simd_zero_pblendb(BxPackedAvxRegister *dst, const BxPackedAvxRegister *op, Bit64u mask, unsigned len)
570 {
571 for (unsigned n=0; n < len; n++) {
572 dst->vmmubyte(n) = (mask & 0x1) ? op->vmmubyte(n) : 0;
573 mask >>= 1;
574 }
575 }
576 #endif
577
xmm_pblendw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit32u mask)578 BX_CPP_INLINE void xmm_pblendw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit32u mask)
579 {
580 for (unsigned n=0; n < 8; n++, mask >>= 1) {
581 if (mask & 0x1) op1->xmm16u(n) = op2->xmm16u(n);
582 }
583 }
584
xmm_zero_pblendw(BxPackedXmmRegister * dst,const BxPackedXmmRegister * op,Bit32u mask)585 BX_CPP_INLINE void xmm_zero_pblendw(BxPackedXmmRegister *dst, const BxPackedXmmRegister *op, Bit32u mask)
586 {
587 for (unsigned n=0; n < 8; n++, mask >>= 1) {
588 dst->xmm16u(n) = (mask & 0x1) ? op->xmm16u(n) : 0;
589 }
590 }
591
592 #if BX_SUPPORT_EVEX
simd_pblendw(BxPackedAvxRegister * op1,const BxPackedAvxRegister * op2,Bit32u mask,unsigned len)593 BX_CPP_INLINE void simd_pblendw(BxPackedAvxRegister *op1, const BxPackedAvxRegister *op2, Bit32u mask, unsigned len)
594 {
595 for (unsigned n=0; n < len; n++) {
596 if (mask & 0x1) op1->vmm16u(n) = op2->vmm16u(n);
597 mask >>= 1;
598 }
599 }
600
simd_zero_pblendw(BxPackedAvxRegister * dst,const BxPackedAvxRegister * op,Bit32u mask,unsigned len)601 BX_CPP_INLINE void simd_zero_pblendw(BxPackedAvxRegister *dst, const BxPackedAvxRegister *op, Bit32u mask, unsigned len)
602 {
603 for (unsigned n=0; n < len; n++) {
604 dst->vmm16u(n) = (mask & 0x1) ? op->vmm16u(n) : 0;
605 mask >>= 1;
606 }
607 }
608 #endif
609
xmm_blendps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit32u mask)610 BX_CPP_INLINE void xmm_blendps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit32u mask)
611 {
612 for (unsigned n=0; n < 4; n++, mask >>= 1) {
613 if (mask & 0x1) op1->xmm32u(n) = op2->xmm32u(n);
614 }
615 }
616
xmm_zero_blendps(BxPackedXmmRegister * dst,const BxPackedXmmRegister * op,Bit32u mask)617 BX_CPP_INLINE void xmm_zero_blendps(BxPackedXmmRegister *dst, const BxPackedXmmRegister *op, Bit32u mask)
618 {
619 for (unsigned n=0; n < 4; n++, mask >>= 1) {
620 dst->xmm32u(n) = (mask & 0x1) ? op->xmm32u(n) : 0;
621 }
622 }
623
624 #if BX_SUPPORT_EVEX
simd_blendps(BxPackedAvxRegister * op1,const BxPackedAvxRegister * op2,Bit32u mask,unsigned len)625 BX_CPP_INLINE void simd_blendps(BxPackedAvxRegister *op1, const BxPackedAvxRegister *op2, Bit32u mask, unsigned len)
626 {
627 for (unsigned n=0; n < len; n++) {
628 if (mask & 0x1) op1->vmm32u(n) = op2->vmm32u(n);
629 mask >>= 1;
630 }
631 }
632
simd_zero_blendps(BxPackedAvxRegister * dst,const BxPackedAvxRegister * op,Bit32u mask,unsigned len)633 BX_CPP_INLINE void simd_zero_blendps(BxPackedAvxRegister *dst, const BxPackedAvxRegister *op, Bit32u mask, unsigned len)
634 {
635 for (unsigned n=0; n < len; n++) {
636 dst->vmm32u(n) = (mask & 0x1) ? op->vmm32u(n) : 0;
637 mask >>= 1;
638 }
639 }
640 #endif
641
xmm_blendpd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit32u mask)642 BX_CPP_INLINE void xmm_blendpd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit32u mask)
643 {
644 for (unsigned n=0; n < 2; n++, mask >>= 1) {
645 if (mask & 0x1) op1->xmm64u(n) = op2->xmm64u(n);
646 }
647 }
648
xmm_zero_blendpd(BxPackedXmmRegister * dst,const BxPackedXmmRegister * op,Bit32u mask)649 BX_CPP_INLINE void xmm_zero_blendpd(BxPackedXmmRegister *dst, const BxPackedXmmRegister *op, Bit32u mask)
650 {
651 for (unsigned n=0; n < 2; n++, mask >>= 1) {
652 dst->xmm64u(n) = (mask & 0x1) ? op->xmm64u(n) : 0;
653 }
654 }
655
656 #if BX_SUPPORT_EVEX
simd_blendpd(BxPackedAvxRegister * op1,const BxPackedAvxRegister * op2,Bit32u mask,unsigned len)657 BX_CPP_INLINE void simd_blendpd(BxPackedAvxRegister *op1, const BxPackedAvxRegister *op2, Bit32u mask, unsigned len)
658 {
659 for (unsigned n=0; n < len; n++) {
660 if (mask & 0x1) op1->vmm64u(n) = op2->vmm64u(n);
661 mask >>= 1;
662 }
663 }
664
simd_zero_blendpd(BxPackedAvxRegister * dst,const BxPackedAvxRegister * op,Bit32u mask,unsigned len)665 BX_CPP_INLINE void simd_zero_blendpd(BxPackedAvxRegister *dst, const BxPackedAvxRegister *op, Bit32u mask, unsigned len)
666 {
667 for (unsigned n=0; n < len; n++) {
668 dst->vmm64u(n) = (mask & 0x1) ? op->vmm64u(n) : 0;
669 mask >>= 1;
670 }
671 }
672 #endif
673
xmm_pblendvb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * mask)674 BX_CPP_INLINE void xmm_pblendvb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *mask)
675 {
676 for(unsigned n=0; n<16; n++) {
677 if (mask->xmmsbyte(n) < 0) op1->xmmubyte(n) = op2->xmmubyte(n);
678 }
679 }
680
xmm_pblendvw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * mask)681 BX_CPP_INLINE void xmm_pblendvw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *mask)
682 {
683 for(unsigned n=0; n<8; n++) {
684 if (mask->xmm16s(n) < 0) op1->xmm16u(n) = op2->xmm16u(n);
685 }
686 }
687
xmm_blendvps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * mask)688 BX_CPP_INLINE void xmm_blendvps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *mask)
689 {
690 for(unsigned n=0; n<4; n++) {
691 if (mask->xmm32s(n) < 0) op1->xmm32u(n) = op2->xmm32u(n);
692 }
693 }
694
xmm_blendvpd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * mask)695 BX_CPP_INLINE void xmm_blendvpd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *mask)
696 {
697 if (mask->xmm32s(1) < 0) op1->xmm64u(0) = op2->xmm64u(0);
698 if (mask->xmm32s(3) < 0) op1->xmm64u(1) = op2->xmm64u(1);
699 }
700
701 // arithmetic (logic)
702
xmm_andps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)703 BX_CPP_INLINE void xmm_andps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
704 {
705 for (unsigned n=0; n < 2; n++)
706 op1->xmm64u(n) &= op2->xmm64u(n);
707 }
708
xmm_andnps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)709 BX_CPP_INLINE void xmm_andnps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
710 {
711 for (unsigned n=0; n < 2; n++)
712 op1->xmm64u(n) = ~(op1->xmm64u(n)) & op2->xmm64u(n);
713 }
714
xmm_orps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)715 BX_CPP_INLINE void xmm_orps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
716 {
717 for (unsigned n=0; n < 2; n++)
718 op1->xmm64u(n) |= op2->xmm64u(n);
719 }
720
xmm_xorps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)721 BX_CPP_INLINE void xmm_xorps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
722 {
723 for (unsigned n=0; n < 2; n++)
724 op1->xmm64u(n) ^= op2->xmm64u(n);
725 }
726
727 // arithmetic (add/sub)
728
xmm_paddb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)729 BX_CPP_INLINE void xmm_paddb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
730 {
731 for(unsigned n=0; n<16; n++) {
732 op1->xmmubyte(n) += op2->xmmubyte(n);
733 }
734 }
735
xmm_paddw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)736 BX_CPP_INLINE void xmm_paddw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
737 {
738 for(unsigned n=0; n<8; n++) {
739 op1->xmm16u(n) += op2->xmm16u(n);
740 }
741 }
742
xmm_paddd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)743 BX_CPP_INLINE void xmm_paddd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
744 {
745 for(unsigned n=0; n<4; n++) {
746 op1->xmm32u(n) += op2->xmm32u(n);
747 }
748 }
749
xmm_paddq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)750 BX_CPP_INLINE void xmm_paddq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
751 {
752 for(unsigned n=0; n<2; n++) {
753 op1->xmm64u(n) += op2->xmm64u(n);
754 }
755 }
756
xmm_psubb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)757 BX_CPP_INLINE void xmm_psubb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
758 {
759 for(unsigned n=0; n<16; n++) {
760 op1->xmmubyte(n) -= op2->xmmubyte(n);
761 }
762 }
763
xmm_psubw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)764 BX_CPP_INLINE void xmm_psubw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
765 {
766 for(unsigned n=0; n<8; n++) {
767 op1->xmm16u(n) -= op2->xmm16u(n);
768 }
769 }
770
xmm_psubd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)771 BX_CPP_INLINE void xmm_psubd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
772 {
773 for(unsigned n=0; n<4; n++) {
774 op1->xmm32u(n) -= op2->xmm32u(n);
775 }
776 }
777
xmm_psubq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)778 BX_CPP_INLINE void xmm_psubq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
779 {
780 for(unsigned n=0; n<2; n++) {
781 op1->xmm64u(n) -= op2->xmm64u(n);
782 }
783 }
784
785 // arithmetic (add/sub with saturation)
786
xmm_paddsb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)787 BX_CPP_INLINE void xmm_paddsb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
788 {
789 for(unsigned n=0; n<16; n++) {
790 op1->xmmsbyte(n) = SaturateWordSToByteS(Bit16s(op1->xmmsbyte(n)) + Bit16s(op2->xmmsbyte(n)));
791 }
792 }
793
xmm_paddsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)794 BX_CPP_INLINE void xmm_paddsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
795 {
796 for(unsigned n=0; n<8; n++) {
797 op1->xmm16s(n) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(n)) + Bit32s(op2->xmm16s(n)));
798 }
799 }
800
xmm_paddusb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)801 BX_CPP_INLINE void xmm_paddusb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
802 {
803 for(unsigned n=0; n<16; n++) {
804 op1->xmmubyte(n) = SaturateWordSToByteU(Bit16s(op1->xmmubyte(n)) + Bit16s(op2->xmmubyte(n)));
805 }
806 }
807
xmm_paddusw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)808 BX_CPP_INLINE void xmm_paddusw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
809 {
810 for(unsigned n=0; n<8; n++) {
811 op1->xmm16u(n) = SaturateDwordSToWordU(Bit32s(op1->xmm16u(n)) + Bit32s(op2->xmm16u(n)));
812 }
813 }
814
xmm_psubsb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)815 BX_CPP_INLINE void xmm_psubsb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
816 {
817 for(unsigned n=0; n<16; n++) {
818 op1->xmmsbyte(n) = SaturateWordSToByteS(Bit16s(op1->xmmsbyte(n)) - Bit16s(op2->xmmsbyte(n)));
819 }
820 }
821
xmm_psubsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)822 BX_CPP_INLINE void xmm_psubsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
823 {
824 for(unsigned n=0; n<8; n++) {
825 op1->xmm16s(n) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(n)) - Bit32s(op2->xmm16s(n)));
826 }
827 }
828
xmm_psubusb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)829 BX_CPP_INLINE void xmm_psubusb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
830 {
831 for(unsigned n=0; n<16; n++)
832 {
833 if(op1->xmmubyte(n) > op2->xmmubyte(n))
834 op1->xmmubyte(n) -= op2->xmmubyte(n);
835 else
836 op1->xmmubyte(n) = 0;
837 }
838 }
839
xmm_psubusw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)840 BX_CPP_INLINE void xmm_psubusw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
841 {
842 for(unsigned n=0; n<8; n++)
843 {
844 if(op1->xmm16u(n) > op2->xmm16u(n))
845 op1->xmm16u(n) -= op2->xmm16u(n);
846 else
847 op1->xmm16u(n) = 0;
848 }
849 }
850
851 // arithmetic (horizontal add/sub)
852
xmm_phaddw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)853 BX_CPP_INLINE void xmm_phaddw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
854 {
855 op1->xmm16u(0) = op1->xmm16u(0) + op1->xmm16u(1);
856 op1->xmm16u(1) = op1->xmm16u(2) + op1->xmm16u(3);
857 op1->xmm16u(2) = op1->xmm16u(4) + op1->xmm16u(5);
858 op1->xmm16u(3) = op1->xmm16u(6) + op1->xmm16u(7);
859
860 op1->xmm16u(4) = op2->xmm16u(0) + op2->xmm16u(1);
861 op1->xmm16u(5) = op2->xmm16u(2) + op2->xmm16u(3);
862 op1->xmm16u(6) = op2->xmm16u(4) + op2->xmm16u(5);
863 op1->xmm16u(7) = op2->xmm16u(6) + op2->xmm16u(7);
864 }
865
xmm_phaddd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)866 BX_CPP_INLINE void xmm_phaddd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
867 {
868 op1->xmm32u(0) = op1->xmm32u(0) + op1->xmm32u(1);
869 op1->xmm32u(1) = op1->xmm32u(2) + op1->xmm32u(3);
870 op1->xmm32u(2) = op2->xmm32u(0) + op2->xmm32u(1);
871 op1->xmm32u(3) = op2->xmm32u(2) + op2->xmm32u(3);
872 }
873
xmm_phaddsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)874 BX_CPP_INLINE void xmm_phaddsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
875 {
876 op1->xmm16s(0) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(0)) + Bit32s(op1->xmm16s(1)));
877 op1->xmm16s(1) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(2)) + Bit32s(op1->xmm16s(3)));
878 op1->xmm16s(2) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(4)) + Bit32s(op1->xmm16s(5)));
879 op1->xmm16s(3) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(6)) + Bit32s(op1->xmm16s(7)));
880
881 op1->xmm16s(4) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(0)) + Bit32s(op2->xmm16s(1)));
882 op1->xmm16s(5) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(2)) + Bit32s(op2->xmm16s(3)));
883 op1->xmm16s(6) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(4)) + Bit32s(op2->xmm16s(5)));
884 op1->xmm16s(7) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(6)) + Bit32s(op2->xmm16s(7)));
885 }
886
xmm_phsubw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)887 BX_CPP_INLINE void xmm_phsubw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
888 {
889 op1->xmm16u(0) = op1->xmm16u(0) - op1->xmm16u(1);
890 op1->xmm16u(1) = op1->xmm16u(2) - op1->xmm16u(3);
891 op1->xmm16u(2) = op1->xmm16u(4) - op1->xmm16u(5);
892 op1->xmm16u(3) = op1->xmm16u(6) - op1->xmm16u(7);
893
894 op1->xmm16u(4) = op2->xmm16u(0) - op2->xmm16u(1);
895 op1->xmm16u(5) = op2->xmm16u(2) - op2->xmm16u(3);
896 op1->xmm16u(6) = op2->xmm16u(4) - op2->xmm16u(5);
897 op1->xmm16u(7) = op2->xmm16u(6) - op2->xmm16u(7);
898 }
899
xmm_phsubd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)900 BX_CPP_INLINE void xmm_phsubd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
901 {
902 op1->xmm32u(0) = op1->xmm32u(0) - op1->xmm32u(1);
903 op1->xmm32u(1) = op1->xmm32u(2) - op1->xmm32u(3);
904 op1->xmm32u(2) = op2->xmm32u(0) - op2->xmm32u(1);
905 op1->xmm32u(3) = op2->xmm32u(2) - op2->xmm32u(3);
906 }
907
xmm_phsubsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)908 BX_CPP_INLINE void xmm_phsubsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
909 {
910 op1->xmm16s(0) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(0)) - Bit32s(op1->xmm16s(1)));
911 op1->xmm16s(1) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(2)) - Bit32s(op1->xmm16s(3)));
912 op1->xmm16s(2) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(4)) - Bit32s(op1->xmm16s(5)));
913 op1->xmm16s(3) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(6)) - Bit32s(op1->xmm16s(7)));
914
915 op1->xmm16s(4) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(0)) - Bit32s(op2->xmm16s(1)));
916 op1->xmm16s(5) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(2)) - Bit32s(op2->xmm16s(3)));
917 op1->xmm16s(6) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(4)) - Bit32s(op2->xmm16s(5)));
918 op1->xmm16s(7) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(6)) - Bit32s(op2->xmm16s(7)));
919 }
920
921 // average
922
xmm_pavgb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)923 BX_CPP_INLINE void xmm_pavgb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
924 {
925 for(unsigned n=0; n<16; n++) {
926 op1->xmmubyte(n) = (op1->xmmubyte(n) + op2->xmmubyte(n) + 1) >> 1;
927 }
928 }
929
xmm_pavgw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)930 BX_CPP_INLINE void xmm_pavgw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
931 {
932 for(unsigned n=0; n<8; n++) {
933 op1->xmm16u(n) = (op1->xmm16u(n) + op2->xmm16u(n) + 1) >> 1;
934 }
935 }
936
937 // multiply
938
xmm_pmullw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)939 BX_CPP_INLINE void xmm_pmullw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
940 {
941 for(unsigned n=0; n<8; n++) {
942 op1->xmm16s(n) *= op2->xmm16s(n);
943 }
944 }
945
xmm_pmulhw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)946 BX_CPP_INLINE void xmm_pmulhw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
947 {
948 for(unsigned n=0; n<8; n++) {
949 Bit32s product = Bit32s(op1->xmm16s(n)) * Bit32s(op2->xmm16s(n));
950 op1->xmm16u(n) = (Bit16u)(product >> 16);
951 }
952 }
953
xmm_pmulhuw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)954 BX_CPP_INLINE void xmm_pmulhuw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
955 {
956 for(unsigned n=0; n<8; n++) {
957 Bit32u product = Bit32u(op1->xmm16u(n)) * Bit32u(op2->xmm16u(n));
958 op1->xmm16u(n) = (Bit16u)(product >> 16);
959 }
960 }
961
xmm_pmulld(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)962 BX_CPP_INLINE void xmm_pmulld(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
963 {
964 for(unsigned n=0; n<4; n++) {
965 op1->xmm32s(n) *= op2->xmm32s(n);
966 }
967 }
968
xmm_pmullq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)969 BX_CPP_INLINE void xmm_pmullq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
970 {
971 for(unsigned n=0; n<2; n++) {
972 op1->xmm64s(n) *= op2->xmm64s(n);
973 }
974 }
975
xmm_pmuldq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)976 BX_CPP_INLINE void xmm_pmuldq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
977 {
978 op1->xmm64s(0) = Bit64s(op1->xmm32s(0)) * Bit64s(op2->xmm32s(0));
979 op1->xmm64s(1) = Bit64s(op1->xmm32s(2)) * Bit64s(op2->xmm32s(2));
980 }
981
xmm_pmuludq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)982 BX_CPP_INLINE void xmm_pmuludq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
983 {
984 op1->xmm64u(0) = Bit64u(op1->xmm32u(0)) * Bit64u(op2->xmm32u(0));
985 op1->xmm64u(1) = Bit64u(op1->xmm32u(2)) * Bit64u(op2->xmm32u(2));
986 }
987
xmm_pmulhrsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)988 BX_CPP_INLINE void xmm_pmulhrsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
989 {
990 for(unsigned n=0; n<8; n++) {
991 op1->xmm16u(n) = (((Bit32s(op1->xmm16s(n)) * Bit32s(op2->xmm16s(n))) >> 14) + 1) >> 1;
992 }
993 }
994
995 // multiply/add
996
xmm_pmaddubsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)997 BX_CPP_INLINE void xmm_pmaddubsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
998 {
999 for(unsigned n=0; n<8; n++)
1000 {
1001 Bit32s temp = Bit32s(op1->xmmubyte(n*2)) * Bit32s(op2->xmmsbyte(n*2)) +
1002 Bit32s(op1->xmmubyte(n*2+1)) * Bit32s(op2->xmmsbyte(n*2+1));
1003
1004 op1->xmm16s(n) = SaturateDwordSToWordS(temp);
1005 }
1006 }
1007
xmm_pmaddwd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1008 BX_CPP_INLINE void xmm_pmaddwd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1009 {
1010 for(unsigned n=0; n<4; n++)
1011 {
1012 op1->xmm32u(n) = Bit32s(op1->xmm16s(n*2)) * Bit32s(op2->xmm16s(n*2)) +
1013 Bit32s(op1->xmm16s(n*2+1)) * Bit32s(op2->xmm16s(n*2+1));
1014 }
1015 }
1016
1017 // broadcast
1018
xmm_pbroadcastb(BxPackedXmmRegister * op,Bit8u val_8)1019 BX_CPP_INLINE void xmm_pbroadcastb(BxPackedXmmRegister *op, Bit8u val_8)
1020 {
1021 for(unsigned n=0; n<16; n++) {
1022 op->xmmubyte(n) = val_8;
1023 }
1024 }
1025
xmm_pbroadcastw(BxPackedXmmRegister * op,Bit16u val_16)1026 BX_CPP_INLINE void xmm_pbroadcastw(BxPackedXmmRegister *op, Bit16u val_16)
1027 {
1028 for(unsigned n=0; n<8; n++) {
1029 op->xmm16u(n) = val_16;
1030 }
1031 }
1032
xmm_pbroadcastd(BxPackedXmmRegister * op,Bit32u val_32)1033 BX_CPP_INLINE void xmm_pbroadcastd(BxPackedXmmRegister *op, Bit32u val_32)
1034 {
1035 for(unsigned n=0; n<4; n++) {
1036 op->xmm32u(n) = val_32;
1037 }
1038 }
1039
xmm_pbroadcastq(BxPackedXmmRegister * op,Bit64u val_64)1040 BX_CPP_INLINE void xmm_pbroadcastq(BxPackedXmmRegister *op, Bit64u val_64)
1041 {
1042 for(unsigned n=0; n<2; n++) {
1043 op->xmm64u(n) = val_64;
1044 }
1045 }
1046
1047 #if BX_SUPPORT_EVEX
simd_pbroadcastb(BxPackedAvxRegister * op,Bit8u val_8,unsigned len)1048 BX_CPP_INLINE void simd_pbroadcastb(BxPackedAvxRegister *op, Bit8u val_8, unsigned len)
1049 {
1050 for(unsigned n=0; n < len; n++) {
1051 op->vmmubyte(n) = val_8;
1052 }
1053 }
1054
simd_pbroadcastw(BxPackedAvxRegister * op,Bit16u val_16,unsigned len)1055 BX_CPP_INLINE void simd_pbroadcastw(BxPackedAvxRegister *op, Bit16u val_16, unsigned len)
1056 {
1057 for(unsigned n=0; n < len; n++) {
1058 op->vmm16u(n) = val_16;
1059 }
1060 }
1061
simd_pbroadcastd(BxPackedAvxRegister * op,Bit32u val_32,unsigned len)1062 BX_CPP_INLINE void simd_pbroadcastd(BxPackedAvxRegister *op, Bit32u val_32, unsigned len)
1063 {
1064 for(unsigned n=0; n < len; n++) {
1065 op->vmm32u(n) = val_32;
1066 }
1067 }
1068
simd_pbroadcastq(BxPackedAvxRegister * op,Bit64u val_64,unsigned len)1069 BX_CPP_INLINE void simd_pbroadcastq(BxPackedAvxRegister *op, Bit64u val_64, unsigned len)
1070 {
1071 for(unsigned n=0; n < len; n++) {
1072 op->vmm64u(n) = val_64;
1073 }
1074 }
1075 #endif
1076
1077 // sum of absolute differences (SAD)
1078
xmm_psadbw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1079 BX_CPP_INLINE void xmm_psadbw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1080 {
1081 unsigned temp = 0;
1082 for (unsigned n=0; n < 8; n++)
1083 temp += abs(op1->xmmubyte(n) - op2->xmmubyte(n));
1084
1085 op1->xmm64u(0) = Bit64u(temp);
1086
1087 temp = 0;
1088 for (unsigned n=8; n < 16; n++)
1089 temp += abs(op1->xmmubyte(n) - op2->xmmubyte(n));
1090
1091 op1->xmm64u(1) = Bit64u(temp);
1092 }
1093
1094 // multiple sum of absolute differences (MSAD)
1095
sad_quadruple(const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,int op1_offset,int op2_offset)1096 BX_CPP_INLINE Bit16u sad_quadruple(const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, int op1_offset, int op2_offset)
1097 {
1098 Bit32u r = 0;
1099
1100 for (unsigned n=0; n < 4; n++) {
1101 Bit8u temp1 = op1->xmmubyte(n + op1_offset);
1102 Bit8u temp2 = op2->xmmubyte(n + op2_offset);
1103
1104 r += abs(temp1 - temp2);
1105 }
1106
1107 return r;
1108 }
1109
xmm_mpsadbw(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit8u control)1110 BX_CPP_INLINE void xmm_mpsadbw(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit8u control)
1111 {
1112 unsigned src_offset = (control & 0x3) * 4;
1113 unsigned dst_offset = ((control >> 2) & 0x1) * 4;
1114
1115 for (unsigned j=0; j < 8; j++) {
1116 r->xmm16u(j) = sad_quadruple(op1, op2, dst_offset + j, src_offset);
1117 }
1118 }
1119
xmm_dbpsadbw(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1120 BX_CPP_INLINE void xmm_dbpsadbw(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1121 {
1122 // assuming shuffle of op2 was done outside
1123 r->xmm16u(0) = sad_quadruple(op1, op2, 0, 0);
1124 r->xmm16u(1) = sad_quadruple(op1, op2, 0, 1);
1125 r->xmm16u(2) = sad_quadruple(op1, op2, 4, 2);
1126 r->xmm16u(3) = sad_quadruple(op1, op2, 4, 3);
1127 r->xmm16u(4) = sad_quadruple(op1, op2, 8, 8);
1128 r->xmm16u(5) = sad_quadruple(op1, op2, 8, 9);
1129 r->xmm16u(6) = sad_quadruple(op1, op2, 12, 10);
1130 r->xmm16u(7) = sad_quadruple(op1, op2, 12, 11);
1131 }
1132
1133 // conflict
1134
1135 #if BX_SUPPORT_EVEX
1136
simd_pconflictd(const BxPackedAvxRegister * op,int index)1137 BX_CPP_INLINE Bit32u simd_pconflictd(const BxPackedAvxRegister *op, int index)
1138 {
1139 Bit32u result = 0;
1140 // compare index element with all previous elements
1141 for (int i=0; i<index-1; i++) {
1142 if (op->vmm32u(index) == op->vmm32u(i)) result |= (1 << i);
1143 }
1144 return result;
1145 }
1146
simd_pconflictq(const BxPackedAvxRegister * op,int index)1147 BX_CPP_INLINE Bit32u simd_pconflictq(const BxPackedAvxRegister *op, int index)
1148 {
1149 Bit32u result = 0;
1150 // compare index element with all previous elements
1151 for (int i=0; i<index-1; i++) {
1152 if (op->vmm64u(index) == op->vmm64u(i)) result |= (1 << i);
1153 }
1154 return result;
1155 }
1156
1157 #endif
1158
1159 // bitwise select
1160
xmm_pselect(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * op3)1161 BX_CPP_INLINE void xmm_pselect(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *op3)
1162 {
1163 for(unsigned n=0;n < 2;n++) {
1164 op1->xmm64u(n) = (op3->xmm64u(n) & op1->xmm64u(n)) | (~op3->xmm64u(n) & op2->xmm64u(n));
1165 }
1166 }
1167
1168 // shift
1169
xmm_psravw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1170 BX_CPP_INLINE void xmm_psravw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1171 {
1172 for (unsigned n=0; n < 8; n++) {
1173 unsigned shift = op2->xmm16u(n);
1174 if(shift > 15)
1175 op1->xmm16u(n) = (op1->xmm16s(n) < 0) ? 0xffff : 0;
1176 else
1177 op1->xmm16u(n) = (Bit16u)(op1->xmm16s(n) >> shift);
1178 }
1179 }
1180
xmm_psravd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1181 BX_CPP_INLINE void xmm_psravd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1182 {
1183 for (unsigned n=0; n < 4; n++) {
1184 Bit32u shift = op2->xmm32u(n);
1185 if(shift > 31)
1186 op1->xmm32u(n) = (op1->xmm32s(n) < 0) ? 0xffffffff : 0;
1187 else
1188 op1->xmm32u(n) = (Bit32u)(op1->xmm32s(n) >> shift);
1189 }
1190 }
1191
xmm_psravq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1192 BX_CPP_INLINE void xmm_psravq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1193 {
1194 for (unsigned n=0; n < 2; n++) {
1195 Bit64u shift = op2->xmm64u(n);
1196 if(shift > 64)
1197 op1->xmm64u(n) = (op1->xmm64s(n) < 0) ? BX_CONST64(0xffffffffffffffff) : 0;
1198 else
1199 op1->xmm64u(n) = (Bit64u)(op1->xmm64s(n) >> shift);
1200 }
1201 }
1202
xmm_psllvw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1203 BX_CPP_INLINE void xmm_psllvw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1204 {
1205 for (unsigned n=0; n < 8; n++) {
1206 unsigned shift = op2->xmm16u(n);
1207 if(shift > 15)
1208 op1->xmm16u(n) = 0;
1209 else
1210 op1->xmm16u(n) <<= shift;
1211 }
1212 }
1213
xmm_psllvd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1214 BX_CPP_INLINE void xmm_psllvd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1215 {
1216 for (unsigned n=0; n < 4; n++) {
1217 Bit32u shift = op2->xmm32u(n);
1218 if(shift > 31)
1219 op1->xmm32u(n) = 0;
1220 else
1221 op1->xmm32u(n) <<= shift;
1222 }
1223 }
1224
xmm_psllvq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1225 BX_CPP_INLINE void xmm_psllvq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1226 {
1227 for (unsigned n=0; n < 2; n++) {
1228 Bit64u shift = op2->xmm64u(n);
1229 if(shift > 63)
1230 op1->xmm64u(n) = 0;
1231 else
1232 op1->xmm64u(n) <<= shift;
1233 }
1234 }
1235
xmm_psrlvw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1236 BX_CPP_INLINE void xmm_psrlvw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1237 {
1238 for (unsigned n=0; n < 8; n++) {
1239 unsigned shift = op2->xmm16u(n);
1240 if(shift > 15)
1241 op1->xmm16u(n) = 0;
1242 else
1243 op1->xmm16u(n) >>= shift;
1244 }
1245 }
1246
xmm_psrlvd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1247 BX_CPP_INLINE void xmm_psrlvd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1248 {
1249 for (unsigned n=0; n < 4; n++) {
1250 Bit32u shift = op2->xmm32u(n);
1251 if(shift > 31)
1252 op1->xmm32u(n) = 0;
1253 else
1254 op1->xmm32u(n) >>= shift;
1255 }
1256 }
1257
xmm_psrlvq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1258 BX_CPP_INLINE void xmm_psrlvq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1259 {
1260 for (unsigned n=0; n < 2; n++) {
1261 Bit64u shift = op2->xmm64u(n);
1262 if(shift > 63)
1263 op1->xmm64u(n) = 0;
1264 else
1265 op1->xmm64u(n) >>= shift;
1266 }
1267 }
1268
xmm_psraw(BxPackedXmmRegister * op,Bit64u shift_64)1269 BX_CPP_INLINE void xmm_psraw(BxPackedXmmRegister *op, Bit64u shift_64)
1270 {
1271 if(shift_64 > 15) {
1272 for (unsigned n=0; n < 8; n++)
1273 op->xmm16u(n) = (op->xmm16s(n) < 0) ? 0xffff : 0;
1274 }
1275 else
1276 {
1277 Bit8u shift = (Bit8u) shift_64;
1278
1279 for (unsigned n=0; n < 8; n++)
1280 op->xmm16u(n) = (Bit16u)(op->xmm16s(n) >> shift);
1281 }
1282 }
1283
xmm_psrad(BxPackedXmmRegister * op,Bit64u shift_64)1284 BX_CPP_INLINE void xmm_psrad(BxPackedXmmRegister *op, Bit64u shift_64)
1285 {
1286 if(shift_64 > 31) {
1287 for (unsigned n=0; n < 4; n++)
1288 op->xmm32u(n) = (op->xmm32s(n) < 0) ? 0xffffffff : 0;
1289 }
1290 else
1291 {
1292 Bit8u shift = (Bit8u) shift_64;
1293
1294 for (unsigned n=0; n < 4; n++)
1295 op->xmm32u(n) = (Bit32u)(op->xmm32s(n) >> shift);
1296 }
1297 }
1298
xmm_psraq(BxPackedXmmRegister * op,Bit64u shift_64)1299 BX_CPP_INLINE void xmm_psraq(BxPackedXmmRegister *op, Bit64u shift_64)
1300 {
1301 if(shift_64 > 63) {
1302 for (unsigned n=0; n < 2; n++)
1303 op->xmm64u(n) = (op->xmm64s(n) < 0) ? BX_CONST64(0xffffffffffffffff) : 0;
1304 }
1305 else
1306 {
1307 Bit8u shift = (Bit8u) shift_64;
1308
1309 for (unsigned n=0; n < 2; n++)
1310 op->xmm64u(n) = (Bit64u)(op->xmm64s(n) >> shift);
1311 }
1312 }
1313
xmm_psrlw(BxPackedXmmRegister * op,Bit64u shift_64)1314 BX_CPP_INLINE void xmm_psrlw(BxPackedXmmRegister *op, Bit64u shift_64)
1315 {
1316 if(shift_64 > 15) op->clear();
1317 else
1318 {
1319 Bit8u shift = (Bit8u) shift_64;
1320
1321 for (unsigned n=0; n < 8; n++)
1322 op->xmm16u(n) >>= shift;
1323 }
1324 }
1325
xmm_psrld(BxPackedXmmRegister * op,Bit64u shift_64)1326 BX_CPP_INLINE void xmm_psrld(BxPackedXmmRegister *op, Bit64u shift_64)
1327 {
1328 if(shift_64 > 31) op->clear();
1329 else
1330 {
1331 Bit8u shift = (Bit8u) shift_64;
1332
1333 for (unsigned n=0; n < 4; n++)
1334 op->xmm32u(n) >>= shift;
1335 }
1336 }
1337
xmm_psrlq(BxPackedXmmRegister * op,Bit64u shift_64)1338 BX_CPP_INLINE void xmm_psrlq(BxPackedXmmRegister *op, Bit64u shift_64)
1339 {
1340 if(shift_64 > 64) op->clear();
1341 else
1342 {
1343 Bit8u shift = (Bit8u) shift_64;
1344
1345 for (unsigned n=0; n < 2; n++)
1346 op->xmm64u(n) >>= shift;
1347 }
1348 }
1349
xmm_psllw(BxPackedXmmRegister * op,Bit64u shift_64)1350 BX_CPP_INLINE void xmm_psllw(BxPackedXmmRegister *op, Bit64u shift_64)
1351 {
1352 if(shift_64 > 15) op->clear();
1353 else
1354 {
1355 Bit8u shift = (Bit8u) shift_64;
1356
1357 for (unsigned n=0; n < 8; n++)
1358 op->xmm16u(n) <<= shift;
1359 }
1360 }
1361
xmm_pslld(BxPackedXmmRegister * op,Bit64u shift_64)1362 BX_CPP_INLINE void xmm_pslld(BxPackedXmmRegister *op, Bit64u shift_64)
1363 {
1364 if(shift_64 > 31) op->clear();
1365 else
1366 {
1367 Bit8u shift = (Bit8u) shift_64;
1368
1369 for (unsigned n=0; n < 4; n++)
1370 op->xmm32u(n) <<= shift;
1371 }
1372 }
1373
xmm_psllq(BxPackedXmmRegister * op,Bit64u shift_64)1374 BX_CPP_INLINE void xmm_psllq(BxPackedXmmRegister *op, Bit64u shift_64)
1375 {
1376 if(shift_64 > 63) op->clear();
1377 else
1378 {
1379 Bit8u shift = (Bit8u) shift_64;
1380
1381 for (unsigned n=0; n < 2; n++)
1382 op->xmm64u(n) <<= shift;
1383 }
1384 }
1385
xmm_psrldq(BxPackedXmmRegister * op,Bit8u shift)1386 BX_CPP_INLINE void xmm_psrldq(BxPackedXmmRegister *op, Bit8u shift)
1387 {
1388 if(shift > 15) op->clear();
1389 else {
1390 if (shift > 7) {
1391 op->xmm64u(0) = op->xmm64u(1);
1392 op->xmm64u(1) = 0;
1393 shift -= 8;
1394 }
1395
1396 shift <<= 3;
1397
1398 if (shift != 0) {
1399 op->xmm64u(0) = (op->xmm64u(0) >> shift) | (op->xmm64u(1) << (64-shift));
1400 op->xmm64u(1) = (op->xmm64u(1) >> shift);
1401 }
1402 }
1403 }
1404
xmm_pslldq(BxPackedXmmRegister * op,Bit8u shift)1405 BX_CPP_INLINE void xmm_pslldq(BxPackedXmmRegister *op, Bit8u shift)
1406 {
1407 if(shift > 15) op->clear();
1408 else {
1409 if (shift > 7) {
1410 op->xmm64u(1) = op->xmm64u(0);
1411 op->xmm64u(0) = 0;
1412 shift -= 8;
1413 }
1414
1415 shift <<= 3;
1416
1417 if (shift != 0) {
1418 op->xmm64u(1) = (op->xmm64u(1) << shift) | (op->xmm64u(0) >> (64-shift));
1419 op->xmm64u(0) = (op->xmm64u(0) << shift);
1420 }
1421 }
1422 }
1423
xmm_palignr(BxPackedXmmRegister * op2,const BxPackedXmmRegister * op1,Bit8u shift)1424 BX_CPP_INLINE void xmm_palignr(BxPackedXmmRegister *op2, const BxPackedXmmRegister *op1, Bit8u shift)
1425 {
1426 // op2 = [op1:op2] >> shift
1427
1428 if (shift > 15) {
1429 *op2 = *op1;
1430 xmm_psrldq(op2, shift - 16);
1431 return;
1432 }
1433
1434 shift <<= 3;
1435
1436 if (shift > 64) {
1437 shift -= 64;
1438 op2->xmm64u(0) = (op2->xmm64u(1) >> shift) | (op1->xmm64u(0) << (64-shift));
1439 op2->xmm64u(1) = (op1->xmm64u(0) >> shift) | (op1->xmm64u(1) << (64-shift));
1440 }
1441 else if (shift == 64) {
1442 op2->xmm64u(0) = op2->xmm64u(1);
1443 op2->xmm64u(1) = op1->xmm64u(0);
1444 }
1445 else if (shift != 0) {
1446 op2->xmm64u(0) = (op2->xmm64u(0) >> shift) | (op2->xmm64u(1) << (64-shift));
1447 op2->xmm64u(1) = (op2->xmm64u(1) >> shift) | (op1->xmm64u(0) << (64-shift));
1448 }
1449 }
1450
1451 // rotate (right)
1452
xmm_prorb(BxPackedXmmRegister * op,int shift)1453 BX_CPP_INLINE void xmm_prorb(BxPackedXmmRegister *op, int shift)
1454 {
1455 shift &= 0x7;
1456
1457 for(unsigned n=0;n<16;n++) {
1458 op->xmmubyte(n) = (op->xmmubyte(n) >> shift) | (op->xmmubyte(n) << (8 - shift));
1459 }
1460 }
1461
xmm_prorw(BxPackedXmmRegister * op,int shift)1462 BX_CPP_INLINE void xmm_prorw(BxPackedXmmRegister *op, int shift)
1463 {
1464 shift &= 0xf;
1465
1466 for(unsigned n=0;n<8;n++) {
1467 op->xmm16u(n) = (op->xmm16u(n) >> shift) | (op->xmm16u(n) << (16 - shift));
1468 }
1469 }
1470
xmm_prord(BxPackedXmmRegister * op,int shift)1471 BX_CPP_INLINE void xmm_prord(BxPackedXmmRegister *op, int shift)
1472 {
1473 shift &= 0x1f;
1474
1475 for(unsigned n=0;n<4;n++) {
1476 op->xmm32u(n) = (op->xmm32u(n) >> shift) | (op->xmm32u(n) << (32 - shift));
1477 }
1478 }
1479
xmm_prorq(BxPackedXmmRegister * op,int shift)1480 BX_CPP_INLINE void xmm_prorq(BxPackedXmmRegister *op, int shift)
1481 {
1482 shift &= 0x3f;
1483
1484 for(unsigned n=0;n<2;n++) {
1485 op->xmm64u(n) = (op->xmm64u(n) >> shift) | (op->xmm64u(n) << (64 - shift));
1486 }
1487 }
1488
xmm_prorvd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1489 BX_CPP_INLINE void xmm_prorvd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1490 {
1491 for(unsigned n=0;n<4;n++) {
1492 int shift = op2->xmm32u(n) & 0x1f;
1493 op1->xmm32u(n) = (op1->xmm32u(n) >> shift) | (op1->xmm32u(n) << (32 - shift));
1494 }
1495 }
1496
xmm_prorvq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1497 BX_CPP_INLINE void xmm_prorvq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1498 {
1499 for(unsigned n=0;n<2;n++) {
1500 int shift = op2->xmm64u(n) & 0x3f;
1501 op1->xmm64u(n) = (op1->xmm64u(n) >> shift) | (op1->xmm64u(n) << (64 - shift));
1502 }
1503 }
1504
1505 // rotate (left)
1506
xmm_prolb(BxPackedXmmRegister * op,int shift)1507 BX_CPP_INLINE void xmm_prolb(BxPackedXmmRegister *op, int shift)
1508 {
1509 shift &= 0x7;
1510
1511 for(unsigned n=0;n<16;n++) {
1512 op->xmmubyte(n) = (op->xmmubyte(n) << shift) | (op->xmmubyte(n) >> (8 - shift));
1513 }
1514 }
1515
xmm_prolw(BxPackedXmmRegister * op,int shift)1516 BX_CPP_INLINE void xmm_prolw(BxPackedXmmRegister *op, int shift)
1517 {
1518 shift &= 0xf;
1519
1520 for(unsigned n=0;n<8;n++) {
1521 op->xmm16u(n) = (op->xmm16u(n) << shift) | (op->xmm16u(n) >> (16 - shift));
1522 }
1523 }
1524
xmm_prold(BxPackedXmmRegister * op,int shift)1525 BX_CPP_INLINE void xmm_prold(BxPackedXmmRegister *op, int shift)
1526 {
1527 shift &= 0x1f;
1528
1529 for(unsigned n=0;n<4;n++) {
1530 op->xmm32u(n) = (op->xmm32u(n) << shift) | (op->xmm32u(n) >> (32 - shift));
1531 }
1532 }
1533
xmm_prolq(BxPackedXmmRegister * op,int shift)1534 BX_CPP_INLINE void xmm_prolq(BxPackedXmmRegister *op, int shift)
1535 {
1536 shift &= 0x3f;
1537
1538 for(unsigned n=0;n<2;n++) {
1539 op->xmm64u(n) = (op->xmm64u(n) << shift) | (op->xmm64u(n) >> (64 - shift));
1540 }
1541 }
1542
xmm_prolvd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1543 BX_CPP_INLINE void xmm_prolvd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1544 {
1545 for(unsigned n=0;n<4;n++) {
1546 int shift = op2->xmm32u(n) & 0x1f;
1547 op1->xmm32u(n) = (op1->xmm32u(n) << shift) | (op1->xmm32u(n) >> (32 - shift));
1548 }
1549 }
1550
xmm_prolvq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1551 BX_CPP_INLINE void xmm_prolvq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1552 {
1553 for(unsigned n=0;n<2;n++) {
1554 int shift = op2->xmm64u(n) & 0x3f;
1555 op1->xmm64u(n) = (op1->xmm64u(n) << shift) | (op1->xmm64u(n) >> (64 - shift));
1556 }
1557 }
1558
1559 // variable shift/rotate (XOP)
1560
xmm_protb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1561 BX_CPP_INLINE void xmm_protb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1562 {
1563 for(unsigned n=0;n < 16;n++) {
1564 int shift = op2->xmmsbyte(n);
1565 if (shift > 0) {
1566 // rotate left
1567 shift &= 0x7;
1568 op1->xmmubyte(n) = (op1->xmmubyte(n) << shift) | (op1->xmmubyte(n) >> (8 - shift));
1569 }
1570 else if (shift < 0) {
1571 // rotate right
1572 shift = -shift & 0x7;
1573 op1->xmmubyte(n) = (op1->xmmubyte(n) >> shift) | (op1->xmmubyte(n) << (8 - shift));
1574 }
1575 }
1576 }
1577
xmm_protw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1578 BX_CPP_INLINE void xmm_protw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1579 {
1580 for(unsigned n=0;n < 8;n++) {
1581 int shift = op2->xmmsbyte(n*2);
1582 if (shift > 0) {
1583 // rotate left
1584 shift &= 0xf;
1585 op1->xmm16u(n) = (op1->xmm16u(n) << shift) | (op1->xmm16u(n) >> (16 - shift));
1586 }
1587 else if (shift < 0) {
1588 // rotate right
1589 shift = -shift & 0xf;
1590 op1->xmm16u(n) = (op1->xmm16u(n) >> shift) | (op1->xmm16u(n) << (16 - shift));
1591 }
1592 }
1593 }
1594
xmm_protd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1595 BX_CPP_INLINE void xmm_protd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1596 {
1597 for(unsigned n=0;n < 4;n++) {
1598 int shift = op2->xmmsbyte(n*4);
1599 if (shift > 0) {
1600 // rotate left
1601 shift &= 0x1f;
1602 op1->xmm32u(n) = (op1->xmm32u(n) << shift) | (op1->xmm32u(n) >> (32 - shift));
1603 }
1604 else if (shift < 0) {
1605 // rotate right
1606 shift = -shift & 0x1f;
1607 op1->xmm32u(n) = (op1->xmm32u(n) >> shift) | (op1->xmm32u(n) << (32 - shift));
1608 }
1609 }
1610 }
1611
xmm_protq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1612 BX_CPP_INLINE void xmm_protq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1613 {
1614 for(unsigned n=0;n < 2;n++) {
1615 int shift = op2->xmmsbyte(n*8);
1616 if (shift > 0) {
1617 // rotate left
1618 shift &= 0x3f;
1619 op1->xmm64u(n) = (op1->xmm64u(n) << shift) | (op1->xmm64u(n) >> (64 - shift));
1620 }
1621 else if (shift < 0) {
1622 // rotate right
1623 shift = -shift & 0x3f;
1624 op1->xmm64u(n) = (op1->xmm64u(n) >> shift) | (op1->xmm64u(n) << (64 - shift));
1625 }
1626 }
1627 }
1628
xmm_pshab(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1629 BX_CPP_INLINE void xmm_pshab(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1630 {
1631 for(unsigned n=0;n < 16;n++) {
1632 int shift = op2->xmmsbyte(n);
1633 if (shift > 0) {
1634 // shift left
1635 op1->xmmsbyte(n) <<= (shift & 0x7);
1636 }
1637 else if (shift < 0) {
1638 // shift right
1639 op1->xmmsbyte(n) >>= (-shift & 0x7);
1640 }
1641 }
1642 }
1643
xmm_pshaw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1644 BX_CPP_INLINE void xmm_pshaw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1645 {
1646 for(unsigned n=0;n < 8;n++) {
1647 int shift = op2->xmmsbyte(n*2);
1648 if (shift > 0) {
1649 // shift left
1650 op1->xmm16s(n) <<= (shift & 0xf);
1651 }
1652 else if (shift < 0) {
1653 // shift right
1654 op1->xmm16s(n) >>= (-shift & 0xf);
1655 }
1656 }
1657 }
1658
xmm_pshad(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1659 BX_CPP_INLINE void xmm_pshad(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1660 {
1661 for(unsigned n=0;n < 4;n++) {
1662 int shift = op2->xmmsbyte(n*4);
1663 if (shift > 0) {
1664 // shift left
1665 op1->xmm32s(n) <<= (shift & 0x1f);
1666 }
1667 else if (shift < 0) {
1668 // shift right
1669 op1->xmm32s(n) >>= (-shift & 0x1f);
1670 }
1671 }
1672 }
1673
xmm_pshaq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1674 BX_CPP_INLINE void xmm_pshaq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1675 {
1676 for(unsigned n=0;n < 2;n++) {
1677 int shift = op2->xmmsbyte(n*8);
1678 if (shift > 0) {
1679 // shift left
1680 op1->xmm64s(n) <<= (shift & 0x3f);
1681 }
1682 else if (shift < 0) {
1683 // shift right
1684 op1->xmm64s(n) >>= (-shift & 0x3f);
1685 }
1686 }
1687 }
1688
xmm_pshlb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1689 BX_CPP_INLINE void xmm_pshlb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1690 {
1691 for(unsigned n=0;n < 16;n++) {
1692 int shift = op2->xmmsbyte(n);
1693 if (shift > 0) {
1694 // shift left
1695 op1->xmmubyte(n) <<= (shift & 0x7);
1696 }
1697 else if (shift < 0) {
1698 // shift right
1699 op1->xmmubyte(n) >>= (-shift & 0x7);
1700 }
1701 }
1702 }
1703
xmm_pshlw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1704 BX_CPP_INLINE void xmm_pshlw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1705 {
1706 for(unsigned n=0;n < 8;n++) {
1707 int shift = op2->xmmubyte(n*2);
1708 if (shift > 0) {
1709 // shift left
1710 op1->xmm16u(n) <<= (shift & 0xf);
1711 }
1712 else if (shift < 0) {
1713 // shift right
1714 op1->xmm16u(n) >>= (-shift & 0xf);
1715 }
1716 }
1717 }
1718
xmm_pshld(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1719 BX_CPP_INLINE void xmm_pshld(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1720 {
1721 for(unsigned n=0;n < 4;n++) {
1722 int shift = op2->xmmsbyte(n*4);
1723 if (shift > 0) {
1724 // shift left
1725 op1->xmm32u(n) <<= (shift & 0x1f);
1726 }
1727 else if (shift < 0) {
1728 // shift right
1729 op1->xmm32u(n) >>= (-shift & 0x1f);
1730 }
1731 }
1732 }
1733
xmm_pshlq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1734 BX_CPP_INLINE void xmm_pshlq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1735 {
1736 for(unsigned n=0;n < 2;n++) {
1737 int shift = op2->xmmsbyte(n*8);
1738 if (shift > 0) {
1739 // shift left
1740 op1->xmm64u(n) <<= (shift & 0x3f);
1741 }
1742 else if (shift < 0) {
1743 // shift right
1744 op1->xmm64u(n) >>= (-shift & 0x3f);
1745 }
1746 }
1747 }
1748
1749 // VNNI
1750
xmm_pdpbusd(BxPackedXmmRegister * dst,BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1751 BX_CPP_INLINE void xmm_pdpbusd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1752 {
1753 for(unsigned n=0; n<4; n++)
1754 {
1755 Bit32s p1word = (Bit32u) op1->xmmubyte(n*4) * (Bit32s) op2->xmmsbyte(n*4);
1756 Bit32s p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1);
1757 Bit32s p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2);
1758 Bit32s p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3);
1759
1760 dst->xmm32s(n) += (p1word + p2word + p3word + p4word);
1761 }
1762 }
1763
xmm_pdpbusds(BxPackedXmmRegister * dst,BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1764 BX_CPP_INLINE void xmm_pdpbusds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1765 {
1766 for(unsigned n=0; n<4; n++)
1767 {
1768 Bit32s p1word = (Bit32u) op1->xmmubyte(n*4) * (Bit32s) op2->xmmsbyte(n*4);
1769 Bit32s p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1);
1770 Bit32s p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2);
1771 Bit32s p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3);
1772
1773 Bit64s result = (Bit64s) dst->xmm32s(n) + (p1word + p2word + p3word + p4word);
1774 dst->xmm32s(n) = SaturateQwordSToDwordS(result);
1775 }
1776 }
1777
xmm_pdpwssd(BxPackedXmmRegister * dst,BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1778 BX_CPP_INLINE void xmm_pdpwssd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1779 {
1780 for(unsigned n=0; n<4; n++)
1781 {
1782 Bit32s p1_dword = (Bit32s) op1->xmm16s(n*2) * (Bit32s) op2->xmm16s(n*2);
1783 Bit32s p2_dword = (Bit32s) op1->xmm16s(n*2+1) * (Bit32s) op2->xmm16s(n*2+1);
1784
1785 dst->xmm32s(n) += (p1_dword + p2_dword);
1786 }
1787 }
1788
xmm_pdpwssds(BxPackedXmmRegister * dst,BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1789 BX_CPP_INLINE void xmm_pdpwssds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1790 {
1791 for(unsigned n=0; n<4; n++)
1792 {
1793 Bit32s p1_dword = (Bit32s) op1->xmm16s(n*2) * (Bit32s) op2->xmm16s(n*2);
1794 Bit32s p2_dword = (Bit32s) op1->xmm16s(n*2+1) * (Bit32s) op2->xmm16s(n*2+1);
1795
1796 Bit64s result = (Bit64s) dst->xmm32s(n) + (p1_dword + p2_dword);
1797 dst->xmm32s(n) = SaturateQwordSToDwordS(result);
1798 }
1799 }
1800
1801 #endif
1802