1 /////////////////////////////////////////////////////////////////////////
2 // $Id: simd_int.h 14133 2021-02-08 13:06:44Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 //   Copyright (c) 2011-2017 Stanislav Shwartsman
6 //          Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 //  This library is free software; you can redistribute it and/or
9 //  modify it under the terms of the GNU Lesser General Public
10 //  License as published by the Free Software Foundation; either
11 //  version 2 of the License, or (at your option) any later version.
12 //
13 //  This library is distributed in the hope that it will be useful,
14 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 //  Lesser General Public License for more details.
17 //
18 //  You should have received a copy of the GNU Lesser General Public
19 //  License along with this library; if not, write to the Free Software
20 //  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23 
24 #ifndef BX_SIMD_INT_FUNCTIONS_H
25 #define BX_SIMD_INT_FUNCTIONS_H
26 
27 // absolute value
28 
xmm_pabsb(BxPackedXmmRegister * op)29 BX_CPP_INLINE void xmm_pabsb(BxPackedXmmRegister *op)
30 {
31   for(unsigned n=0; n<16; n++) {
32     if(op->xmmsbyte(n) < 0) op->xmmubyte(n) = -op->xmmsbyte(n);
33   }
34 }
35 
xmm_pabsw(BxPackedXmmRegister * op)36 BX_CPP_INLINE void xmm_pabsw(BxPackedXmmRegister *op)
37 {
38   for(unsigned n=0; n<8; n++) {
39     if(op->xmm16s(n) < 0) op->xmm16u(n) = -op->xmm16s(n);
40   }
41 }
42 
xmm_pabsd(BxPackedXmmRegister * op)43 BX_CPP_INLINE void xmm_pabsd(BxPackedXmmRegister *op)
44 {
45   for(unsigned n=0; n<4; n++) {
46     if(op->xmm32s(n) < 0) op->xmm32u(n) = -op->xmm32s(n);
47   }
48 }
49 
xmm_pabsq(BxPackedXmmRegister * op)50 BX_CPP_INLINE void xmm_pabsq(BxPackedXmmRegister *op)
51 {
52   for(unsigned n=0; n<2; n++) {
53     if(op->xmm64s(n) < 0) op->xmm64u(n) = -op->xmm64s(n);
54   }
55 }
56 
57 // min/max
58 
xmm_pminsb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)59 BX_CPP_INLINE void xmm_pminsb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
60 {
61   for(unsigned n=0; n<16; n++) {
62     if(op2->xmmsbyte(n) < op1->xmmsbyte(n)) op1->xmmubyte(n) = op2->xmmubyte(n);
63   }
64 }
65 
xmm_pminub(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)66 BX_CPP_INLINE void xmm_pminub(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
67 {
68   for(unsigned n=0; n<16; n++) {
69     if(op2->xmmubyte(n) < op1->xmmubyte(n)) op1->xmmubyte(n) = op2->xmmubyte(n);
70   }
71 }
72 
xmm_pminsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)73 BX_CPP_INLINE void xmm_pminsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
74 {
75   for(unsigned n=0; n<8; n++) {
76     if(op2->xmm16s(n) < op1->xmm16s(n)) op1->xmm16s(n) = op2->xmm16s(n);
77   }
78 }
79 
xmm_pminuw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)80 BX_CPP_INLINE void xmm_pminuw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
81 {
82   for(unsigned n=0; n<8; n++) {
83     if(op2->xmm16u(n) < op1->xmm16u(n)) op1->xmm16s(n) = op2->xmm16s(n);
84   }
85 }
86 
xmm_pminsd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)87 BX_CPP_INLINE void xmm_pminsd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
88 {
89   for(unsigned n=0; n<4; n++) {
90     if(op2->xmm32s(n) < op1->xmm32s(n)) op1->xmm32u(n) = op2->xmm32u(n);
91   }
92 }
93 
xmm_pminud(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)94 BX_CPP_INLINE void xmm_pminud(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
95 {
96   for(unsigned n=0; n<4; n++) {
97     if(op2->xmm32u(n) < op1->xmm32u(n)) op1->xmm32u(n) = op2->xmm32u(n);
98   }
99 }
100 
xmm_pminsq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)101 BX_CPP_INLINE void xmm_pminsq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
102 {
103   for(unsigned n=0; n<2; n++) {
104     if(op2->xmm64s(n) < op1->xmm64s(n)) op1->xmm64u(n) = op2->xmm64u(n);
105   }
106 }
107 
xmm_pminuq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)108 BX_CPP_INLINE void xmm_pminuq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
109 {
110   for(unsigned n=0; n<2; n++) {
111     if(op2->xmm64u(n) < op1->xmm64u(n)) op1->xmm64u(n) = op2->xmm64u(n);
112   }
113 }
114 
xmm_pmaxsb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)115 BX_CPP_INLINE void xmm_pmaxsb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
116 {
117   for(unsigned n=0; n<16; n++) {
118     if(op2->xmmsbyte(n) > op1->xmmsbyte(n)) op1->xmmubyte(n) = op2->xmmubyte(n);
119   }
120 }
121 
xmm_pmaxub(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)122 BX_CPP_INLINE void xmm_pmaxub(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
123 {
124   for(unsigned n=0; n<16; n++) {
125     if(op2->xmmubyte(n) > op1->xmmubyte(n)) op1->xmmubyte(n) = op2->xmmubyte(n);
126   }
127 }
128 
xmm_pmaxsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)129 BX_CPP_INLINE void xmm_pmaxsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
130 {
131   for(unsigned n=0; n<8; n++) {
132     if(op2->xmm16s(n) > op1->xmm16s(n)) op1->xmm16s(n) = op2->xmm16s(n);
133   }
134 }
135 
xmm_pmaxuw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)136 BX_CPP_INLINE void xmm_pmaxuw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
137 {
138   for(unsigned n=0; n<8; n++) {
139     if(op2->xmm16u(n) > op1->xmm16u(n)) op1->xmm16s(n) = op2->xmm16s(n);
140   }
141 }
142 
xmm_pmaxsd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)143 BX_CPP_INLINE void xmm_pmaxsd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
144 {
145   for(unsigned n=0; n<4; n++) {
146     if(op2->xmm32s(n) > op1->xmm32s(n)) op1->xmm32u(n) = op2->xmm32u(n);
147   }
148 }
149 
xmm_pmaxud(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)150 BX_CPP_INLINE void xmm_pmaxud(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
151 {
152   for(unsigned n=0; n<4; n++) {
153     if(op2->xmm32u(n) > op1->xmm32u(n)) op1->xmm32u(n) = op2->xmm32u(n);
154   }
155 }
156 
xmm_pmaxsq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)157 BX_CPP_INLINE void xmm_pmaxsq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
158 {
159   for(unsigned n=0; n<2; n++) {
160     if(op2->xmm64s(n) > op1->xmm64s(n)) op1->xmm64u(n) = op2->xmm64u(n);
161   }
162 }
163 
xmm_pmaxuq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)164 BX_CPP_INLINE void xmm_pmaxuq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
165 {
166   for(unsigned n=0; n<2; n++) {
167     if(op2->xmm64u(n) > op1->xmm64u(n)) op1->xmm64u(n) = op2->xmm64u(n);
168   }
169 }
170 
171 // unpack
172 
xmm_unpcklps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)173 BX_CPP_INLINE void xmm_unpcklps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
174 {
175   op1->xmm32u(3) = op2->xmm32u(1);
176   op1->xmm32u(2) = op1->xmm32u(1);
177   op1->xmm32u(1) = op2->xmm32u(0);
178 //op1->xmm32u(0) = op1->xmm32u(0);
179 }
180 
xmm_unpckhps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)181 BX_CPP_INLINE void xmm_unpckhps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
182 {
183   op1->xmm32u(0) = op1->xmm32u(2);
184   op1->xmm32u(1) = op2->xmm32u(2);
185   op1->xmm32u(2) = op1->xmm32u(3);
186   op1->xmm32u(3) = op2->xmm32u(3);
187 }
188 
xmm_unpcklpd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)189 BX_CPP_INLINE void xmm_unpcklpd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
190 {
191 //op1->xmm64u(0) = op1->xmm64u(0);
192   op1->xmm64u(1) = op2->xmm64u(0);
193 }
194 
xmm_unpckhpd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)195 BX_CPP_INLINE void xmm_unpckhpd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
196 {
197   op1->xmm64u(0) = op1->xmm64u(1);
198   op1->xmm64u(1) = op2->xmm64u(1);
199 }
200 
xmm_punpcklbw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)201 BX_CPP_INLINE void xmm_punpcklbw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
202 {
203   op1->xmmubyte(0xF) = op2->xmmubyte(7);
204   op1->xmmubyte(0xE) = op1->xmmubyte(7);
205   op1->xmmubyte(0xD) = op2->xmmubyte(6);
206   op1->xmmubyte(0xC) = op1->xmmubyte(6);
207   op1->xmmubyte(0xB) = op2->xmmubyte(5);
208   op1->xmmubyte(0xA) = op1->xmmubyte(5);
209   op1->xmmubyte(0x9) = op2->xmmubyte(4);
210   op1->xmmubyte(0x8) = op1->xmmubyte(4);
211   op1->xmmubyte(0x7) = op2->xmmubyte(3);
212   op1->xmmubyte(0x6) = op1->xmmubyte(3);
213   op1->xmmubyte(0x5) = op2->xmmubyte(2);
214   op1->xmmubyte(0x4) = op1->xmmubyte(2);
215   op1->xmmubyte(0x3) = op2->xmmubyte(1);
216   op1->xmmubyte(0x2) = op1->xmmubyte(1);
217   op1->xmmubyte(0x1) = op2->xmmubyte(0);
218 //op1->xmmubyte(0x0) = op1->xmmubyte(0);
219 }
220 
xmm_punpckhbw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)221 BX_CPP_INLINE void xmm_punpckhbw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
222 {
223   op1->xmmubyte(0x0) = op1->xmmubyte(0x8);
224   op1->xmmubyte(0x1) = op2->xmmubyte(0x8);
225   op1->xmmubyte(0x2) = op1->xmmubyte(0x9);
226   op1->xmmubyte(0x3) = op2->xmmubyte(0x9);
227   op1->xmmubyte(0x4) = op1->xmmubyte(0xA);
228   op1->xmmubyte(0x5) = op2->xmmubyte(0xA);
229   op1->xmmubyte(0x6) = op1->xmmubyte(0xB);
230   op1->xmmubyte(0x7) = op2->xmmubyte(0xB);
231   op1->xmmubyte(0x8) = op1->xmmubyte(0xC);
232   op1->xmmubyte(0x9) = op2->xmmubyte(0xC);
233   op1->xmmubyte(0xA) = op1->xmmubyte(0xD);
234   op1->xmmubyte(0xB) = op2->xmmubyte(0xD);
235   op1->xmmubyte(0xC) = op1->xmmubyte(0xE);
236   op1->xmmubyte(0xD) = op2->xmmubyte(0xE);
237   op1->xmmubyte(0xE) = op1->xmmubyte(0xF);
238   op1->xmmubyte(0xF) = op2->xmmubyte(0xF);
239 }
240 
xmm_punpcklwd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)241 BX_CPP_INLINE void xmm_punpcklwd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
242 {
243   op1->xmm16u(7) = op2->xmm16u(3);
244   op1->xmm16u(6) = op1->xmm16u(3);
245   op1->xmm16u(5) = op2->xmm16u(2);
246   op1->xmm16u(4) = op1->xmm16u(2);
247   op1->xmm16u(3) = op2->xmm16u(1);
248   op1->xmm16u(2) = op1->xmm16u(1);
249   op1->xmm16u(1) = op2->xmm16u(0);
250 //op1->xmm16u(0) = op1->xmm16u(0);
251 }
252 
xmm_punpckhwd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)253 BX_CPP_INLINE void xmm_punpckhwd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
254 {
255   op1->xmm16u(0) = op1->xmm16u(4);
256   op1->xmm16u(1) = op2->xmm16u(4);
257   op1->xmm16u(2) = op1->xmm16u(5);
258   op1->xmm16u(3) = op2->xmm16u(5);
259   op1->xmm16u(4) = op1->xmm16u(6);
260   op1->xmm16u(5) = op2->xmm16u(6);
261   op1->xmm16u(6) = op1->xmm16u(7);
262   op1->xmm16u(7) = op2->xmm16u(7);
263 }
264 
265 // pack
266 
xmm_packuswb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)267 BX_CPP_INLINE void xmm_packuswb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
268 {
269   op1->xmmubyte(0x0) = SaturateWordSToByteU(op1->xmm16s(0));
270   op1->xmmubyte(0x1) = SaturateWordSToByteU(op1->xmm16s(1));
271   op1->xmmubyte(0x2) = SaturateWordSToByteU(op1->xmm16s(2));
272   op1->xmmubyte(0x3) = SaturateWordSToByteU(op1->xmm16s(3));
273   op1->xmmubyte(0x4) = SaturateWordSToByteU(op1->xmm16s(4));
274   op1->xmmubyte(0x5) = SaturateWordSToByteU(op1->xmm16s(5));
275   op1->xmmubyte(0x6) = SaturateWordSToByteU(op1->xmm16s(6));
276   op1->xmmubyte(0x7) = SaturateWordSToByteU(op1->xmm16s(7));
277 
278   op1->xmmubyte(0x8) = SaturateWordSToByteU(op2->xmm16s(0));
279   op1->xmmubyte(0x9) = SaturateWordSToByteU(op2->xmm16s(1));
280   op1->xmmubyte(0xA) = SaturateWordSToByteU(op2->xmm16s(2));
281   op1->xmmubyte(0xB) = SaturateWordSToByteU(op2->xmm16s(3));
282   op1->xmmubyte(0xC) = SaturateWordSToByteU(op2->xmm16s(4));
283   op1->xmmubyte(0xD) = SaturateWordSToByteU(op2->xmm16s(5));
284   op1->xmmubyte(0xE) = SaturateWordSToByteU(op2->xmm16s(6));
285   op1->xmmubyte(0xF) = SaturateWordSToByteU(op2->xmm16s(7));
286 }
287 
xmm_packsswb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)288 BX_CPP_INLINE void xmm_packsswb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
289 {
290   op1->xmmsbyte(0x0) = SaturateWordSToByteS(op1->xmm16s(0));
291   op1->xmmsbyte(0x1) = SaturateWordSToByteS(op1->xmm16s(1));
292   op1->xmmsbyte(0x2) = SaturateWordSToByteS(op1->xmm16s(2));
293   op1->xmmsbyte(0x3) = SaturateWordSToByteS(op1->xmm16s(3));
294   op1->xmmsbyte(0x4) = SaturateWordSToByteS(op1->xmm16s(4));
295   op1->xmmsbyte(0x5) = SaturateWordSToByteS(op1->xmm16s(5));
296   op1->xmmsbyte(0x6) = SaturateWordSToByteS(op1->xmm16s(6));
297   op1->xmmsbyte(0x7) = SaturateWordSToByteS(op1->xmm16s(7));
298 
299   op1->xmmsbyte(0x8) = SaturateWordSToByteS(op2->xmm16s(0));
300   op1->xmmsbyte(0x9) = SaturateWordSToByteS(op2->xmm16s(1));
301   op1->xmmsbyte(0xA) = SaturateWordSToByteS(op2->xmm16s(2));
302   op1->xmmsbyte(0xB) = SaturateWordSToByteS(op2->xmm16s(3));
303   op1->xmmsbyte(0xC) = SaturateWordSToByteS(op2->xmm16s(4));
304   op1->xmmsbyte(0xD) = SaturateWordSToByteS(op2->xmm16s(5));
305   op1->xmmsbyte(0xE) = SaturateWordSToByteS(op2->xmm16s(6));
306   op1->xmmsbyte(0xF) = SaturateWordSToByteS(op2->xmm16s(7));
307 }
308 
xmm_packusdw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)309 BX_CPP_INLINE void xmm_packusdw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
310 {
311   op1->xmm16u(0) = SaturateDwordSToWordU(op1->xmm32s(0));
312   op1->xmm16u(1) = SaturateDwordSToWordU(op1->xmm32s(1));
313   op1->xmm16u(2) = SaturateDwordSToWordU(op1->xmm32s(2));
314   op1->xmm16u(3) = SaturateDwordSToWordU(op1->xmm32s(3));
315 
316   op1->xmm16u(4) = SaturateDwordSToWordU(op2->xmm32s(0));
317   op1->xmm16u(5) = SaturateDwordSToWordU(op2->xmm32s(1));
318   op1->xmm16u(6) = SaturateDwordSToWordU(op2->xmm32s(2));
319   op1->xmm16u(7) = SaturateDwordSToWordU(op2->xmm32s(3));
320 }
321 
xmm_packssdw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)322 BX_CPP_INLINE void xmm_packssdw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
323 {
324   op1->xmm16s(0) = SaturateDwordSToWordS(op1->xmm32s(0));
325   op1->xmm16s(1) = SaturateDwordSToWordS(op1->xmm32s(1));
326   op1->xmm16s(2) = SaturateDwordSToWordS(op1->xmm32s(2));
327   op1->xmm16s(3) = SaturateDwordSToWordS(op1->xmm32s(3));
328 
329   op1->xmm16s(4) = SaturateDwordSToWordS(op2->xmm32s(0));
330   op1->xmm16s(5) = SaturateDwordSToWordS(op2->xmm32s(1));
331   op1->xmm16s(6) = SaturateDwordSToWordS(op2->xmm32s(2));
332   op1->xmm16s(7) = SaturateDwordSToWordS(op2->xmm32s(3));
333 }
334 
335 // shuffle
336 
xmm_pshufb(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)337 BX_CPP_INLINE void xmm_pshufb(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
338 {
339   for(unsigned n=0; n<16; n++)
340   {
341     unsigned mask = op2->xmmubyte(n);
342     if (mask & 0x80)
343       r->xmmubyte(n) = 0;
344     else
345       r->xmmubyte(n) = op1->xmmubyte(mask & 0xf);
346   }
347 }
348 
xmm_pshufhw(BxPackedXmmRegister * r,const BxPackedXmmRegister * op,Bit8u order)349 BX_CPP_INLINE void xmm_pshufhw(BxPackedXmmRegister *r, const BxPackedXmmRegister *op, Bit8u order)
350 {
351   r->xmm64u(0) = op->xmm64u(0);
352   r->xmm16u(4) = op->xmm16u(4 + ((order >> 0) & 0x3));
353   r->xmm16u(5) = op->xmm16u(4 + ((order >> 2) & 0x3));
354   r->xmm16u(6) = op->xmm16u(4 + ((order >> 4) & 0x3));
355   r->xmm16u(7) = op->xmm16u(4 + ((order >> 6) & 0x3));
356 }
357 
xmm_pshuflw(BxPackedXmmRegister * r,const BxPackedXmmRegister * op,Bit8u order)358 BX_CPP_INLINE void xmm_pshuflw(BxPackedXmmRegister *r, const BxPackedXmmRegister *op, Bit8u order)
359 {
360   r->xmm16u(0) = op->xmm16u((order >> 0) & 0x3);
361   r->xmm16u(1) = op->xmm16u((order >> 2) & 0x3);
362   r->xmm16u(2) = op->xmm16u((order >> 4) & 0x3);
363   r->xmm16u(3) = op->xmm16u((order >> 6) & 0x3);
364   r->xmm64u(1) = op->xmm64u(1);
365 }
366 
xmm_shufps(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit8u order)367 BX_CPP_INLINE void xmm_shufps(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit8u order)
368 {
369   r->xmm32u(0) = op1->xmm32u((order >> 0) & 0x3);
370   r->xmm32u(1) = op1->xmm32u((order >> 2) & 0x3);
371   r->xmm32u(2) = op2->xmm32u((order >> 4) & 0x3);
372   r->xmm32u(3) = op2->xmm32u((order >> 6) & 0x3);
373 }
374 
xmm_shufpd(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit8u order)375 BX_CPP_INLINE void xmm_shufpd(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit8u order)
376 {
377   r->xmm64u(0) = op1->xmm64u((order >> 0) & 0x1);
378   r->xmm64u(1) = op2->xmm64u((order >> 1) & 0x1);
379 }
380 
xmm_permilps(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)381 BX_CPP_INLINE void xmm_permilps(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
382 {
383   r->xmm32u(0) = op1->xmm32u(op2->xmm32u(0) & 0x3);
384   r->xmm32u(1) = op1->xmm32u(op2->xmm32u(1) & 0x3);
385   r->xmm32u(2) = op1->xmm32u(op2->xmm32u(2) & 0x3);
386   r->xmm32u(3) = op1->xmm32u(op2->xmm32u(3) & 0x3);
387 }
388 
xmm_permilpd(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)389 BX_CPP_INLINE void xmm_permilpd(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
390 {
391   r->xmm64u(0) = op1->xmm64u((op2->xmm32u(0) >> 1) & 0x1);
392   r->xmm64u(1) = op1->xmm64u((op2->xmm32u(2) >> 1) & 0x1);
393 }
394 
xmm_permil2ps(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * op3,unsigned m2z)395 BX_CPP_INLINE void xmm_permil2ps(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *op3, unsigned m2z)
396 {
397   for(unsigned n=0; n < 4; n++) {
398     Bit32u ctrl = op3->xmm32u(n);
399     if ((m2z ^ ((ctrl >> 3) & 0x1)) == 0x3)
400       r->xmm32u(n) = 0;
401     else
402       r->xmm32u(n) = (ctrl & 0x4) ? op1->xmm32u(ctrl & 0x3) : op2->xmm32u(ctrl & 0x3);
403   }
404 }
405 
xmm_permil2pd(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * op3,unsigned m2z)406 BX_CPP_INLINE void xmm_permil2pd(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *op3, unsigned m2z)
407 {
408   for(unsigned n=0; n < 2; n++) {
409     Bit32u ctrl = op3->xmm32u(n*2);
410     if ((m2z ^ ((ctrl >> 3) & 0x1)) == 0x3)
411       r->xmm64u(n) = 0;
412     else
413       r->xmm64u(n) = (ctrl & 0x4) ? op1->xmm64u((ctrl >> 1) & 0x1) : op2->xmm64u((ctrl >> 1) & 0x1);
414   }
415 }
416 
417 #if BX_SUPPORT_AVX
ymm_vpermq(BxPackedYmmRegister * r,const BxPackedYmmRegister * op,Bit8u control)418 BX_CPP_INLINE void ymm_vpermq(BxPackedYmmRegister *r, const BxPackedYmmRegister *op, Bit8u control)
419 {
420   r->ymm64u(0) = op->ymm64u((control)      & 0x3);
421   r->ymm64u(1) = op->ymm64u((control >> 2) & 0x3);
422   r->ymm64u(2) = op->ymm64u((control >> 4) & 0x3);
423   r->ymm64u(3) = op->ymm64u((control >> 6) & 0x3);
424 }
425 #endif
426 
427 // sign
428 
xmm_psignb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)429 BX_CPP_INLINE void xmm_psignb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
430 {
431   for(unsigned n=0; n<16; n++) {
432     int sign = (op2->xmmsbyte(n) > 0) - (op2->xmmsbyte(n) < 0);
433     op1->xmmsbyte(n) *= sign;
434   }
435 }
436 
xmm_psignw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)437 BX_CPP_INLINE void xmm_psignw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
438 {
439   for(unsigned n=0; n<8; n++) {
440     int sign = (op2->xmm16s(n) > 0) - (op2->xmm16s(n) < 0);
441     op1->xmm16s(n) *= sign;
442   }
443 }
444 
xmm_psignd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)445 BX_CPP_INLINE void xmm_psignd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
446 {
447   for(unsigned n=0; n<4; n++) {
448     int sign = (op2->xmm32s(n) > 0) - (op2->xmm32s(n) < 0);
449     op1->xmm32s(n) *= sign;
450   }
451 }
452 
453 // mask creation
454 
xmm_pmovmskb(const BxPackedXmmRegister * op)455 BX_CPP_INLINE Bit32u xmm_pmovmskb(const BxPackedXmmRegister *op)
456 {
457   Bit32u mask = 0;
458 
459   if(op->xmmsbyte(0x0) < 0) mask |= 0x0001;
460   if(op->xmmsbyte(0x1) < 0) mask |= 0x0002;
461   if(op->xmmsbyte(0x2) < 0) mask |= 0x0004;
462   if(op->xmmsbyte(0x3) < 0) mask |= 0x0008;
463   if(op->xmmsbyte(0x4) < 0) mask |= 0x0010;
464   if(op->xmmsbyte(0x5) < 0) mask |= 0x0020;
465   if(op->xmmsbyte(0x6) < 0) mask |= 0x0040;
466   if(op->xmmsbyte(0x7) < 0) mask |= 0x0080;
467   if(op->xmmsbyte(0x8) < 0) mask |= 0x0100;
468   if(op->xmmsbyte(0x9) < 0) mask |= 0x0200;
469   if(op->xmmsbyte(0xA) < 0) mask |= 0x0400;
470   if(op->xmmsbyte(0xB) < 0) mask |= 0x0800;
471   if(op->xmmsbyte(0xC) < 0) mask |= 0x1000;
472   if(op->xmmsbyte(0xD) < 0) mask |= 0x2000;
473   if(op->xmmsbyte(0xE) < 0) mask |= 0x4000;
474   if(op->xmmsbyte(0xF) < 0) mask |= 0x8000;
475 
476   return mask;
477 }
478 
xmm_pmovmskw(const BxPackedXmmRegister * op)479 BX_CPP_INLINE Bit32u xmm_pmovmskw(const BxPackedXmmRegister *op)
480 {
481   Bit32u mask = 0;
482 
483   if(op->xmm16s(0) < 0) mask |= 0x01;
484   if(op->xmm16s(1) < 0) mask |= 0x02;
485   if(op->xmm16s(2) < 0) mask |= 0x04;
486   if(op->xmm16s(3) < 0) mask |= 0x08;
487   if(op->xmm16s(4) < 0) mask |= 0x10;
488   if(op->xmm16s(5) < 0) mask |= 0x20;
489   if(op->xmm16s(6) < 0) mask |= 0x40;
490   if(op->xmm16s(7) < 0) mask |= 0x80;
491 
492   return mask;
493 }
494 
xmm_pmovmskd(const BxPackedXmmRegister * op)495 BX_CPP_INLINE Bit32u xmm_pmovmskd(const BxPackedXmmRegister *op)
496 {
497   Bit32u mask = 0;
498 
499   if(op->xmm32s(0) < 0) mask |= 0x1;
500   if(op->xmm32s(1) < 0) mask |= 0x2;
501   if(op->xmm32s(2) < 0) mask |= 0x4;
502   if(op->xmm32s(3) < 0) mask |= 0x8;
503 
504   return mask;
505 }
506 
xmm_pmovmskq(const BxPackedXmmRegister * op)507 BX_CPP_INLINE Bit32u xmm_pmovmskq(const BxPackedXmmRegister *op)
508 {
509   Bit32u mask = 0;
510 
511   if(op->xmm32s(1) < 0) mask |= 0x1;
512   if(op->xmm32s(3) < 0) mask |= 0x2;
513 
514   return mask;
515 }
516 
xmm_pmovm2b(BxPackedXmmRegister * dst,Bit32u mask)517 BX_CPP_INLINE void xmm_pmovm2b(BxPackedXmmRegister *dst, Bit32u mask)
518 {
519   for (unsigned n=0; n < 16; n++, mask >>= 1) {
520     dst->xmmsbyte(n) = - Bit8s(mask & 0x1);
521   }
522 }
523 
xmm_pmovm2w(BxPackedXmmRegister * dst,Bit32u mask)524 BX_CPP_INLINE void xmm_pmovm2w(BxPackedXmmRegister *dst, Bit32u mask)
525 {
526   for (unsigned n=0; n < 8; n++, mask >>= 1) {
527     dst->xmm16s(n) = - Bit16s(mask & 0x1);
528   }
529 }
530 
xmm_pmovm2d(BxPackedXmmRegister * dst,Bit32u mask)531 BX_CPP_INLINE void xmm_pmovm2d(BxPackedXmmRegister *dst, Bit32u mask)
532 {
533   for (unsigned n=0; n < 4; n++, mask >>= 1) {
534     dst->xmm32s(n) = - Bit32s(mask & 0x1);
535   }
536 }
537 
xmm_pmovm2q(BxPackedXmmRegister * dst,Bit32u mask)538 BX_CPP_INLINE void xmm_pmovm2q(BxPackedXmmRegister *dst, Bit32u mask)
539 {
540   dst->xmm64s(0) = (mask & 0x1) ? (Bit64s) -1 : 0;
541   dst->xmm64s(1) = (mask & 0x2) ? (Bit64s) -1 : 0;
542 }
543 
544 // blend
545 
xmm_pblendb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit32u mask)546 BX_CPP_INLINE void xmm_pblendb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit32u mask)
547 {
548   for (unsigned n=0; n < 16; n++, mask >>= 1) {
549     if (mask & 0x1) op1->xmmubyte(n) = op2->xmmubyte(n);
550   }
551 }
552 
xmm_zero_pblendb(BxPackedXmmRegister * dst,const BxPackedXmmRegister * op,Bit32u mask)553 BX_CPP_INLINE void xmm_zero_pblendb(BxPackedXmmRegister *dst, const BxPackedXmmRegister *op, Bit32u mask)
554 {
555   for (unsigned n=0; n < 16; n++, mask >>= 1) {
556     dst->xmmubyte(n) = (mask & 0x1) ? op->xmmubyte(n) : 0;
557   }
558 }
559 
560 #if BX_SUPPORT_EVEX
simd_pblendb(BxPackedAvxRegister * op1,const BxPackedAvxRegister * op2,Bit64u mask,unsigned len)561 BX_CPP_INLINE void simd_pblendb(BxPackedAvxRegister *op1, const BxPackedAvxRegister *op2, Bit64u mask, unsigned len)
562 {
563   for (unsigned n=0; n < len; n++) {
564     if (mask & 0x1) op1->vmmubyte(n) = op2->vmmubyte(n);
565     mask >>= 1;
566   }
567 }
568 
simd_zero_pblendb(BxPackedAvxRegister * dst,const BxPackedAvxRegister * op,Bit64u mask,unsigned len)569 BX_CPP_INLINE void simd_zero_pblendb(BxPackedAvxRegister *dst, const BxPackedAvxRegister *op, Bit64u mask, unsigned len)
570 {
571   for (unsigned n=0; n < len; n++) {
572     dst->vmmubyte(n) = (mask & 0x1) ? op->vmmubyte(n) : 0;
573     mask >>= 1;
574   }
575 }
576 #endif
577 
xmm_pblendw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit32u mask)578 BX_CPP_INLINE void xmm_pblendw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit32u mask)
579 {
580   for (unsigned n=0; n < 8; n++, mask >>= 1) {
581     if (mask & 0x1) op1->xmm16u(n) = op2->xmm16u(n);
582   }
583 }
584 
xmm_zero_pblendw(BxPackedXmmRegister * dst,const BxPackedXmmRegister * op,Bit32u mask)585 BX_CPP_INLINE void xmm_zero_pblendw(BxPackedXmmRegister *dst, const BxPackedXmmRegister *op, Bit32u mask)
586 {
587   for (unsigned n=0; n < 8; n++, mask >>= 1) {
588     dst->xmm16u(n) = (mask & 0x1) ? op->xmm16u(n) : 0;
589   }
590 }
591 
592 #if BX_SUPPORT_EVEX
simd_pblendw(BxPackedAvxRegister * op1,const BxPackedAvxRegister * op2,Bit32u mask,unsigned len)593 BX_CPP_INLINE void simd_pblendw(BxPackedAvxRegister *op1, const BxPackedAvxRegister *op2, Bit32u mask, unsigned len)
594 {
595   for (unsigned n=0; n < len; n++) {
596     if (mask & 0x1) op1->vmm16u(n) = op2->vmm16u(n);
597     mask >>= 1;
598   }
599 }
600 
simd_zero_pblendw(BxPackedAvxRegister * dst,const BxPackedAvxRegister * op,Bit32u mask,unsigned len)601 BX_CPP_INLINE void simd_zero_pblendw(BxPackedAvxRegister *dst, const BxPackedAvxRegister *op, Bit32u mask, unsigned len)
602 {
603   for (unsigned n=0; n < len; n++) {
604     dst->vmm16u(n) = (mask & 0x1) ? op->vmm16u(n) : 0;
605     mask >>= 1;
606   }
607 }
608 #endif
609 
xmm_blendps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit32u mask)610 BX_CPP_INLINE void xmm_blendps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit32u mask)
611 {
612   for (unsigned n=0; n < 4; n++, mask >>= 1) {
613     if (mask & 0x1) op1->xmm32u(n) = op2->xmm32u(n);
614   }
615 }
616 
xmm_zero_blendps(BxPackedXmmRegister * dst,const BxPackedXmmRegister * op,Bit32u mask)617 BX_CPP_INLINE void xmm_zero_blendps(BxPackedXmmRegister *dst, const BxPackedXmmRegister *op, Bit32u mask)
618 {
619   for (unsigned n=0; n < 4; n++, mask >>= 1) {
620     dst->xmm32u(n) = (mask & 0x1) ? op->xmm32u(n) : 0;
621   }
622 }
623 
624 #if BX_SUPPORT_EVEX
simd_blendps(BxPackedAvxRegister * op1,const BxPackedAvxRegister * op2,Bit32u mask,unsigned len)625 BX_CPP_INLINE void simd_blendps(BxPackedAvxRegister *op1, const BxPackedAvxRegister *op2, Bit32u mask, unsigned len)
626 {
627   for (unsigned n=0; n < len; n++) {
628     if (mask & 0x1) op1->vmm32u(n) = op2->vmm32u(n);
629     mask >>= 1;
630   }
631 }
632 
simd_zero_blendps(BxPackedAvxRegister * dst,const BxPackedAvxRegister * op,Bit32u mask,unsigned len)633 BX_CPP_INLINE void simd_zero_blendps(BxPackedAvxRegister *dst, const BxPackedAvxRegister *op, Bit32u mask, unsigned len)
634 {
635   for (unsigned n=0; n < len; n++) {
636     dst->vmm32u(n) = (mask & 0x1) ? op->vmm32u(n) : 0;
637     mask >>= 1;
638   }
639 }
640 #endif
641 
xmm_blendpd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit32u mask)642 BX_CPP_INLINE void xmm_blendpd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit32u mask)
643 {
644   for (unsigned n=0; n < 2; n++, mask >>= 1) {
645     if (mask & 0x1) op1->xmm64u(n) = op2->xmm64u(n);
646   }
647 }
648 
xmm_zero_blendpd(BxPackedXmmRegister * dst,const BxPackedXmmRegister * op,Bit32u mask)649 BX_CPP_INLINE void xmm_zero_blendpd(BxPackedXmmRegister *dst, const BxPackedXmmRegister *op, Bit32u mask)
650 {
651   for (unsigned n=0; n < 2; n++, mask >>= 1) {
652     dst->xmm64u(n) = (mask & 0x1) ? op->xmm64u(n) : 0;
653   }
654 }
655 
656 #if BX_SUPPORT_EVEX
simd_blendpd(BxPackedAvxRegister * op1,const BxPackedAvxRegister * op2,Bit32u mask,unsigned len)657 BX_CPP_INLINE void simd_blendpd(BxPackedAvxRegister *op1, const BxPackedAvxRegister *op2, Bit32u mask, unsigned len)
658 {
659   for (unsigned n=0; n < len; n++) {
660     if (mask & 0x1) op1->vmm64u(n) = op2->vmm64u(n);
661     mask >>= 1;
662   }
663 }
664 
simd_zero_blendpd(BxPackedAvxRegister * dst,const BxPackedAvxRegister * op,Bit32u mask,unsigned len)665 BX_CPP_INLINE void simd_zero_blendpd(BxPackedAvxRegister *dst, const BxPackedAvxRegister *op, Bit32u mask, unsigned len)
666 {
667   for (unsigned n=0; n < len; n++) {
668     dst->vmm64u(n) = (mask & 0x1) ? op->vmm64u(n) : 0;
669     mask >>= 1;
670   }
671 }
672 #endif
673 
xmm_pblendvb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * mask)674 BX_CPP_INLINE void xmm_pblendvb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *mask)
675 {
676   for(unsigned n=0; n<16; n++) {
677     if (mask->xmmsbyte(n) < 0) op1->xmmubyte(n) = op2->xmmubyte(n);
678   }
679 }
680 
xmm_pblendvw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * mask)681 BX_CPP_INLINE void xmm_pblendvw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *mask)
682 {
683   for(unsigned n=0; n<8; n++) {
684     if (mask->xmm16s(n) < 0) op1->xmm16u(n) = op2->xmm16u(n);
685   }
686 }
687 
xmm_blendvps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * mask)688 BX_CPP_INLINE void xmm_blendvps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *mask)
689 {
690   for(unsigned n=0; n<4; n++) {
691     if (mask->xmm32s(n) < 0) op1->xmm32u(n) = op2->xmm32u(n);
692   }
693 }
694 
xmm_blendvpd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * mask)695 BX_CPP_INLINE void xmm_blendvpd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *mask)
696 {
697   if (mask->xmm32s(1) < 0) op1->xmm64u(0) = op2->xmm64u(0);
698   if (mask->xmm32s(3) < 0) op1->xmm64u(1) = op2->xmm64u(1);
699 }
700 
701 // arithmetic (logic)
702 
xmm_andps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)703 BX_CPP_INLINE void xmm_andps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
704 {
705   for (unsigned n=0; n < 2; n++)
706     op1->xmm64u(n) &= op2->xmm64u(n);
707 }
708 
xmm_andnps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)709 BX_CPP_INLINE void xmm_andnps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
710 {
711   for (unsigned n=0; n < 2; n++)
712     op1->xmm64u(n) = ~(op1->xmm64u(n)) & op2->xmm64u(n);
713 }
714 
xmm_orps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)715 BX_CPP_INLINE void xmm_orps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
716 {
717   for (unsigned n=0; n < 2; n++)
718     op1->xmm64u(n) |= op2->xmm64u(n);
719 }
720 
xmm_xorps(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)721 BX_CPP_INLINE void xmm_xorps(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
722 {
723   for (unsigned n=0; n < 2; n++)
724     op1->xmm64u(n) ^= op2->xmm64u(n);
725 }
726 
727 // arithmetic (add/sub)
728 
xmm_paddb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)729 BX_CPP_INLINE void xmm_paddb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
730 {
731   for(unsigned n=0; n<16; n++) {
732     op1->xmmubyte(n) += op2->xmmubyte(n);
733   }
734 }
735 
xmm_paddw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)736 BX_CPP_INLINE void xmm_paddw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
737 {
738   for(unsigned n=0; n<8; n++) {
739     op1->xmm16u(n) += op2->xmm16u(n);
740   }
741 }
742 
xmm_paddd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)743 BX_CPP_INLINE void xmm_paddd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
744 {
745   for(unsigned n=0; n<4; n++) {
746     op1->xmm32u(n) += op2->xmm32u(n);
747   }
748 }
749 
xmm_paddq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)750 BX_CPP_INLINE void xmm_paddq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
751 {
752   for(unsigned n=0; n<2; n++) {
753     op1->xmm64u(n) += op2->xmm64u(n);
754   }
755 }
756 
xmm_psubb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)757 BX_CPP_INLINE void xmm_psubb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
758 {
759   for(unsigned n=0; n<16; n++) {
760     op1->xmmubyte(n) -= op2->xmmubyte(n);
761   }
762 }
763 
xmm_psubw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)764 BX_CPP_INLINE void xmm_psubw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
765 {
766   for(unsigned n=0; n<8; n++) {
767     op1->xmm16u(n) -= op2->xmm16u(n);
768   }
769 }
770 
xmm_psubd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)771 BX_CPP_INLINE void xmm_psubd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
772 {
773   for(unsigned n=0; n<4; n++) {
774     op1->xmm32u(n) -= op2->xmm32u(n);
775   }
776 }
777 
xmm_psubq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)778 BX_CPP_INLINE void xmm_psubq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
779 {
780   for(unsigned n=0; n<2; n++) {
781     op1->xmm64u(n) -= op2->xmm64u(n);
782   }
783 }
784 
785 // arithmetic (add/sub with saturation)
786 
xmm_paddsb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)787 BX_CPP_INLINE void xmm_paddsb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
788 {
789   for(unsigned n=0; n<16; n++) {
790     op1->xmmsbyte(n) = SaturateWordSToByteS(Bit16s(op1->xmmsbyte(n)) + Bit16s(op2->xmmsbyte(n)));
791   }
792 }
793 
xmm_paddsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)794 BX_CPP_INLINE void xmm_paddsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
795 {
796   for(unsigned n=0; n<8; n++) {
797     op1->xmm16s(n) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(n)) + Bit32s(op2->xmm16s(n)));
798   }
799 }
800 
xmm_paddusb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)801 BX_CPP_INLINE void xmm_paddusb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
802 {
803   for(unsigned n=0; n<16; n++) {
804     op1->xmmubyte(n) = SaturateWordSToByteU(Bit16s(op1->xmmubyte(n)) + Bit16s(op2->xmmubyte(n)));
805   }
806 }
807 
xmm_paddusw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)808 BX_CPP_INLINE void xmm_paddusw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
809 {
810   for(unsigned n=0; n<8; n++) {
811     op1->xmm16u(n) = SaturateDwordSToWordU(Bit32s(op1->xmm16u(n)) + Bit32s(op2->xmm16u(n)));
812   }
813 }
814 
xmm_psubsb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)815 BX_CPP_INLINE void xmm_psubsb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
816 {
817   for(unsigned n=0; n<16; n++) {
818     op1->xmmsbyte(n) = SaturateWordSToByteS(Bit16s(op1->xmmsbyte(n)) - Bit16s(op2->xmmsbyte(n)));
819   }
820 }
821 
xmm_psubsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)822 BX_CPP_INLINE void xmm_psubsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
823 {
824   for(unsigned n=0; n<8; n++) {
825     op1->xmm16s(n) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(n)) - Bit32s(op2->xmm16s(n)));
826   }
827 }
828 
xmm_psubusb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)829 BX_CPP_INLINE void xmm_psubusb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
830 {
831   for(unsigned n=0; n<16; n++)
832   {
833     if(op1->xmmubyte(n) > op2->xmmubyte(n))
834       op1->xmmubyte(n) -= op2->xmmubyte(n);
835     else
836       op1->xmmubyte(n) = 0;
837   }
838 }
839 
xmm_psubusw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)840 BX_CPP_INLINE void xmm_psubusw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
841 {
842   for(unsigned n=0; n<8; n++)
843   {
844     if(op1->xmm16u(n) > op2->xmm16u(n))
845       op1->xmm16u(n) -= op2->xmm16u(n);
846     else
847       op1->xmm16u(n) = 0;
848   }
849 }
850 
851 // arithmetic (horizontal add/sub)
852 
xmm_phaddw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)853 BX_CPP_INLINE void xmm_phaddw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
854 {
855   op1->xmm16u(0) = op1->xmm16u(0) + op1->xmm16u(1);
856   op1->xmm16u(1) = op1->xmm16u(2) + op1->xmm16u(3);
857   op1->xmm16u(2) = op1->xmm16u(4) + op1->xmm16u(5);
858   op1->xmm16u(3) = op1->xmm16u(6) + op1->xmm16u(7);
859 
860   op1->xmm16u(4) = op2->xmm16u(0) + op2->xmm16u(1);
861   op1->xmm16u(5) = op2->xmm16u(2) + op2->xmm16u(3);
862   op1->xmm16u(6) = op2->xmm16u(4) + op2->xmm16u(5);
863   op1->xmm16u(7) = op2->xmm16u(6) + op2->xmm16u(7);
864 }
865 
xmm_phaddd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)866 BX_CPP_INLINE void xmm_phaddd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
867 {
868   op1->xmm32u(0) = op1->xmm32u(0) + op1->xmm32u(1);
869   op1->xmm32u(1) = op1->xmm32u(2) + op1->xmm32u(3);
870   op1->xmm32u(2) = op2->xmm32u(0) + op2->xmm32u(1);
871   op1->xmm32u(3) = op2->xmm32u(2) + op2->xmm32u(3);
872 }
873 
xmm_phaddsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)874 BX_CPP_INLINE void xmm_phaddsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
875 {
876   op1->xmm16s(0) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(0)) + Bit32s(op1->xmm16s(1)));
877   op1->xmm16s(1) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(2)) + Bit32s(op1->xmm16s(3)));
878   op1->xmm16s(2) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(4)) + Bit32s(op1->xmm16s(5)));
879   op1->xmm16s(3) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(6)) + Bit32s(op1->xmm16s(7)));
880 
881   op1->xmm16s(4) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(0)) + Bit32s(op2->xmm16s(1)));
882   op1->xmm16s(5) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(2)) + Bit32s(op2->xmm16s(3)));
883   op1->xmm16s(6) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(4)) + Bit32s(op2->xmm16s(5)));
884   op1->xmm16s(7) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(6)) + Bit32s(op2->xmm16s(7)));
885 }
886 
xmm_phsubw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)887 BX_CPP_INLINE void xmm_phsubw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
888 {
889   op1->xmm16u(0) = op1->xmm16u(0) - op1->xmm16u(1);
890   op1->xmm16u(1) = op1->xmm16u(2) - op1->xmm16u(3);
891   op1->xmm16u(2) = op1->xmm16u(4) - op1->xmm16u(5);
892   op1->xmm16u(3) = op1->xmm16u(6) - op1->xmm16u(7);
893 
894   op1->xmm16u(4) = op2->xmm16u(0) - op2->xmm16u(1);
895   op1->xmm16u(5) = op2->xmm16u(2) - op2->xmm16u(3);
896   op1->xmm16u(6) = op2->xmm16u(4) - op2->xmm16u(5);
897   op1->xmm16u(7) = op2->xmm16u(6) - op2->xmm16u(7);
898 }
899 
xmm_phsubd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)900 BX_CPP_INLINE void xmm_phsubd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
901 {
902   op1->xmm32u(0) = op1->xmm32u(0) - op1->xmm32u(1);
903   op1->xmm32u(1) = op1->xmm32u(2) - op1->xmm32u(3);
904   op1->xmm32u(2) = op2->xmm32u(0) - op2->xmm32u(1);
905   op1->xmm32u(3) = op2->xmm32u(2) - op2->xmm32u(3);
906 }
907 
xmm_phsubsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)908 BX_CPP_INLINE void xmm_phsubsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
909 {
910   op1->xmm16s(0) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(0)) - Bit32s(op1->xmm16s(1)));
911   op1->xmm16s(1) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(2)) - Bit32s(op1->xmm16s(3)));
912   op1->xmm16s(2) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(4)) - Bit32s(op1->xmm16s(5)));
913   op1->xmm16s(3) = SaturateDwordSToWordS(Bit32s(op1->xmm16s(6)) - Bit32s(op1->xmm16s(7)));
914 
915   op1->xmm16s(4) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(0)) - Bit32s(op2->xmm16s(1)));
916   op1->xmm16s(5) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(2)) - Bit32s(op2->xmm16s(3)));
917   op1->xmm16s(6) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(4)) - Bit32s(op2->xmm16s(5)));
918   op1->xmm16s(7) = SaturateDwordSToWordS(Bit32s(op2->xmm16s(6)) - Bit32s(op2->xmm16s(7)));
919 }
920 
921 // average
922 
xmm_pavgb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)923 BX_CPP_INLINE void xmm_pavgb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
924 {
925   for(unsigned n=0; n<16; n++) {
926     op1->xmmubyte(n) = (op1->xmmubyte(n) + op2->xmmubyte(n) + 1) >> 1;
927   }
928 }
929 
xmm_pavgw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)930 BX_CPP_INLINE void xmm_pavgw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
931 {
932   for(unsigned n=0; n<8; n++) {
933     op1->xmm16u(n) = (op1->xmm16u(n) + op2->xmm16u(n) + 1) >> 1;
934   }
935 }
936 
937 // multiply
938 
xmm_pmullw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)939 BX_CPP_INLINE void xmm_pmullw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
940 {
941   for(unsigned n=0; n<8; n++) {
942     op1->xmm16s(n) *= op2->xmm16s(n);
943   }
944 }
945 
xmm_pmulhw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)946 BX_CPP_INLINE void xmm_pmulhw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
947 {
948   for(unsigned n=0; n<8; n++) {
949     Bit32s product = Bit32s(op1->xmm16s(n)) * Bit32s(op2->xmm16s(n));
950     op1->xmm16u(n) = (Bit16u)(product >> 16);
951   }
952 }
953 
xmm_pmulhuw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)954 BX_CPP_INLINE void xmm_pmulhuw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
955 {
956   for(unsigned n=0; n<8; n++) {
957     Bit32u product = Bit32u(op1->xmm16u(n)) * Bit32u(op2->xmm16u(n));
958     op1->xmm16u(n) = (Bit16u)(product >> 16);
959   }
960 }
961 
xmm_pmulld(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)962 BX_CPP_INLINE void xmm_pmulld(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
963 {
964   for(unsigned n=0; n<4; n++) {
965     op1->xmm32s(n) *= op2->xmm32s(n);
966   }
967 }
968 
xmm_pmullq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)969 BX_CPP_INLINE void xmm_pmullq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
970 {
971   for(unsigned n=0; n<2; n++) {
972     op1->xmm64s(n) *= op2->xmm64s(n);
973   }
974 }
975 
xmm_pmuldq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)976 BX_CPP_INLINE void xmm_pmuldq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
977 {
978   op1->xmm64s(0) = Bit64s(op1->xmm32s(0)) * Bit64s(op2->xmm32s(0));
979   op1->xmm64s(1) = Bit64s(op1->xmm32s(2)) * Bit64s(op2->xmm32s(2));
980 }
981 
xmm_pmuludq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)982 BX_CPP_INLINE void xmm_pmuludq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
983 {
984   op1->xmm64u(0) = Bit64u(op1->xmm32u(0)) * Bit64u(op2->xmm32u(0));
985   op1->xmm64u(1) = Bit64u(op1->xmm32u(2)) * Bit64u(op2->xmm32u(2));
986 }
987 
xmm_pmulhrsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)988 BX_CPP_INLINE void xmm_pmulhrsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
989 {
990   for(unsigned n=0; n<8; n++) {
991     op1->xmm16u(n) = (((Bit32s(op1->xmm16s(n)) * Bit32s(op2->xmm16s(n))) >> 14) + 1) >> 1;
992   }
993 }
994 
995 // multiply/add
996 
xmm_pmaddubsw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)997 BX_CPP_INLINE void xmm_pmaddubsw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
998 {
999   for(unsigned n=0; n<8; n++)
1000   {
1001     Bit32s temp = Bit32s(op1->xmmubyte(n*2))   * Bit32s(op2->xmmsbyte(n*2)) +
1002                   Bit32s(op1->xmmubyte(n*2+1)) * Bit32s(op2->xmmsbyte(n*2+1));
1003 
1004     op1->xmm16s(n) = SaturateDwordSToWordS(temp);
1005   }
1006 }
1007 
xmm_pmaddwd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1008 BX_CPP_INLINE void xmm_pmaddwd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1009 {
1010   for(unsigned n=0; n<4; n++)
1011   {
1012     op1->xmm32u(n) = Bit32s(op1->xmm16s(n*2))   * Bit32s(op2->xmm16s(n*2)) +
1013                      Bit32s(op1->xmm16s(n*2+1)) * Bit32s(op2->xmm16s(n*2+1));
1014   }
1015 }
1016 
1017 // broadcast
1018 
xmm_pbroadcastb(BxPackedXmmRegister * op,Bit8u val_8)1019 BX_CPP_INLINE void xmm_pbroadcastb(BxPackedXmmRegister *op, Bit8u val_8)
1020 {
1021   for(unsigned n=0; n<16; n++) {
1022     op->xmmubyte(n) = val_8;
1023   }
1024 }
1025 
xmm_pbroadcastw(BxPackedXmmRegister * op,Bit16u val_16)1026 BX_CPP_INLINE void xmm_pbroadcastw(BxPackedXmmRegister *op, Bit16u val_16)
1027 {
1028   for(unsigned n=0; n<8; n++) {
1029     op->xmm16u(n) = val_16;
1030   }
1031 }
1032 
xmm_pbroadcastd(BxPackedXmmRegister * op,Bit32u val_32)1033 BX_CPP_INLINE void xmm_pbroadcastd(BxPackedXmmRegister *op, Bit32u val_32)
1034 {
1035   for(unsigned n=0; n<4; n++) {
1036     op->xmm32u(n) = val_32;
1037   }
1038 }
1039 
xmm_pbroadcastq(BxPackedXmmRegister * op,Bit64u val_64)1040 BX_CPP_INLINE void xmm_pbroadcastq(BxPackedXmmRegister *op, Bit64u val_64)
1041 {
1042   for(unsigned n=0; n<2; n++) {
1043     op->xmm64u(n) = val_64;
1044   }
1045 }
1046 
1047 #if BX_SUPPORT_EVEX
simd_pbroadcastb(BxPackedAvxRegister * op,Bit8u val_8,unsigned len)1048 BX_CPP_INLINE void simd_pbroadcastb(BxPackedAvxRegister *op, Bit8u val_8, unsigned len)
1049 {
1050   for(unsigned n=0; n < len; n++) {
1051     op->vmmubyte(n) = val_8;
1052   }
1053 }
1054 
simd_pbroadcastw(BxPackedAvxRegister * op,Bit16u val_16,unsigned len)1055 BX_CPP_INLINE void simd_pbroadcastw(BxPackedAvxRegister *op, Bit16u val_16, unsigned len)
1056 {
1057   for(unsigned n=0; n < len; n++) {
1058     op->vmm16u(n) = val_16;
1059   }
1060 }
1061 
simd_pbroadcastd(BxPackedAvxRegister * op,Bit32u val_32,unsigned len)1062 BX_CPP_INLINE void simd_pbroadcastd(BxPackedAvxRegister *op, Bit32u val_32, unsigned len)
1063 {
1064   for(unsigned n=0; n < len; n++) {
1065     op->vmm32u(n) = val_32;
1066   }
1067 }
1068 
simd_pbroadcastq(BxPackedAvxRegister * op,Bit64u val_64,unsigned len)1069 BX_CPP_INLINE void simd_pbroadcastq(BxPackedAvxRegister *op, Bit64u val_64, unsigned len)
1070 {
1071   for(unsigned n=0; n < len; n++) {
1072     op->vmm64u(n) = val_64;
1073   }
1074 }
1075 #endif
1076 
1077 // sum of absolute differences (SAD)
1078 
xmm_psadbw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1079 BX_CPP_INLINE void xmm_psadbw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1080 {
1081   unsigned temp = 0;
1082   for (unsigned n=0; n < 8; n++)
1083     temp += abs(op1->xmmubyte(n) - op2->xmmubyte(n));
1084 
1085   op1->xmm64u(0) = Bit64u(temp);
1086 
1087   temp = 0;
1088   for (unsigned n=8; n < 16; n++)
1089     temp += abs(op1->xmmubyte(n) - op2->xmmubyte(n));
1090 
1091   op1->xmm64u(1) = Bit64u(temp);
1092 }
1093 
1094 // multiple sum of absolute differences (MSAD)
1095 
sad_quadruple(const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,int op1_offset,int op2_offset)1096 BX_CPP_INLINE Bit16u sad_quadruple(const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, int op1_offset, int op2_offset)
1097 {
1098   Bit32u r = 0;
1099 
1100   for (unsigned n=0; n < 4; n++) {
1101     Bit8u temp1 = op1->xmmubyte(n + op1_offset);
1102     Bit8u temp2 = op2->xmmubyte(n + op2_offset);
1103 
1104     r += abs(temp1 - temp2);
1105   }
1106 
1107   return r;
1108 }
1109 
xmm_mpsadbw(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,Bit8u control)1110 BX_CPP_INLINE void xmm_mpsadbw(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, Bit8u control)
1111 {
1112   unsigned src_offset = (control & 0x3) * 4;
1113   unsigned dst_offset = ((control >> 2) & 0x1) * 4;
1114 
1115   for (unsigned j=0; j < 8; j++) {
1116     r->xmm16u(j) = sad_quadruple(op1, op2, dst_offset + j, src_offset);
1117   }
1118 }
1119 
xmm_dbpsadbw(BxPackedXmmRegister * r,const BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1120 BX_CPP_INLINE void xmm_dbpsadbw(BxPackedXmmRegister *r, const BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1121 {
1122   // assuming shuffle of op2 was done outside
1123   r->xmm16u(0) = sad_quadruple(op1, op2,  0,  0);
1124   r->xmm16u(1) = sad_quadruple(op1, op2,  0,  1);
1125   r->xmm16u(2) = sad_quadruple(op1, op2,  4,  2);
1126   r->xmm16u(3) = sad_quadruple(op1, op2,  4,  3);
1127   r->xmm16u(4) = sad_quadruple(op1, op2,  8,  8);
1128   r->xmm16u(5) = sad_quadruple(op1, op2,  8,  9);
1129   r->xmm16u(6) = sad_quadruple(op1, op2, 12, 10);
1130   r->xmm16u(7) = sad_quadruple(op1, op2, 12, 11);
1131 }
1132 
1133 // conflict
1134 
1135 #if BX_SUPPORT_EVEX
1136 
simd_pconflictd(const BxPackedAvxRegister * op,int index)1137 BX_CPP_INLINE Bit32u simd_pconflictd(const BxPackedAvxRegister *op, int index)
1138 {
1139   Bit32u result = 0;
1140   // compare index element with all previous elements
1141   for (int i=0; i<index-1; i++) {
1142     if (op->vmm32u(index) == op->vmm32u(i)) result |= (1 << i);
1143   }
1144   return result;
1145 }
1146 
simd_pconflictq(const BxPackedAvxRegister * op,int index)1147 BX_CPP_INLINE Bit32u simd_pconflictq(const BxPackedAvxRegister *op, int index)
1148 {
1149   Bit32u result = 0;
1150   // compare index element with all previous elements
1151   for (int i=0; i<index-1; i++) {
1152     if (op->vmm64u(index) == op->vmm64u(i)) result |= (1 << i);
1153   }
1154   return result;
1155 }
1156 
1157 #endif
1158 
1159 // bitwise select
1160 
xmm_pselect(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2,const BxPackedXmmRegister * op3)1161 BX_CPP_INLINE void xmm_pselect(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2, const BxPackedXmmRegister *op3)
1162 {
1163   for(unsigned n=0;n < 2;n++) {
1164     op1->xmm64u(n) = (op3->xmm64u(n) & op1->xmm64u(n)) | (~op3->xmm64u(n) & op2->xmm64u(n));
1165   }
1166 }
1167 
1168 // shift
1169 
xmm_psravw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1170 BX_CPP_INLINE void xmm_psravw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1171 {
1172   for (unsigned n=0; n < 8; n++) {
1173     unsigned shift = op2->xmm16u(n);
1174     if(shift > 15)
1175       op1->xmm16u(n) = (op1->xmm16s(n) < 0) ? 0xffff : 0;
1176     else
1177       op1->xmm16u(n) = (Bit16u)(op1->xmm16s(n) >> shift);
1178   }
1179 }
1180 
xmm_psravd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1181 BX_CPP_INLINE void xmm_psravd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1182 {
1183   for (unsigned n=0; n < 4; n++) {
1184     Bit32u shift = op2->xmm32u(n);
1185     if(shift > 31)
1186       op1->xmm32u(n) = (op1->xmm32s(n) < 0) ? 0xffffffff : 0;
1187     else
1188       op1->xmm32u(n) = (Bit32u)(op1->xmm32s(n) >> shift);
1189   }
1190 }
1191 
xmm_psravq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1192 BX_CPP_INLINE void xmm_psravq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1193 {
1194   for (unsigned n=0; n < 2; n++) {
1195     Bit64u shift = op2->xmm64u(n);
1196     if(shift > 64)
1197       op1->xmm64u(n) = (op1->xmm64s(n) < 0) ? BX_CONST64(0xffffffffffffffff) : 0;
1198     else
1199       op1->xmm64u(n) = (Bit64u)(op1->xmm64s(n) >> shift);
1200   }
1201 }
1202 
xmm_psllvw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1203 BX_CPP_INLINE void xmm_psllvw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1204 {
1205   for (unsigned n=0; n < 8; n++) {
1206     unsigned shift = op2->xmm16u(n);
1207     if(shift > 15)
1208       op1->xmm16u(n) = 0;
1209     else
1210       op1->xmm16u(n) <<= shift;
1211   }
1212 }
1213 
xmm_psllvd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1214 BX_CPP_INLINE void xmm_psllvd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1215 {
1216   for (unsigned n=0; n < 4; n++) {
1217     Bit32u shift = op2->xmm32u(n);
1218     if(shift > 31)
1219       op1->xmm32u(n) = 0;
1220     else
1221       op1->xmm32u(n) <<= shift;
1222   }
1223 }
1224 
xmm_psllvq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1225 BX_CPP_INLINE void xmm_psllvq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1226 {
1227   for (unsigned n=0; n < 2; n++) {
1228     Bit64u shift = op2->xmm64u(n);
1229     if(shift > 63)
1230       op1->xmm64u(n) = 0;
1231     else
1232       op1->xmm64u(n) <<= shift;
1233   }
1234 }
1235 
xmm_psrlvw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1236 BX_CPP_INLINE void xmm_psrlvw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1237 {
1238   for (unsigned n=0; n < 8; n++) {
1239     unsigned shift = op2->xmm16u(n);
1240     if(shift > 15)
1241       op1->xmm16u(n) = 0;
1242     else
1243       op1->xmm16u(n) >>= shift;
1244   }
1245 }
1246 
xmm_psrlvd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1247 BX_CPP_INLINE void xmm_psrlvd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1248 {
1249   for (unsigned n=0; n < 4; n++) {
1250     Bit32u shift = op2->xmm32u(n);
1251     if(shift > 31)
1252       op1->xmm32u(n) = 0;
1253     else
1254       op1->xmm32u(n) >>= shift;
1255   }
1256 }
1257 
xmm_psrlvq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1258 BX_CPP_INLINE void xmm_psrlvq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1259 {
1260   for (unsigned n=0; n < 2; n++) {
1261     Bit64u shift = op2->xmm64u(n);
1262     if(shift > 63)
1263       op1->xmm64u(n) = 0;
1264     else
1265       op1->xmm64u(n) >>= shift;
1266   }
1267 }
1268 
xmm_psraw(BxPackedXmmRegister * op,Bit64u shift_64)1269 BX_CPP_INLINE void xmm_psraw(BxPackedXmmRegister *op, Bit64u shift_64)
1270 {
1271   if(shift_64 > 15) {
1272     for (unsigned n=0; n < 8; n++)
1273       op->xmm16u(n) = (op->xmm16s(n) < 0) ? 0xffff : 0;
1274   }
1275   else
1276   {
1277     Bit8u shift = (Bit8u) shift_64;
1278 
1279     for (unsigned n=0; n < 8; n++)
1280       op->xmm16u(n) = (Bit16u)(op->xmm16s(n) >> shift);
1281   }
1282 }
1283 
xmm_psrad(BxPackedXmmRegister * op,Bit64u shift_64)1284 BX_CPP_INLINE void xmm_psrad(BxPackedXmmRegister *op, Bit64u shift_64)
1285 {
1286   if(shift_64 > 31) {
1287     for (unsigned n=0; n < 4; n++)
1288       op->xmm32u(n) = (op->xmm32s(n) < 0) ? 0xffffffff : 0;
1289   }
1290   else
1291   {
1292     Bit8u shift = (Bit8u) shift_64;
1293 
1294     for (unsigned n=0; n < 4; n++)
1295       op->xmm32u(n) = (Bit32u)(op->xmm32s(n) >> shift);
1296   }
1297 }
1298 
xmm_psraq(BxPackedXmmRegister * op,Bit64u shift_64)1299 BX_CPP_INLINE void xmm_psraq(BxPackedXmmRegister *op, Bit64u shift_64)
1300 {
1301   if(shift_64 > 63) {
1302     for (unsigned n=0; n < 2; n++)
1303       op->xmm64u(n) = (op->xmm64s(n) < 0) ? BX_CONST64(0xffffffffffffffff) : 0;
1304   }
1305   else
1306   {
1307     Bit8u shift = (Bit8u) shift_64;
1308 
1309     for (unsigned n=0; n < 2; n++)
1310       op->xmm64u(n) = (Bit64u)(op->xmm64s(n) >> shift);
1311   }
1312 }
1313 
xmm_psrlw(BxPackedXmmRegister * op,Bit64u shift_64)1314 BX_CPP_INLINE void xmm_psrlw(BxPackedXmmRegister *op, Bit64u shift_64)
1315 {
1316   if(shift_64 > 15) op->clear();
1317   else
1318   {
1319     Bit8u shift = (Bit8u) shift_64;
1320 
1321     for (unsigned n=0; n < 8; n++)
1322       op->xmm16u(n) >>= shift;
1323   }
1324 }
1325 
xmm_psrld(BxPackedXmmRegister * op,Bit64u shift_64)1326 BX_CPP_INLINE void xmm_psrld(BxPackedXmmRegister *op, Bit64u shift_64)
1327 {
1328   if(shift_64 > 31) op->clear();
1329   else
1330   {
1331     Bit8u shift = (Bit8u) shift_64;
1332 
1333     for (unsigned n=0; n < 4; n++)
1334       op->xmm32u(n) >>= shift;
1335   }
1336 }
1337 
xmm_psrlq(BxPackedXmmRegister * op,Bit64u shift_64)1338 BX_CPP_INLINE void xmm_psrlq(BxPackedXmmRegister *op, Bit64u shift_64)
1339 {
1340   if(shift_64 > 64) op->clear();
1341   else
1342   {
1343     Bit8u shift = (Bit8u) shift_64;
1344 
1345     for (unsigned n=0; n < 2; n++)
1346       op->xmm64u(n) >>= shift;
1347   }
1348 }
1349 
xmm_psllw(BxPackedXmmRegister * op,Bit64u shift_64)1350 BX_CPP_INLINE void xmm_psllw(BxPackedXmmRegister *op, Bit64u shift_64)
1351 {
1352   if(shift_64 > 15) op->clear();
1353   else
1354   {
1355     Bit8u shift = (Bit8u) shift_64;
1356 
1357     for (unsigned n=0; n < 8; n++)
1358       op->xmm16u(n) <<= shift;
1359   }
1360 }
1361 
xmm_pslld(BxPackedXmmRegister * op,Bit64u shift_64)1362 BX_CPP_INLINE void xmm_pslld(BxPackedXmmRegister *op, Bit64u shift_64)
1363 {
1364   if(shift_64 > 31) op->clear();
1365   else
1366   {
1367     Bit8u shift = (Bit8u) shift_64;
1368 
1369     for (unsigned n=0; n < 4; n++)
1370       op->xmm32u(n) <<= shift;
1371   }
1372 }
1373 
xmm_psllq(BxPackedXmmRegister * op,Bit64u shift_64)1374 BX_CPP_INLINE void xmm_psllq(BxPackedXmmRegister *op, Bit64u shift_64)
1375 {
1376   if(shift_64 > 63) op->clear();
1377   else
1378   {
1379     Bit8u shift = (Bit8u) shift_64;
1380 
1381     for (unsigned n=0; n < 2; n++)
1382       op->xmm64u(n) <<= shift;
1383   }
1384 }
1385 
xmm_psrldq(BxPackedXmmRegister * op,Bit8u shift)1386 BX_CPP_INLINE void xmm_psrldq(BxPackedXmmRegister *op, Bit8u shift)
1387 {
1388   if(shift > 15) op->clear();
1389   else {
1390     if (shift > 7) {
1391       op->xmm64u(0) = op->xmm64u(1);
1392       op->xmm64u(1) = 0;
1393       shift -= 8;
1394     }
1395 
1396     shift <<= 3;
1397 
1398     if (shift != 0) {
1399       op->xmm64u(0) = (op->xmm64u(0) >> shift) | (op->xmm64u(1) << (64-shift));
1400       op->xmm64u(1) = (op->xmm64u(1) >> shift);
1401     }
1402   }
1403 }
1404 
xmm_pslldq(BxPackedXmmRegister * op,Bit8u shift)1405 BX_CPP_INLINE void xmm_pslldq(BxPackedXmmRegister *op, Bit8u shift)
1406 {
1407   if(shift > 15) op->clear();
1408   else {
1409     if (shift > 7) {
1410       op->xmm64u(1) = op->xmm64u(0);
1411       op->xmm64u(0) = 0;
1412       shift -= 8;
1413     }
1414 
1415     shift <<= 3;
1416 
1417     if (shift != 0) {
1418       op->xmm64u(1) = (op->xmm64u(1) << shift) | (op->xmm64u(0) >> (64-shift));
1419       op->xmm64u(0) = (op->xmm64u(0) << shift);
1420     }
1421   }
1422 }
1423 
xmm_palignr(BxPackedXmmRegister * op2,const BxPackedXmmRegister * op1,Bit8u shift)1424 BX_CPP_INLINE void xmm_palignr(BxPackedXmmRegister *op2, const BxPackedXmmRegister *op1, Bit8u shift)
1425 {
1426   // op2 = [op1:op2] >> shift
1427 
1428   if (shift > 15) {
1429     *op2 = *op1;
1430     xmm_psrldq(op2, shift - 16);
1431     return;
1432   }
1433 
1434   shift <<= 3;
1435 
1436   if (shift > 64) {
1437     shift -= 64;
1438     op2->xmm64u(0) = (op2->xmm64u(1) >> shift) | (op1->xmm64u(0) << (64-shift));
1439     op2->xmm64u(1) = (op1->xmm64u(0) >> shift) | (op1->xmm64u(1) << (64-shift));
1440   }
1441   else if (shift == 64) {
1442     op2->xmm64u(0) = op2->xmm64u(1);
1443     op2->xmm64u(1) = op1->xmm64u(0);
1444   }
1445   else if (shift != 0) {
1446     op2->xmm64u(0) = (op2->xmm64u(0) >> shift) | (op2->xmm64u(1) << (64-shift));
1447     op2->xmm64u(1) = (op2->xmm64u(1) >> shift) | (op1->xmm64u(0) << (64-shift));
1448   }
1449 }
1450 
1451 // rotate (right)
1452 
xmm_prorb(BxPackedXmmRegister * op,int shift)1453 BX_CPP_INLINE void xmm_prorb(BxPackedXmmRegister *op, int shift)
1454 {
1455   shift &= 0x7;
1456 
1457   for(unsigned n=0;n<16;n++) {
1458     op->xmmubyte(n) = (op->xmmubyte(n) >> shift) | (op->xmmubyte(n) << (8 - shift));
1459   }
1460 }
1461 
xmm_prorw(BxPackedXmmRegister * op,int shift)1462 BX_CPP_INLINE void xmm_prorw(BxPackedXmmRegister *op, int shift)
1463 {
1464   shift &= 0xf;
1465 
1466   for(unsigned n=0;n<8;n++) {
1467     op->xmm16u(n) = (op->xmm16u(n) >> shift) | (op->xmm16u(n) << (16 - shift));
1468   }
1469 }
1470 
xmm_prord(BxPackedXmmRegister * op,int shift)1471 BX_CPP_INLINE void xmm_prord(BxPackedXmmRegister *op, int shift)
1472 {
1473   shift &= 0x1f;
1474 
1475   for(unsigned n=0;n<4;n++) {
1476     op->xmm32u(n) = (op->xmm32u(n) >> shift) | (op->xmm32u(n) << (32 - shift));
1477   }
1478 }
1479 
xmm_prorq(BxPackedXmmRegister * op,int shift)1480 BX_CPP_INLINE void xmm_prorq(BxPackedXmmRegister *op, int shift)
1481 {
1482   shift &= 0x3f;
1483 
1484   for(unsigned n=0;n<2;n++) {
1485     op->xmm64u(n) = (op->xmm64u(n) >> shift) | (op->xmm64u(n) << (64 - shift));
1486   }
1487 }
1488 
xmm_prorvd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1489 BX_CPP_INLINE void xmm_prorvd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1490 {
1491   for(unsigned n=0;n<4;n++) {
1492     int shift = op2->xmm32u(n) & 0x1f;
1493     op1->xmm32u(n) = (op1->xmm32u(n) >> shift) | (op1->xmm32u(n) << (32 - shift));
1494   }
1495 }
1496 
xmm_prorvq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1497 BX_CPP_INLINE void xmm_prorvq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1498 {
1499   for(unsigned n=0;n<2;n++) {
1500     int shift = op2->xmm64u(n) & 0x3f;
1501     op1->xmm64u(n) = (op1->xmm64u(n) >> shift) | (op1->xmm64u(n) << (64 - shift));
1502   }
1503 }
1504 
1505 // rotate (left)
1506 
xmm_prolb(BxPackedXmmRegister * op,int shift)1507 BX_CPP_INLINE void xmm_prolb(BxPackedXmmRegister *op, int shift)
1508 {
1509   shift &= 0x7;
1510 
1511   for(unsigned n=0;n<16;n++) {
1512     op->xmmubyte(n) = (op->xmmubyte(n) << shift) | (op->xmmubyte(n) >> (8 - shift));
1513   }
1514 }
1515 
xmm_prolw(BxPackedXmmRegister * op,int shift)1516 BX_CPP_INLINE void xmm_prolw(BxPackedXmmRegister *op, int shift)
1517 {
1518   shift &= 0xf;
1519 
1520   for(unsigned n=0;n<8;n++) {
1521     op->xmm16u(n) = (op->xmm16u(n) << shift) | (op->xmm16u(n) >> (16 - shift));
1522   }
1523 }
1524 
xmm_prold(BxPackedXmmRegister * op,int shift)1525 BX_CPP_INLINE void xmm_prold(BxPackedXmmRegister *op, int shift)
1526 {
1527   shift &= 0x1f;
1528 
1529   for(unsigned n=0;n<4;n++) {
1530     op->xmm32u(n) = (op->xmm32u(n) << shift) | (op->xmm32u(n) >> (32 - shift));
1531   }
1532 }
1533 
xmm_prolq(BxPackedXmmRegister * op,int shift)1534 BX_CPP_INLINE void xmm_prolq(BxPackedXmmRegister *op, int shift)
1535 {
1536   shift &= 0x3f;
1537 
1538   for(unsigned n=0;n<2;n++) {
1539     op->xmm64u(n) = (op->xmm64u(n) << shift) | (op->xmm64u(n) >> (64 - shift));
1540   }
1541 }
1542 
xmm_prolvd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1543 BX_CPP_INLINE void xmm_prolvd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1544 {
1545   for(unsigned n=0;n<4;n++) {
1546     int shift = op2->xmm32u(n) & 0x1f;
1547     op1->xmm32u(n) = (op1->xmm32u(n) << shift) | (op1->xmm32u(n) >> (32 - shift));
1548   }
1549 }
1550 
xmm_prolvq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1551 BX_CPP_INLINE void xmm_prolvq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1552 {
1553   for(unsigned n=0;n<2;n++) {
1554     int shift = op2->xmm64u(n) & 0x3f;
1555     op1->xmm64u(n) = (op1->xmm64u(n) << shift) | (op1->xmm64u(n) >> (64 - shift));
1556   }
1557 }
1558 
1559 // variable shift/rotate (XOP)
1560 
xmm_protb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1561 BX_CPP_INLINE void xmm_protb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1562 {
1563   for(unsigned n=0;n < 16;n++) {
1564     int shift = op2->xmmsbyte(n);
1565     if (shift > 0) {
1566       // rotate left
1567       shift &= 0x7;
1568       op1->xmmubyte(n) = (op1->xmmubyte(n) << shift) | (op1->xmmubyte(n) >> (8 - shift));
1569     }
1570     else if (shift < 0) {
1571       // rotate right
1572       shift = -shift & 0x7;
1573       op1->xmmubyte(n) = (op1->xmmubyte(n) >> shift) | (op1->xmmubyte(n) << (8 - shift));
1574     }
1575   }
1576 }
1577 
xmm_protw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1578 BX_CPP_INLINE void xmm_protw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1579 {
1580   for(unsigned n=0;n < 8;n++) {
1581     int shift = op2->xmmsbyte(n*2);
1582     if (shift > 0) {
1583       // rotate left
1584       shift &= 0xf;
1585       op1->xmm16u(n) = (op1->xmm16u(n) << shift) | (op1->xmm16u(n) >> (16 - shift));
1586     }
1587     else if (shift < 0) {
1588       // rotate right
1589       shift = -shift & 0xf;
1590       op1->xmm16u(n) = (op1->xmm16u(n) >> shift) | (op1->xmm16u(n) << (16 - shift));
1591     }
1592   }
1593 }
1594 
xmm_protd(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1595 BX_CPP_INLINE void xmm_protd(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1596 {
1597   for(unsigned n=0;n < 4;n++) {
1598     int shift = op2->xmmsbyte(n*4);
1599     if (shift > 0) {
1600       // rotate left
1601       shift &= 0x1f;
1602       op1->xmm32u(n) = (op1->xmm32u(n) << shift) | (op1->xmm32u(n) >> (32 - shift));
1603     }
1604     else if (shift < 0) {
1605       // rotate right
1606       shift = -shift & 0x1f;
1607       op1->xmm32u(n) = (op1->xmm32u(n) >> shift) | (op1->xmm32u(n) << (32 - shift));
1608     }
1609   }
1610 }
1611 
xmm_protq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1612 BX_CPP_INLINE void xmm_protq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1613 {
1614   for(unsigned n=0;n < 2;n++) {
1615     int shift = op2->xmmsbyte(n*8);
1616     if (shift > 0) {
1617       // rotate left
1618       shift &= 0x3f;
1619       op1->xmm64u(n) = (op1->xmm64u(n) << shift) | (op1->xmm64u(n) >> (64 - shift));
1620     }
1621     else if (shift < 0) {
1622       // rotate right
1623       shift = -shift & 0x3f;
1624       op1->xmm64u(n) = (op1->xmm64u(n) >> shift) | (op1->xmm64u(n) << (64 - shift));
1625     }
1626   }
1627 }
1628 
xmm_pshab(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1629 BX_CPP_INLINE void xmm_pshab(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1630 {
1631   for(unsigned n=0;n < 16;n++) {
1632     int shift = op2->xmmsbyte(n);
1633     if (shift > 0) {
1634       // shift left
1635       op1->xmmsbyte(n) <<= (shift & 0x7);
1636     }
1637     else if (shift < 0) {
1638       // shift right
1639       op1->xmmsbyte(n) >>= (-shift & 0x7);
1640     }
1641   }
1642 }
1643 
xmm_pshaw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1644 BX_CPP_INLINE void xmm_pshaw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1645 {
1646   for(unsigned n=0;n < 8;n++) {
1647     int shift = op2->xmmsbyte(n*2);
1648     if (shift > 0) {
1649       // shift left
1650       op1->xmm16s(n) <<= (shift & 0xf);
1651     }
1652     else if (shift < 0) {
1653       // shift right
1654       op1->xmm16s(n) >>= (-shift & 0xf);
1655     }
1656   }
1657 }
1658 
xmm_pshad(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1659 BX_CPP_INLINE void xmm_pshad(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1660 {
1661   for(unsigned n=0;n < 4;n++) {
1662     int shift = op2->xmmsbyte(n*4);
1663     if (shift > 0) {
1664       // shift left
1665       op1->xmm32s(n) <<= (shift & 0x1f);
1666     }
1667     else if (shift < 0) {
1668       // shift right
1669       op1->xmm32s(n) >>= (-shift & 0x1f);
1670     }
1671   }
1672 }
1673 
xmm_pshaq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1674 BX_CPP_INLINE void xmm_pshaq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1675 {
1676   for(unsigned n=0;n < 2;n++) {
1677     int shift = op2->xmmsbyte(n*8);
1678     if (shift > 0) {
1679       // shift left
1680       op1->xmm64s(n) <<= (shift & 0x3f);
1681     }
1682     else if (shift < 0) {
1683       // shift right
1684       op1->xmm64s(n) >>= (-shift & 0x3f);
1685     }
1686   }
1687 }
1688 
xmm_pshlb(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1689 BX_CPP_INLINE void xmm_pshlb(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1690 {
1691   for(unsigned n=0;n < 16;n++) {
1692     int shift = op2->xmmsbyte(n);
1693     if (shift > 0) {
1694       // shift left
1695       op1->xmmubyte(n) <<= (shift & 0x7);
1696     }
1697     else if (shift < 0) {
1698       // shift right
1699       op1->xmmubyte(n) >>= (-shift & 0x7);
1700     }
1701   }
1702 }
1703 
xmm_pshlw(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1704 BX_CPP_INLINE void xmm_pshlw(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1705 {
1706   for(unsigned n=0;n < 8;n++) {
1707     int shift = op2->xmmubyte(n*2);
1708     if (shift > 0) {
1709       // shift left
1710       op1->xmm16u(n) <<= (shift & 0xf);
1711     }
1712     else if (shift < 0) {
1713       // shift right
1714       op1->xmm16u(n) >>= (-shift & 0xf);
1715     }
1716   }
1717 }
1718 
xmm_pshld(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1719 BX_CPP_INLINE void xmm_pshld(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1720 {
1721   for(unsigned n=0;n < 4;n++) {
1722     int shift = op2->xmmsbyte(n*4);
1723     if (shift > 0) {
1724       // shift left
1725       op1->xmm32u(n) <<= (shift & 0x1f);
1726     }
1727     else if (shift < 0) {
1728       // shift right
1729       op1->xmm32u(n) >>= (-shift & 0x1f);
1730     }
1731   }
1732 }
1733 
xmm_pshlq(BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1734 BX_CPP_INLINE void xmm_pshlq(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1735 {
1736   for(unsigned n=0;n < 2;n++) {
1737     int shift = op2->xmmsbyte(n*8);
1738     if (shift > 0) {
1739       // shift left
1740       op1->xmm64u(n) <<= (shift & 0x3f);
1741     }
1742     else if (shift < 0) {
1743       // shift right
1744       op1->xmm64u(n) >>= (-shift & 0x3f);
1745     }
1746   }
1747 }
1748 
1749 // VNNI
1750 
xmm_pdpbusd(BxPackedXmmRegister * dst,BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1751 BX_CPP_INLINE void xmm_pdpbusd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1752 {
1753   for(unsigned n=0; n<4; n++)
1754   {
1755     Bit32s p1word = (Bit32u) op1->xmmubyte(n*4)   * (Bit32s) op2->xmmsbyte(n*4);
1756     Bit32s p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1);
1757     Bit32s p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2);
1758     Bit32s p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3);
1759 
1760     dst->xmm32s(n) += (p1word + p2word + p3word + p4word);
1761   }
1762 }
1763 
xmm_pdpbusds(BxPackedXmmRegister * dst,BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1764 BX_CPP_INLINE void xmm_pdpbusds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1765 {
1766   for(unsigned n=0; n<4; n++)
1767   {
1768     Bit32s p1word = (Bit32u) op1->xmmubyte(n*4)   * (Bit32s) op2->xmmsbyte(n*4);
1769     Bit32s p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1);
1770     Bit32s p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2);
1771     Bit32s p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3);
1772 
1773     Bit64s result = (Bit64s) dst->xmm32s(n) + (p1word + p2word + p3word + p4word);
1774     dst->xmm32s(n) = SaturateQwordSToDwordS(result);
1775   }
1776 }
1777 
xmm_pdpwssd(BxPackedXmmRegister * dst,BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1778 BX_CPP_INLINE void xmm_pdpwssd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1779 {
1780   for(unsigned n=0; n<4; n++)
1781   {
1782     Bit32s p1_dword = (Bit32s) op1->xmm16s(n*2)   * (Bit32s) op2->xmm16s(n*2);
1783     Bit32s p2_dword = (Bit32s) op1->xmm16s(n*2+1) * (Bit32s) op2->xmm16s(n*2+1);
1784 
1785     dst->xmm32s(n) += (p1_dword + p2_dword);
1786   }
1787 }
1788 
xmm_pdpwssds(BxPackedXmmRegister * dst,BxPackedXmmRegister * op1,const BxPackedXmmRegister * op2)1789 BX_CPP_INLINE void xmm_pdpwssds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
1790 {
1791   for(unsigned n=0; n<4; n++)
1792   {
1793     Bit32s p1_dword = (Bit32s) op1->xmm16s(n*2)   * (Bit32s) op2->xmm16s(n*2);
1794     Bit32s p2_dword = (Bit32s) op1->xmm16s(n*2+1) * (Bit32s) op2->xmm16s(n*2+1);
1795 
1796     Bit64s result = (Bit64s) dst->xmm32s(n) + (p1_dword + p2_dword);
1797     dst->xmm32s(n) = SaturateQwordSToDwordS(result);
1798   }
1799 }
1800 
1801 #endif
1802