1 /////////////////////////////////////////////////////////////////////////
2 // $Id: sse_string.cc 14086 2021-01-30 08:35:35Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 // Copyright (c) 2007-2018 Stanislav Shwartsman
6 // Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
28
29 #if BX_CPU_LEVEL >= 6
30
31 // Compare all pairs of Ai, Bj according to imm8 control
compare_strings(Bit8u BoolRes[16][16],const BxPackedXmmRegister & op1,const BxPackedXmmRegister & op2,Bit8u imm)32 static void compare_strings(Bit8u BoolRes[16][16], const BxPackedXmmRegister &op1, const BxPackedXmmRegister &op2, Bit8u imm)
33 {
34 unsigned i, j;
35 unsigned aggregation_operation = (imm >> 2) & 3;
36
37 // All possible comparisons are performed, the individual boolean
38 // results of those comparisons are referred by
39 // BoolRes[op2 element index, op1 element index]
40
41 switch (imm & 3) {
42 case 0: /* unsigned bytes compare */
43 for (i=0;i<16;i++) {
44 for (j=0;j<16;j++) {
45 switch (aggregation_operation) {
46 case 0: /* 'equal' comparison */
47 case 2:
48 case 3:
49 BoolRes[j][i] = (op1.xmmubyte(i) == op2.xmmubyte(j));
50 break;
51 case 1: /* 'ranges' comparison */
52 if ((i % 2) == 0)
53 BoolRes[j][i] = (op1.xmmubyte(i) <= op2.xmmubyte(j));
54 else
55 BoolRes[j][i] = (op1.xmmubyte(i) >= op2.xmmubyte(j));
56 break;
57 }
58 }
59 }
60 break;
61
62 case 1: /* unsigned words compare */
63 for (i=0;i<8;i++) {
64 for (j=0;j<8;j++) {
65 switch (aggregation_operation) {
66 case 0: /* 'equal' comparison */
67 case 2:
68 case 3:
69 BoolRes[j][i] = (op1.xmm16u(i) == op2.xmm16u(j));
70 break;
71 case 1: /* 'ranges' comparison */
72 if ((i % 2) == 0)
73 BoolRes[j][i] = (op1.xmm16u(i) <= op2.xmm16u(j));
74 else
75 BoolRes[j][i] = (op1.xmm16u(i) >= op2.xmm16u(j));
76 break;
77 }
78 }
79 }
80 break;
81
82 case 2: /* signed bytes compare */
83 for (i=0;i<16;i++) {
84 for (j=0;j<16;j++) {
85 switch (aggregation_operation) {
86 case 0: /* 'equal' comparison */
87 case 2:
88 case 3:
89 BoolRes[j][i] = (op1.xmmsbyte(i) == op2.xmmsbyte(j));
90 break;
91 case 1: /* 'ranges' comparison */
92 if ((i % 2) == 0)
93 BoolRes[j][i] = (op1.xmmsbyte(i) <= op2.xmmsbyte(j));
94 else
95 BoolRes[j][i] = (op1.xmmsbyte(i) >= op2.xmmsbyte(j));
96 break;
97 }
98 }
99 }
100 break;
101
102 case 3: /* signed words compare */
103 for (i=0;i<8;i++) {
104 for (j=0;j<8;j++) {
105 switch (aggregation_operation) {
106 case 0: /* 'equal' comparison */
107 case 2:
108 case 3:
109 BoolRes[j][i] = (op1.xmm16s(i) == op2.xmm16s(j));
110 break;
111 case 1: /* 'ranges' comparison */
112 if ((i % 2) == 0)
113 BoolRes[j][i] = (op1.xmm16s(i) <= op2.xmm16s(j));
114 else
115 BoolRes[j][i] = (op1.xmm16s(i) >= op2.xmm16s(j));
116 break;
117 }
118 }
119 }
120 break;
121 }
122 }
123
find_eos32(Bit32s reg32,Bit8u imm)124 static unsigned find_eos32(Bit32s reg32, Bit8u imm)
125 {
126 if (imm & 0x1) { // 8 elements
127 if (reg32 > 8 || reg32 < -8) return 8;
128 else return abs(reg32);
129 }
130 else { // 16 elements
131 if (reg32 > 16 || reg32 < -16) return 16;
132 else return abs(reg32);
133 }
134 }
135
136 #if BX_SUPPORT_X86_64
find_eos64(Bit64s reg64,Bit8u imm)137 static unsigned find_eos64(Bit64s reg64, Bit8u imm)
138 {
139 if (imm & 0x1) { // 8 elements
140 if (reg64 > 8 || reg64 < -8) return 8;
141 else return (unsigned) abs(reg64);
142 }
143 else { // 16 elements
144 if (reg64 > 16 || reg64 < -16) return 16;
145 else return (unsigned) abs(reg64);
146 }
147 }
148 #endif
149
find_eos(const BxPackedXmmRegister & op,Bit8u imm)150 static unsigned find_eos(const BxPackedXmmRegister &op, Bit8u imm)
151 {
152 unsigned i = 0;
153
154 if (imm & 0x1) { // 8 elements
155 for(i=0;i<8;i++)
156 if (op.xmm16u(i) == 0) break;
157 }
158 else { // 16 elements
159 for(i=0;i<16;i++)
160 if (op.xmmubyte(i) == 0) break;
161 }
162
163 return i;
164 }
165
override_if_data_invalid(bool val,bool i_valid,bool j_valid,Bit8u imm)166 static bool override_if_data_invalid(bool val, bool i_valid, bool j_valid, Bit8u imm)
167 {
168 unsigned aggregation_operation = (imm >> 2) & 3;
169
170 switch(aggregation_operation) {
171 case 0: // 'equal any'
172 case 1: // 'ranges'
173 if (! i_valid || ! j_valid) // one of the elements is invalid
174 return 0;
175 break;
176
177 case 2: // 'equal each'
178 if (! i_valid) {
179 if (! j_valid) return 1; // both elements are invalid
180 else return 0; // only i is invalid
181 }
182 else {
183 if (! j_valid) return 0; // only j is invalid
184 }
185 break;
186
187 case 3: // 'equal ordered'
188 if (! i_valid) { // element i is invalid
189 return 1;
190 }
191 else {
192 if (! j_valid) { // only j is invalid
193 return 0;
194 }
195 }
196 break;
197 }
198
199 return val;
200 }
201
aggregate(Bit8u BoolRes[16][16],unsigned len1,unsigned len2,Bit8u imm)202 static Bit16u aggregate(Bit8u BoolRes[16][16], unsigned len1, unsigned len2, Bit8u imm)
203 {
204 unsigned aggregation_operation = (imm >> 2) & 3;
205 unsigned num_elements = (imm & 0x1) ? 8 : 16;
206 unsigned polarity = (imm >> 4) & 3;
207 unsigned i,j,k;
208
209 Bit16u result = 0;
210
211 switch(aggregation_operation) {
212 case 0: // 'equal any'
213 for(j=0; j<num_elements; j++) {
214 bool res = 0;
215 for(i=0; i<num_elements; i++) {
216 if (override_if_data_invalid(BoolRes[j][i], (i < len1), (j < len2), imm)) {
217 res = 1;
218 break;
219 }
220 }
221
222 if (res) result |= (1<<j);
223 }
224 break;
225
226 case 1: // 'ranges'
227 for(j=0; j<num_elements; j++) {
228 bool res = 0;
229 for(i=0; i<num_elements; i+=2) {
230 if (override_if_data_invalid(BoolRes[j][i], (i < len1), (j < len2), imm) &&
231 override_if_data_invalid(BoolRes[j][i+1], (i+1 < len1), (j < len2), imm)) {
232 res = 1;
233 break;
234 }
235 }
236
237 if (res) result |= (1<<j);
238 }
239 break;
240
241 case 2: // 'equal each'
242 for(j=0; j<num_elements; j++) {
243 if (override_if_data_invalid(BoolRes[j][j], (j < len1), (j < len2), imm))
244 result |= (1<<j);
245 }
246 break;
247
248 case 3: // 'equal ordered'
249 for(j=0; j<num_elements; j++) {
250 bool res = 1;
251 for (i=0, k=j; (i < num_elements-j) && (k < num_elements); i++, k++) {
252 if (! override_if_data_invalid(BoolRes[k][i], (i < len1), (k < len2), imm)) {
253 res = 0;
254 break;
255 }
256 }
257
258 if (res) result |= (1<<j);
259 }
260 break;
261 }
262
263 switch(polarity) {
264 case 0:
265 case 2:
266 break; // do nothing
267
268 case 1:
269 result ^= (num_elements == 8) ? 0xFF : 0xFFFF;
270 break;
271
272 case 3:
273 for (j=0;j<num_elements;j++)
274 if (j < len2) result ^= (1<<j); // flip the bit
275 break;
276 }
277
278 return result;
279 }
280
281 /* 66 0F 3A 60 */
PCMPESTRM_VdqWdqIbR(bxInstruction_c * i)282 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCMPESTRM_VdqWdqIbR(bxInstruction_c *i)
283 {
284 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst());
285 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src()), result;
286 Bit8u imm8 = i->Ib();
287
288 // compare all pairs of Ai, Bj
289 Bit8u BoolRes[16][16];
290 compare_strings(BoolRes, op1, op2, imm8);
291 unsigned len1, len2, num_elements = (imm8 & 0x1) ? 8 : 16;
292
293 #if BX_SUPPORT_X86_64
294 if (i->os64L()) {
295 len1 = find_eos64(RAX, imm8);
296 len2 = find_eos64(RDX, imm8);
297 }
298 else
299 #endif
300 {
301 len1 = find_eos32(EAX, imm8);
302 len2 = find_eos32(EDX, imm8);
303 }
304 Bit16u result2 = aggregate(BoolRes, len1, len2, imm8);
305
306 // As defined by imm8[6], result2 is then either stored to the least
307 // significant bits of XMM0 (zero extended to 128 bits) or expanded
308 // into a byte/word-mask and then stored to XMM0
309 if (imm8 & 0x40) {
310 if (num_elements == 8) {
311 for (int index = 0; index < 8; index++)
312 result.xmm16u(index) = (result2 & (1<<index)) ? 0xffff : 0;
313 }
314 else { // num_elements = 16
315 for (int index = 0; index < 16; index++)
316 result.xmmubyte(index) = (result2 & (1<<index)) ? 0xff : 0;
317 }
318 }
319 else {
320 result.xmm64u(1) = 0;
321 result.xmm64u(0) = (Bit64u) result2;
322 }
323
324 Bit32u flags = 0;
325 if (result2 != 0) flags |= EFlagsCFMask;
326 if (len1 < num_elements) flags |= EFlagsSFMask;
327 if (len2 < num_elements) flags |= EFlagsZFMask;
328 if (result2 & 0x1)
329 flags |= EFlagsOFMask;
330 setEFlagsOSZAPC(flags);
331
332 BX_WRITE_XMM_REGZ(0, result, i->getVL()); /* store result XMM0 */
333
334 BX_NEXT_INSTR(i);
335 }
336
337 /* 66 0F 3A 61 */
PCMPESTRI_VdqWdqIbR(bxInstruction_c * i)338 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCMPESTRI_VdqWdqIbR(bxInstruction_c *i)
339 {
340 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
341 Bit8u imm8 = i->Ib();
342
343 // compare all pairs of Ai, Bj
344 Bit8u BoolRes[16][16];
345 compare_strings(BoolRes, op1, op2, imm8);
346 unsigned len1, len2, num_elements = (imm8 & 0x1) ? 8 : 16;
347 int index;
348
349 #if BX_SUPPORT_X86_64
350 if (i->os64L()) {
351 len1 = find_eos64(RAX, imm8);
352 len2 = find_eos64(RDX, imm8);
353 }
354 else
355 #endif
356 {
357 len1 = find_eos32(EAX, imm8);
358 len2 = find_eos32(EDX, imm8);
359 }
360 Bit16u result2 = aggregate(BoolRes, len1, len2, imm8);
361
362 // The index of the first (or last, according to imm8[6]) set bit of result2
363 // is returned to ECX. If no bits are set in IntRes2, ECX is set to 16 (8)
364 if (imm8 & 0x40) {
365 // The index returned to ECX is of the MSB in result2
366 for (index=num_elements-1; index>=0; index--)
367 if (result2 & (1<<index)) break;
368 if (index < 0) index = num_elements;
369 }
370 else {
371 // The index returned to ECX is of the LSB in result2
372 for (index=0; index<(int)num_elements; index++)
373 if (result2 & (1<<index)) break;
374 }
375 RCX = index;
376
377 Bit32u flags = 0;
378 if (result2 != 0) flags |= EFlagsCFMask;
379 if (len1 < num_elements) flags |= EFlagsSFMask;
380 if (len2 < num_elements) flags |= EFlagsZFMask;
381 if (result2 & 0x1)
382 flags |= EFlagsOFMask;
383 setEFlagsOSZAPC(flags);
384
385 BX_NEXT_INSTR(i);
386 }
387
388 /* 66 0F 3A 62 */
PCMPISTRM_VdqWdqIbR(bxInstruction_c * i)389 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCMPISTRM_VdqWdqIbR(bxInstruction_c *i)
390 {
391 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst());
392 BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src()), result;
393 Bit8u imm8 = i->Ib();
394
395 // compare all pairs of Ai, Bj
396 Bit8u BoolRes[16][16];
397 compare_strings(BoolRes, op1, op2, imm8);
398
399 unsigned num_elements = (imm8 & 0x1) ? 8 : 16;
400 unsigned len1 = find_eos(op1, imm8);
401 unsigned len2 = find_eos(op2, imm8);
402 Bit16u result2 = aggregate(BoolRes, len1, len2, imm8);
403
404 // As defined by imm8[6], result2 is then either stored to the least
405 // significant bits of XMM0 (zero extended to 128 bits) or expanded
406 // into a byte/word-mask and then stored to XMM0
407 if (imm8 & 0x40) {
408 if (num_elements == 8) {
409 for (int index = 0; index < 8; index++)
410 result.xmm16u(index) = (result2 & (1<<index)) ? 0xffff : 0;
411 }
412 else { // num_elements = 16
413 for (int index = 0; index < 16; index++)
414 result.xmmubyte(index) = (result2 & (1<<index)) ? 0xff : 0;
415 }
416 }
417 else {
418 result.xmm64u(1) = 0;
419 result.xmm64u(0) = (Bit64u) result2;
420 }
421
422 Bit32u flags = 0;
423 if (result2 != 0) flags |= EFlagsCFMask;
424 if (len1 < num_elements) flags |= EFlagsSFMask;
425 if (len2 < num_elements) flags |= EFlagsZFMask;
426 if (result2 & 0x1)
427 flags |= EFlagsOFMask;
428 setEFlagsOSZAPC(flags);
429
430 BX_WRITE_XMM_REGZ(0, result, i->getVL()); /* store result XMM0 */
431
432 BX_NEXT_INSTR(i);
433 }
434
435 /* 66 0F 3A 63 */
PCMPISTRI_VdqWdqIbR(bxInstruction_c * i)436 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCMPISTRI_VdqWdqIbR(bxInstruction_c *i)
437 {
438 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
439 Bit8u imm8 = i->Ib();
440
441 // compare all pairs of Ai, Bj
442 Bit8u BoolRes[16][16];
443 compare_strings(BoolRes, op1, op2, imm8);
444 unsigned num_elements = (imm8 & 0x1) ? 8 : 16;
445 int index;
446
447 unsigned len1 = find_eos(op1, imm8);
448 unsigned len2 = find_eos(op2, imm8);
449 Bit16u result2 = aggregate(BoolRes, len1, len2, imm8);
450
451 // The index of the first (or last, according to imm8[6]) set bit of result2
452 // is returned to ECX. If no bits are set in IntRes2, ECX is set to 16 (8)
453 if (imm8 & 0x40) {
454 // The index returned to ECX is of the MSB in result2
455 for (index=num_elements-1; index>=0; index--)
456 if (result2 & (1<<index)) break;
457 if (index < 0) index = num_elements;
458 }
459 else {
460 // The index returned to ECX is of the LSB in result2
461 for (index=0; index<(int)num_elements; index++)
462 if (result2 & (1<<index)) break;
463 }
464 RCX = index;
465
466 Bit32u flags = 0;
467 if (result2 != 0) flags |= EFlagsCFMask;
468 if (len1 < num_elements) flags |= EFlagsSFMask;
469 if (len2 < num_elements) flags |= EFlagsZFMask;
470 if (result2 & 0x1)
471 flags |= EFlagsOFMask;
472 setEFlagsOSZAPC(flags);
473
474 BX_NEXT_INSTR(i);
475 }
476
477 #endif // BX_CPU_LEVEL >= 6
478