1 /////////////////////////////////////////////////////////////////////////
2 // $Id: aes.cc 13520 2018-05-27 19:09:59Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 // Copyright (c) 2008-2018 Stanislav Shwartsman
6 // Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
28
29 #if BX_CPU_LEVEL >= 6
30
31 #include "simd_int.h"
32
33 //
34 // XMM - Byte Representation of a 128-bit AES State
35 //
36 // F E D C B A
37 // 1 1 1 1 1 1
38 // 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
39 // --+-+-+-+-+-+-+-+-+-+-+-+-+-+-+--
40 // P O N M L K J I H G F E D C B A
41 //
42 //
43 // XMM - Matrix Representation of a 128-bit AES State
44 //
45 // | A E I M | | S(0,0) S(0,1) S(0,2) S(0,3) | | S(0) S(4) S(8) S(C) |
46 // | B F J N | = | S(1,0) S(1,1) S(1,2) S(1,3) | = | S(1) S(5) S(9) S(D) |
47 // | C G K O | | S(2,0) S(2,1) S(2,2) S(2,3) | | S(2) S(6) S(A) S(E) |
48 // | D H L P | | S(3,0) S(3,1) S(2,3) S(3,3) | | S(3) S(7) S(B) S(F) |
49 //
50
51 //
52 // AES ShiftRows transformation
53 //
54 // | A E I M | | A E I M |
55 // | B F J N | => | F J N B |
56 // | C G K O | | K O C G |
57 // | D H L P | | P D H L |
58 //
59
AES_ShiftRows(BxPackedXmmRegister & state)60 BX_CPP_INLINE void AES_ShiftRows(BxPackedXmmRegister &state)
61 {
62 BxPackedXmmRegister tmp = state;
63
64 state.xmmubyte(0x0) = tmp.xmmubyte(0x0); // A => A
65 state.xmmubyte(0x1) = tmp.xmmubyte(0x5);
66 state.xmmubyte(0x2) = tmp.xmmubyte(0xA);
67 state.xmmubyte(0x3) = tmp.xmmubyte(0xF);
68 state.xmmubyte(0x4) = tmp.xmmubyte(0x4); // E => E
69 state.xmmubyte(0x5) = tmp.xmmubyte(0x9);
70 state.xmmubyte(0x6) = tmp.xmmubyte(0xE);
71 state.xmmubyte(0x7) = tmp.xmmubyte(0x3);
72 state.xmmubyte(0x8) = tmp.xmmubyte(0x8); // I => I
73 state.xmmubyte(0x9) = tmp.xmmubyte(0xD);
74 state.xmmubyte(0xA) = tmp.xmmubyte(0x2);
75 state.xmmubyte(0xB) = tmp.xmmubyte(0x7);
76 state.xmmubyte(0xC) = tmp.xmmubyte(0xC); // M => M
77 state.xmmubyte(0xD) = tmp.xmmubyte(0x1);
78 state.xmmubyte(0xE) = tmp.xmmubyte(0x6);
79 state.xmmubyte(0xF) = tmp.xmmubyte(0xB);
80 }
81
82 //
83 // AES InverseShiftRows transformation
84 //
85 // | A E I M | | A E I M |
86 // | B F J N | => | N B F J |
87 // | C G K O | | K O C G |
88 // | D H L P | | H L P D |
89 //
90
AES_InverseShiftRows(BxPackedXmmRegister & state)91 BX_CPP_INLINE void AES_InverseShiftRows(BxPackedXmmRegister &state)
92 {
93 BxPackedXmmRegister tmp = state;
94
95 state.xmmubyte(0x0) = tmp.xmmubyte(0x0); // A => A
96 state.xmmubyte(0x1) = tmp.xmmubyte(0xD);
97 state.xmmubyte(0x2) = tmp.xmmubyte(0xA);
98 state.xmmubyte(0x3) = tmp.xmmubyte(0x7);
99 state.xmmubyte(0x4) = tmp.xmmubyte(0x4); // E => E
100 state.xmmubyte(0x5) = tmp.xmmubyte(0x1);
101 state.xmmubyte(0x6) = tmp.xmmubyte(0xE);
102 state.xmmubyte(0x7) = tmp.xmmubyte(0xB);
103 state.xmmubyte(0x8) = tmp.xmmubyte(0x8); // I => I
104 state.xmmubyte(0x9) = tmp.xmmubyte(0x5);
105 state.xmmubyte(0xA) = tmp.xmmubyte(0x2);
106 state.xmmubyte(0xB) = tmp.xmmubyte(0xF);
107 state.xmmubyte(0xC) = tmp.xmmubyte(0xC); // M => M
108 state.xmmubyte(0xD) = tmp.xmmubyte(0x9);
109 state.xmmubyte(0xE) = tmp.xmmubyte(0x6);
110 state.xmmubyte(0xF) = tmp.xmmubyte(0x3);
111 }
112
113 static const Bit8u sbox_transformation[256] = {
114 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
115 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
116 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
117 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
118 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
119 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
120 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
121 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
122 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
123 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
124 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
125 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
126 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
127 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
128 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
129 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
130 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
131 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
132 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
133 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
134 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
135 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
136 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
137 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
138 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
139 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
140 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
141 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
142 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
143 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
144 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
145 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
146 };
147
148 static const Bit8u inverse_sbox_transformation[256] = {
149 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
150 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
151 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
152 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
153 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
154 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
155 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
156 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
157 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
158 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
159 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
160 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
161 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
162 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
163 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
164 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
165 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
166 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
167 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
168 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
169 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
170 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
171 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
172 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
173 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
174 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
175 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
176 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
177 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
178 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
179 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
180 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
181 };
182
AES_SubstituteBytes(BxPackedXmmRegister & state)183 BX_CPP_INLINE void AES_SubstituteBytes(BxPackedXmmRegister &state)
184 {
185 for (int i=0; i<16; i++)
186 state.xmmubyte(i) = sbox_transformation[state.xmmubyte(i)];
187 }
188
AES_InverseSubstituteBytes(BxPackedXmmRegister & state)189 BX_CPP_INLINE void AES_InverseSubstituteBytes(BxPackedXmmRegister &state)
190 {
191 for (int i=0; i<16; i++)
192 state.xmmubyte(i) = inverse_sbox_transformation[state.xmmubyte(i)];
193 }
194
195 /*
196 * Galois Field multiplication of a by b, modulo m.
197 * Just like arithmetic multiplication, except that additions and
198 * subtractions are replaced by XOR.
199 * The code was taken from: http://www.darkside.com.au/ice/index.html
200 */
201
gf_mul(unsigned a,unsigned b)202 BX_CPP_INLINE unsigned gf_mul(unsigned a, unsigned b)
203 {
204 unsigned res = 0, m = 0x11b;
205
206 while (b) {
207 if (b & 1)
208 res ^= a;
209
210 a <<= 1;
211 b >>= 1;
212
213 if (a >= 256)
214 a ^= m;
215 }
216
217 return res;
218 }
219
220 #define AES_STATE(s,a,b) (s.xmmubyte((b)*4+(a)))
221
AES_MixColumns(BxPackedXmmRegister & state)222 static void AES_MixColumns(BxPackedXmmRegister &state)
223 {
224 BxPackedXmmRegister tmp = state;
225
226 for(int j=0; j<4; j++) {
227 AES_STATE(state, 0, j) = gf_mul(0x2, AES_STATE(tmp, 0, j)) ^
228 gf_mul(0x3, AES_STATE(tmp, 1, j)) ^
229 AES_STATE(tmp, 2, j) ^
230 AES_STATE(tmp, 3, j);
231
232 AES_STATE(state, 1, j) = AES_STATE(tmp, 0, j) ^
233 gf_mul(0x2, AES_STATE(tmp, 1, j)) ^
234 gf_mul(0x3, AES_STATE(tmp, 2, j)) ^
235 AES_STATE(tmp, 3, j);
236
237 AES_STATE(state, 2, j) = AES_STATE(tmp, 0, j) ^
238 AES_STATE(tmp, 1, j) ^
239 gf_mul(0x2, AES_STATE(tmp, 2, j)) ^
240 gf_mul(0x3, AES_STATE(tmp, 3, j));
241
242 AES_STATE(state, 3, j) = gf_mul(0x3, AES_STATE(tmp, 0, j)) ^
243 AES_STATE(tmp, 1, j) ^
244 AES_STATE(tmp, 2, j) ^
245 gf_mul(0x2, AES_STATE(tmp, 3, j));
246 }
247 }
248
AES_InverseMixColumns(BxPackedXmmRegister & state)249 static void AES_InverseMixColumns(BxPackedXmmRegister &state)
250 {
251 BxPackedXmmRegister tmp = state;
252
253 for(int j=0; j<4; j++) {
254 AES_STATE(state, 0, j) = gf_mul(0xE, AES_STATE(tmp, 0, j)) ^
255 gf_mul(0xB, AES_STATE(tmp, 1, j)) ^
256 gf_mul(0xD, AES_STATE(tmp, 2, j)) ^
257 gf_mul(0x9, AES_STATE(tmp, 3, j));
258
259 AES_STATE(state, 1, j) = gf_mul(0x9, AES_STATE(tmp, 0, j)) ^
260 gf_mul(0xE, AES_STATE(tmp, 1, j)) ^
261 gf_mul(0xB, AES_STATE(tmp, 2, j)) ^
262 gf_mul(0xD, AES_STATE(tmp, 3, j));
263
264 AES_STATE(state, 2, j) = gf_mul(0xD, AES_STATE(tmp, 0, j)) ^
265 gf_mul(0x9, AES_STATE(tmp, 1, j)) ^
266 gf_mul(0xE, AES_STATE(tmp, 2, j)) ^
267 gf_mul(0xB, AES_STATE(tmp, 3, j));
268
269 AES_STATE(state, 3, j) = gf_mul(0xB, AES_STATE(tmp, 0, j)) ^
270 gf_mul(0xD, AES_STATE(tmp, 1, j)) ^
271 gf_mul(0x9, AES_STATE(tmp, 2, j)) ^
272 gf_mul(0xE, AES_STATE(tmp, 3, j));
273 }
274 }
275
AES_SubWord(Bit32u x)276 BX_CPP_INLINE Bit32u AES_SubWord(Bit32u x)
277 {
278 Bit8u b0 = sbox_transformation[(x) & 0xff];
279 Bit8u b1 = sbox_transformation[(x>>8) & 0xff];
280 Bit8u b2 = sbox_transformation[(x>>16) & 0xff];
281 Bit8u b3 = sbox_transformation[(x>>24) & 0xff];
282
283 return b0 | ((Bit32u)(b1) << 8) |
284 ((Bit32u)(b2) << 16) | ((Bit32u)(b3) << 24);
285 }
286
AES_RotWord(Bit32u x)287 BX_CPP_INLINE Bit32u AES_RotWord(Bit32u x)
288 {
289 return (x >> 8) | (x << 24);
290 }
291
292 /* 66 0F 38 DB */
AESIMC_VdqWdqR(bxInstruction_c * i)293 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESIMC_VdqWdqR(bxInstruction_c *i)
294 {
295 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
296
297 AES_InverseMixColumns(op);
298
299 BX_WRITE_XMM_REGZ(i->dst(), op, i->getVL());
300
301 BX_NEXT_INSTR(i);
302 }
303
304 /* 66 0F 38 DC */
AESENC_VdqWdqR(bxInstruction_c * i)305 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESENC_VdqWdqR(bxInstruction_c *i)
306 {
307 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
308
309 AES_ShiftRows(op1);
310 AES_SubstituteBytes(op1);
311 AES_MixColumns(op1);
312
313 xmm_xorps(&op1, &op2);
314
315 BX_WRITE_XMM_REG(i->dst(), op1);
316
317 BX_NEXT_INSTR(i);
318 }
319
320 #if BX_SUPPORT_AVX
VAESENC_VdqHdqWdqR(bxInstruction_c * i)321 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VAESENC_VdqHdqWdqR(bxInstruction_c *i)
322 {
323 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
324 unsigned len = i->getVL();
325
326 for (unsigned n=0; n < len; n++) {
327 AES_ShiftRows(op1.vmm128(n));
328 AES_SubstituteBytes(op1.vmm128(n));
329 AES_MixColumns(op1.vmm128(n));
330
331 xmm_xorps(&op1.vmm128(n), &op2.vmm128(n));
332 }
333
334 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
335
336 BX_NEXT_INSTR(i);
337 }
338 #endif
339
340 /* 66 0F 38 DD */
AESENCLAST_VdqWdqR(bxInstruction_c * i)341 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESENCLAST_VdqWdqR(bxInstruction_c *i)
342 {
343 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
344
345 AES_ShiftRows(op1);
346 AES_SubstituteBytes(op1);
347
348 xmm_xorps(&op1, &op2);
349
350 BX_WRITE_XMM_REG(i->dst(), op1);
351
352 BX_NEXT_INSTR(i);
353 }
354
355 #if BX_SUPPORT_AVX
VAESENCLAST_VdqHdqWdqR(bxInstruction_c * i)356 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VAESENCLAST_VdqHdqWdqR(bxInstruction_c *i)
357 {
358 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
359 unsigned len = i->getVL();
360
361 for (unsigned n=0; n < len; n++) {
362 AES_ShiftRows(op1.vmm128(n));
363 AES_SubstituteBytes(op1.vmm128(n));
364
365 xmm_xorps(&op1.vmm128(n), &op2.vmm128(n));
366 }
367
368 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
369
370 BX_NEXT_INSTR(i);
371 }
372 #endif
373
374 /* 66 0F 38 DE */
AESDEC_VdqWdqR(bxInstruction_c * i)375 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESDEC_VdqWdqR(bxInstruction_c *i)
376 {
377 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
378
379 AES_InverseShiftRows(op1);
380 AES_InverseSubstituteBytes(op1);
381 AES_InverseMixColumns(op1);
382
383 xmm_xorps(&op1, &op2);
384
385 BX_WRITE_XMM_REG(i->dst(), op1);
386
387 BX_NEXT_INSTR(i);
388 }
389
390 #if BX_SUPPORT_AVX
VAESDEC_VdqHdqWdqR(bxInstruction_c * i)391 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VAESDEC_VdqHdqWdqR(bxInstruction_c *i)
392 {
393 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
394 unsigned len = i->getVL();
395
396 for (unsigned n=0; n < len; n++) {
397 AES_InverseShiftRows(op1.vmm128(n));
398 AES_InverseSubstituteBytes(op1.vmm128(n));
399 AES_InverseMixColumns(op1.vmm128(n));
400
401 xmm_xorps(&op1.vmm128(n), &op2.vmm128(n));
402 }
403
404 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
405
406 BX_NEXT_INSTR(i);
407 }
408 #endif
409
410 /* 66 0F 38 DF */
AESDECLAST_VdqWdqR(bxInstruction_c * i)411 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESDECLAST_VdqWdqR(bxInstruction_c *i)
412 {
413 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
414
415 AES_InverseShiftRows(op1);
416 AES_InverseSubstituteBytes(op1);
417
418 xmm_xorps(&op1, &op2);
419
420 BX_WRITE_XMM_REG(i->dst(), op1);
421
422 BX_NEXT_INSTR(i);
423 }
424
425 #if BX_SUPPORT_AVX
VAESDECLAST_VdqHdqWdqR(bxInstruction_c * i)426 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VAESDECLAST_VdqHdqWdqR(bxInstruction_c *i)
427 {
428 BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
429 unsigned len = i->getVL();
430
431 for (unsigned n=0; n < len; n++) {
432 AES_InverseShiftRows(op1.vmm128(n));
433 AES_InverseSubstituteBytes(op1.vmm128(n));
434
435 xmm_xorps(&op1.vmm128(n), &op2.vmm128(n));
436 }
437
438 BX_WRITE_AVX_REGZ(i->dst(), op1, len);
439
440 BX_NEXT_INSTR(i);
441 }
442 #endif
443
444 /* 66 0F 3A DF */
AESKEYGENASSIST_VdqWdqIbR(bxInstruction_c * i)445 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESKEYGENASSIST_VdqWdqIbR(bxInstruction_c *i)
446 {
447 BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()), result;
448
449 Bit32u rcon32 = i->Ib();
450
451 result.xmm32u(0) = AES_SubWord(op.xmm32u(1));
452 result.xmm32u(1) = AES_RotWord(result.xmm32u(0)) ^ rcon32;
453 result.xmm32u(2) = AES_SubWord(op.xmm32u(3));
454 result.xmm32u(3) = AES_RotWord(result.xmm32u(2)) ^ rcon32;
455
456 BX_WRITE_XMM_REGZ(i->dst(), result, i->getVL());
457
458 BX_NEXT_INSTR(i);
459 }
460
xmm_pclmulqdq(BxPackedXmmRegister * r,Bit64u a,Bit64u b)461 BX_CPP_INLINE void xmm_pclmulqdq(BxPackedXmmRegister *r, Bit64u a, Bit64u b)
462 {
463 BxPackedXmmRegister tmp;
464
465 tmp.xmm64u(0) = a;
466 tmp.xmm64u(1) = 0;
467
468 r->clear();
469
470 for (unsigned n = 0; b && n < 64; n++) {
471 if (b & 1) {
472 xmm_xorps(r, &tmp);
473 }
474 tmp.xmm64u(1) = (tmp.xmm64u(1) << 1) | (tmp.xmm64u(0) >> 63);
475 tmp.xmm64u(0) <<= 1;
476 b >>= 1;
477 }
478 }
479
480 /* 66 0F 3A 44 */
PCLMULQDQ_VdqWdqIbR(bxInstruction_c * i)481 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCLMULQDQ_VdqWdqIbR(bxInstruction_c *i)
482 {
483 BxPackedXmmRegister r;
484 Bit8u imm8 = i->Ib();
485
486 BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
487
488 // Perform Carry Less Multiplication [R = A CLMUL B]
489 // A determined by op1[imm8[0]]
490 // B determined by op2[imm8[4]]
491 xmm_pclmulqdq(&r, op1.xmm64u(imm8 & 1), op2.xmm64u((imm8 >> 4) & 1));
492
493 BX_WRITE_XMM_REG(i->dst(), r);
494
495 BX_NEXT_INSTR(i);
496 }
497
498 #if BX_SUPPORT_AVX
VPCLMULQDQ_VdqHdqWdqIbR(bxInstruction_c * i)499 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCLMULQDQ_VdqHdqWdqIbR(bxInstruction_c *i)
500 {
501 BxPackedAvxRegister r;
502 unsigned len = i->getVL();
503 Bit8u imm8 = i->Ib();
504
505 r.clear();
506
507 for (unsigned n=0; n < len; n++) {
508 BxPackedXmmRegister op1 = BX_READ_AVX_REG_LANE(i->src1(), n), op2 = BX_READ_AVX_REG_LANE(i->src2(), n);
509
510 // Perform Carry Less Multiplication [R = A CLMUL B]
511 // A determined by op1[imm8[0]]
512 // B determined by op2[imm8[4]]
513 xmm_pclmulqdq(&r.vmm128(n), op1.xmm64u(imm8 & 1), op2.xmm64u((imm8 >> 4) & 1));
514 }
515
516 BX_WRITE_AVX_REG(i->dst(), r);
517
518 BX_NEXT_INSTR(i);
519 }
520 #endif
521
522 #endif
523