1 /////////////////////////////////////////////////////////////////////////
2 // $Id: aes.cc 13520 2018-05-27 19:09:59Z sshwarts $
3 /////////////////////////////////////////////////////////////////////////
4 //
5 //   Copyright (c) 2008-2018 Stanislav Shwartsman
6 //          Written by Stanislav Shwartsman [sshwarts at sourceforge net]
7 //
8 //  This library is free software; you can redistribute it and/or
9 //  modify it under the terms of the GNU Lesser General Public
10 //  License as published by the Free Software Foundation; either
11 //  version 2 of the License, or (at your option) any later version.
12 //
13 //  This library is distributed in the hope that it will be useful,
14 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 //  Lesser General Public License for more details.
17 //
18 //  You should have received a copy of the GNU Lesser General Public
19 //  License along with this library; if not, write to the Free Software
20 //  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
21 //
22 /////////////////////////////////////////////////////////////////////////
23 
24 #define NEED_CPU_REG_SHORTCUTS 1
25 #include "bochs.h"
26 #include "cpu.h"
27 #define LOG_THIS BX_CPU_THIS_PTR
28 
29 #if BX_CPU_LEVEL >= 6
30 
31 #include "simd_int.h"
32 
33 //
34 // XMM - Byte Representation of a 128-bit AES State
35 //
36 //      F E D C B A
37 //      1 1 1 1 1 1
38 //      5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
39 //     --+-+-+-+-+-+-+-+-+-+-+-+-+-+-+--
40 //      P O N M L K J I H G F E D C B A
41 //
42 //
43 // XMM - Matrix Representation of a 128-bit AES State
44 //
45 // | A E I M |   | S(0,0) S(0,1) S(0,2) S(0,3) |   | S(0) S(4) S(8) S(C) |
46 // | B F J N | = | S(1,0) S(1,1) S(1,2) S(1,3) | = | S(1) S(5) S(9) S(D) |
47 // | C G K O |   | S(2,0) S(2,1) S(2,2) S(2,3) |   | S(2) S(6) S(A) S(E) |
48 // | D H L P |   | S(3,0) S(3,1) S(2,3) S(3,3) |   | S(3) S(7) S(B) S(F) |
49 //
50 
51 //
52 // AES ShiftRows transformation
53 //
54 // | A E I M |    | A E I M |
55 // | B F J N | => | F J N B |
56 // | C G K O |    | K O C G |
57 // | D H L P |    | P D H L |
58 //
59 
AES_ShiftRows(BxPackedXmmRegister & state)60 BX_CPP_INLINE void AES_ShiftRows(BxPackedXmmRegister &state)
61 {
62   BxPackedXmmRegister tmp = state;
63 
64   state.xmmubyte(0x0) = tmp.xmmubyte(0x0); // A => A
65   state.xmmubyte(0x1) = tmp.xmmubyte(0x5);
66   state.xmmubyte(0x2) = tmp.xmmubyte(0xA);
67   state.xmmubyte(0x3) = tmp.xmmubyte(0xF);
68   state.xmmubyte(0x4) = tmp.xmmubyte(0x4); // E => E
69   state.xmmubyte(0x5) = tmp.xmmubyte(0x9);
70   state.xmmubyte(0x6) = tmp.xmmubyte(0xE);
71   state.xmmubyte(0x7) = tmp.xmmubyte(0x3);
72   state.xmmubyte(0x8) = tmp.xmmubyte(0x8); // I => I
73   state.xmmubyte(0x9) = tmp.xmmubyte(0xD);
74   state.xmmubyte(0xA) = tmp.xmmubyte(0x2);
75   state.xmmubyte(0xB) = tmp.xmmubyte(0x7);
76   state.xmmubyte(0xC) = tmp.xmmubyte(0xC); // M => M
77   state.xmmubyte(0xD) = tmp.xmmubyte(0x1);
78   state.xmmubyte(0xE) = tmp.xmmubyte(0x6);
79   state.xmmubyte(0xF) = tmp.xmmubyte(0xB);
80 }
81 
82 //
83 // AES InverseShiftRows transformation
84 //
85 // | A E I M |    | A E I M |
86 // | B F J N | => | N B F J |
87 // | C G K O |    | K O C G |
88 // | D H L P |    | H L P D |
89 //
90 
AES_InverseShiftRows(BxPackedXmmRegister & state)91 BX_CPP_INLINE void AES_InverseShiftRows(BxPackedXmmRegister &state)
92 {
93   BxPackedXmmRegister tmp = state;
94 
95   state.xmmubyte(0x0) = tmp.xmmubyte(0x0); // A => A
96   state.xmmubyte(0x1) = tmp.xmmubyte(0xD);
97   state.xmmubyte(0x2) = tmp.xmmubyte(0xA);
98   state.xmmubyte(0x3) = tmp.xmmubyte(0x7);
99   state.xmmubyte(0x4) = tmp.xmmubyte(0x4); // E => E
100   state.xmmubyte(0x5) = tmp.xmmubyte(0x1);
101   state.xmmubyte(0x6) = tmp.xmmubyte(0xE);
102   state.xmmubyte(0x7) = tmp.xmmubyte(0xB);
103   state.xmmubyte(0x8) = tmp.xmmubyte(0x8); // I => I
104   state.xmmubyte(0x9) = tmp.xmmubyte(0x5);
105   state.xmmubyte(0xA) = tmp.xmmubyte(0x2);
106   state.xmmubyte(0xB) = tmp.xmmubyte(0xF);
107   state.xmmubyte(0xC) = tmp.xmmubyte(0xC); // M => M
108   state.xmmubyte(0xD) = tmp.xmmubyte(0x9);
109   state.xmmubyte(0xE) = tmp.xmmubyte(0x6);
110   state.xmmubyte(0xF) = tmp.xmmubyte(0x3);
111 }
112 
113 static const Bit8u sbox_transformation[256] = {
114   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
115   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
116   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
117   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
118   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
119   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
120   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
121   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
122   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
123   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
124   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
125   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
126   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
127   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
128   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
129   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
130   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
131   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
132   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
133   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
134   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
135   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
136   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
137   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
138   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
139   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
140   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
141   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
142   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
143   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
144   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
145   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
146 };
147 
148 static const Bit8u inverse_sbox_transformation[256] = {
149   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
150   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
151   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
152   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
153   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
154   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
155   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
156   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
157   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
158   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
159   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
160   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
161   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
162   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
163   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
164   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
165   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
166   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
167   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
168   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
169   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
170   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
171   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
172   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
173   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
174   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
175   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
176   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
177   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
178   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
179   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
180   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
181 };
182 
AES_SubstituteBytes(BxPackedXmmRegister & state)183 BX_CPP_INLINE void AES_SubstituteBytes(BxPackedXmmRegister &state)
184 {
185   for (int i=0; i<16; i++)
186     state.xmmubyte(i) = sbox_transformation[state.xmmubyte(i)];
187 }
188 
AES_InverseSubstituteBytes(BxPackedXmmRegister & state)189 BX_CPP_INLINE void AES_InverseSubstituteBytes(BxPackedXmmRegister &state)
190 {
191   for (int i=0; i<16; i++)
192     state.xmmubyte(i) = inverse_sbox_transformation[state.xmmubyte(i)];
193 }
194 
195 /*
196  * Galois Field multiplication of a by b, modulo m.
197  * Just like arithmetic multiplication, except that additions and
198  * subtractions are replaced by XOR.
199  * The code was taken from: http://www.darkside.com.au/ice/index.html
200  */
201 
gf_mul(unsigned a,unsigned b)202 BX_CPP_INLINE unsigned gf_mul(unsigned a, unsigned b)
203 {
204   unsigned res = 0, m = 0x11b;
205 
206   while (b) {
207     if (b & 1)
208       res ^= a;
209 
210     a <<= 1;
211     b >>= 1;
212 
213     if (a >= 256)
214       a ^= m;
215   }
216 
217   return res;
218 }
219 
220 #define AES_STATE(s,a,b) (s.xmmubyte((b)*4+(a)))
221 
AES_MixColumns(BxPackedXmmRegister & state)222 static void AES_MixColumns(BxPackedXmmRegister &state)
223 {
224   BxPackedXmmRegister tmp = state;
225 
226   for(int j=0; j<4; j++) {
227     AES_STATE(state, 0, j) = gf_mul(0x2, AES_STATE(tmp, 0, j)) ^
228                              gf_mul(0x3, AES_STATE(tmp, 1, j)) ^
229                              AES_STATE(tmp, 2, j) ^
230                              AES_STATE(tmp, 3, j);
231 
232     AES_STATE(state, 1, j) = AES_STATE(tmp, 0, j) ^
233                              gf_mul(0x2, AES_STATE(tmp, 1, j)) ^
234                              gf_mul(0x3, AES_STATE(tmp, 2, j)) ^
235                              AES_STATE(tmp, 3, j);
236 
237     AES_STATE(state, 2, j) = AES_STATE(tmp, 0, j) ^
238                              AES_STATE(tmp, 1, j) ^
239                              gf_mul(0x2, AES_STATE(tmp, 2, j)) ^
240                              gf_mul(0x3, AES_STATE(tmp, 3, j));
241 
242     AES_STATE(state, 3, j) = gf_mul(0x3, AES_STATE(tmp, 0, j)) ^
243                              AES_STATE(tmp, 1, j) ^
244                              AES_STATE(tmp, 2, j) ^
245                              gf_mul(0x2, AES_STATE(tmp, 3, j));
246   }
247 }
248 
AES_InverseMixColumns(BxPackedXmmRegister & state)249 static void AES_InverseMixColumns(BxPackedXmmRegister &state)
250 {
251   BxPackedXmmRegister tmp = state;
252 
253   for(int j=0; j<4; j++) {
254     AES_STATE(state, 0, j) = gf_mul(0xE, AES_STATE(tmp, 0, j)) ^
255                              gf_mul(0xB, AES_STATE(tmp, 1, j)) ^
256                              gf_mul(0xD, AES_STATE(tmp, 2, j)) ^
257                              gf_mul(0x9, AES_STATE(tmp, 3, j));
258 
259     AES_STATE(state, 1, j) = gf_mul(0x9, AES_STATE(tmp, 0, j)) ^
260                              gf_mul(0xE, AES_STATE(tmp, 1, j)) ^
261                              gf_mul(0xB, AES_STATE(tmp, 2, j)) ^
262                              gf_mul(0xD, AES_STATE(tmp, 3, j));
263 
264     AES_STATE(state, 2, j) = gf_mul(0xD, AES_STATE(tmp, 0, j)) ^
265                              gf_mul(0x9, AES_STATE(tmp, 1, j)) ^
266                              gf_mul(0xE, AES_STATE(tmp, 2, j)) ^
267                              gf_mul(0xB, AES_STATE(tmp, 3, j));
268 
269     AES_STATE(state, 3, j) = gf_mul(0xB, AES_STATE(tmp, 0, j)) ^
270                              gf_mul(0xD, AES_STATE(tmp, 1, j)) ^
271                              gf_mul(0x9, AES_STATE(tmp, 2, j)) ^
272                              gf_mul(0xE, AES_STATE(tmp, 3, j));
273   }
274 }
275 
AES_SubWord(Bit32u x)276 BX_CPP_INLINE Bit32u AES_SubWord(Bit32u x)
277 {
278   Bit8u b0 = sbox_transformation[(x)     & 0xff];
279   Bit8u b1 = sbox_transformation[(x>>8)  & 0xff];
280   Bit8u b2 = sbox_transformation[(x>>16) & 0xff];
281   Bit8u b3 = sbox_transformation[(x>>24) & 0xff];
282 
283   return b0 | ((Bit32u)(b1) <<  8) |
284               ((Bit32u)(b2) << 16) | ((Bit32u)(b3) << 24);
285 }
286 
AES_RotWord(Bit32u x)287 BX_CPP_INLINE Bit32u AES_RotWord(Bit32u x)
288 {
289   return (x >> 8) | (x << 24);
290 }
291 
292 /* 66 0F 38 DB */
AESIMC_VdqWdqR(bxInstruction_c * i)293 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESIMC_VdqWdqR(bxInstruction_c *i)
294 {
295   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
296 
297   AES_InverseMixColumns(op);
298 
299   BX_WRITE_XMM_REGZ(i->dst(), op, i->getVL());
300 
301   BX_NEXT_INSTR(i);
302 }
303 
304 /* 66 0F 38 DC */
AESENC_VdqWdqR(bxInstruction_c * i)305 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESENC_VdqWdqR(bxInstruction_c *i)
306 {
307   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
308 
309   AES_ShiftRows(op1);
310   AES_SubstituteBytes(op1);
311   AES_MixColumns(op1);
312 
313   xmm_xorps(&op1, &op2);
314 
315   BX_WRITE_XMM_REG(i->dst(), op1);
316 
317   BX_NEXT_INSTR(i);
318 }
319 
320 #if BX_SUPPORT_AVX
VAESENC_VdqHdqWdqR(bxInstruction_c * i)321 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VAESENC_VdqHdqWdqR(bxInstruction_c *i)
322 {
323   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
324   unsigned len = i->getVL();
325 
326   for (unsigned n=0; n < len; n++) {
327     AES_ShiftRows(op1.vmm128(n));
328     AES_SubstituteBytes(op1.vmm128(n));
329     AES_MixColumns(op1.vmm128(n));
330 
331     xmm_xorps(&op1.vmm128(n), &op2.vmm128(n));
332   }
333 
334   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
335 
336   BX_NEXT_INSTR(i);
337 }
338 #endif
339 
340 /* 66 0F 38 DD */
AESENCLAST_VdqWdqR(bxInstruction_c * i)341 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESENCLAST_VdqWdqR(bxInstruction_c *i)
342 {
343   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
344 
345   AES_ShiftRows(op1);
346   AES_SubstituteBytes(op1);
347 
348   xmm_xorps(&op1, &op2);
349 
350   BX_WRITE_XMM_REG(i->dst(), op1);
351 
352   BX_NEXT_INSTR(i);
353 }
354 
355 #if BX_SUPPORT_AVX
VAESENCLAST_VdqHdqWdqR(bxInstruction_c * i)356 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VAESENCLAST_VdqHdqWdqR(bxInstruction_c *i)
357 {
358   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
359   unsigned len = i->getVL();
360 
361   for (unsigned n=0; n < len; n++) {
362     AES_ShiftRows(op1.vmm128(n));
363     AES_SubstituteBytes(op1.vmm128(n));
364 
365     xmm_xorps(&op1.vmm128(n), &op2.vmm128(n));
366   }
367 
368   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
369 
370   BX_NEXT_INSTR(i);
371 }
372 #endif
373 
374 /* 66 0F 38 DE */
AESDEC_VdqWdqR(bxInstruction_c * i)375 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESDEC_VdqWdqR(bxInstruction_c *i)
376 {
377   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
378 
379   AES_InverseShiftRows(op1);
380   AES_InverseSubstituteBytes(op1);
381   AES_InverseMixColumns(op1);
382 
383   xmm_xorps(&op1, &op2);
384 
385   BX_WRITE_XMM_REG(i->dst(), op1);
386 
387   BX_NEXT_INSTR(i);
388 }
389 
390 #if BX_SUPPORT_AVX
VAESDEC_VdqHdqWdqR(bxInstruction_c * i)391 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VAESDEC_VdqHdqWdqR(bxInstruction_c *i)
392 {
393   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
394   unsigned len = i->getVL();
395 
396   for (unsigned n=0; n < len; n++) {
397     AES_InverseShiftRows(op1.vmm128(n));
398     AES_InverseSubstituteBytes(op1.vmm128(n));
399     AES_InverseMixColumns(op1.vmm128(n));
400 
401     xmm_xorps(&op1.vmm128(n), &op2.vmm128(n));
402   }
403 
404   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
405 
406   BX_NEXT_INSTR(i);
407 }
408 #endif
409 
410 /* 66 0F 38 DF */
AESDECLAST_VdqWdqR(bxInstruction_c * i)411 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESDECLAST_VdqWdqR(bxInstruction_c *i)
412 {
413   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
414 
415   AES_InverseShiftRows(op1);
416   AES_InverseSubstituteBytes(op1);
417 
418   xmm_xorps(&op1, &op2);
419 
420   BX_WRITE_XMM_REG(i->dst(), op1);
421 
422   BX_NEXT_INSTR(i);
423 }
424 
425 #if BX_SUPPORT_AVX
VAESDECLAST_VdqHdqWdqR(bxInstruction_c * i)426 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VAESDECLAST_VdqHdqWdqR(bxInstruction_c *i)
427 {
428   BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
429   unsigned len = i->getVL();
430 
431   for (unsigned n=0; n < len; n++) {
432     AES_InverseShiftRows(op1.vmm128(n));
433     AES_InverseSubstituteBytes(op1.vmm128(n));
434 
435     xmm_xorps(&op1.vmm128(n), &op2.vmm128(n));
436   }
437 
438   BX_WRITE_AVX_REGZ(i->dst(), op1, len);
439 
440   BX_NEXT_INSTR(i);
441 }
442 #endif
443 
444 /* 66 0F 3A DF */
AESKEYGENASSIST_VdqWdqIbR(bxInstruction_c * i)445 void BX_CPP_AttrRegparmN(1) BX_CPU_C::AESKEYGENASSIST_VdqWdqIbR(bxInstruction_c *i)
446 {
447   BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()), result;
448 
449   Bit32u rcon32 = i->Ib();
450 
451   result.xmm32u(0) = AES_SubWord(op.xmm32u(1));
452   result.xmm32u(1) = AES_RotWord(result.xmm32u(0)) ^ rcon32;
453   result.xmm32u(2) = AES_SubWord(op.xmm32u(3));
454   result.xmm32u(3) = AES_RotWord(result.xmm32u(2)) ^ rcon32;
455 
456   BX_WRITE_XMM_REGZ(i->dst(), result, i->getVL());
457 
458   BX_NEXT_INSTR(i);
459 }
460 
xmm_pclmulqdq(BxPackedXmmRegister * r,Bit64u a,Bit64u b)461 BX_CPP_INLINE void xmm_pclmulqdq(BxPackedXmmRegister *r, Bit64u a, Bit64u b)
462 {
463   BxPackedXmmRegister tmp;
464 
465   tmp.xmm64u(0) = a;
466   tmp.xmm64u(1) = 0;
467 
468   r->clear();
469 
470   for (unsigned n = 0; b && n < 64; n++) {
471       if (b & 1) {
472           xmm_xorps(r, &tmp);
473       }
474       tmp.xmm64u(1) = (tmp.xmm64u(1) << 1) | (tmp.xmm64u(0) >> 63);
475       tmp.xmm64u(0) <<= 1;
476       b >>= 1;
477   }
478 }
479 
480 /* 66 0F 3A 44 */
PCLMULQDQ_VdqWdqIbR(bxInstruction_c * i)481 void BX_CPP_AttrRegparmN(1) BX_CPU_C::PCLMULQDQ_VdqWdqIbR(bxInstruction_c *i)
482 {
483   BxPackedXmmRegister r;
484   Bit8u imm8 = i->Ib();
485 
486   BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->dst()), op2 = BX_READ_XMM_REG(i->src());
487 
488   // Perform Carry Less Multiplication [R = A CLMUL B]
489   // A determined by op1[imm8[0]]
490   // B determined by op2[imm8[4]]
491   xmm_pclmulqdq(&r, op1.xmm64u(imm8 & 1), op2.xmm64u((imm8 >> 4) & 1));
492 
493   BX_WRITE_XMM_REG(i->dst(), r);
494 
495   BX_NEXT_INSTR(i);
496 }
497 
498 #if BX_SUPPORT_AVX
VPCLMULQDQ_VdqHdqWdqIbR(bxInstruction_c * i)499 void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCLMULQDQ_VdqHdqWdqIbR(bxInstruction_c *i)
500 {
501   BxPackedAvxRegister r;
502   unsigned len = i->getVL();
503   Bit8u imm8 = i->Ib();
504 
505   r.clear();
506 
507   for (unsigned n=0; n < len; n++) {
508     BxPackedXmmRegister op1 = BX_READ_AVX_REG_LANE(i->src1(), n), op2 = BX_READ_AVX_REG_LANE(i->src2(), n);
509 
510     // Perform Carry Less Multiplication [R = A CLMUL B]
511     // A determined by op1[imm8[0]]
512     // B determined by op2[imm8[4]]
513     xmm_pclmulqdq(&r.vmm128(n), op1.xmm64u(imm8 & 1), op2.xmm64u((imm8 >> 4) & 1));
514   }
515 
516   BX_WRITE_AVX_REG(i->dst(), r);
517 
518   BX_NEXT_INSTR(i);
519 }
520 #endif
521 
522 #endif
523