1 /*
2 * Copyright (c) 2018, Intel Corporation.
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25 
26 #include "precompiled.hpp"
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "runtime/stubRoutines.hpp"
30 #include "macroAssembler_x86.hpp"
31 
32 #ifdef _LP64
33 // Multiply 128 x 128 bits, using 4 pclmulqdq operations
schoolbookAAD(int i,Register htbl,XMMRegister data,XMMRegister tmp0,XMMRegister tmp1,XMMRegister tmp2,XMMRegister tmp3)34 void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
35     XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
36     movdqu(xmm15, Address(htbl, i * 16));
37     vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
38     vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
39     vpclmulldq(tmp3, data, xmm15); // 0x00
40     vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
41     vpclmulhdq(tmp3, data, xmm15); // 0x11
42     vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
43     vpclmullqhqdq(tmp3, data, xmm15); // 0x10
44     vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
45 }
46 
47 // Multiply two 128 bit numbers resulting in a 256 bit value
48 // Result of the multiplication followed by reduction stored in state
gfmul(XMMRegister tmp0,XMMRegister state)49 void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
50     const XMMRegister tmp1 = xmm4;
51     const XMMRegister tmp2 = xmm5;
52     const XMMRegister tmp3 = xmm6;
53     const XMMRegister tmp4 = xmm7;
54 
55     vpclmulldq(tmp1, state, tmp0); //0x00  (a0 * b0)
56     vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
57     vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
58     vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
59 
60     vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
61 
62     vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
63     vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
64     vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
65     vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
66     // Follows the reduction technique mentioned in
67     // Shift-XOR reduction described in Gueron-Kounavis May 2010
68     // First phase of reduction
69     //
70     vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
71     vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
72     vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
73     // xor the shifted versions
74     vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
75     vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
76     vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
77     vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
78     vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
79     //
80     // Second phase of the reduction
81     //
82     vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
83     vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
84     vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
85     vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
86     vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
87     vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
88     vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
89     vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
90     ret(0);
91 }
92 
93 // This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
94 // The power of H is used in reduction process for one block ghash
generateHtbl_one_block(Register htbl)95 void MacroAssembler::generateHtbl_one_block(Register htbl) {
96     const XMMRegister t = xmm13;
97 
98     // load the original subkey hash
99     movdqu(t, Address(htbl, 0));
100     // shuffle using long swap mask
101     movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
102     vpshufb(t, t, xmm10, Assembler::AVX_128bit);
103 
104     // Compute H' = GFMUL(H, 2)
105     vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
106     movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
107     vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
108     movl(rax, 0xff00);
109     movdl(xmm4, rax);
110     vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
111     movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
112     vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
113     vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
114     vpslld(xmm4, t, 1, Assembler::AVX_128bit);
115     vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
116     vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
117 
118     //Adding p(x)<<1 to xmm5 which holds the reduction polynomial
119     vpxor(t, t, xmm5, Assembler::AVX_128bit);
120     movdqu(Address(htbl, 1 * 16), t); // H * 2
121 
122     ret(0);
123 }
124 
125 // This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
126 // The power of H is used in reduction process for eight block ghash
generateHtbl_eight_blocks(Register htbl)127 void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
128     const XMMRegister t = xmm13;
129     const XMMRegister tmp0 = xmm1;
130     Label GFMUL;
131 
132     movdqu(t, Address(htbl, 1 * 16));
133     movdqu(tmp0, t);
134 
135     // tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
136     call(GFMUL, relocInfo::none);
137     movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
138     call(GFMUL, relocInfo::none);
139     movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
140     call(GFMUL, relocInfo::none);
141     movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
142     call(GFMUL, relocInfo::none);
143     movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
144     call(GFMUL, relocInfo::none);
145     movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
146     call(GFMUL, relocInfo::none);
147     movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
148     call(GFMUL, relocInfo::none);
149     movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
150     ret(0);
151 
152     bind(GFMUL);
153     gfmul(tmp0, t);
154 }
155 
156 // Multiblock and single block GHASH computation using Shift XOR reduction technique
avx_ghash(Register input_state,Register htbl,Register input_data,Register blocks)157 void MacroAssembler::avx_ghash(Register input_state, Register htbl,
158     Register input_data, Register blocks) {
159 
160     // temporary variables to hold input data and input state
161     const XMMRegister data = xmm1;
162     const XMMRegister state = xmm0;
163     // temporary variables to hold intermediate results
164     const XMMRegister tmp0 = xmm3;
165     const XMMRegister tmp1 = xmm4;
166     const XMMRegister tmp2 = xmm5;
167     const XMMRegister tmp3 = xmm6;
168     // temporary variables to hold byte and long swap masks
169     const XMMRegister bswap_mask = xmm2;
170     const XMMRegister lswap_mask = xmm14;
171 
172     Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
173           ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
174 
175     testptr(blocks, blocks);
176     jcc(Assembler::zero, EXIT_GHASH);
177 
178     // Check if Hashtable (1*16) has been already generated
179     // For anything less than 8 blocks, we generate only the first power of H.
180     movdqu(tmp2, Address(htbl, 1 * 16));
181     ptest(tmp2, tmp2);
182     jcc(Assembler::notZero, BEGIN_PROCESS);
183     call(GENERATE_HTBL_1_BLK, relocInfo::none);
184 
185     // Shuffle the input state
186     bind(BEGIN_PROCESS);
187     movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
188     movdqu(state, Address(input_state, 0));
189     vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
190 
191     cmpl(blocks, 8);
192     jcc(Assembler::below, ONE_BLK_INIT);
193     // If we have 8 blocks or more data, then generate remaining powers of H
194     movdqu(tmp2, Address(htbl, 8 * 16));
195     ptest(tmp2, tmp2);
196     jcc(Assembler::notZero, PROCESS_8_BLOCKS);
197     call(GENERATE_HTBL_8_BLKS, relocInfo::none);
198 
199     //Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
200     //Each block = 16 bytes.
201     bind(PROCESS_8_BLOCKS);
202     subl(blocks, 8);
203     movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
204     movdqu(data, Address(input_data, 16 * 7));
205     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
206     //Loading 1*16 as calculated powers of H required starts at that location.
207     movdqu(xmm15, Address(htbl, 1 * 16));
208     //Perform carryless multiplication of (H*2, data block #7)
209     vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
210     vpclmulldq(tmp0, data, xmm15);//a0 * b0
211     vpclmulhdq(tmp1, data, xmm15);//a1 * b1
212     vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
213     vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
214 
215     movdqu(data, Address(input_data, 16 * 6));
216     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
217     // Perform carryless multiplication of (H^2 * 2, data block #6)
218     schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
219 
220     movdqu(data, Address(input_data, 16 * 5));
221     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
222     // Perform carryless multiplication of (H^3 * 2, data block #5)
223     schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
224     movdqu(data, Address(input_data, 16 * 4));
225     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
226     // Perform carryless multiplication of (H^4 * 2, data block #4)
227     schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
228     movdqu(data, Address(input_data, 16 * 3));
229     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
230     // Perform carryless multiplication of (H^5 * 2, data block #3)
231     schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
232     movdqu(data, Address(input_data, 16 * 2));
233     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
234     // Perform carryless multiplication of (H^6 * 2, data block #2)
235     schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
236     movdqu(data, Address(input_data, 16 * 1));
237     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
238     // Perform carryless multiplication of (H^7 * 2, data block #1)
239     schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
240     movdqu(data, Address(input_data, 16 * 0));
241     // xor data block#0 with input state before perfoming carry-less multiplication
242     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
243     vpxor(data, data, state, Assembler::AVX_128bit);
244     // Perform carryless multiplication of (H^8 * 2, data block #0)
245     schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
246     vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
247     vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
248     vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
249     vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
250 
251     // we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
252     // with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
253     // Follows the reduction technique mentioned in
254     // Shift-XOR reduction described in Gueron-Kounavis May 2010
255     bind(BLOCK8_REDUCTION);
256     // First Phase of the reduction
257     vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
258     vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
259     vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
260     // xor the shifted versions
261     vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
262     vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
263 
264     vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
265     vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
266 
267     vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
268     // second phase of the reduction
269     vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
270     vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
271     vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
272     // xor the shifted versions
273     vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
274     vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
275     vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
276     vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
277     // Final result is in state
278     vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
279 
280     lea(input_data, Address(input_data, 16 * 8));
281     cmpl(blocks, 8);
282     jcc(Assembler::below, ONE_BLK_INIT);
283     jmp(PROCESS_8_BLOCKS);
284 
285     // Since this is one block operation we will only use H * 2 i.e. the first power of H
286     bind(ONE_BLK_INIT);
287     movdqu(tmp0, Address(htbl, 1 * 16));
288     movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
289 
290     //Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
291     bind(PROCESS_1_BLOCK);
292     cmpl(blocks, 0);
293     jcc(Assembler::equal, SAVE_STATE);
294     subl(blocks, 1);
295     movdqu(data, Address(input_data, 0));
296     vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
297     vpxor(state, state, data, Assembler::AVX_128bit);
298     // gfmul(H*2, state)
299     call(GFMUL, relocInfo::none);
300     addptr(input_data, 16);
301     jmp(PROCESS_1_BLOCK);
302 
303     bind(SAVE_STATE);
304     vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
305     movdqu(Address(input_state, 0), state);
306     jmp(EXIT_GHASH);
307 
308     bind(GFMUL);
309     gfmul(tmp0, state);
310 
311     bind(GENERATE_HTBL_1_BLK);
312     generateHtbl_one_block(htbl);
313 
314     bind(GENERATE_HTBL_8_BLKS);
315     generateHtbl_eight_blocks(htbl);
316 
317     bind(EXIT_GHASH);
318     // zero out xmm registers used for Htbl storage
319     vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
320     vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
321     vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
322     vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
323 }
324 #endif // _LP64
325