1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009 Intel Corporation
24 * All Rights Reserved.
25 */
26/*
27 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
28 * Use is subject to license terms.
29 */
30
31/*
32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33 * instructions.  This file contains an accelerated
34 * Galois Field Multiplication implementation.
35 *
36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37 * carry-less multiplication. More information about PCLMULQDQ can be
38 * found at:
39 * http://software.intel.com/en-us/articles/
40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
41 *
42 */
43
44/*
45 * ====================================================================
46 * OpenSolaris OS modifications
47 *
48 * This source originates as file galois_hash_asm.c from
49 * Intel Corporation dated September 21, 2009.
50 *
51 * This OpenSolaris version has these major changes from the original source:
52 *
53 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
54 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
55 * definition for lint.
56 *
57 * 2. Formatted code, added comments, and added #includes and #defines.
58 *
59 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
60 * calling kpreempt_disable() and kpreempt_enable().
61 * If the TS bit is not set, Save and restore %xmm registers at the beginning
62 * and end of function calls (%xmm* registers are not saved and restored by
63 * during kernel thread preemption).
64 *
65 * 4. Removed code to perform hashing.  This is already done with C macro
66 * GHASH in gcm.c.  For better performance, this removed code should be
67 * reintegrated in the future to replace the C GHASH macro.
68 *
69 * 5. Added code to byte swap 16-byte input and output.
70 *
71 * 6. Folded in comments from the original C source with embedded assembly
72 * (SB_w_shift_xor.c)
73 *
74 * 7. Renamed function and reordered parameters to match OpenSolaris:
75 * Intel interface:
76 *	void galois_hash_asm(unsigned char *hk, unsigned char *s,
77 *		unsigned char *d, int length)
78 * OpenSolaris OS interface:
79 *	void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
80 * ====================================================================
81 */
82
83
84#if defined(lint) || defined(__lint)	/* lint */
85
86#include <sys/types.h>
87
88/* ARGSUSED */
89void
90gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
91}
92
93#elif defined(HAVE_PCLMULQDQ)	/* guard by instruction set */
94
95#define _ASM
96#include <sys/asm_linkage.h>
97
98/*
99 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
100 */
101
102// static uint8_t byte_swap16_mask[] = {
103//	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
104.data
105.align XMM_ALIGN
106.Lbyte_swap16_mask:
107	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
108
109
110/*
111 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
112 *
113 * Perform a carry-less multiplication (that is, use XOR instead of the
114 * multiply operator) on P1 and P2 and place the result in P3.
115 *
116 * Byte swap the input and the output.
117 *
118 * Note: x_in, y, and res all point to a block of 20-byte numbers
119 * (an array of two 64-bit integers).
120 *
121 * Note2: For kernel code, caller is responsible for ensuring
122 * kpreempt_disable() has been called.  This is because %xmm registers are
123 * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
124 * respectively, if TS is set on entry.  Otherwise, if TS is not set,
125 * save and restore %xmm registers on the stack.
126 *
127 * Note3: Original Intel definition:
128 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
129 *	unsigned char *d, int length)
130 *
131 * Note4: Register/parameter mapping:
132 * Intel:
133 *	Parameter 1: %rcx (copied to %xmm0)	hk or x_in
134 *	Parameter 2: %rdx (copied to %xmm1)	s or y
135 *	Parameter 3: %rdi (result)		d or res
136 * OpenSolaris:
137 *	Parameter 1: %rdi (copied to %xmm0)	x_in
138 *	Parameter 2: %rsi (copied to %xmm1)	y
139 *	Parameter 3: %rdx (result)		res
140 */
141
142ENTRY_NP(gcm_mul_pclmulqdq)
143	//
144	// Copy Parameters
145	//
146	movdqu	(%rdi), %xmm0	// P1
147	movdqu	(%rsi), %xmm1	// P2
148
149	//
150	// Byte swap 16-byte input
151	//
152	lea	.Lbyte_swap16_mask(%rip), %rax
153	movups	(%rax), %xmm10
154	pshufb	%xmm10, %xmm0
155	pshufb	%xmm10, %xmm1
156
157
158	//
159	// Multiply with the hash key
160	//
161	movdqu	%xmm0, %xmm3
162	pclmulqdq $0, %xmm1, %xmm3	// xmm3 holds a0*b0
163
164	movdqu	%xmm0, %xmm4
165	pclmulqdq $16, %xmm1, %xmm4	// xmm4 holds a0*b1
166
167	movdqu	%xmm0, %xmm5
168	pclmulqdq $1, %xmm1, %xmm5	// xmm5 holds a1*b0
169	movdqu	%xmm0, %xmm6
170	pclmulqdq $17, %xmm1, %xmm6	// xmm6 holds a1*b1
171
172	pxor	%xmm5, %xmm4	// xmm4 holds a0*b1 + a1*b0
173
174	movdqu	%xmm4, %xmm5	// move the contents of xmm4 to xmm5
175	psrldq	$8, %xmm4	// shift by xmm4 64 bits to the right
176	pslldq	$8, %xmm5	// shift by xmm5 64 bits to the left
177	pxor	%xmm5, %xmm3
178	pxor	%xmm4, %xmm6	// Register pair <xmm6:xmm3> holds the result
179				// of the carry-less multiplication of
180				// xmm0 by xmm1.
181
182	// We shift the result of the multiplication by one bit position
183	// to the left to cope for the fact that the bits are reversed.
184	movdqu	%xmm3, %xmm7
185	movdqu	%xmm6, %xmm8
186	pslld	$1, %xmm3
187	pslld	$1, %xmm6
188	psrld	$31, %xmm7
189	psrld	$31, %xmm8
190	movdqu	%xmm7, %xmm9
191	pslldq	$4, %xmm8
192	pslldq	$4, %xmm7
193	psrldq	$12, %xmm9
194	por	%xmm7, %xmm3
195	por	%xmm8, %xmm6
196	por	%xmm9, %xmm6
197
198	//
199	// First phase of the reduction
200	//
201	// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
202	// independently.
203	movdqu	%xmm3, %xmm7
204	movdqu	%xmm3, %xmm8
205	movdqu	%xmm3, %xmm9
206	pslld	$31, %xmm7	// packed right shift shifting << 31
207	pslld	$30, %xmm8	// packed right shift shifting << 30
208	pslld	$25, %xmm9	// packed right shift shifting << 25
209	pxor	%xmm8, %xmm7	// xor the shifted versions
210	pxor	%xmm9, %xmm7
211	movdqu	%xmm7, %xmm8
212	pslldq	$12, %xmm7
213	psrldq	$4, %xmm8
214	pxor	%xmm7, %xmm3	// first phase of the reduction complete
215
216	//
217	// Second phase of the reduction
218	//
219	// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
220	// shift operations.
221	movdqu	%xmm3, %xmm2
222	movdqu	%xmm3, %xmm4	// packed left shifting >> 1
223	movdqu	%xmm3, %xmm5
224	psrld	$1, %xmm2
225	psrld	$2, %xmm4	// packed left shifting >> 2
226	psrld	$7, %xmm5	// packed left shifting >> 7
227	pxor	%xmm4, %xmm2	// xor the shifted versions
228	pxor	%xmm5, %xmm2
229	pxor	%xmm8, %xmm2
230	pxor	%xmm2, %xmm3
231	pxor	%xmm3, %xmm6	// the result is in xmm6
232
233	//
234	// Byte swap 16-byte result
235	//
236	pshufb	%xmm10, %xmm6	// %xmm10 has the swap mask
237
238	//
239	// Store the result
240	//
241	movdqu	%xmm6, (%rdx)	// P3
242
243
244	//
245	// Return
246	//
247	ret
248	SET_SIZE(gcm_mul_pclmulqdq)
249
250#endif	/* lint || __lint */
251
252#ifdef __ELF__
253.section .note.GNU-stack,"",%progbits
254#endif
255