1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 #ifdef FREEBL_NO_DEPEND
6 #include "stubs.h"
7 #endif
8 #include "gcm.h"
9 #include "secerr.h"
10 
11 #include <wmmintrin.h> /* clmul */
12 
13 #define WRITE64(x, bytes)   \
14     (bytes)[0] = (x) >> 56; \
15     (bytes)[1] = (x) >> 48; \
16     (bytes)[2] = (x) >> 40; \
17     (bytes)[3] = (x) >> 32; \
18     (bytes)[4] = (x) >> 24; \
19     (bytes)[5] = (x) >> 16; \
20     (bytes)[6] = (x) >> 8;  \
21     (bytes)[7] = (x);
22 
23 SECStatus
gcm_HashWrite_hw(gcmHashContext * ghash,unsigned char * outbuf)24 gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
25 {
26     uint64_t tmp_out[2];
27     _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
28     /* maxout must be larger than 16 byte (checked by the caller). */
29     WRITE64(tmp_out[0], outbuf + 8);
30     WRITE64(tmp_out[1], outbuf);
31     return SECSuccess;
32 }
33 
34 SECStatus
gcm_HashMult_hw(gcmHashContext * ghash,const unsigned char * buf,unsigned int count)35 gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
36                 unsigned int count)
37 {
38     size_t i;
39     pre_align __m128i z_high post_align;
40     pre_align __m128i z_low post_align;
41     pre_align __m128i C post_align;
42     pre_align __m128i D post_align;
43     pre_align __m128i E post_align;
44     pre_align __m128i F post_align;
45     pre_align __m128i bin post_align;
46     pre_align __m128i Ci post_align;
47     pre_align __m128i tmp post_align;
48 
49     for (i = 0; i < count; i++, buf += 16) {
50         bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
51                             ((uint16_t)buf[2] << 8) | buf[3],
52                             ((uint16_t)buf[4] << 8) | buf[5],
53                             ((uint16_t)buf[6] << 8) | buf[7],
54                             ((uint16_t)buf[8] << 8) | buf[9],
55                             ((uint16_t)buf[10] << 8) | buf[11],
56                             ((uint16_t)buf[12] << 8) | buf[13],
57                             ((uint16_t)buf[14] << 8) | buf[15]);
58         Ci = _mm_xor_si128(bin, ghash->x);
59 
60         /* Do binary mult ghash->X = Ci * ghash->H. */
61         C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
62         D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
63         E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
64         F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
65         tmp = _mm_xor_si128(E, F);
66         z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
67         z_high = _mm_unpackhi_epi64(z_high, D);
68         z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
69         z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
70 
71         /* Shift one to the left (multiply by x) as gcm spec is stupid. */
72         C = _mm_slli_si128(z_low, 8);
73         E = _mm_srli_epi64(C, 63);
74         D = _mm_slli_si128(z_high, 8);
75         F = _mm_srli_epi64(D, 63);
76         /* Carry over */
77         C = _mm_srli_si128(z_low, 8);
78         D = _mm_srli_epi64(C, 63);
79         z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
80         z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
81 
82         /* Reduce */
83         C = _mm_slli_si128(z_low, 8);
84         /* D = z_low << 127 */
85         D = _mm_slli_epi64(C, 63);
86         /* E = z_low << 126 */
87         E = _mm_slli_epi64(C, 62);
88         /* F = z_low << 121 */
89         F = _mm_slli_epi64(C, 57);
90         /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
91         z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
92         C = _mm_srli_si128(z_low, 8);
93         /* D = z_low >> 1 */
94         D = _mm_slli_epi64(C, 63);
95         D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
96         /* E = z_low >> 2 */
97         E = _mm_slli_epi64(C, 62);
98         E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
99         /* F = z_low >> 7 */
100         F = _mm_slli_epi64(C, 57);
101         F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
102         /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
103         ghash->x = _mm_xor_si128(_mm_xor_si128(
104                                      _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
105                                  F);
106     }
107     return SECSuccess;
108 }
109 
110 SECStatus
gcm_HashInit_hw(gcmHashContext * ghash)111 gcm_HashInit_hw(gcmHashContext *ghash)
112 {
113     ghash->ghash_mul = gcm_HashMult_hw;
114     ghash->x = _mm_setzero_si128();
115     /* MSVC requires __m64 to load epi64. */
116     ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
117                              ghash->h_low >> 32, (uint32_t)ghash->h_low);
118     ghash->hw = PR_TRUE;
119     return SECSuccess;
120 }
121 
122 SECStatus
gcm_HashZeroX_hw(gcmHashContext * ghash)123 gcm_HashZeroX_hw(gcmHashContext *ghash)
124 {
125     ghash->x = _mm_setzero_si128();
126     return SECSuccess;
127 }
128