1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 #ifdef FREEBL_NO_DEPEND
6 #include "stubs.h"
7 #endif
8 #include "blapii.h"
9 #include "blapit.h"
10 #include "gcm.h"
11 #include "secerr.h"
12 #include "prtypes.h"
13 
14 #if defined(IS_LITTLE_ENDIAN)
15 
16 #include <arm_neon.h>
17 
18 SECStatus
gcm_HashWrite_hw(gcmHashContext * ghash,unsigned char * outbuf)19 gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
20 {
21     vst1_u8(outbuf, vrev64_u8(vcreate_u8(ghash->x_high)));
22     vst1_u8(outbuf + 8, vrev64_u8(vcreate_u8(ghash->x_low)));
23     return SECSuccess;
24 }
25 
26 /* Carry-less multiplication. a * b = ret. */
27 static inline uint8x16_t
clmul(const uint8x8_t a,const uint8x8_t b)28 clmul(const uint8x8_t a, const uint8x8_t b)
29 {
30     uint8x16_t d, e, f, g, h, i, j, k, l, m, n;
31     uint8x8_t t_high, t_low;
32     uint8x16_t t0, t1, t2, t3;
33     const uint8x8_t k16 = vcreate_u8(0xffff);
34     const uint8x8_t k32 = vcreate_u8(0xffffffff);
35     const uint8x8_t k48 = vcreate_u8(0xffffffffffff);
36 
37     // D = A * B
38     d = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
39                                       vreinterpret_p8_u8(b)));
40     // E = A * B1
41     e = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
42                                       vreinterpret_p8_u8(vext_u8(b, b, 1))));
43     // F = A1 * B
44     f = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 1)),
45                                       vreinterpret_p8_u8(b)));
46     // G = A * B2
47     g = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
48                                       vreinterpret_p8_u8(vext_u8(b, b, 2))));
49     // H = A2 * B
50     h = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 2)),
51                                       vreinterpret_p8_u8(b)));
52     // I = A * B3
53     i = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
54                                       vreinterpret_p8_u8(vext_u8(b, b, 3))));
55     // J = A3 * B
56     j = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(vext_u8(a, a, 3)),
57                                       vreinterpret_p8_u8(b)));
58     // K = A * B4
59     k = vreinterpretq_u8_p16(vmull_p8(vreinterpret_p8_u8(a),
60                                       vreinterpret_p8_u8(vext_u8(b, b, 4))));
61     // L = E + F
62     l = veorq_u8(e, f);
63     // M = G + H
64     m = veorq_u8(g, h);
65     // N = I + J
66     n = veorq_u8(i, j);
67 
68     // t0 = (L) (P0 + P1) << 8
69     t_high = vget_high_u8(l);
70     t_low = vget_low_u8(l);
71     t_low = veor_u8(t_low, t_high);
72     t_high = vand_u8(t_high, k48);
73     t_low = veor_u8(t_low, t_high);
74     t0 = vcombine_u8(t_low, t_high);
75     t0 = vextq_u8(t0, t0, 15);
76 
77     // t1 = (M) (P2 + P3) << 16
78     t_high = vget_high_u8(m);
79     t_low = vget_low_u8(m);
80     t_low = veor_u8(t_low, t_high);
81     t_high = vand_u8(t_high, k32);
82     t_low = veor_u8(t_low, t_high);
83     t1 = vcombine_u8(t_low, t_high);
84     t1 = vextq_u8(t1, t1, 14);
85 
86     // t2 = (N) (P4 + P5) << 24
87     t_high = vget_high_u8(n);
88     t_low = vget_low_u8(n);
89     t_low = veor_u8(t_low, t_high);
90     t_high = vand_u8(t_high, k16);
91     t_low = veor_u8(t_low, t_high);
92     t2 = vcombine_u8(t_low, t_high);
93     t2 = vextq_u8(t2, t2, 13);
94 
95     // t3 = (K) (P6 + P7) << 32
96     t_high = vget_high_u8(k);
97     t_low = vget_low_u8(k);
98     t_low = veor_u8(t_low, t_high);
99     t_high = vdup_n_u8(0);
100     t3 = vcombine_u8(t_low, t_high);
101     t3 = vextq_u8(t3, t3, 12);
102 
103     t0 = veorq_u8(t0, t1);
104     t2 = veorq_u8(t2, t3);
105     return veorq_u8(veorq_u8(d, t0), t2);
106 }
107 
108 SECStatus
gcm_HashMult_hw(gcmHashContext * ghash,const unsigned char * buf,unsigned int count)109 gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
110                 unsigned int count)
111 {
112     const uint8x8_t h_low = vcreate_u8(ghash->h_low);
113     const uint8x8_t h_high = vcreate_u8(ghash->h_high);
114     uint8x16_t ci;
115     uint8x8_t ci_low;
116     uint8x8_t ci_high;
117     uint8x16_t z0, z2, z1a;
118     uint8x16_t z_high, z_low;
119     uint8x16_t t;
120     int64x2_t t1, t2, t3;
121     uint64x2_t z_low_l, z_low_r, z_high_l, z_high_r;
122     size_t i;
123 
124     ci = vcombine_u8(vcreate_u8(ghash->x_low), vcreate_u8(ghash->x_high));
125 
126     for (i = 0; i < count; i++, buf += 16) {
127         ci = veorq_u8(ci, vcombine_u8(vrev64_u8(vld1_u8(buf + 8)),
128                                       vrev64_u8(vld1_u8(buf))));
129         ci_high = vget_high_u8(ci);
130         ci_low = vget_low_u8(ci);
131 
132         /* Do binary mult ghash->X = C * ghash->H (Karatsuba). */
133         z0 = clmul(ci_low, h_low);
134         z2 = clmul(ci_high, h_high);
135         z1a = clmul(veor_u8(ci_high, ci_low), veor_u8(h_high, h_low));
136         z1a = veorq_u8(z0, z1a);
137         z1a = veorq_u8(z2, z1a);
138         z_high = vcombine_u8(veor_u8(vget_low_u8(z2), vget_high_u8(z1a)),
139                              vget_high_u8(z2));
140         z_low = vcombine_u8(vget_low_u8(z0),
141                             veor_u8(vget_high_u8(z0), vget_low_u8(z1a)));
142 
143         /* Shift one (multiply by x) as gcm spec is stupid. */
144         z_low_l = vshlq_n_u64(vreinterpretq_u64_u8(z_low), 1);
145         z_low_r = vshrq_n_u64(vreinterpretq_u64_u8(z_low), 63);
146         z_high_l = vshlq_n_u64(vreinterpretq_u64_u8(z_high), 1);
147         z_high_r = vshrq_n_u64(vreinterpretq_u64_u8(z_high), 63);
148         z_low = vreinterpretq_u8_u64(
149             vcombine_u64(vget_low_u64(z_low_l),
150                          vorr_u64(vget_high_u64(z_low_l),
151                                   vget_low_u64(z_low_r))));
152         z_high = vreinterpretq_u8_u64(
153             vcombine_u64(vorr_u64(vget_low_u64(z_high_l),
154                                   vget_high_u64(z_low_r)),
155                          vorr_u64(vget_high_u64(z_high_l),
156                                   vget_low_u64(z_high_r))));
157 
158         /* Reduce */
159         t1 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 57);
160         t2 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 62);
161         t3 = vshlq_n_s64(vreinterpretq_s64_u8(z_low), 63);
162         t = vreinterpretq_u8_s64(veorq_s64(t1, veorq_s64(t2, t3)));
163 
164         z_low = vcombine_u8(vget_low_u8(z_low),
165                             veor_u8(vget_high_u8(z_low), vget_low_u8(t)));
166         z_high = vcombine_u8(veor_u8(vget_low_u8(z_high), vget_high_u8(t)),
167                              vget_high_u8(z_high));
168 
169         t = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(z_low), 1));
170         z_high = veorq_u8(z_high, z_low);
171         z_low = veorq_u8(z_low, t);
172         t = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(t), 6));
173         z_low = vreinterpretq_u8_u64(
174             vshrq_n_u64(vreinterpretq_u64_u8(z_low), 1));
175         z_low = veorq_u8(z_low, z_high);
176         ci = veorq_u8(z_low, t);
177     }
178 
179     vst1_u8((uint8_t *)&ghash->x_high, vget_high_u8(ci));
180     vst1_u8((uint8_t *)&ghash->x_low, vget_low_u8(ci));
181     return SECSuccess;
182 }
183 
184 SECStatus
gcm_HashInit_hw(gcmHashContext * ghash)185 gcm_HashInit_hw(gcmHashContext *ghash)
186 {
187     ghash->ghash_mul = gcm_HashMult_hw;
188     ghash->x_low = 0;
189     ghash->x_high = 0;
190     ghash->hw = PR_TRUE;
191     return SECSuccess;
192 }
193 
194 SECStatus
gcm_HashZeroX_hw(gcmHashContext * ghash)195 gcm_HashZeroX_hw(gcmHashContext *ghash)
196 {
197     ghash->x_low = 0;
198     ghash->x_high = 0;
199     return SECSuccess;
200 }
201 
202 #endif /* IS_LITTLE_ENDIAN */
203