1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5 #ifdef USE_HW_SHA2
6
7 #ifndef __ARM_FEATURE_CRYPTO
8 #error "Compiler option is invalid"
9 #endif
10
11 #ifdef FREEBL_NO_DEPEND
12 #include "stubs.h"
13 #endif
14
15 #include "prcpucfg.h"
16 #include "prtypes.h" /* for PRUintXX */
17 #include "prlong.h"
18 #include "blapi.h"
19 #include "sha256.h"
20
21 #include <arm_neon.h>
22
23 /* SHA-256 constants, K256. */
24 static const PRUint32 __attribute__((aligned(16))) K256[64] = {
25 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
26 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
27 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
28 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
29 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
30 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
31 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
32 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
33 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
34 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
35 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
36 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
37 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
38 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
39 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
40 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
41 };
42
43 #define ROUND(n, a, b, c, d) \
44 { \
45 uint32x4_t t = vaddq_u32(a, k##n); \
46 uint32x4_t wt = w0; \
47 w0 = vsha256hq_u32(w0, w1, t); \
48 w1 = vsha256h2q_u32(w1, wt, t); \
49 if (n < 12) { \
50 a = vsha256su0q_u32(a, b); \
51 a = vsha256su1q_u32(a, c, d); \
52 } \
53 }
54
55 void
SHA256_Compress_Native(SHA256Context * ctx)56 SHA256_Compress_Native(SHA256Context *ctx)
57 {
58 const uint32x4_t k0 = vld1q_u32(K256);
59 const uint32x4_t k1 = vld1q_u32(K256 + 4);
60 const uint32x4_t k2 = vld1q_u32(K256 + 8);
61 const uint32x4_t k3 = vld1q_u32(K256 + 12);
62 const uint32x4_t k4 = vld1q_u32(K256 + 16);
63 const uint32x4_t k5 = vld1q_u32(K256 + 20);
64 const uint32x4_t k6 = vld1q_u32(K256 + 24);
65 const uint32x4_t k7 = vld1q_u32(K256 + 28);
66 const uint32x4_t k8 = vld1q_u32(K256 + 32);
67 const uint32x4_t k9 = vld1q_u32(K256 + 36);
68 const uint32x4_t k10 = vld1q_u32(K256 + 40);
69 const uint32x4_t k11 = vld1q_u32(K256 + 44);
70 const uint32x4_t k12 = vld1q_u32(K256 + 48);
71 const uint32x4_t k13 = vld1q_u32(K256 + 52);
72 const uint32x4_t k14 = vld1q_u32(K256 + 56);
73 const uint32x4_t k15 = vld1q_u32(K256 + 60);
74
75 uint32x4_t h0 = vld1q_u32(ctx->h);
76 uint32x4_t h1 = vld1q_u32(ctx->h + 4);
77
78 unsigned char *input = ctx->u.b;
79
80 uint32x4_t a = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input)));
81 uint32x4_t b = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
82 uint32x4_t c = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
83 uint32x4_t d = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
84
85 uint32x4_t w0 = h0;
86 uint32x4_t w1 = h1;
87
88 ROUND(0, a, b, c, d)
89 ROUND(1, b, c, d, a)
90 ROUND(2, c, d, a, b)
91 ROUND(3, d, a, b, c)
92 ROUND(4, a, b, c, d)
93 ROUND(5, b, c, d, a)
94 ROUND(6, c, d, a, b)
95 ROUND(7, d, a, b, c)
96 ROUND(8, a, b, c, d)
97 ROUND(9, b, c, d, a)
98 ROUND(10, c, d, a, b)
99 ROUND(11, d, a, b, c)
100 ROUND(12, a, b, c, d)
101 ROUND(13, b, c, d, a)
102 ROUND(14, c, d, a, b)
103 ROUND(15, d, a, b, c)
104
105 h0 = vaddq_u32(h0, w0);
106 h1 = vaddq_u32(h1, w1);
107
108 vst1q_u32(ctx->h, h0);
109 vst1q_u32(ctx->h + 4, h1);
110 }
111
112 void
SHA256_Update_Native(SHA256Context * ctx,const unsigned char * input,unsigned int inputLen)113 SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
114 unsigned int inputLen)
115 {
116 const uint32x4_t k0 = vld1q_u32(K256);
117 const uint32x4_t k1 = vld1q_u32(K256 + 4);
118 const uint32x4_t k2 = vld1q_u32(K256 + 8);
119 const uint32x4_t k3 = vld1q_u32(K256 + 12);
120 const uint32x4_t k4 = vld1q_u32(K256 + 16);
121 const uint32x4_t k5 = vld1q_u32(K256 + 20);
122 const uint32x4_t k6 = vld1q_u32(K256 + 24);
123 const uint32x4_t k7 = vld1q_u32(K256 + 28);
124 const uint32x4_t k8 = vld1q_u32(K256 + 32);
125 const uint32x4_t k9 = vld1q_u32(K256 + 36);
126 const uint32x4_t k10 = vld1q_u32(K256 + 40);
127 const uint32x4_t k11 = vld1q_u32(K256 + 44);
128 const uint32x4_t k12 = vld1q_u32(K256 + 48);
129 const uint32x4_t k13 = vld1q_u32(K256 + 52);
130 const uint32x4_t k14 = vld1q_u32(K256 + 56);
131 const uint32x4_t k15 = vld1q_u32(K256 + 60);
132
133 unsigned int inBuf = ctx->sizeLo & 0x3f;
134 if (!inputLen) {
135 return;
136 }
137
138 /* Add inputLen into the count of bytes processed, before processing */
139 if ((ctx->sizeLo += inputLen) < inputLen) {
140 ctx->sizeHi++;
141 }
142
143 /* if data already in buffer, attemp to fill rest of buffer */
144 if (inBuf) {
145 unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
146 if (inputLen < todo) {
147 todo = inputLen;
148 }
149 memcpy(ctx->u.b + inBuf, input, todo);
150 input += todo;
151 inputLen -= todo;
152 if (inBuf + todo == SHA256_BLOCK_LENGTH) {
153 SHA256_Compress_Native(ctx);
154 }
155 }
156
157 uint32x4_t h0 = vld1q_u32(ctx->h);
158 uint32x4_t h1 = vld1q_u32(ctx->h + 4);
159
160 /* if enough data to fill one or more whole buffers, process them. */
161 while (inputLen >= SHA256_BLOCK_LENGTH) {
162 uint32x4_t a, b, c, d;
163 a = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input)));
164 b = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
165 c = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
166 d = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
167 input += SHA256_BLOCK_LENGTH;
168 inputLen -= SHA256_BLOCK_LENGTH;
169
170 uint32x4_t w0 = h0;
171 uint32x4_t w1 = h1;
172
173 ROUND(0, a, b, c, d)
174 ROUND(1, b, c, d, a)
175 ROUND(2, c, d, a, b)
176 ROUND(3, d, a, b, c)
177 ROUND(4, a, b, c, d)
178 ROUND(5, b, c, d, a)
179 ROUND(6, c, d, a, b)
180 ROUND(7, d, a, b, c)
181 ROUND(8, a, b, c, d)
182 ROUND(9, b, c, d, a)
183 ROUND(10, c, d, a, b)
184 ROUND(11, d, a, b, c)
185 ROUND(12, a, b, c, d)
186 ROUND(13, b, c, d, a)
187 ROUND(14, c, d, a, b)
188 ROUND(15, d, a, b, c)
189
190 h0 = vaddq_u32(h0, w0);
191 h1 = vaddq_u32(h1, w1);
192 }
193
194 vst1q_u32(ctx->h, h0);
195 vst1q_u32(ctx->h + 4, h1);
196
197 /* if data left over, fill it into buffer */
198 if (inputLen) {
199 memcpy(ctx->u.b, input, inputLen);
200 }
201 }
202
203 #endif /* USE_HW_SHA2 */
204