1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 #ifdef USE_HW_SHA2
6 
7 #ifndef __ARM_FEATURE_CRYPTO
8 #error "Compiler option is invalid"
9 #endif
10 
11 #ifdef FREEBL_NO_DEPEND
12 #include "stubs.h"
13 #endif
14 
15 #include "prcpucfg.h"
16 #include "prtypes.h" /* for PRUintXX */
17 #include "prlong.h"
18 #include "blapi.h"
19 #include "sha256.h"
20 
21 #include <arm_neon.h>
22 
23 /* SHA-256 constants, K256. */
24 static const PRUint32 __attribute__((aligned(16))) K256[64] = {
25     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
26     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
27     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
28     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
29     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
30     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
31     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
32     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
33     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
34     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
35     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
36     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
37     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
38     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
39     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
40     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
41 };
42 
43 #define ROUND(n, a, b, c, d)               \
44     {                                      \
45         uint32x4_t t = vaddq_u32(a, k##n); \
46         uint32x4_t wt = w0;                \
47         w0 = vsha256hq_u32(w0, w1, t);     \
48         w1 = vsha256h2q_u32(w1, wt, t);    \
49         if (n < 12) {                      \
50             a = vsha256su0q_u32(a, b);     \
51             a = vsha256su1q_u32(a, c, d);  \
52         }                                  \
53     }
54 
55 void
SHA256_Compress_Native(SHA256Context * ctx)56 SHA256_Compress_Native(SHA256Context *ctx)
57 {
58     const uint32x4_t k0 = vld1q_u32(K256);
59     const uint32x4_t k1 = vld1q_u32(K256 + 4);
60     const uint32x4_t k2 = vld1q_u32(K256 + 8);
61     const uint32x4_t k3 = vld1q_u32(K256 + 12);
62     const uint32x4_t k4 = vld1q_u32(K256 + 16);
63     const uint32x4_t k5 = vld1q_u32(K256 + 20);
64     const uint32x4_t k6 = vld1q_u32(K256 + 24);
65     const uint32x4_t k7 = vld1q_u32(K256 + 28);
66     const uint32x4_t k8 = vld1q_u32(K256 + 32);
67     const uint32x4_t k9 = vld1q_u32(K256 + 36);
68     const uint32x4_t k10 = vld1q_u32(K256 + 40);
69     const uint32x4_t k11 = vld1q_u32(K256 + 44);
70     const uint32x4_t k12 = vld1q_u32(K256 + 48);
71     const uint32x4_t k13 = vld1q_u32(K256 + 52);
72     const uint32x4_t k14 = vld1q_u32(K256 + 56);
73     const uint32x4_t k15 = vld1q_u32(K256 + 60);
74 
75     uint32x4_t h0 = vld1q_u32(ctx->h);
76     uint32x4_t h1 = vld1q_u32(ctx->h + 4);
77 
78     unsigned char *input = ctx->u.b;
79 
80     uint32x4_t a = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input)));
81     uint32x4_t b = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
82     uint32x4_t c = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
83     uint32x4_t d = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
84 
85     uint32x4_t w0 = h0;
86     uint32x4_t w1 = h1;
87 
88     ROUND(0, a, b, c, d)
89     ROUND(1, b, c, d, a)
90     ROUND(2, c, d, a, b)
91     ROUND(3, d, a, b, c)
92     ROUND(4, a, b, c, d)
93     ROUND(5, b, c, d, a)
94     ROUND(6, c, d, a, b)
95     ROUND(7, d, a, b, c)
96     ROUND(8, a, b, c, d)
97     ROUND(9, b, c, d, a)
98     ROUND(10, c, d, a, b)
99     ROUND(11, d, a, b, c)
100     ROUND(12, a, b, c, d)
101     ROUND(13, b, c, d, a)
102     ROUND(14, c, d, a, b)
103     ROUND(15, d, a, b, c)
104 
105     h0 = vaddq_u32(h0, w0);
106     h1 = vaddq_u32(h1, w1);
107 
108     vst1q_u32(ctx->h, h0);
109     vst1q_u32(ctx->h + 4, h1);
110 }
111 
112 void
SHA256_Update_Native(SHA256Context * ctx,const unsigned char * input,unsigned int inputLen)113 SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
114                      unsigned int inputLen)
115 {
116     const uint32x4_t k0 = vld1q_u32(K256);
117     const uint32x4_t k1 = vld1q_u32(K256 + 4);
118     const uint32x4_t k2 = vld1q_u32(K256 + 8);
119     const uint32x4_t k3 = vld1q_u32(K256 + 12);
120     const uint32x4_t k4 = vld1q_u32(K256 + 16);
121     const uint32x4_t k5 = vld1q_u32(K256 + 20);
122     const uint32x4_t k6 = vld1q_u32(K256 + 24);
123     const uint32x4_t k7 = vld1q_u32(K256 + 28);
124     const uint32x4_t k8 = vld1q_u32(K256 + 32);
125     const uint32x4_t k9 = vld1q_u32(K256 + 36);
126     const uint32x4_t k10 = vld1q_u32(K256 + 40);
127     const uint32x4_t k11 = vld1q_u32(K256 + 44);
128     const uint32x4_t k12 = vld1q_u32(K256 + 48);
129     const uint32x4_t k13 = vld1q_u32(K256 + 52);
130     const uint32x4_t k14 = vld1q_u32(K256 + 56);
131     const uint32x4_t k15 = vld1q_u32(K256 + 60);
132 
133     unsigned int inBuf = ctx->sizeLo & 0x3f;
134     if (!inputLen) {
135         return;
136     }
137 
138     /* Add inputLen into the count of bytes processed, before processing */
139     if ((ctx->sizeLo += inputLen) < inputLen) {
140         ctx->sizeHi++;
141     }
142 
143     /* if data already in buffer, attemp to fill rest of buffer */
144     if (inBuf) {
145         unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
146         if (inputLen < todo) {
147             todo = inputLen;
148         }
149         memcpy(ctx->u.b + inBuf, input, todo);
150         input += todo;
151         inputLen -= todo;
152         if (inBuf + todo == SHA256_BLOCK_LENGTH) {
153             SHA256_Compress_Native(ctx);
154         }
155     }
156 
157     uint32x4_t h0 = vld1q_u32(ctx->h);
158     uint32x4_t h1 = vld1q_u32(ctx->h + 4);
159 
160     /* if enough data to fill one or more whole buffers, process them. */
161     while (inputLen >= SHA256_BLOCK_LENGTH) {
162         uint32x4_t a, b, c, d;
163         a = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input)));
164         b = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
165         c = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
166         d = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
167         input += SHA256_BLOCK_LENGTH;
168         inputLen -= SHA256_BLOCK_LENGTH;
169 
170         uint32x4_t w0 = h0;
171         uint32x4_t w1 = h1;
172 
173         ROUND(0, a, b, c, d)
174         ROUND(1, b, c, d, a)
175         ROUND(2, c, d, a, b)
176         ROUND(3, d, a, b, c)
177         ROUND(4, a, b, c, d)
178         ROUND(5, b, c, d, a)
179         ROUND(6, c, d, a, b)
180         ROUND(7, d, a, b, c)
181         ROUND(8, a, b, c, d)
182         ROUND(9, b, c, d, a)
183         ROUND(10, c, d, a, b)
184         ROUND(11, d, a, b, c)
185         ROUND(12, a, b, c, d)
186         ROUND(13, b, c, d, a)
187         ROUND(14, c, d, a, b)
188         ROUND(15, d, a, b, c)
189 
190         h0 = vaddq_u32(h0, w0);
191         h1 = vaddq_u32(h1, w1);
192     }
193 
194     vst1q_u32(ctx->h, h0);
195     vst1q_u32(ctx->h + 4, h1);
196 
197     /* if data left over, fill it into buffer */
198     if (inputLen) {
199         memcpy(ctx->u.b, input, inputLen);
200     }
201 }
202 
203 #endif /* USE_HW_SHA2 */
204