1 /*
2    poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication
3    and 64 bit addition
4 */
5 
6 #if defined(_MSC_VER)
7 # define POLY1305_NOINLINE __declspec(noinline)
8 #elif defined(__clang__) || defined(__GNUC__)
9 # define POLY1305_NOINLINE __attribute__((noinline))
10 #else
11 # define POLY1305_NOINLINE
12 #endif
13 
14 #include "private/common.h"
15 
16 #define poly1305_block_size 16
17 
18 /* 17 + sizeof(unsigned long long) + 14*sizeof(unsigned long) */
19 typedef struct poly1305_state_internal_t {
20     unsigned long      r[5];
21     unsigned long      h[5];
22     unsigned long      pad[4];
23     unsigned long long leftover;
24     unsigned char      buffer[poly1305_block_size];
25     unsigned char      final;
26 } poly1305_state_internal_t;
27 
28 static void
29 poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32])
30 {
31     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff - wiped after finalization */
32     st->r[0] = (LOAD32_LE(&key[0])) & 0x3ffffff;
33     st->r[1] = (LOAD32_LE(&key[3]) >> 2) & 0x3ffff03;
34     st->r[2] = (LOAD32_LE(&key[6]) >> 4) & 0x3ffc0ff;
35     st->r[3] = (LOAD32_LE(&key[9]) >> 6) & 0x3f03fff;
36     st->r[4] = (LOAD32_LE(&key[12]) >> 8) & 0x00fffff;
37 
38     /* h = 0 */
39     st->h[0] = 0;
40     st->h[1] = 0;
41     st->h[2] = 0;
42     st->h[3] = 0;
43     st->h[4] = 0;
44 
45     /* save pad for later */
46     st->pad[0] = LOAD32_LE(&key[16]);
47     st->pad[1] = LOAD32_LE(&key[20]);
48     st->pad[2] = LOAD32_LE(&key[24]);
49     st->pad[3] = LOAD32_LE(&key[28]);
50 
51     st->leftover = 0;
52     st->final    = 0;
53 }
54 
55 static void
56 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
57                 unsigned long long bytes)
58 {
59     const unsigned long hibit = (st->final) ? 0UL : (1UL << 24); /* 1 << 128 */
60     unsigned long       r0, r1, r2, r3, r4;
61     unsigned long       s1, s2, s3, s4;
62     unsigned long       h0, h1, h2, h3, h4;
63     unsigned long long  d0, d1, d2, d3, d4;
64     unsigned long       c;
65 
66     r0 = st->r[0];
67     r1 = st->r[1];
68     r2 = st->r[2];
69     r3 = st->r[3];
70     r4 = st->r[4];
71 
72     s1 = r1 * 5;
73     s2 = r2 * 5;
74     s3 = r3 * 5;
75     s4 = r4 * 5;
76 
77     h0 = st->h[0];
78     h1 = st->h[1];
79     h2 = st->h[2];
80     h3 = st->h[3];
81     h4 = st->h[4];
82 
83     while (bytes >= poly1305_block_size) {
84         /* h += m[i] */
85         h0 += (LOAD32_LE(m + 0)) & 0x3ffffff;
86         h1 += (LOAD32_LE(m + 3) >> 2) & 0x3ffffff;
87         h2 += (LOAD32_LE(m + 6) >> 4) & 0x3ffffff;
88         h3 += (LOAD32_LE(m + 9) >> 6) & 0x3ffffff;
89         h4 += (LOAD32_LE(m + 12) >> 8) | hibit;
90 
91         /* h *= r */
92         d0 = ((unsigned long long) h0 * r0) + ((unsigned long long) h1 * s4) +
93              ((unsigned long long) h2 * s3) + ((unsigned long long) h3 * s2) +
94              ((unsigned long long) h4 * s1);
95         d1 = ((unsigned long long) h0 * r1) + ((unsigned long long) h1 * r0) +
96              ((unsigned long long) h2 * s4) + ((unsigned long long) h3 * s3) +
97              ((unsigned long long) h4 * s2);
98         d2 = ((unsigned long long) h0 * r2) + ((unsigned long long) h1 * r1) +
99              ((unsigned long long) h2 * r0) + ((unsigned long long) h3 * s4) +
100              ((unsigned long long) h4 * s3);
101         d3 = ((unsigned long long) h0 * r3) + ((unsigned long long) h1 * r2) +
102              ((unsigned long long) h2 * r1) + ((unsigned long long) h3 * r0) +
103              ((unsigned long long) h4 * s4);
104         d4 = ((unsigned long long) h0 * r4) + ((unsigned long long) h1 * r3) +
105              ((unsigned long long) h2 * r2) + ((unsigned long long) h3 * r1) +
106              ((unsigned long long) h4 * r0);
107 
108         /* (partial) h %= p */
109         c  = (unsigned long) (d0 >> 26);
110         h0 = (unsigned long) d0 & 0x3ffffff;
111         d1 += c;
112         c  = (unsigned long) (d1 >> 26);
113         h1 = (unsigned long) d1 & 0x3ffffff;
114         d2 += c;
115         c  = (unsigned long) (d2 >> 26);
116         h2 = (unsigned long) d2 & 0x3ffffff;
117         d3 += c;
118         c  = (unsigned long) (d3 >> 26);
119         h3 = (unsigned long) d3 & 0x3ffffff;
120         d4 += c;
121         c  = (unsigned long) (d4 >> 26);
122         h4 = (unsigned long) d4 & 0x3ffffff;
123         h0 += c * 5;
124         c  = (h0 >> 26);
125         h0 = h0 & 0x3ffffff;
126         h1 += c;
127 
128         m += poly1305_block_size;
129         bytes -= poly1305_block_size;
130     }
131 
132     st->h[0] = h0;
133     st->h[1] = h1;
134     st->h[2] = h2;
135     st->h[3] = h3;
136     st->h[4] = h4;
137 }
138 
139 static POLY1305_NOINLINE void
140 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
141 {
142     unsigned long      h0, h1, h2, h3, h4, c;
143     unsigned long      g0, g1, g2, g3, g4;
144     unsigned long long f;
145     unsigned long      mask;
146 
147     /* process the remaining block */
148     if (st->leftover) {
149         unsigned long long i = st->leftover;
150 
151         st->buffer[i++] = 1;
152         for (; i < poly1305_block_size; i++) {
153             st->buffer[i] = 0;
154         }
155         st->final = 1;
156         poly1305_blocks(st, st->buffer, poly1305_block_size);
157     }
158 
159     /* fully carry h */
160     h0 = st->h[0];
161     h1 = st->h[1];
162     h2 = st->h[2];
163     h3 = st->h[3];
164     h4 = st->h[4];
165 
166     c  = h1 >> 26;
167     h1 = h1 & 0x3ffffff;
168     h2 += c;
169     c  = h2 >> 26;
170     h2 = h2 & 0x3ffffff;
171     h3 += c;
172     c  = h3 >> 26;
173     h3 = h3 & 0x3ffffff;
174     h4 += c;
175     c  = h4 >> 26;
176     h4 = h4 & 0x3ffffff;
177     h0 += c * 5;
178     c  = h0 >> 26;
179     h0 = h0 & 0x3ffffff;
180     h1 += c;
181 
182     /* compute h + -p */
183     g0 = h0 + 5;
184     c  = g0 >> 26;
185     g0 &= 0x3ffffff;
186     g1 = h1 + c;
187     c  = g1 >> 26;
188     g1 &= 0x3ffffff;
189     g2 = h2 + c;
190     c  = g2 >> 26;
191     g2 &= 0x3ffffff;
192     g3 = h3 + c;
193     c  = g3 >> 26;
194     g3 &= 0x3ffffff;
195     g4 = h4 + c - (1UL << 26);
196 
197     /* select h if h < p, or h + -p if h >= p */
198     mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1;
199     g0 &= mask;
200     g1 &= mask;
201     g2 &= mask;
202     g3 &= mask;
203     g4 &= mask;
204     mask = ~mask;
205 
206     h0 = (h0 & mask) | g0;
207     h1 = (h1 & mask) | g1;
208     h2 = (h2 & mask) | g2;
209     h3 = (h3 & mask) | g3;
210     h4 = (h4 & mask) | g4;
211 
212     /* h = h % (2^128) */
213     h0 = ((h0) | (h1 << 26)) & 0xffffffff;
214     h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
215     h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
216     h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
217 
218     /* mac = (h + pad) % (2^128) */
219     f  = (unsigned long long) h0 + st->pad[0];
220     h0 = (unsigned long) f;
221     f  = (unsigned long long) h1 + st->pad[1] + (f >> 32);
222     h1 = (unsigned long) f;
223     f  = (unsigned long long) h2 + st->pad[2] + (f >> 32);
224     h2 = (unsigned long) f;
225     f  = (unsigned long long) h3 + st->pad[3] + (f >> 32);
226     h3 = (unsigned long) f;
227 
228     STORE32_LE(mac + 0, (uint32_t) h0);
229     STORE32_LE(mac + 4, (uint32_t) h1);
230     STORE32_LE(mac + 8, (uint32_t) h2);
231     STORE32_LE(mac + 12, (uint32_t) h3);
232 
233     /* zero out the state */
234     sodium_memzero((void *) st, sizeof *st);
235 }
236