1 /*
2    poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication
3    and 128 bit addition
4 */
5 
6 #include "private/common.h"
7 
8 #define MUL(out, x, y) out = ((uint128_t) x * y)
9 #define ADD(out, in) out += in
10 #define ADDLO(out, in) out += in
11 #define SHR(in, shift) (unsigned long long) (in >> (shift))
12 #define LO(in) (unsigned long long) (in)
13 
14 #if defined(_MSC_VER)
15 # define POLY1305_NOINLINE __declspec(noinline)
16 #elif defined(__clang__) || defined(__GNUC__)
17 # define POLY1305_NOINLINE __attribute__((noinline))
18 #else
19 # define POLY1305_NOINLINE
20 #endif
21 
22 #define poly1305_block_size 16
23 
24 /* 17 + sizeof(unsigned long long) + 8*sizeof(unsigned long long) */
25 typedef struct poly1305_state_internal_t {
26     unsigned long long r[3];
27     unsigned long long h[3];
28     unsigned long long pad[2];
29     unsigned long long leftover;
30     unsigned char      buffer[poly1305_block_size];
31     unsigned char      final;
32 } poly1305_state_internal_t;
33 
34 static void
35 poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32])
36 {
37     unsigned long long t0, t1;
38 
39     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
40     t0 = LOAD64_LE(&key[0]);
41     t1 = LOAD64_LE(&key[8]);
42 
43     /* wiped after finalization */
44     st->r[0] = (t0) &0xffc0fffffff;
45     st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
46     st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f;
47 
48     /* h = 0 */
49     st->h[0] = 0;
50     st->h[1] = 0;
51     st->h[2] = 0;
52 
53     /* save pad for later */
54     st->pad[0] = LOAD64_LE(&key[16]);
55     st->pad[1] = LOAD64_LE(&key[24]);
56 
57     st->leftover = 0;
58     st->final    = 0;
59 }
60 
61 static void
62 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
63                 unsigned long long bytes)
64 {
65     const unsigned long long hibit =
66         (st->final) ? 0ULL : (1ULL << 40); /* 1 << 128 */
67     unsigned long long r0, r1, r2;
68     unsigned long long s1, s2;
69     unsigned long long h0, h1, h2;
70     unsigned long long c;
71     uint128_t          d0, d1, d2, d;
72 
73     r0 = st->r[0];
74     r1 = st->r[1];
75     r2 = st->r[2];
76 
77     h0 = st->h[0];
78     h1 = st->h[1];
79     h2 = st->h[2];
80 
81     s1 = r1 * (5 << 2);
82     s2 = r2 * (5 << 2);
83 
84     while (bytes >= poly1305_block_size) {
85         unsigned long long t0, t1;
86 
87         /* h += m[i] */
88         t0 = LOAD64_LE(&m[0]);
89         t1 = LOAD64_LE(&m[8]);
90 
91         h0 += ((t0) &0xfffffffffff);
92         h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
93         h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
94 
95         /* h *= r */
96         MUL(d0, h0, r0);
97         MUL(d, h1, s2);
98         ADD(d0, d);
99         MUL(d, h2, s1);
100         ADD(d0, d);
101         MUL(d1, h0, r1);
102         MUL(d, h1, r0);
103         ADD(d1, d);
104         MUL(d, h2, s2);
105         ADD(d1, d);
106         MUL(d2, h0, r2);
107         MUL(d, h1, r1);
108         ADD(d2, d);
109         MUL(d, h2, r0);
110         ADD(d2, d);
111 
112         /* (partial) h %= p */
113         c  = SHR(d0, 44);
114         h0 = LO(d0) & 0xfffffffffff;
115         ADDLO(d1, c);
116         c  = SHR(d1, 44);
117         h1 = LO(d1) & 0xfffffffffff;
118         ADDLO(d2, c);
119         c  = SHR(d2, 42);
120         h2 = LO(d2) & 0x3ffffffffff;
121         h0 += c * 5;
122         c  = (h0 >> 44);
123         h0 = h0 & 0xfffffffffff;
124         h1 += c;
125 
126         m += poly1305_block_size;
127         bytes -= poly1305_block_size;
128     }
129 
130     st->h[0] = h0;
131     st->h[1] = h1;
132     st->h[2] = h2;
133 }
134 
135 static POLY1305_NOINLINE void
136 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
137 {
138     unsigned long long h0, h1, h2, c;
139     unsigned long long g0, g1, g2;
140     unsigned long long t0, t1;
141 
142     /* process the remaining block */
143     if (st->leftover) {
144         unsigned long long i = st->leftover;
145 
146         st->buffer[i] = 1;
147 
148         for (i = i + 1; i < poly1305_block_size; i++) {
149             st->buffer[i] = 0;
150         }
151         st->final = 1;
152         poly1305_blocks(st, st->buffer, poly1305_block_size);
153     }
154 
155     /* fully carry h */
156     h0 = st->h[0];
157     h1 = st->h[1];
158     h2 = st->h[2];
159 
160     c = (h1 >> 44);
161     h1 &= 0xfffffffffff;
162     h2 += c;
163     c = (h2 >> 42);
164     h2 &= 0x3ffffffffff;
165     h0 += c * 5;
166     c = (h0 >> 44);
167     h0 &= 0xfffffffffff;
168     h1 += c;
169     c = (h1 >> 44);
170     h1 &= 0xfffffffffff;
171     h2 += c;
172     c = (h2 >> 42);
173     h2 &= 0x3ffffffffff;
174     h0 += c * 5;
175     c = (h0 >> 44);
176     h0 &= 0xfffffffffff;
177     h1 += c;
178 
179     /* compute h + -p */
180     g0 = h0 + 5;
181     c  = (g0 >> 44);
182     g0 &= 0xfffffffffff;
183     g1 = h1 + c;
184     c  = (g1 >> 44);
185     g1 &= 0xfffffffffff;
186     g2 = h2 + c - (1ULL << 42);
187 
188     /* select h if h < p, or h + -p if h >= p */
189     c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
190     g0 &= c;
191     g1 &= c;
192     g2 &= c;
193     c  = ~c;
194     h0 = (h0 & c) | g0;
195     h1 = (h1 & c) | g1;
196     h2 = (h2 & c) | g2;
197 
198     /* h = (h + pad) */
199     t0 = st->pad[0];
200     t1 = st->pad[1];
201 
202     h0 += ((t0) &0xfffffffffff);
203     c = (h0 >> 44);
204     h0 &= 0xfffffffffff;
205     h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
206     c = (h1 >> 44);
207     h1 &= 0xfffffffffff;
208     h2 += (((t1 >> 24)) & 0x3ffffffffff) + c;
209     h2 &= 0x3ffffffffff;
210 
211     /* mac = h % (2^128) */
212     h0 = ((h0) | (h1 << 44));
213     h1 = ((h1 >> 20) | (h2 << 24));
214 
215     STORE64_LE(&mac[0], h0);
216     STORE64_LE(&mac[8], h1);
217 
218     /* zero out the state */
219     sodium_memzero((void *) st, sizeof *st);
220 }
221