10ac341f1SConrad Meyer /*
20ac341f1SConrad Meyer    poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication
30ac341f1SConrad Meyer    and 128 bit addition
40ac341f1SConrad Meyer */
50ac341f1SConrad Meyer 
60ac341f1SConrad Meyer #include "private/common.h"
70ac341f1SConrad Meyer 
80ac341f1SConrad Meyer #define MUL(out, x, y) out = ((uint128_t) x * y)
90ac341f1SConrad Meyer #define ADD(out, in) out += in
100ac341f1SConrad Meyer #define ADDLO(out, in) out += in
110ac341f1SConrad Meyer #define SHR(in, shift) (unsigned long long) (in >> (shift))
120ac341f1SConrad Meyer #define LO(in) (unsigned long long) (in)
130ac341f1SConrad Meyer 
140ac341f1SConrad Meyer #if defined(_MSC_VER)
150ac341f1SConrad Meyer # define POLY1305_NOINLINE __declspec(noinline)
160ac341f1SConrad Meyer #elif defined(__clang__) || defined(__GNUC__)
170ac341f1SConrad Meyer # define POLY1305_NOINLINE __attribute__((noinline))
180ac341f1SConrad Meyer #else
190ac341f1SConrad Meyer # define POLY1305_NOINLINE
200ac341f1SConrad Meyer #endif
210ac341f1SConrad Meyer 
220ac341f1SConrad Meyer #define poly1305_block_size 16
230ac341f1SConrad Meyer 
240ac341f1SConrad Meyer /* 17 + sizeof(unsigned long long) + 8*sizeof(unsigned long long) */
250ac341f1SConrad Meyer typedef struct poly1305_state_internal_t {
260ac341f1SConrad Meyer     unsigned long long r[3];
270ac341f1SConrad Meyer     unsigned long long h[3];
280ac341f1SConrad Meyer     unsigned long long pad[2];
290ac341f1SConrad Meyer     unsigned long long leftover;
300ac341f1SConrad Meyer     unsigned char      buffer[poly1305_block_size];
310ac341f1SConrad Meyer     unsigned char      final;
320ac341f1SConrad Meyer } poly1305_state_internal_t;
330ac341f1SConrad Meyer 
340ac341f1SConrad Meyer static void
poly1305_init(poly1305_state_internal_t * st,const unsigned char key[32])350ac341f1SConrad Meyer poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32])
360ac341f1SConrad Meyer {
370ac341f1SConrad Meyer     unsigned long long t0, t1;
380ac341f1SConrad Meyer 
390ac341f1SConrad Meyer     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
400ac341f1SConrad Meyer     t0 = LOAD64_LE(&key[0]);
410ac341f1SConrad Meyer     t1 = LOAD64_LE(&key[8]);
420ac341f1SConrad Meyer 
430ac341f1SConrad Meyer     /* wiped after finalization */
440ac341f1SConrad Meyer     st->r[0] = (t0) &0xffc0fffffff;
450ac341f1SConrad Meyer     st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
460ac341f1SConrad Meyer     st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f;
470ac341f1SConrad Meyer 
480ac341f1SConrad Meyer     /* h = 0 */
490ac341f1SConrad Meyer     st->h[0] = 0;
500ac341f1SConrad Meyer     st->h[1] = 0;
510ac341f1SConrad Meyer     st->h[2] = 0;
520ac341f1SConrad Meyer 
530ac341f1SConrad Meyer     /* save pad for later */
540ac341f1SConrad Meyer     st->pad[0] = LOAD64_LE(&key[16]);
550ac341f1SConrad Meyer     st->pad[1] = LOAD64_LE(&key[24]);
560ac341f1SConrad Meyer 
570ac341f1SConrad Meyer     st->leftover = 0;
580ac341f1SConrad Meyer     st->final    = 0;
590ac341f1SConrad Meyer }
600ac341f1SConrad Meyer 
610ac341f1SConrad Meyer static void
poly1305_blocks(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long bytes)620ac341f1SConrad Meyer poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
630ac341f1SConrad Meyer                 unsigned long long bytes)
640ac341f1SConrad Meyer {
650ac341f1SConrad Meyer     const unsigned long long hibit =
660ac341f1SConrad Meyer         (st->final) ? 0ULL : (1ULL << 40); /* 1 << 128 */
670ac341f1SConrad Meyer     unsigned long long r0, r1, r2;
680ac341f1SConrad Meyer     unsigned long long s1, s2;
690ac341f1SConrad Meyer     unsigned long long h0, h1, h2;
700ac341f1SConrad Meyer     unsigned long long c;
710ac341f1SConrad Meyer     uint128_t          d0, d1, d2, d;
720ac341f1SConrad Meyer 
730ac341f1SConrad Meyer     r0 = st->r[0];
740ac341f1SConrad Meyer     r1 = st->r[1];
750ac341f1SConrad Meyer     r2 = st->r[2];
760ac341f1SConrad Meyer 
770ac341f1SConrad Meyer     h0 = st->h[0];
780ac341f1SConrad Meyer     h1 = st->h[1];
790ac341f1SConrad Meyer     h2 = st->h[2];
800ac341f1SConrad Meyer 
810ac341f1SConrad Meyer     s1 = r1 * (5 << 2);
820ac341f1SConrad Meyer     s2 = r2 * (5 << 2);
830ac341f1SConrad Meyer 
840ac341f1SConrad Meyer     while (bytes >= poly1305_block_size) {
850ac341f1SConrad Meyer         unsigned long long t0, t1;
860ac341f1SConrad Meyer 
870ac341f1SConrad Meyer         /* h += m[i] */
880ac341f1SConrad Meyer         t0 = LOAD64_LE(&m[0]);
890ac341f1SConrad Meyer         t1 = LOAD64_LE(&m[8]);
900ac341f1SConrad Meyer 
910ac341f1SConrad Meyer         h0 += ((t0) &0xfffffffffff);
920ac341f1SConrad Meyer         h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
930ac341f1SConrad Meyer         h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
940ac341f1SConrad Meyer 
950ac341f1SConrad Meyer         /* h *= r */
960ac341f1SConrad Meyer         MUL(d0, h0, r0);
970ac341f1SConrad Meyer         MUL(d, h1, s2);
980ac341f1SConrad Meyer         ADD(d0, d);
990ac341f1SConrad Meyer         MUL(d, h2, s1);
1000ac341f1SConrad Meyer         ADD(d0, d);
1010ac341f1SConrad Meyer         MUL(d1, h0, r1);
1020ac341f1SConrad Meyer         MUL(d, h1, r0);
1030ac341f1SConrad Meyer         ADD(d1, d);
1040ac341f1SConrad Meyer         MUL(d, h2, s2);
1050ac341f1SConrad Meyer         ADD(d1, d);
1060ac341f1SConrad Meyer         MUL(d2, h0, r2);
1070ac341f1SConrad Meyer         MUL(d, h1, r1);
1080ac341f1SConrad Meyer         ADD(d2, d);
1090ac341f1SConrad Meyer         MUL(d, h2, r0);
1100ac341f1SConrad Meyer         ADD(d2, d);
1110ac341f1SConrad Meyer 
1120ac341f1SConrad Meyer         /* (partial) h %= p */
1130ac341f1SConrad Meyer         c  = SHR(d0, 44);
1140ac341f1SConrad Meyer         h0 = LO(d0) & 0xfffffffffff;
1150ac341f1SConrad Meyer         ADDLO(d1, c);
1160ac341f1SConrad Meyer         c  = SHR(d1, 44);
1170ac341f1SConrad Meyer         h1 = LO(d1) & 0xfffffffffff;
1180ac341f1SConrad Meyer         ADDLO(d2, c);
1190ac341f1SConrad Meyer         c  = SHR(d2, 42);
1200ac341f1SConrad Meyer         h2 = LO(d2) & 0x3ffffffffff;
1210ac341f1SConrad Meyer         h0 += c * 5;
1220ac341f1SConrad Meyer         c  = (h0 >> 44);
1230ac341f1SConrad Meyer         h0 = h0 & 0xfffffffffff;
1240ac341f1SConrad Meyer         h1 += c;
1250ac341f1SConrad Meyer 
1260ac341f1SConrad Meyer         m += poly1305_block_size;
1270ac341f1SConrad Meyer         bytes -= poly1305_block_size;
1280ac341f1SConrad Meyer     }
1290ac341f1SConrad Meyer 
1300ac341f1SConrad Meyer     st->h[0] = h0;
1310ac341f1SConrad Meyer     st->h[1] = h1;
1320ac341f1SConrad Meyer     st->h[2] = h2;
1330ac341f1SConrad Meyer }
1340ac341f1SConrad Meyer 
1350ac341f1SConrad Meyer static POLY1305_NOINLINE void
poly1305_finish(poly1305_state_internal_t * st,unsigned char mac[16])1360ac341f1SConrad Meyer poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
1370ac341f1SConrad Meyer {
1380ac341f1SConrad Meyer     unsigned long long h0, h1, h2, c;
1390ac341f1SConrad Meyer     unsigned long long g0, g1, g2;
1400ac341f1SConrad Meyer     unsigned long long t0, t1;
1410ac341f1SConrad Meyer 
1420ac341f1SConrad Meyer     /* process the remaining block */
1430ac341f1SConrad Meyer     if (st->leftover) {
1440ac341f1SConrad Meyer         unsigned long long i = st->leftover;
1450ac341f1SConrad Meyer 
1460ac341f1SConrad Meyer         st->buffer[i] = 1;
1470ac341f1SConrad Meyer 
1480ac341f1SConrad Meyer         for (i = i + 1; i < poly1305_block_size; i++) {
1490ac341f1SConrad Meyer             st->buffer[i] = 0;
1500ac341f1SConrad Meyer         }
1510ac341f1SConrad Meyer         st->final = 1;
1520ac341f1SConrad Meyer         poly1305_blocks(st, st->buffer, poly1305_block_size);
1530ac341f1SConrad Meyer     }
1540ac341f1SConrad Meyer 
1550ac341f1SConrad Meyer     /* fully carry h */
1560ac341f1SConrad Meyer     h0 = st->h[0];
1570ac341f1SConrad Meyer     h1 = st->h[1];
1580ac341f1SConrad Meyer     h2 = st->h[2];
1590ac341f1SConrad Meyer 
1600ac341f1SConrad Meyer     c = (h1 >> 44);
1610ac341f1SConrad Meyer     h1 &= 0xfffffffffff;
1620ac341f1SConrad Meyer     h2 += c;
1630ac341f1SConrad Meyer     c = (h2 >> 42);
1640ac341f1SConrad Meyer     h2 &= 0x3ffffffffff;
1650ac341f1SConrad Meyer     h0 += c * 5;
1660ac341f1SConrad Meyer     c = (h0 >> 44);
1670ac341f1SConrad Meyer     h0 &= 0xfffffffffff;
1680ac341f1SConrad Meyer     h1 += c;
1690ac341f1SConrad Meyer     c = (h1 >> 44);
1700ac341f1SConrad Meyer     h1 &= 0xfffffffffff;
1710ac341f1SConrad Meyer     h2 += c;
1720ac341f1SConrad Meyer     c = (h2 >> 42);
1730ac341f1SConrad Meyer     h2 &= 0x3ffffffffff;
1740ac341f1SConrad Meyer     h0 += c * 5;
1750ac341f1SConrad Meyer     c = (h0 >> 44);
1760ac341f1SConrad Meyer     h0 &= 0xfffffffffff;
1770ac341f1SConrad Meyer     h1 += c;
1780ac341f1SConrad Meyer 
1790ac341f1SConrad Meyer     /* compute h + -p */
1800ac341f1SConrad Meyer     g0 = h0 + 5;
1810ac341f1SConrad Meyer     c  = (g0 >> 44);
1820ac341f1SConrad Meyer     g0 &= 0xfffffffffff;
1830ac341f1SConrad Meyer     g1 = h1 + c;
1840ac341f1SConrad Meyer     c  = (g1 >> 44);
1850ac341f1SConrad Meyer     g1 &= 0xfffffffffff;
1860ac341f1SConrad Meyer     g2 = h2 + c - (1ULL << 42);
1870ac341f1SConrad Meyer 
1880ac341f1SConrad Meyer     /* select h if h < p, or h + -p if h >= p */
1890ac341f1SConrad Meyer     c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
1900ac341f1SConrad Meyer     g0 &= c;
1910ac341f1SConrad Meyer     g1 &= c;
1920ac341f1SConrad Meyer     g2 &= c;
1930ac341f1SConrad Meyer     c  = ~c;
1940ac341f1SConrad Meyer     h0 = (h0 & c) | g0;
1950ac341f1SConrad Meyer     h1 = (h1 & c) | g1;
1960ac341f1SConrad Meyer     h2 = (h2 & c) | g2;
1970ac341f1SConrad Meyer 
1980ac341f1SConrad Meyer     /* h = (h + pad) */
1990ac341f1SConrad Meyer     t0 = st->pad[0];
2000ac341f1SConrad Meyer     t1 = st->pad[1];
2010ac341f1SConrad Meyer 
2020ac341f1SConrad Meyer     h0 += ((t0) &0xfffffffffff);
2030ac341f1SConrad Meyer     c = (h0 >> 44);
2040ac341f1SConrad Meyer     h0 &= 0xfffffffffff;
2050ac341f1SConrad Meyer     h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
2060ac341f1SConrad Meyer     c = (h1 >> 44);
2070ac341f1SConrad Meyer     h1 &= 0xfffffffffff;
2080ac341f1SConrad Meyer     h2 += (((t1 >> 24)) & 0x3ffffffffff) + c;
2090ac341f1SConrad Meyer     h2 &= 0x3ffffffffff;
2100ac341f1SConrad Meyer 
2110ac341f1SConrad Meyer     /* mac = h % (2^128) */
2120ac341f1SConrad Meyer     h0 = ((h0) | (h1 << 44));
2130ac341f1SConrad Meyer     h1 = ((h1 >> 20) | (h2 << 24));
2140ac341f1SConrad Meyer 
2150ac341f1SConrad Meyer     STORE64_LE(&mac[0], h0);
2160ac341f1SConrad Meyer     STORE64_LE(&mac[8], h1);
2170ac341f1SConrad Meyer 
2180ac341f1SConrad Meyer     /* zero out the state */
2190ac341f1SConrad Meyer     sodium_memzero((void *) st, sizeof *st);
2200ac341f1SConrad Meyer }
221