10ac341f1SConrad Meyer /*
20ac341f1SConrad Meyer poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication
30ac341f1SConrad Meyer and 128 bit addition
40ac341f1SConrad Meyer */
50ac341f1SConrad Meyer
60ac341f1SConrad Meyer #include "private/common.h"
70ac341f1SConrad Meyer
80ac341f1SConrad Meyer #define MUL(out, x, y) out = ((uint128_t) x * y)
90ac341f1SConrad Meyer #define ADD(out, in) out += in
100ac341f1SConrad Meyer #define ADDLO(out, in) out += in
110ac341f1SConrad Meyer #define SHR(in, shift) (unsigned long long) (in >> (shift))
120ac341f1SConrad Meyer #define LO(in) (unsigned long long) (in)
130ac341f1SConrad Meyer
140ac341f1SConrad Meyer #if defined(_MSC_VER)
150ac341f1SConrad Meyer # define POLY1305_NOINLINE __declspec(noinline)
160ac341f1SConrad Meyer #elif defined(__clang__) || defined(__GNUC__)
170ac341f1SConrad Meyer # define POLY1305_NOINLINE __attribute__((noinline))
180ac341f1SConrad Meyer #else
190ac341f1SConrad Meyer # define POLY1305_NOINLINE
200ac341f1SConrad Meyer #endif
210ac341f1SConrad Meyer
220ac341f1SConrad Meyer #define poly1305_block_size 16
230ac341f1SConrad Meyer
240ac341f1SConrad Meyer /* 17 + sizeof(unsigned long long) + 8*sizeof(unsigned long long) */
250ac341f1SConrad Meyer typedef struct poly1305_state_internal_t {
260ac341f1SConrad Meyer unsigned long long r[3];
270ac341f1SConrad Meyer unsigned long long h[3];
280ac341f1SConrad Meyer unsigned long long pad[2];
290ac341f1SConrad Meyer unsigned long long leftover;
300ac341f1SConrad Meyer unsigned char buffer[poly1305_block_size];
310ac341f1SConrad Meyer unsigned char final;
320ac341f1SConrad Meyer } poly1305_state_internal_t;
330ac341f1SConrad Meyer
340ac341f1SConrad Meyer static void
poly1305_init(poly1305_state_internal_t * st,const unsigned char key[32])350ac341f1SConrad Meyer poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32])
360ac341f1SConrad Meyer {
370ac341f1SConrad Meyer unsigned long long t0, t1;
380ac341f1SConrad Meyer
390ac341f1SConrad Meyer /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
400ac341f1SConrad Meyer t0 = LOAD64_LE(&key[0]);
410ac341f1SConrad Meyer t1 = LOAD64_LE(&key[8]);
420ac341f1SConrad Meyer
430ac341f1SConrad Meyer /* wiped after finalization */
440ac341f1SConrad Meyer st->r[0] = (t0) &0xffc0fffffff;
450ac341f1SConrad Meyer st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
460ac341f1SConrad Meyer st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f;
470ac341f1SConrad Meyer
480ac341f1SConrad Meyer /* h = 0 */
490ac341f1SConrad Meyer st->h[0] = 0;
500ac341f1SConrad Meyer st->h[1] = 0;
510ac341f1SConrad Meyer st->h[2] = 0;
520ac341f1SConrad Meyer
530ac341f1SConrad Meyer /* save pad for later */
540ac341f1SConrad Meyer st->pad[0] = LOAD64_LE(&key[16]);
550ac341f1SConrad Meyer st->pad[1] = LOAD64_LE(&key[24]);
560ac341f1SConrad Meyer
570ac341f1SConrad Meyer st->leftover = 0;
580ac341f1SConrad Meyer st->final = 0;
590ac341f1SConrad Meyer }
600ac341f1SConrad Meyer
610ac341f1SConrad Meyer static void
poly1305_blocks(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long bytes)620ac341f1SConrad Meyer poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
630ac341f1SConrad Meyer unsigned long long bytes)
640ac341f1SConrad Meyer {
650ac341f1SConrad Meyer const unsigned long long hibit =
660ac341f1SConrad Meyer (st->final) ? 0ULL : (1ULL << 40); /* 1 << 128 */
670ac341f1SConrad Meyer unsigned long long r0, r1, r2;
680ac341f1SConrad Meyer unsigned long long s1, s2;
690ac341f1SConrad Meyer unsigned long long h0, h1, h2;
700ac341f1SConrad Meyer unsigned long long c;
710ac341f1SConrad Meyer uint128_t d0, d1, d2, d;
720ac341f1SConrad Meyer
730ac341f1SConrad Meyer r0 = st->r[0];
740ac341f1SConrad Meyer r1 = st->r[1];
750ac341f1SConrad Meyer r2 = st->r[2];
760ac341f1SConrad Meyer
770ac341f1SConrad Meyer h0 = st->h[0];
780ac341f1SConrad Meyer h1 = st->h[1];
790ac341f1SConrad Meyer h2 = st->h[2];
800ac341f1SConrad Meyer
810ac341f1SConrad Meyer s1 = r1 * (5 << 2);
820ac341f1SConrad Meyer s2 = r2 * (5 << 2);
830ac341f1SConrad Meyer
840ac341f1SConrad Meyer while (bytes >= poly1305_block_size) {
850ac341f1SConrad Meyer unsigned long long t0, t1;
860ac341f1SConrad Meyer
870ac341f1SConrad Meyer /* h += m[i] */
880ac341f1SConrad Meyer t0 = LOAD64_LE(&m[0]);
890ac341f1SConrad Meyer t1 = LOAD64_LE(&m[8]);
900ac341f1SConrad Meyer
910ac341f1SConrad Meyer h0 += ((t0) &0xfffffffffff);
920ac341f1SConrad Meyer h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
930ac341f1SConrad Meyer h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
940ac341f1SConrad Meyer
950ac341f1SConrad Meyer /* h *= r */
960ac341f1SConrad Meyer MUL(d0, h0, r0);
970ac341f1SConrad Meyer MUL(d, h1, s2);
980ac341f1SConrad Meyer ADD(d0, d);
990ac341f1SConrad Meyer MUL(d, h2, s1);
1000ac341f1SConrad Meyer ADD(d0, d);
1010ac341f1SConrad Meyer MUL(d1, h0, r1);
1020ac341f1SConrad Meyer MUL(d, h1, r0);
1030ac341f1SConrad Meyer ADD(d1, d);
1040ac341f1SConrad Meyer MUL(d, h2, s2);
1050ac341f1SConrad Meyer ADD(d1, d);
1060ac341f1SConrad Meyer MUL(d2, h0, r2);
1070ac341f1SConrad Meyer MUL(d, h1, r1);
1080ac341f1SConrad Meyer ADD(d2, d);
1090ac341f1SConrad Meyer MUL(d, h2, r0);
1100ac341f1SConrad Meyer ADD(d2, d);
1110ac341f1SConrad Meyer
1120ac341f1SConrad Meyer /* (partial) h %= p */
1130ac341f1SConrad Meyer c = SHR(d0, 44);
1140ac341f1SConrad Meyer h0 = LO(d0) & 0xfffffffffff;
1150ac341f1SConrad Meyer ADDLO(d1, c);
1160ac341f1SConrad Meyer c = SHR(d1, 44);
1170ac341f1SConrad Meyer h1 = LO(d1) & 0xfffffffffff;
1180ac341f1SConrad Meyer ADDLO(d2, c);
1190ac341f1SConrad Meyer c = SHR(d2, 42);
1200ac341f1SConrad Meyer h2 = LO(d2) & 0x3ffffffffff;
1210ac341f1SConrad Meyer h0 += c * 5;
1220ac341f1SConrad Meyer c = (h0 >> 44);
1230ac341f1SConrad Meyer h0 = h0 & 0xfffffffffff;
1240ac341f1SConrad Meyer h1 += c;
1250ac341f1SConrad Meyer
1260ac341f1SConrad Meyer m += poly1305_block_size;
1270ac341f1SConrad Meyer bytes -= poly1305_block_size;
1280ac341f1SConrad Meyer }
1290ac341f1SConrad Meyer
1300ac341f1SConrad Meyer st->h[0] = h0;
1310ac341f1SConrad Meyer st->h[1] = h1;
1320ac341f1SConrad Meyer st->h[2] = h2;
1330ac341f1SConrad Meyer }
1340ac341f1SConrad Meyer
1350ac341f1SConrad Meyer static POLY1305_NOINLINE void
poly1305_finish(poly1305_state_internal_t * st,unsigned char mac[16])1360ac341f1SConrad Meyer poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
1370ac341f1SConrad Meyer {
1380ac341f1SConrad Meyer unsigned long long h0, h1, h2, c;
1390ac341f1SConrad Meyer unsigned long long g0, g1, g2;
1400ac341f1SConrad Meyer unsigned long long t0, t1;
1410ac341f1SConrad Meyer
1420ac341f1SConrad Meyer /* process the remaining block */
1430ac341f1SConrad Meyer if (st->leftover) {
1440ac341f1SConrad Meyer unsigned long long i = st->leftover;
1450ac341f1SConrad Meyer
1460ac341f1SConrad Meyer st->buffer[i] = 1;
1470ac341f1SConrad Meyer
1480ac341f1SConrad Meyer for (i = i + 1; i < poly1305_block_size; i++) {
1490ac341f1SConrad Meyer st->buffer[i] = 0;
1500ac341f1SConrad Meyer }
1510ac341f1SConrad Meyer st->final = 1;
1520ac341f1SConrad Meyer poly1305_blocks(st, st->buffer, poly1305_block_size);
1530ac341f1SConrad Meyer }
1540ac341f1SConrad Meyer
1550ac341f1SConrad Meyer /* fully carry h */
1560ac341f1SConrad Meyer h0 = st->h[0];
1570ac341f1SConrad Meyer h1 = st->h[1];
1580ac341f1SConrad Meyer h2 = st->h[2];
1590ac341f1SConrad Meyer
1600ac341f1SConrad Meyer c = (h1 >> 44);
1610ac341f1SConrad Meyer h1 &= 0xfffffffffff;
1620ac341f1SConrad Meyer h2 += c;
1630ac341f1SConrad Meyer c = (h2 >> 42);
1640ac341f1SConrad Meyer h2 &= 0x3ffffffffff;
1650ac341f1SConrad Meyer h0 += c * 5;
1660ac341f1SConrad Meyer c = (h0 >> 44);
1670ac341f1SConrad Meyer h0 &= 0xfffffffffff;
1680ac341f1SConrad Meyer h1 += c;
1690ac341f1SConrad Meyer c = (h1 >> 44);
1700ac341f1SConrad Meyer h1 &= 0xfffffffffff;
1710ac341f1SConrad Meyer h2 += c;
1720ac341f1SConrad Meyer c = (h2 >> 42);
1730ac341f1SConrad Meyer h2 &= 0x3ffffffffff;
1740ac341f1SConrad Meyer h0 += c * 5;
1750ac341f1SConrad Meyer c = (h0 >> 44);
1760ac341f1SConrad Meyer h0 &= 0xfffffffffff;
1770ac341f1SConrad Meyer h1 += c;
1780ac341f1SConrad Meyer
1790ac341f1SConrad Meyer /* compute h + -p */
1800ac341f1SConrad Meyer g0 = h0 + 5;
1810ac341f1SConrad Meyer c = (g0 >> 44);
1820ac341f1SConrad Meyer g0 &= 0xfffffffffff;
1830ac341f1SConrad Meyer g1 = h1 + c;
1840ac341f1SConrad Meyer c = (g1 >> 44);
1850ac341f1SConrad Meyer g1 &= 0xfffffffffff;
1860ac341f1SConrad Meyer g2 = h2 + c - (1ULL << 42);
1870ac341f1SConrad Meyer
1880ac341f1SConrad Meyer /* select h if h < p, or h + -p if h >= p */
1890ac341f1SConrad Meyer c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
1900ac341f1SConrad Meyer g0 &= c;
1910ac341f1SConrad Meyer g1 &= c;
1920ac341f1SConrad Meyer g2 &= c;
1930ac341f1SConrad Meyer c = ~c;
1940ac341f1SConrad Meyer h0 = (h0 & c) | g0;
1950ac341f1SConrad Meyer h1 = (h1 & c) | g1;
1960ac341f1SConrad Meyer h2 = (h2 & c) | g2;
1970ac341f1SConrad Meyer
1980ac341f1SConrad Meyer /* h = (h + pad) */
1990ac341f1SConrad Meyer t0 = st->pad[0];
2000ac341f1SConrad Meyer t1 = st->pad[1];
2010ac341f1SConrad Meyer
2020ac341f1SConrad Meyer h0 += ((t0) &0xfffffffffff);
2030ac341f1SConrad Meyer c = (h0 >> 44);
2040ac341f1SConrad Meyer h0 &= 0xfffffffffff;
2050ac341f1SConrad Meyer h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
2060ac341f1SConrad Meyer c = (h1 >> 44);
2070ac341f1SConrad Meyer h1 &= 0xfffffffffff;
2080ac341f1SConrad Meyer h2 += (((t1 >> 24)) & 0x3ffffffffff) + c;
2090ac341f1SConrad Meyer h2 &= 0x3ffffffffff;
2100ac341f1SConrad Meyer
2110ac341f1SConrad Meyer /* mac = h % (2^128) */
2120ac341f1SConrad Meyer h0 = ((h0) | (h1 << 44));
2130ac341f1SConrad Meyer h1 = ((h1 >> 20) | (h2 << 24));
2140ac341f1SConrad Meyer
2150ac341f1SConrad Meyer STORE64_LE(&mac[0], h0);
2160ac341f1SConrad Meyer STORE64_LE(&mac[8], h1);
2170ac341f1SConrad Meyer
2180ac341f1SConrad Meyer /* zero out the state */
2190ac341f1SConrad Meyer sodium_memzero((void *) st, sizeof *st);
2200ac341f1SConrad Meyer }
221