1 /* 2 poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication 3 and 128 bit addition 4 */ 5 6 #include "private/common.h" 7 8 #define MUL(out, x, y) out = ((uint128_t) x * y) 9 #define ADD(out, in) out += in 10 #define ADDLO(out, in) out += in 11 #define SHR(in, shift) (unsigned long long) (in >> (shift)) 12 #define LO(in) (unsigned long long) (in) 13 14 #if defined(_MSC_VER) 15 # define POLY1305_NOINLINE __declspec(noinline) 16 #elif defined(__clang__) || defined(__GNUC__) 17 # define POLY1305_NOINLINE __attribute__((noinline)) 18 #else 19 # define POLY1305_NOINLINE 20 #endif 21 22 #define poly1305_block_size 16 23 24 /* 17 + sizeof(unsigned long long) + 8*sizeof(unsigned long long) */ 25 typedef struct poly1305_state_internal_t { 26 unsigned long long r[3]; 27 unsigned long long h[3]; 28 unsigned long long pad[2]; 29 unsigned long long leftover; 30 unsigned char buffer[poly1305_block_size]; 31 unsigned char final; 32 } poly1305_state_internal_t; 33 34 static void 35 poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32]) 36 { 37 unsigned long long t0, t1; 38 39 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 40 t0 = LOAD64_LE(&key[0]); 41 t1 = LOAD64_LE(&key[8]); 42 43 /* wiped after finalization */ 44 st->r[0] = (t0) &0xffc0fffffff; 45 st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; 46 st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f; 47 48 /* h = 0 */ 49 st->h[0] = 0; 50 st->h[1] = 0; 51 st->h[2] = 0; 52 53 /* save pad for later */ 54 st->pad[0] = LOAD64_LE(&key[16]); 55 st->pad[1] = LOAD64_LE(&key[24]); 56 57 st->leftover = 0; 58 st->final = 0; 59 } 60 61 static void 62 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, 63 unsigned long long bytes) 64 { 65 const unsigned long long hibit = 66 (st->final) ? 0ULL : (1ULL << 40); /* 1 << 128 */ 67 unsigned long long r0, r1, r2; 68 unsigned long long s1, s2; 69 unsigned long long h0, h1, h2; 70 unsigned long long c; 71 uint128_t d0, d1, d2, d; 72 73 r0 = st->r[0]; 74 r1 = st->r[1]; 75 r2 = st->r[2]; 76 77 h0 = st->h[0]; 78 h1 = st->h[1]; 79 h2 = st->h[2]; 80 81 s1 = r1 * (5 << 2); 82 s2 = r2 * (5 << 2); 83 84 while (bytes >= poly1305_block_size) { 85 unsigned long long t0, t1; 86 87 /* h += m[i] */ 88 t0 = LOAD64_LE(&m[0]); 89 t1 = LOAD64_LE(&m[8]); 90 91 h0 += ((t0) &0xfffffffffff); 92 h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); 93 h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit; 94 95 /* h *= r */ 96 MUL(d0, h0, r0); 97 MUL(d, h1, s2); 98 ADD(d0, d); 99 MUL(d, h2, s1); 100 ADD(d0, d); 101 MUL(d1, h0, r1); 102 MUL(d, h1, r0); 103 ADD(d1, d); 104 MUL(d, h2, s2); 105 ADD(d1, d); 106 MUL(d2, h0, r2); 107 MUL(d, h1, r1); 108 ADD(d2, d); 109 MUL(d, h2, r0); 110 ADD(d2, d); 111 112 /* (partial) h %= p */ 113 c = SHR(d0, 44); 114 h0 = LO(d0) & 0xfffffffffff; 115 ADDLO(d1, c); 116 c = SHR(d1, 44); 117 h1 = LO(d1) & 0xfffffffffff; 118 ADDLO(d2, c); 119 c = SHR(d2, 42); 120 h2 = LO(d2) & 0x3ffffffffff; 121 h0 += c * 5; 122 c = (h0 >> 44); 123 h0 = h0 & 0xfffffffffff; 124 h1 += c; 125 126 m += poly1305_block_size; 127 bytes -= poly1305_block_size; 128 } 129 130 st->h[0] = h0; 131 st->h[1] = h1; 132 st->h[2] = h2; 133 } 134 135 static POLY1305_NOINLINE void 136 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16]) 137 { 138 unsigned long long h0, h1, h2, c; 139 unsigned long long g0, g1, g2; 140 unsigned long long t0, t1; 141 142 /* process the remaining block */ 143 if (st->leftover) { 144 unsigned long long i = st->leftover; 145 146 st->buffer[i] = 1; 147 148 for (i = i + 1; i < poly1305_block_size; i++) { 149 st->buffer[i] = 0; 150 } 151 st->final = 1; 152 poly1305_blocks(st, st->buffer, poly1305_block_size); 153 } 154 155 /* fully carry h */ 156 h0 = st->h[0]; 157 h1 = st->h[1]; 158 h2 = st->h[2]; 159 160 c = (h1 >> 44); 161 h1 &= 0xfffffffffff; 162 h2 += c; 163 c = (h2 >> 42); 164 h2 &= 0x3ffffffffff; 165 h0 += c * 5; 166 c = (h0 >> 44); 167 h0 &= 0xfffffffffff; 168 h1 += c; 169 c = (h1 >> 44); 170 h1 &= 0xfffffffffff; 171 h2 += c; 172 c = (h2 >> 42); 173 h2 &= 0x3ffffffffff; 174 h0 += c * 5; 175 c = (h0 >> 44); 176 h0 &= 0xfffffffffff; 177 h1 += c; 178 179 /* compute h + -p */ 180 g0 = h0 + 5; 181 c = (g0 >> 44); 182 g0 &= 0xfffffffffff; 183 g1 = h1 + c; 184 c = (g1 >> 44); 185 g1 &= 0xfffffffffff; 186 g2 = h2 + c - (1ULL << 42); 187 188 /* select h if h < p, or h + -p if h >= p */ 189 c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1; 190 g0 &= c; 191 g1 &= c; 192 g2 &= c; 193 c = ~c; 194 h0 = (h0 & c) | g0; 195 h1 = (h1 & c) | g1; 196 h2 = (h2 & c) | g2; 197 198 /* h = (h + pad) */ 199 t0 = st->pad[0]; 200 t1 = st->pad[1]; 201 202 h0 += ((t0) &0xfffffffffff); 203 c = (h0 >> 44); 204 h0 &= 0xfffffffffff; 205 h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; 206 c = (h1 >> 44); 207 h1 &= 0xfffffffffff; 208 h2 += (((t1 >> 24)) & 0x3ffffffffff) + c; 209 h2 &= 0x3ffffffffff; 210 211 /* mac = h % (2^128) */ 212 h0 = ((h0) | (h1 << 44)); 213 h1 = ((h1 >> 20) | (h2 << 24)); 214 215 STORE64_LE(&mac[0], h0); 216 STORE64_LE(&mac[8], h1); 217 218 /* zero out the state */ 219 sodium_memzero((void *) st, sizeof *st); 220 } 221