1 /* 2 poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication 3 and 64 bit addition 4 */ 5 6 #if defined(_MSC_VER) 7 # define POLY1305_NOINLINE __declspec(noinline) 8 #elif defined(__clang__) || defined(__GNUC__) 9 # define POLY1305_NOINLINE __attribute__((noinline)) 10 #else 11 # define POLY1305_NOINLINE 12 #endif 13 14 #include "private/common.h" 15 16 #define poly1305_block_size 16 17 18 /* 17 + sizeof(unsigned long long) + 14*sizeof(unsigned long) */ 19 typedef struct poly1305_state_internal_t { 20 unsigned long r[5]; 21 unsigned long h[5]; 22 unsigned long pad[4]; 23 unsigned long long leftover; 24 unsigned char buffer[poly1305_block_size]; 25 unsigned char final; 26 } poly1305_state_internal_t; 27 28 static void 29 poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32]) 30 { 31 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff - wiped after finalization */ 32 st->r[0] = (LOAD32_LE(&key[0])) & 0x3ffffff; 33 st->r[1] = (LOAD32_LE(&key[3]) >> 2) & 0x3ffff03; 34 st->r[2] = (LOAD32_LE(&key[6]) >> 4) & 0x3ffc0ff; 35 st->r[3] = (LOAD32_LE(&key[9]) >> 6) & 0x3f03fff; 36 st->r[4] = (LOAD32_LE(&key[12]) >> 8) & 0x00fffff; 37 38 /* h = 0 */ 39 st->h[0] = 0; 40 st->h[1] = 0; 41 st->h[2] = 0; 42 st->h[3] = 0; 43 st->h[4] = 0; 44 45 /* save pad for later */ 46 st->pad[0] = LOAD32_LE(&key[16]); 47 st->pad[1] = LOAD32_LE(&key[20]); 48 st->pad[2] = LOAD32_LE(&key[24]); 49 st->pad[3] = LOAD32_LE(&key[28]); 50 51 st->leftover = 0; 52 st->final = 0; 53 } 54 55 static void 56 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, 57 unsigned long long bytes) 58 { 59 const unsigned long hibit = (st->final) ? 0UL : (1UL << 24); /* 1 << 128 */ 60 unsigned long r0, r1, r2, r3, r4; 61 unsigned long s1, s2, s3, s4; 62 unsigned long h0, h1, h2, h3, h4; 63 unsigned long long d0, d1, d2, d3, d4; 64 unsigned long c; 65 66 r0 = st->r[0]; 67 r1 = st->r[1]; 68 r2 = st->r[2]; 69 r3 = st->r[3]; 70 r4 = st->r[4]; 71 72 s1 = r1 * 5; 73 s2 = r2 * 5; 74 s3 = r3 * 5; 75 s4 = r4 * 5; 76 77 h0 = st->h[0]; 78 h1 = st->h[1]; 79 h2 = st->h[2]; 80 h3 = st->h[3]; 81 h4 = st->h[4]; 82 83 while (bytes >= poly1305_block_size) { 84 /* h += m[i] */ 85 h0 += (LOAD32_LE(m + 0)) & 0x3ffffff; 86 h1 += (LOAD32_LE(m + 3) >> 2) & 0x3ffffff; 87 h2 += (LOAD32_LE(m + 6) >> 4) & 0x3ffffff; 88 h3 += (LOAD32_LE(m + 9) >> 6) & 0x3ffffff; 89 h4 += (LOAD32_LE(m + 12) >> 8) | hibit; 90 91 /* h *= r */ 92 d0 = ((unsigned long long) h0 * r0) + ((unsigned long long) h1 * s4) + 93 ((unsigned long long) h2 * s3) + ((unsigned long long) h3 * s2) + 94 ((unsigned long long) h4 * s1); 95 d1 = ((unsigned long long) h0 * r1) + ((unsigned long long) h1 * r0) + 96 ((unsigned long long) h2 * s4) + ((unsigned long long) h3 * s3) + 97 ((unsigned long long) h4 * s2); 98 d2 = ((unsigned long long) h0 * r2) + ((unsigned long long) h1 * r1) + 99 ((unsigned long long) h2 * r0) + ((unsigned long long) h3 * s4) + 100 ((unsigned long long) h4 * s3); 101 d3 = ((unsigned long long) h0 * r3) + ((unsigned long long) h1 * r2) + 102 ((unsigned long long) h2 * r1) + ((unsigned long long) h3 * r0) + 103 ((unsigned long long) h4 * s4); 104 d4 = ((unsigned long long) h0 * r4) + ((unsigned long long) h1 * r3) + 105 ((unsigned long long) h2 * r2) + ((unsigned long long) h3 * r1) + 106 ((unsigned long long) h4 * r0); 107 108 /* (partial) h %= p */ 109 c = (unsigned long) (d0 >> 26); 110 h0 = (unsigned long) d0 & 0x3ffffff; 111 d1 += c; 112 c = (unsigned long) (d1 >> 26); 113 h1 = (unsigned long) d1 & 0x3ffffff; 114 d2 += c; 115 c = (unsigned long) (d2 >> 26); 116 h2 = (unsigned long) d2 & 0x3ffffff; 117 d3 += c; 118 c = (unsigned long) (d3 >> 26); 119 h3 = (unsigned long) d3 & 0x3ffffff; 120 d4 += c; 121 c = (unsigned long) (d4 >> 26); 122 h4 = (unsigned long) d4 & 0x3ffffff; 123 h0 += c * 5; 124 c = (h0 >> 26); 125 h0 = h0 & 0x3ffffff; 126 h1 += c; 127 128 m += poly1305_block_size; 129 bytes -= poly1305_block_size; 130 } 131 132 st->h[0] = h0; 133 st->h[1] = h1; 134 st->h[2] = h2; 135 st->h[3] = h3; 136 st->h[4] = h4; 137 } 138 139 static POLY1305_NOINLINE void 140 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16]) 141 { 142 unsigned long h0, h1, h2, h3, h4, c; 143 unsigned long g0, g1, g2, g3, g4; 144 unsigned long long f; 145 unsigned long mask; 146 147 /* process the remaining block */ 148 if (st->leftover) { 149 unsigned long long i = st->leftover; 150 151 st->buffer[i++] = 1; 152 for (; i < poly1305_block_size; i++) { 153 st->buffer[i] = 0; 154 } 155 st->final = 1; 156 poly1305_blocks(st, st->buffer, poly1305_block_size); 157 } 158 159 /* fully carry h */ 160 h0 = st->h[0]; 161 h1 = st->h[1]; 162 h2 = st->h[2]; 163 h3 = st->h[3]; 164 h4 = st->h[4]; 165 166 c = h1 >> 26; 167 h1 = h1 & 0x3ffffff; 168 h2 += c; 169 c = h2 >> 26; 170 h2 = h2 & 0x3ffffff; 171 h3 += c; 172 c = h3 >> 26; 173 h3 = h3 & 0x3ffffff; 174 h4 += c; 175 c = h4 >> 26; 176 h4 = h4 & 0x3ffffff; 177 h0 += c * 5; 178 c = h0 >> 26; 179 h0 = h0 & 0x3ffffff; 180 h1 += c; 181 182 /* compute h + -p */ 183 g0 = h0 + 5; 184 c = g0 >> 26; 185 g0 &= 0x3ffffff; 186 g1 = h1 + c; 187 c = g1 >> 26; 188 g1 &= 0x3ffffff; 189 g2 = h2 + c; 190 c = g2 >> 26; 191 g2 &= 0x3ffffff; 192 g3 = h3 + c; 193 c = g3 >> 26; 194 g3 &= 0x3ffffff; 195 g4 = h4 + c - (1UL << 26); 196 197 /* select h if h < p, or h + -p if h >= p */ 198 mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1; 199 g0 &= mask; 200 g1 &= mask; 201 g2 &= mask; 202 g3 &= mask; 203 g4 &= mask; 204 mask = ~mask; 205 206 h0 = (h0 & mask) | g0; 207 h1 = (h1 & mask) | g1; 208 h2 = (h2 & mask) | g2; 209 h3 = (h3 & mask) | g3; 210 h4 = (h4 & mask) | g4; 211 212 /* h = h % (2^128) */ 213 h0 = ((h0) | (h1 << 26)) & 0xffffffff; 214 h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; 215 h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; 216 h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; 217 218 /* mac = (h + pad) % (2^128) */ 219 f = (unsigned long long) h0 + st->pad[0]; 220 h0 = (unsigned long) f; 221 f = (unsigned long long) h1 + st->pad[1] + (f >> 32); 222 h1 = (unsigned long) f; 223 f = (unsigned long long) h2 + st->pad[2] + (f >> 32); 224 h2 = (unsigned long) f; 225 f = (unsigned long long) h3 + st->pad[3] + (f >> 32); 226 h3 = (unsigned long) f; 227 228 STORE32_LE(mac + 0, (uint32_t) h0); 229 STORE32_LE(mac + 4, (uint32_t) h1); 230 STORE32_LE(mac + 8, (uint32_t) h2); 231 STORE32_LE(mac + 12, (uint32_t) h3); 232 233 /* zero out the state */ 234 sodium_memzero((void *) st, sizeof *st); 235 } 236