1 // gcm.cpp - originally written and placed in the public domain by Wei Dai.
2 // ARM and Aarch64 added by Jeffrey Walton. The ARM carryless
3 // multiply routines are less efficient because they shadow x86.
4 // The precomputed key table integration makes it tricky to use the
5 // more efficient ARMv8 implementation of the multiply and reduce.
6
7 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM gcm.cpp" to generate MASM code
8
9 #include "pch.h"
10 #include "config.h"
11
12 #ifndef CRYPTOPP_IMPORTS
13 #ifndef CRYPTOPP_GENERATE_X64_MASM
14
15 // Visual Studio .Net 2003 compiler crash
16 #if defined(_MSC_VER) && (_MSC_VER < 1400)
17 # pragma optimize("", off)
18 #endif
19
20 #include "gcm.h"
21 #include "cpu.h"
22
23 #if defined(CRYPTOPP_DISABLE_GCM_ASM)
24 # undef CRYPTOPP_X86_ASM_AVAILABLE
25 # undef CRYPTOPP_X32_ASM_AVAILABLE
26 # undef CRYPTOPP_X64_ASM_AVAILABLE
27 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
28 #endif
29
30 NAMESPACE_BEGIN(CryptoPP)
31
32 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64)
33 // Different assemblers accept different mnemonics: 'movd eax, xmm0' vs
34 // 'movd rax, xmm0' vs 'mov eax, xmm0' vs 'mov rax, xmm0'
35 #if defined(CRYPTOPP_DISABLE_MIXED_ASM)
36 // 'movd eax, xmm0' only. REG_WORD() macro not used. Clang path.
37 # define USE_MOVD_REG32 1
38 #elif defined(__GNUC__) || defined(_MSC_VER)
39 // 'movd eax, xmm0' or 'movd rax, xmm0'. REG_WORD() macro supplies REG32 or REG64.
40 # define USE_MOVD_REG32_OR_REG64 1
41 #else
42 // 'mov eax, xmm0' or 'mov rax, xmm0'. REG_WORD() macro supplies REG32 or REG64.
43 # define USE_MOV_REG32_OR_REG64 1
44 #endif
45 #endif // CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
46
47 // Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
48 #define M128_CAST(x) ((__m128i *)(void *)(x))
49 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
50
51 word16 GCM_Base::s_reductionTable[256];
52 volatile bool GCM_Base::s_reductionTableInitialized = false;
53
IncrementCounterBy256()54 void GCM_Base::GCTR::IncrementCounterBy256()
55 {
56 IncrementCounterByOne(m_counterArray+BlockSize()-4, 3);
57 }
58
Xor16(byte * a,const byte * b,const byte * c)59 static inline void Xor16(byte *a, const byte *b, const byte *c)
60 {
61 CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
62 CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<word64>()));
63 CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<word64>()));
64 ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0];
65 ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
66 }
67
68 #if CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
69 // SunCC 5.10-5.11 compiler crash. Move GCM_Xor16_SSE2 out-of-line, and place in
70 // a source file with a SSE architecture switch. Also see GH #226 and GH #284.
71 extern void GCM_Xor16_SSE2(byte *a, const byte *b, const byte *c);
72 #endif // SSE2
73
74 #if CRYPTOPP_ARM_NEON_AVAILABLE
75 extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c);
76 #endif
77
78 #if CRYPTOPP_POWER8_AVAILABLE
79 extern void GCM_Xor16_POWER8(byte *a, const byte *b, const byte *c);
80 #endif
81
82 #if CRYPTOPP_CLMUL_AVAILABLE
83 extern void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
84 extern size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
85 const unsigned int s_cltableSizeInBlocks = 8;
86 extern void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer);
87 #endif // CRYPTOPP_CLMUL_AVAILABLE
88
89 #if CRYPTOPP_ARM_PMULL_AVAILABLE
90 extern void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
91 extern size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
92 const unsigned int s_cltableSizeInBlocks = 8;
93 extern void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer);
94 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE
95
96 #if CRYPTOPP_POWER8_VMULL_AVAILABLE
97 extern void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
98 extern size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
99 const unsigned int s_cltableSizeInBlocks = 8;
100 extern void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer);
101 #endif // CRYPTOPP_POWER8_VMULL_AVAILABLE
102
SetKeyWithoutResync(const byte * userKey,size_t keylength,const NameValuePairs & params)103 void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms)
104 {
105 BlockCipher &blockCipher = AccessBlockCipher();
106 blockCipher.SetKey(userKey, keylength, params);
107
108 // GCM is only defined for 16-byte block ciphers at the moment.
109 // However, variable blocksize support means we have to defer
110 // blocksize checks to runtime after the key is set. Also see
111 // https://github.com/weidai11/cryptopp/issues/408.
112 const unsigned int blockSize = blockCipher.BlockSize();
113 CRYPTOPP_ASSERT(blockSize == REQUIRED_BLOCKSIZE);
114 if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
115 throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
116
117 int tableSize, i, j, k;
118
119 #if CRYPTOPP_CLMUL_AVAILABLE
120 if (HasCLMUL())
121 {
122 // Avoid "parameter not used" error and suppress Coverity finding
123 (void)params.GetIntValue(Name::TableSize(), tableSize);
124 tableSize = s_cltableSizeInBlocks * blockSize;
125 CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
126 }
127 else
128 #elif CRYPTOPP_ARM_PMULL_AVAILABLE
129 if (HasPMULL())
130 {
131 // Avoid "parameter not used" error and suppress Coverity finding
132 (void)params.GetIntValue(Name::TableSize(), tableSize);
133 tableSize = s_cltableSizeInBlocks * blockSize;
134 CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
135 }
136 else
137 #elif CRYPTOPP_POWER8_VMULL_AVAILABLE
138 if (HasPMULL())
139 {
140 // Avoid "parameter not used" error and suppress Coverity finding
141 (void)params.GetIntValue(Name::TableSize(), tableSize);
142 tableSize = s_cltableSizeInBlocks * blockSize;
143 CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
144 }
145 else
146 #endif
147 {
148 if (params.GetIntValue(Name::TableSize(), tableSize))
149 tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
150 else
151 tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024;
152
153 //#if defined(_MSC_VER) && (_MSC_VER < 1400)
154 // VC 2003 workaround: compiler generates bad code for 64K tables
155 //tableSize = 2*1024;
156 //#endif
157 }
158
159 m_buffer.resize(3*blockSize + tableSize);
160 byte *mulTable = MulTable();
161 byte *hashKey = HashKey();
162 memset(hashKey, 0, REQUIRED_BLOCKSIZE);
163 blockCipher.ProcessBlock(hashKey);
164
165 #if CRYPTOPP_CLMUL_AVAILABLE
166 if (HasCLMUL())
167 {
168 GCM_SetKeyWithoutResync_CLMUL(hashKey, mulTable, tableSize);
169 return;
170 }
171 #elif CRYPTOPP_ARM_PMULL_AVAILABLE
172 if (HasPMULL())
173 {
174 GCM_SetKeyWithoutResync_PMULL(hashKey, mulTable, tableSize);
175 return;
176 }
177 #elif CRYPTOPP_POWER8_VMULL_AVAILABLE
178 if (HasPMULL())
179 {
180 GCM_SetKeyWithoutResync_VMULL(hashKey, mulTable, tableSize);
181 return;
182 }
183 #endif
184
185 word64 V0, V1;
186 typedef BlockGetAndPut<word64, BigEndian> Block;
187 Block::Get(hashKey)(V0)(V1);
188
189 if (tableSize == 64*1024)
190 {
191 for (i=0; i<128; i++)
192 {
193 k = i%8;
194 Block::Put(NULLPTR, mulTable+(i/8)*256*16+(size_t(1)<<(11-k)))(V0)(V1);
195
196 int x = (int)V1 & 1;
197 V1 = (V1>>1) | (V0<<63);
198 V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
199 }
200
201 for (i=0; i<16; i++)
202 {
203 memset(mulTable+i*256*16, 0, 16);
204 #if CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
205 if (HasSSE2())
206 for (j=2; j<=0x80; j*=2)
207 for (k=1; k<j; k++)
208 GCM_Xor16_SSE2(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
209 else
210 #elif CRYPTOPP_ARM_NEON_AVAILABLE
211 if (HasNEON())
212 for (j=2; j<=0x80; j*=2)
213 for (k=1; k<j; k++)
214 GCM_Xor16_NEON(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
215 else
216 #elif CRYPTOPP_POWER8_AVAILABLE
217 if (HasPower8())
218 for (j=2; j<=0x80; j*=2)
219 for (k=1; k<j; k++)
220 GCM_Xor16_POWER8(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
221 else
222 #endif
223 for (j=2; j<=0x80; j*=2)
224 for (k=1; k<j; k++)
225 Xor16(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
226 }
227 }
228 else
229 {
230 if (!s_reductionTableInitialized)
231 {
232 s_reductionTable[0] = 0;
233 word16 x = 0x01c2;
234 s_reductionTable[1] = ByteReverse(x);
235 for (unsigned int ii=2; ii<=0x80; ii*=2)
236 {
237 x <<= 1;
238 s_reductionTable[ii] = ByteReverse(x);
239 for (unsigned int jj=1; jj<ii; jj++)
240 s_reductionTable[ii+jj] = s_reductionTable[ii] ^ s_reductionTable[jj];
241 }
242 s_reductionTableInitialized = true;
243 }
244
245 for (i=0; i<128-24; i++)
246 {
247 k = i%32;
248 if (k < 4)
249 Block::Put(NULLPTR, mulTable+1024+(i/32)*256+(size_t(1)<<(7-k)))(V0)(V1);
250 else if (k < 8)
251 Block::Put(NULLPTR, mulTable+(i/32)*256+(size_t(1)<<(11-k)))(V0)(V1);
252
253 int x = (int)V1 & 1;
254 V1 = (V1>>1) | (V0<<63);
255 V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
256 }
257
258 for (i=0; i<4; i++)
259 {
260 memset(mulTable+i*256, 0, 16);
261 memset(mulTable+1024+i*256, 0, 16);
262 #if CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
263 if (HasSSE2())
264 for (j=2; j<=8; j*=2)
265 for (k=1; k<j; k++)
266 {
267 GCM_Xor16_SSE2(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
268 GCM_Xor16_SSE2(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
269 }
270 else
271 #elif CRYPTOPP_ARM_NEON_AVAILABLE
272 if (HasNEON())
273 for (j=2; j<=8; j*=2)
274 for (k=1; k<j; k++)
275 {
276 GCM_Xor16_NEON(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
277 GCM_Xor16_NEON(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
278 }
279 else
280 #elif CRYPTOPP_POWER8_AVAILABLE
281 if (HasPower8())
282 for (j=2; j<=8; j*=2)
283 for (k=1; k<j; k++)
284 {
285 GCM_Xor16_POWER8(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
286 GCM_Xor16_POWER8(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
287 }
288 else
289 #endif
290 for (j=2; j<=8; j*=2)
291 for (k=1; k<j; k++)
292 {
293 Xor16(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
294 Xor16(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
295 }
296 }
297 }
298 }
299
ReverseHashBufferIfNeeded()300 inline void GCM_Base::ReverseHashBufferIfNeeded()
301 {
302 #if CRYPTOPP_CLMUL_AVAILABLE
303 if (HasCLMUL())
304 {
305 GCM_ReverseHashBufferIfNeeded_CLMUL(HashBuffer());
306 }
307 #elif CRYPTOPP_ARM_PMULL_AVAILABLE
308 if (HasPMULL())
309 {
310 GCM_ReverseHashBufferIfNeeded_PMULL(HashBuffer());
311 }
312 #elif CRYPTOPP_POWER8_VMULL_AVAILABLE
313 if (HasPMULL())
314 {
315 GCM_ReverseHashBufferIfNeeded_VMULL(HashBuffer());
316 }
317 #endif
318 }
319
Resync(const byte * iv,size_t len)320 void GCM_Base::Resync(const byte *iv, size_t len)
321 {
322 BlockCipher &cipher = AccessBlockCipher();
323 byte *hashBuffer = HashBuffer();
324
325 if (len == 12)
326 {
327 memcpy(hashBuffer, iv, len);
328 memset(hashBuffer+len, 0, 3);
329 hashBuffer[len+3] = 1;
330 }
331 else
332 {
333 size_t origLen = len;
334 memset(hashBuffer, 0, HASH_BLOCKSIZE);
335
336 if (len >= HASH_BLOCKSIZE)
337 {
338 len = GCM_Base::AuthenticateBlocks(iv, len);
339 iv += (origLen - len);
340 }
341
342 if (len > 0)
343 {
344 memcpy(m_buffer, iv, len);
345 memset(m_buffer+len, 0, HASH_BLOCKSIZE-len);
346 GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
347 }
348
349 PutBlock<word64, BigEndian, true>(NULLPTR, m_buffer)(0)(origLen*8);
350 GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
351
352 ReverseHashBufferIfNeeded();
353 }
354
355 if (m_state >= State_IVSet)
356 m_ctr.Resynchronize(hashBuffer, REQUIRED_BLOCKSIZE);
357 else
358 m_ctr.SetCipherWithIV(cipher, hashBuffer);
359
360 m_ctr.Seek(HASH_BLOCKSIZE);
361
362 memset(hashBuffer, 0, HASH_BLOCKSIZE);
363 }
364
OptimalDataAlignment() const365 unsigned int GCM_Base::OptimalDataAlignment() const
366 {
367 return
368 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
369 HasSSE2() ? 16 :
370 #elif CRYPTOPP_ARM_NEON_AVAILABLE
371 HasNEON() ? 4 :
372 #elif CRYPTOPP_POWER8_AVAILABLE
373 HasPower8() ? 16 :
374 #endif
375 GetBlockCipher().OptimalDataAlignment();
376 }
377
378 #if CRYPTOPP_MSC_VERSION
379 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
380 #endif
381
382 #endif // Not CRYPTOPP_GENERATE_X64_MASM
383
384 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
385 extern "C" {
386 void GCM_AuthenticateBlocks_2K_SSE2(const byte *data, size_t blocks, word64 *hashBuffer, const word16 *reductionTable);
387 void GCM_AuthenticateBlocks_64K_SSE2(const byte *data, size_t blocks, word64 *hashBuffer);
388 }
389 #endif
390
391 #ifndef CRYPTOPP_GENERATE_X64_MASM
392
AuthenticateBlocks(const byte * data,size_t len)393 size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
394 {
395 #if CRYPTOPP_CLMUL_AVAILABLE
396 if (HasCLMUL())
397 {
398 return GCM_AuthenticateBlocks_CLMUL(data, len, MulTable(), HashBuffer());
399 }
400 #elif CRYPTOPP_ARM_PMULL_AVAILABLE
401 if (HasPMULL())
402 {
403 return GCM_AuthenticateBlocks_PMULL(data, len, MulTable(), HashBuffer());
404 }
405 #elif CRYPTOPP_POWER8_VMULL_AVAILABLE
406 if (HasPMULL())
407 {
408 return GCM_AuthenticateBlocks_VMULL(data, len, MulTable(), HashBuffer());
409 }
410 #endif
411
412 typedef BlockGetAndPut<word64, NativeByteOrder> Block;
413 word64 *hashBuffer = (word64 *)(void *)HashBuffer();
414 CRYPTOPP_ASSERT(IsAlignedOn(hashBuffer,GetAlignmentOf<word64>()));
415
416 switch (2*(m_buffer.size()>=64*1024)
417 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
418 + HasSSE2()
419 //#elif CRYPTOPP_ARM_NEON_AVAILABLE
420 // + HasNEON()
421 #endif
422 )
423 {
424 case 0: // non-SSE2 and 2K tables
425 {
426 byte *mulTable = MulTable();
427 word64 x0 = hashBuffer[0], x1 = hashBuffer[1];
428
429 do
430 {
431 word64 y0, y1, a0, a1, b0, b1, c0, c1, d0, d1;
432 Block::Get(data)(y0)(y1);
433 x0 ^= y0;
434 x1 ^= y1;
435
436 data += HASH_BLOCKSIZE;
437 len -= HASH_BLOCKSIZE;
438
439 #define READ_TABLE_WORD64_COMMON(a, b, c, d) *(word64 *)(void *)(mulTable+(a*1024)+(b*256)+c+d*8)
440
441 #if (CRYPTOPP_LITTLE_ENDIAN)
442 #if CRYPTOPP_BOOL_SLOW_WORD64
443 word32 z0 = (word32)x0;
444 word32 z1 = (word32)(x0>>32);
445 word32 z2 = (word32)x1;
446 word32 z3 = (word32)(x1>>32);
447 #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((d%2), c, (d?(z##c>>((d?d-1:0)*4))&0xf0:(z##c&0xf)<<4), e)
448 #else
449 #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((d%2), c, ((d+8*b)?(x##a>>(((d+8*b)?(d+8*b)-1:1)*4))&0xf0:(x##a&0xf)<<4), e)
450 #endif
451 #define GF_MOST_SIG_8BITS(a) (a##1 >> 7*8)
452 #define GF_SHIFT_8(a) a##1 = (a##1 << 8) ^ (a##0 >> 7*8); a##0 <<= 8;
453 #else
454 #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((1-d%2), c, ((15-d-8*b)?(x##a>>(((15-d-8*b)?(15-d-8*b)-1:0)*4))&0xf0:(x##a&0xf)<<4), e)
455 #define GF_MOST_SIG_8BITS(a) (a##1 & 0xff)
456 #define GF_SHIFT_8(a) a##1 = (a##1 >> 8) ^ (a##0 << 7*8); a##0 >>= 8;
457 #endif
458
459 #define GF_MUL_32BY128(op, a, b, c) \
460 a0 op READ_TABLE_WORD64(a, b, c, 0, 0) ^ READ_TABLE_WORD64(a, b, c, 1, 0); \
461 a1 op READ_TABLE_WORD64(a, b, c, 0, 1) ^ READ_TABLE_WORD64(a, b, c, 1, 1); \
462 b0 op READ_TABLE_WORD64(a, b, c, 2, 0) ^ READ_TABLE_WORD64(a, b, c, 3, 0); \
463 b1 op READ_TABLE_WORD64(a, b, c, 2, 1) ^ READ_TABLE_WORD64(a, b, c, 3, 1); \
464 c0 op READ_TABLE_WORD64(a, b, c, 4, 0) ^ READ_TABLE_WORD64(a, b, c, 5, 0); \
465 c1 op READ_TABLE_WORD64(a, b, c, 4, 1) ^ READ_TABLE_WORD64(a, b, c, 5, 1); \
466 d0 op READ_TABLE_WORD64(a, b, c, 6, 0) ^ READ_TABLE_WORD64(a, b, c, 7, 0); \
467 d1 op READ_TABLE_WORD64(a, b, c, 6, 1) ^ READ_TABLE_WORD64(a, b, c, 7, 1); \
468
469 GF_MUL_32BY128(=, 0, 0, 0)
470 GF_MUL_32BY128(^=, 0, 1, 1)
471 GF_MUL_32BY128(^=, 1, 0, 2)
472 GF_MUL_32BY128(^=, 1, 1, 3)
473
474 word32 r = (word32)s_reductionTable[GF_MOST_SIG_8BITS(d)] << 16;
475 GF_SHIFT_8(d)
476 c0 ^= d0; c1 ^= d1;
477 r ^= (word32)s_reductionTable[GF_MOST_SIG_8BITS(c)] << 8;
478 GF_SHIFT_8(c)
479 b0 ^= c0; b1 ^= c1;
480 r ^= s_reductionTable[GF_MOST_SIG_8BITS(b)];
481 GF_SHIFT_8(b)
482 a0 ^= b0; a1 ^= b1;
483 a0 ^= ConditionalByteReverse<word64>(LITTLE_ENDIAN_ORDER, r);
484 x0 = a0; x1 = a1;
485 }
486 while (len >= HASH_BLOCKSIZE);
487
488 hashBuffer[0] = x0; hashBuffer[1] = x1;
489 return len;
490 }
491
492 case 2: // non-SSE2 and 64K tables
493 {
494 byte *mulTable = MulTable();
495 word64 x0 = hashBuffer[0], x1 = hashBuffer[1];
496
497 do
498 {
499 word64 y0, y1, a0, a1;
500 Block::Get(data)(y0)(y1);
501 x0 ^= y0;
502 x1 ^= y1;
503
504 data += HASH_BLOCKSIZE;
505 len -= HASH_BLOCKSIZE;
506
507 #undef READ_TABLE_WORD64_COMMON
508 #undef READ_TABLE_WORD64
509
510 #define READ_TABLE_WORD64_COMMON(a, c, d) *(word64 *)(void *)(mulTable+(a)*256*16+(c)+(d)*8)
511
512 #if (CRYPTOPP_LITTLE_ENDIAN)
513 #if CRYPTOPP_BOOL_SLOW_WORD64
514 word32 z0 = (word32)x0;
515 word32 z1 = (word32)(x0>>32);
516 word32 z2 = (word32)x1;
517 word32 z3 = (word32)(x1>>32);
518 #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, (d?(z##c>>((d?d:1)*8-4))&0xff0:(z##c&0xff)<<4), e)
519 #else
520 #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, ((d+4*(c%2))?(x##b>>(((d+4*(c%2))?(d+4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e)
521 #endif
522 #else
523 #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, ((7-d-4*(c%2))?(x##b>>(((7-d-4*(c%2))?(7-d-4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e)
524 #endif
525
526 #define GF_MUL_8BY128(op, b, c, d) \
527 a0 op READ_TABLE_WORD64(b, c, d, 0);\
528 a1 op READ_TABLE_WORD64(b, c, d, 1);\
529
530 GF_MUL_8BY128(=, 0, 0, 0)
531 GF_MUL_8BY128(^=, 0, 0, 1)
532 GF_MUL_8BY128(^=, 0, 0, 2)
533 GF_MUL_8BY128(^=, 0, 0, 3)
534 GF_MUL_8BY128(^=, 0, 1, 0)
535 GF_MUL_8BY128(^=, 0, 1, 1)
536 GF_MUL_8BY128(^=, 0, 1, 2)
537 GF_MUL_8BY128(^=, 0, 1, 3)
538 GF_MUL_8BY128(^=, 1, 2, 0)
539 GF_MUL_8BY128(^=, 1, 2, 1)
540 GF_MUL_8BY128(^=, 1, 2, 2)
541 GF_MUL_8BY128(^=, 1, 2, 3)
542 GF_MUL_8BY128(^=, 1, 3, 0)
543 GF_MUL_8BY128(^=, 1, 3, 1)
544 GF_MUL_8BY128(^=, 1, 3, 2)
545 GF_MUL_8BY128(^=, 1, 3, 3)
546
547 x0 = a0; x1 = a1;
548 }
549 while (len >= HASH_BLOCKSIZE);
550
551 hashBuffer[0] = x0; hashBuffer[1] = x1;
552 return len;
553 }
554 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
555
556 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
557 case 1: // SSE2 and 2K tables
558 GCM_AuthenticateBlocks_2K_SSE2(data, len/16, hashBuffer, s_reductionTable);
559 return len % 16;
560 case 3: // SSE2 and 64K tables
561 GCM_AuthenticateBlocks_64K_SSE2(data, len/16, hashBuffer);
562 return len % 16;
563 #endif
564
565 #if CRYPTOPP_SSE2_ASM_AVAILABLE
566 case 1: // SSE2 and 2K tables
567 {
568 #ifdef __GNUC__
569 __asm__ __volatile__
570 (
571 INTEL_NOPREFIX
572 #elif defined(CRYPTOPP_GENERATE_X64_MASM)
573 ALIGN 8
574 GCM_AuthenticateBlocks_2K_SSE2 PROC FRAME
575 rex_push_reg rsi
576 push_reg rdi
577 push_reg rbx
578 .endprolog
579 mov rsi, r8
580 mov r11, r9
581 #else
582 AS2( mov WORD_REG(cx), data )
583 AS2( mov WORD_REG(dx), len )
584 AS2( mov WORD_REG(si), hashBuffer )
585 AS2( shr WORD_REG(dx), 4 )
586 #endif
587
588 #if CRYPTOPP_BOOL_X32
589 AS1(push rbx)
590 AS1(push rbp)
591 #else
592 AS_PUSH_IF86( bx)
593 AS_PUSH_IF86( bp)
594 #endif
595
596 #ifdef __GNUC__
597 AS2( mov AS_REG_7, WORD_REG(di))
598 #elif CRYPTOPP_BOOL_X86
599 AS2( lea AS_REG_7, s_reductionTable)
600 #endif
601
602 AS2( movdqa xmm0, [WORD_REG(si)] )
603
604 #define MUL_TABLE_0 WORD_REG(si) + 32
605 #define MUL_TABLE_1 WORD_REG(si) + 32 + 1024
606 #define RED_TABLE AS_REG_7
607
608 ASL(0)
609 AS2( movdqu xmm4, [WORD_REG(cx)] )
610 AS2( pxor xmm0, xmm4 )
611
612 AS2( movd ebx, xmm0 )
613 AS2( mov eax, AS_HEX(f0f0f0f0) )
614 AS2( and eax, ebx )
615 AS2( shl ebx, 4 )
616 AS2( and ebx, AS_HEX(f0f0f0f0) )
617 AS2( movzx edi, ah )
618 AS2( movdqa xmm5, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
619 AS2( movzx edi, al )
620 AS2( movdqa xmm4, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
621 AS2( shr eax, 16 )
622 AS2( movzx edi, ah )
623 AS2( movdqa xmm3, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
624 AS2( movzx edi, al )
625 AS2( movdqa xmm2, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] )
626
627 #define SSE2_MUL_32BITS(i) \
628 AS2( psrldq xmm0, 4 )\
629 AS2( movd eax, xmm0 )\
630 AS2( and eax, AS_HEX(f0f0f0f0) )\
631 AS2( movzx edi, bh )\
632 AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
633 AS2( movzx edi, bl )\
634 AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
635 AS2( shr ebx, 16 )\
636 AS2( movzx edi, bh )\
637 AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
638 AS2( movzx edi, bl )\
639 AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\
640 AS2( movd ebx, xmm0 )\
641 AS2( shl ebx, 4 )\
642 AS2( and ebx, AS_HEX(f0f0f0f0) )\
643 AS2( movzx edi, ah )\
644 AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
645 AS2( movzx edi, al )\
646 AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
647 AS2( shr eax, 16 )\
648 AS2( movzx edi, ah )\
649 AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
650 AS2( movzx edi, al )\
651 AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\
652
653 SSE2_MUL_32BITS(1)
654 SSE2_MUL_32BITS(2)
655 SSE2_MUL_32BITS(3)
656
657 AS2( movzx edi, bh )
658 AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
659 AS2( movzx edi, bl )
660 AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
661 AS2( shr ebx, 16 )
662 AS2( movzx edi, bh )
663 AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
664 AS2( movzx edi, bl )
665 AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] )
666
667 AS2( movdqa xmm0, xmm3 )
668 AS2( pslldq xmm3, 1 )
669 AS2( pxor xmm2, xmm3 )
670 AS2( movdqa xmm1, xmm2 )
671 AS2( pslldq xmm2, 1 )
672 AS2( pxor xmm5, xmm2 )
673
674 AS2( psrldq xmm0, 15 )
675 #if USE_MOVD_REG32
676 AS2( movd edi, xmm0 )
677 #elif USE_MOV_REG32_OR_REG64
678 AS2( mov WORD_REG(di), xmm0 )
679 #else // GNU Assembler
680 AS2( movd WORD_REG(di), xmm0 )
681 #endif
682 AS2( movzx eax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
683 AS2( shl eax, 8 )
684
685 AS2( movdqa xmm0, xmm5 )
686 AS2( pslldq xmm5, 1 )
687 AS2( pxor xmm4, xmm5 )
688
689 AS2( psrldq xmm1, 15 )
690 #if USE_MOVD_REG32
691 AS2( movd edi, xmm1 )
692 #elif USE_MOV_REG32_OR_REG64
693 AS2( mov WORD_REG(di), xmm1 )
694 #else
695 AS2( movd WORD_REG(di), xmm1 )
696 #endif
697 AS2( xor ax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
698 AS2( shl eax, 8 )
699
700 AS2( psrldq xmm0, 15 )
701 #if USE_MOVD_REG32
702 AS2( movd edi, xmm0 )
703 #elif USE_MOV_REG32_OR_REG64
704 AS2( mov WORD_REG(di), xmm0 )
705 #else
706 AS2( movd WORD_REG(di), xmm0 )
707 #endif
708 AS2( xor ax, WORD PTR [RED_TABLE + WORD_REG(di)*2] )
709
710 AS2( movd xmm0, eax )
711 AS2( pxor xmm0, xmm4 )
712
713 AS2( add WORD_REG(cx), 16 )
714 AS2( sub WORD_REG(dx), 1 )
715 // ATT_NOPREFIX
716 ASJ( jnz, 0, b )
717 INTEL_NOPREFIX
718 AS2( movdqa [WORD_REG(si)], xmm0 )
719
720 #if CRYPTOPP_BOOL_X32
721 AS1(pop rbp)
722 AS1(pop rbx)
723 #else
724 AS_POP_IF86( bp)
725 AS_POP_IF86( bx)
726 #endif
727
728 #ifdef __GNUC__
729 ATT_PREFIX
730 :
731 : "c" (data), "d" (len/16), "S" (hashBuffer), "D" (s_reductionTable)
732 : "memory", "cc", "%eax"
733 #if CRYPTOPP_BOOL_X64
734 , "%ebx", "%r11"
735 #endif
736 );
737 #elif defined(CRYPTOPP_GENERATE_X64_MASM)
738 pop rbx
739 pop rdi
740 pop rsi
741 ret
742 GCM_AuthenticateBlocks_2K_SSE2 ENDP
743 #endif
744
745 return len%16;
746 }
747 case 3: // SSE2 and 64K tables
748 {
749 #ifdef __GNUC__
750 __asm__ __volatile__
751 (
752 INTEL_NOPREFIX
753 #elif defined(CRYPTOPP_GENERATE_X64_MASM)
754 ALIGN 8
755 GCM_AuthenticateBlocks_64K_SSE2 PROC FRAME
756 rex_push_reg rsi
757 push_reg rdi
758 .endprolog
759 mov rsi, r8
760 #else
761 AS2( mov WORD_REG(cx), data )
762 AS2( mov WORD_REG(dx), len )
763 AS2( mov WORD_REG(si), hashBuffer )
764 AS2( shr WORD_REG(dx), 4 )
765 #endif
766
767 AS2( movdqa xmm0, [WORD_REG(si)] )
768
769 #undef MUL_TABLE
770 #define MUL_TABLE(i,j) WORD_REG(si) + 32 + (i*4+j)*256*16
771
772 ASL(1)
773 AS2( movdqu xmm1, [WORD_REG(cx)] )
774 AS2( pxor xmm1, xmm0 )
775 AS2( pxor xmm0, xmm0 )
776
777 #undef SSE2_MUL_32BITS
778 #define SSE2_MUL_32BITS(i) \
779 AS2( movd eax, xmm1 )\
780 AS2( psrldq xmm1, 4 )\
781 AS2( movzx edi, al )\
782 AS2( add WORD_REG(di), WORD_REG(di) )\
783 AS2( pxor xmm0, [MUL_TABLE(i,0) + WORD_REG(di)*8] )\
784 AS2( movzx edi, ah )\
785 AS2( add WORD_REG(di), WORD_REG(di) )\
786 AS2( pxor xmm0, [MUL_TABLE(i,1) + WORD_REG(di)*8] )\
787 AS2( shr eax, 16 )\
788 AS2( movzx edi, al )\
789 AS2( add WORD_REG(di), WORD_REG(di) )\
790 AS2( pxor xmm0, [MUL_TABLE(i,2) + WORD_REG(di)*8] )\
791 AS2( movzx edi, ah )\
792 AS2( add WORD_REG(di), WORD_REG(di) )\
793 AS2( pxor xmm0, [MUL_TABLE(i,3) + WORD_REG(di)*8] )\
794
795 SSE2_MUL_32BITS(0)
796 SSE2_MUL_32BITS(1)
797 SSE2_MUL_32BITS(2)
798 SSE2_MUL_32BITS(3)
799
800 AS2( add WORD_REG(cx), 16 )
801 AS2( sub WORD_REG(dx), 1 )
802 // ATT_NOPREFIX
803 ASJ( jnz, 1, b )
804 INTEL_NOPREFIX
805 AS2( movdqa [WORD_REG(si)], xmm0 )
806
807 #ifdef __GNUC__
808 ATT_PREFIX
809 :
810 : "c" (data), "d" (len/16), "S" (hashBuffer)
811 : "memory", "cc", "%edi", "%eax"
812 );
813 #elif defined(CRYPTOPP_GENERATE_X64_MASM)
814 pop rdi
815 pop rsi
816 ret
817 GCM_AuthenticateBlocks_64K_SSE2 ENDP
818 #endif
819
820 return len%16;
821 }
822 #endif
823 #ifndef CRYPTOPP_GENERATE_X64_MASM
824 }
825
826 return len%16;
827 }
828
AuthenticateLastHeaderBlock()829 void GCM_Base::AuthenticateLastHeaderBlock()
830 {
831 if (m_bufferedDataLength > 0)
832 {
833 memset(m_buffer+m_bufferedDataLength, 0, HASH_BLOCKSIZE-m_bufferedDataLength);
834 m_bufferedDataLength = 0;
835 GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
836 }
837 }
838
AuthenticateLastConfidentialBlock()839 void GCM_Base::AuthenticateLastConfidentialBlock()
840 {
841 GCM_Base::AuthenticateLastHeaderBlock();
842 PutBlock<word64, BigEndian, true>(NULLPTR, m_buffer)(m_totalHeaderLength*8)(m_totalMessageLength*8);
843 GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
844 }
845
AuthenticateLastFooterBlock(byte * mac,size_t macSize)846 void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
847 {
848 m_ctr.Seek(0);
849 ReverseHashBufferIfNeeded();
850 m_ctr.ProcessData(mac, HashBuffer(), macSize);
851 }
852
853 NAMESPACE_END
854
855 #endif // Not CRYPTOPP_GENERATE_X64_MASM
856 #endif
857