1 // gcm.cpp - originally written and placed in the public domain by Wei Dai.
2 //           ARM and Aarch64 added by Jeffrey Walton. The ARM carryless
3 //           multiply routines are less efficient because they shadow x86.
4 //           The precomputed key table integration makes it tricky to use the
5 //           more efficient ARMv8 implementation of the multiply and reduce.
6 
7 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM gcm.cpp" to generate MASM code
8 
9 #include "pch.h"
10 #include "config.h"
11 
12 #ifndef CRYPTOPP_IMPORTS
13 #ifndef CRYPTOPP_GENERATE_X64_MASM
14 
15 // Visual Studio .Net 2003 compiler crash
16 #if defined(_MSC_VER) && (_MSC_VER < 1400)
17 # pragma optimize("", off)
18 #endif
19 
20 #include "gcm.h"
21 #include "cpu.h"
22 
23 #if defined(CRYPTOPP_DISABLE_GCM_ASM)
24 # undef CRYPTOPP_X86_ASM_AVAILABLE
25 # undef CRYPTOPP_X32_ASM_AVAILABLE
26 # undef CRYPTOPP_X64_ASM_AVAILABLE
27 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
28 #endif
29 
30 NAMESPACE_BEGIN(CryptoPP)
31 
32 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64)
33 // Different assemblers accept different mnemonics: 'movd eax, xmm0' vs
34 //   'movd rax, xmm0' vs 'mov eax, xmm0' vs 'mov rax, xmm0'
35 #if defined(CRYPTOPP_DISABLE_MIXED_ASM)
36 // 'movd eax, xmm0' only. REG_WORD() macro not used. Clang path.
37 # define USE_MOVD_REG32 1
38 #elif defined(__GNUC__) || defined(_MSC_VER)
39 // 'movd eax, xmm0' or 'movd rax, xmm0'. REG_WORD() macro supplies REG32 or REG64.
40 # define USE_MOVD_REG32_OR_REG64 1
41 #else
42 // 'mov eax, xmm0' or 'mov rax, xmm0'. REG_WORD() macro supplies REG32 or REG64.
43 # define USE_MOV_REG32_OR_REG64 1
44 #endif
45 #endif  // CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
46 
47 // Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
48 #define M128_CAST(x) ((__m128i *)(void *)(x))
49 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
50 
51 word16 GCM_Base::s_reductionTable[256];
52 volatile bool GCM_Base::s_reductionTableInitialized = false;
53 
IncrementCounterBy256()54 void GCM_Base::GCTR::IncrementCounterBy256()
55 {
56     IncrementCounterByOne(m_counterArray+BlockSize()-4, 3);
57 }
58 
Xor16(byte * a,const byte * b,const byte * c)59 static inline void Xor16(byte *a, const byte *b, const byte *c)
60 {
61     CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
62     CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<word64>()));
63     CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<word64>()));
64     ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0];
65     ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
66 }
67 
68 #if CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
69 // SunCC 5.10-5.11 compiler crash. Move GCM_Xor16_SSE2 out-of-line, and place in
70 // a source file with a SSE architecture switch. Also see GH #226 and GH #284.
71 extern void GCM_Xor16_SSE2(byte *a, const byte *b, const byte *c);
72 #endif  // SSE2
73 
74 #if CRYPTOPP_ARM_NEON_AVAILABLE
75 extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c);
76 #endif
77 
78 #if CRYPTOPP_POWER8_AVAILABLE
79 extern void GCM_Xor16_POWER8(byte *a, const byte *b, const byte *c);
80 #endif
81 
82 #if CRYPTOPP_CLMUL_AVAILABLE
83 extern void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
84 extern size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
85 const unsigned int s_cltableSizeInBlocks = 8;
86 extern void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer);
87 #endif  // CRYPTOPP_CLMUL_AVAILABLE
88 
89 #if CRYPTOPP_ARM_PMULL_AVAILABLE
90 extern void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
91 extern size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
92 const unsigned int s_cltableSizeInBlocks = 8;
93 extern void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer);
94 #endif  // CRYPTOPP_ARM_PMULL_AVAILABLE
95 
96 #if CRYPTOPP_POWER8_VMULL_AVAILABLE
97 extern void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize);
98 extern size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
99 const unsigned int s_cltableSizeInBlocks = 8;
100 extern void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer);
101 #endif  // CRYPTOPP_POWER8_VMULL_AVAILABLE
102 
SetKeyWithoutResync(const byte * userKey,size_t keylength,const NameValuePairs & params)103 void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs &params)
104 {
105     BlockCipher &blockCipher = AccessBlockCipher();
106     blockCipher.SetKey(userKey, keylength, params);
107 
108     // GCM is only defined for 16-byte block ciphers at the moment.
109     // However, variable blocksize support means we have to defer
110     // blocksize checks to runtime after the key is set. Also see
111     // https://github.com/weidai11/cryptopp/issues/408.
112     const unsigned int blockSize = blockCipher.BlockSize();
113     CRYPTOPP_ASSERT(blockSize == REQUIRED_BLOCKSIZE);
114     if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
115         throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
116 
117     int tableSize, i, j, k;
118 
119 #if CRYPTOPP_CLMUL_AVAILABLE
120     if (HasCLMUL())
121     {
122         // Avoid "parameter not used" error and suppress Coverity finding
123         (void)params.GetIntValue(Name::TableSize(), tableSize);
124         tableSize = s_cltableSizeInBlocks * blockSize;
125         CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
126     }
127     else
128 #elif CRYPTOPP_ARM_PMULL_AVAILABLE
129     if (HasPMULL())
130     {
131         // Avoid "parameter not used" error and suppress Coverity finding
132         (void)params.GetIntValue(Name::TableSize(), tableSize);
133         tableSize = s_cltableSizeInBlocks * blockSize;
134         CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
135     }
136     else
137 #elif CRYPTOPP_POWER8_VMULL_AVAILABLE
138     if (HasPMULL())
139     {
140         // Avoid "parameter not used" error and suppress Coverity finding
141         (void)params.GetIntValue(Name::TableSize(), tableSize);
142         tableSize = s_cltableSizeInBlocks * blockSize;
143         CRYPTOPP_ASSERT(tableSize > static_cast<int>(blockSize));
144     }
145     else
146 #endif
147     {
148         if (params.GetIntValue(Name::TableSize(), tableSize))
149             tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
150         else
151             tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024;
152 
153         //#if defined(_MSC_VER) && (_MSC_VER < 1400)
154         // VC 2003 workaround: compiler generates bad code for 64K tables
155         //tableSize = 2*1024;
156         //#endif
157     }
158 
159     m_buffer.resize(3*blockSize + tableSize);
160     byte *mulTable = MulTable();
161     byte *hashKey = HashKey();
162     memset(hashKey, 0, REQUIRED_BLOCKSIZE);
163     blockCipher.ProcessBlock(hashKey);
164 
165 #if CRYPTOPP_CLMUL_AVAILABLE
166     if (HasCLMUL())
167     {
168         GCM_SetKeyWithoutResync_CLMUL(hashKey, mulTable, tableSize);
169         return;
170     }
171 #elif CRYPTOPP_ARM_PMULL_AVAILABLE
172     if (HasPMULL())
173     {
174         GCM_SetKeyWithoutResync_PMULL(hashKey, mulTable, tableSize);
175         return;
176     }
177 #elif CRYPTOPP_POWER8_VMULL_AVAILABLE
178     if (HasPMULL())
179     {
180         GCM_SetKeyWithoutResync_VMULL(hashKey, mulTable, tableSize);
181         return;
182     }
183 #endif
184 
185     word64 V0, V1;
186     typedef BlockGetAndPut<word64, BigEndian> Block;
187     Block::Get(hashKey)(V0)(V1);
188 
189     if (tableSize == 64*1024)
190     {
191         for (i=0; i<128; i++)
192         {
193             k = i%8;
194             Block::Put(NULLPTR, mulTable+(i/8)*256*16+(size_t(1)<<(11-k)))(V0)(V1);
195 
196             int x = (int)V1 & 1;
197             V1 = (V1>>1) | (V0<<63);
198             V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
199         }
200 
201         for (i=0; i<16; i++)
202         {
203             memset(mulTable+i*256*16, 0, 16);
204 #if CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
205             if (HasSSE2())
206                 for (j=2; j<=0x80; j*=2)
207                     for (k=1; k<j; k++)
208                         GCM_Xor16_SSE2(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
209             else
210 #elif CRYPTOPP_ARM_NEON_AVAILABLE
211             if (HasNEON())
212                 for (j=2; j<=0x80; j*=2)
213                     for (k=1; k<j; k++)
214                         GCM_Xor16_NEON(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
215             else
216 #elif CRYPTOPP_POWER8_AVAILABLE
217             if (HasPower8())
218                 for (j=2; j<=0x80; j*=2)
219                     for (k=1; k<j; k++)
220                         GCM_Xor16_POWER8(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
221             else
222 #endif
223                 for (j=2; j<=0x80; j*=2)
224                     for (k=1; k<j; k++)
225                         Xor16(mulTable+i*256*16+(j+k)*16, mulTable+i*256*16+j*16, mulTable+i*256*16+k*16);
226         }
227     }
228     else
229     {
230         if (!s_reductionTableInitialized)
231         {
232             s_reductionTable[0] = 0;
233             word16 x = 0x01c2;
234             s_reductionTable[1] = ByteReverse(x);
235             for (unsigned int ii=2; ii<=0x80; ii*=2)
236             {
237                 x <<= 1;
238                 s_reductionTable[ii] = ByteReverse(x);
239                 for (unsigned int jj=1; jj<ii; jj++)
240                     s_reductionTable[ii+jj] = s_reductionTable[ii] ^ s_reductionTable[jj];
241             }
242             s_reductionTableInitialized = true;
243         }
244 
245         for (i=0; i<128-24; i++)
246         {
247             k = i%32;
248             if (k < 4)
249                 Block::Put(NULLPTR, mulTable+1024+(i/32)*256+(size_t(1)<<(7-k)))(V0)(V1);
250             else if (k < 8)
251                 Block::Put(NULLPTR, mulTable+(i/32)*256+(size_t(1)<<(11-k)))(V0)(V1);
252 
253             int x = (int)V1 & 1;
254             V1 = (V1>>1) | (V0<<63);
255             V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
256         }
257 
258         for (i=0; i<4; i++)
259         {
260             memset(mulTable+i*256, 0, 16);
261             memset(mulTable+1024+i*256, 0, 16);
262 #if CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
263             if (HasSSE2())
264                 for (j=2; j<=8; j*=2)
265                     for (k=1; k<j; k++)
266                     {
267                         GCM_Xor16_SSE2(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
268                         GCM_Xor16_SSE2(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
269                     }
270             else
271 #elif CRYPTOPP_ARM_NEON_AVAILABLE
272             if (HasNEON())
273                 for (j=2; j<=8; j*=2)
274                     for (k=1; k<j; k++)
275                     {
276                         GCM_Xor16_NEON(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
277                         GCM_Xor16_NEON(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
278                     }
279             else
280 #elif CRYPTOPP_POWER8_AVAILABLE
281             if (HasPower8())
282                 for (j=2; j<=8; j*=2)
283                     for (k=1; k<j; k++)
284                     {
285                         GCM_Xor16_POWER8(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
286                         GCM_Xor16_POWER8(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
287                     }
288             else
289 #endif
290                 for (j=2; j<=8; j*=2)
291                     for (k=1; k<j; k++)
292                     {
293                         Xor16(mulTable+i*256+(j+k)*16, mulTable+i*256+j*16, mulTable+i*256+k*16);
294                         Xor16(mulTable+1024+i*256+(j+k)*16, mulTable+1024+i*256+j*16, mulTable+1024+i*256+k*16);
295                     }
296         }
297     }
298 }
299 
ReverseHashBufferIfNeeded()300 inline void GCM_Base::ReverseHashBufferIfNeeded()
301 {
302 #if CRYPTOPP_CLMUL_AVAILABLE
303     if (HasCLMUL())
304     {
305         GCM_ReverseHashBufferIfNeeded_CLMUL(HashBuffer());
306     }
307 #elif CRYPTOPP_ARM_PMULL_AVAILABLE
308     if (HasPMULL())
309     {
310         GCM_ReverseHashBufferIfNeeded_PMULL(HashBuffer());
311     }
312 #elif CRYPTOPP_POWER8_VMULL_AVAILABLE
313     if (HasPMULL())
314     {
315         GCM_ReverseHashBufferIfNeeded_VMULL(HashBuffer());
316     }
317 #endif
318 }
319 
Resync(const byte * iv,size_t len)320 void GCM_Base::Resync(const byte *iv, size_t len)
321 {
322     BlockCipher &cipher = AccessBlockCipher();
323     byte *hashBuffer = HashBuffer();
324 
325     if (len == 12)
326     {
327         memcpy(hashBuffer, iv, len);
328         memset(hashBuffer+len, 0, 3);
329         hashBuffer[len+3] = 1;
330     }
331     else
332     {
333         size_t origLen = len;
334         memset(hashBuffer, 0, HASH_BLOCKSIZE);
335 
336         if (len >= HASH_BLOCKSIZE)
337         {
338             len = GCM_Base::AuthenticateBlocks(iv, len);
339             iv += (origLen - len);
340         }
341 
342         if (len > 0)
343         {
344             memcpy(m_buffer, iv, len);
345             memset(m_buffer+len, 0, HASH_BLOCKSIZE-len);
346             GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
347         }
348 
349         PutBlock<word64, BigEndian, true>(NULLPTR, m_buffer)(0)(origLen*8);
350         GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
351 
352         ReverseHashBufferIfNeeded();
353     }
354 
355     if (m_state >= State_IVSet)
356         m_ctr.Resynchronize(hashBuffer, REQUIRED_BLOCKSIZE);
357     else
358         m_ctr.SetCipherWithIV(cipher, hashBuffer);
359 
360     m_ctr.Seek(HASH_BLOCKSIZE);
361 
362     memset(hashBuffer, 0, HASH_BLOCKSIZE);
363 }
364 
OptimalDataAlignment() const365 unsigned int GCM_Base::OptimalDataAlignment() const
366 {
367     return
368 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
369         HasSSE2() ? 16 :
370 #elif CRYPTOPP_ARM_NEON_AVAILABLE
371         HasNEON() ? 4 :
372 #elif CRYPTOPP_POWER8_AVAILABLE
373         HasPower8() ? 16 :
374 #endif
375         GetBlockCipher().OptimalDataAlignment();
376 }
377 
378 #if CRYPTOPP_MSC_VERSION
379 # pragma warning(disable: 4731)    // frame pointer register 'ebp' modified by inline assembly code
380 #endif
381 
382 #endif    // Not CRYPTOPP_GENERATE_X64_MASM
383 
384 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
385 extern "C" {
386 void GCM_AuthenticateBlocks_2K_SSE2(const byte *data, size_t blocks, word64 *hashBuffer, const word16 *reductionTable);
387 void GCM_AuthenticateBlocks_64K_SSE2(const byte *data, size_t blocks, word64 *hashBuffer);
388 }
389 #endif
390 
391 #ifndef CRYPTOPP_GENERATE_X64_MASM
392 
AuthenticateBlocks(const byte * data,size_t len)393 size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
394 {
395 #if CRYPTOPP_CLMUL_AVAILABLE
396     if (HasCLMUL())
397     {
398         return GCM_AuthenticateBlocks_CLMUL(data, len, MulTable(), HashBuffer());
399     }
400 #elif CRYPTOPP_ARM_PMULL_AVAILABLE
401     if (HasPMULL())
402     {
403         return GCM_AuthenticateBlocks_PMULL(data, len, MulTable(), HashBuffer());
404     }
405 #elif CRYPTOPP_POWER8_VMULL_AVAILABLE
406     if (HasPMULL())
407     {
408         return GCM_AuthenticateBlocks_VMULL(data, len, MulTable(), HashBuffer());
409     }
410 #endif
411 
412     typedef BlockGetAndPut<word64, NativeByteOrder> Block;
413     word64 *hashBuffer = (word64 *)(void *)HashBuffer();
414     CRYPTOPP_ASSERT(IsAlignedOn(hashBuffer,GetAlignmentOf<word64>()));
415 
416     switch (2*(m_buffer.size()>=64*1024)
417 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
418         + HasSSE2()
419 //#elif CRYPTOPP_ARM_NEON_AVAILABLE
420 //      + HasNEON()
421 #endif
422         )
423     {
424     case 0:        // non-SSE2 and 2K tables
425         {
426         byte *mulTable = MulTable();
427         word64 x0 = hashBuffer[0], x1 = hashBuffer[1];
428 
429         do
430         {
431             word64 y0, y1, a0, a1, b0, b1, c0, c1, d0, d1;
432             Block::Get(data)(y0)(y1);
433             x0 ^= y0;
434             x1 ^= y1;
435 
436             data += HASH_BLOCKSIZE;
437             len -= HASH_BLOCKSIZE;
438 
439             #define READ_TABLE_WORD64_COMMON(a, b, c, d)    *(word64 *)(void *)(mulTable+(a*1024)+(b*256)+c+d*8)
440 
441             #if (CRYPTOPP_LITTLE_ENDIAN)
442                 #if CRYPTOPP_BOOL_SLOW_WORD64
443                     word32 z0 = (word32)x0;
444                     word32 z1 = (word32)(x0>>32);
445                     word32 z2 = (word32)x1;
446                     word32 z3 = (word32)(x1>>32);
447                     #define READ_TABLE_WORD64(a, b, c, d, e)    READ_TABLE_WORD64_COMMON((d%2), c, (d?(z##c>>((d?d-1:0)*4))&0xf0:(z##c&0xf)<<4), e)
448                 #else
449                     #define READ_TABLE_WORD64(a, b, c, d, e)    READ_TABLE_WORD64_COMMON((d%2), c, ((d+8*b)?(x##a>>(((d+8*b)?(d+8*b)-1:1)*4))&0xf0:(x##a&0xf)<<4), e)
450                 #endif
451                 #define GF_MOST_SIG_8BITS(a) (a##1 >> 7*8)
452                 #define GF_SHIFT_8(a) a##1 = (a##1 << 8) ^ (a##0 >> 7*8); a##0 <<= 8;
453             #else
454                 #define READ_TABLE_WORD64(a, b, c, d, e)    READ_TABLE_WORD64_COMMON((1-d%2), c, ((15-d-8*b)?(x##a>>(((15-d-8*b)?(15-d-8*b)-1:0)*4))&0xf0:(x##a&0xf)<<4), e)
455                 #define GF_MOST_SIG_8BITS(a) (a##1 & 0xff)
456                 #define GF_SHIFT_8(a) a##1 = (a##1 >> 8) ^ (a##0 << 7*8); a##0 >>= 8;
457             #endif
458 
459             #define GF_MUL_32BY128(op, a, b, c)                                            \
460                 a0 op READ_TABLE_WORD64(a, b, c, 0, 0) ^ READ_TABLE_WORD64(a, b, c, 1, 0); \
461                 a1 op READ_TABLE_WORD64(a, b, c, 0, 1) ^ READ_TABLE_WORD64(a, b, c, 1, 1); \
462                 b0 op READ_TABLE_WORD64(a, b, c, 2, 0) ^ READ_TABLE_WORD64(a, b, c, 3, 0); \
463                 b1 op READ_TABLE_WORD64(a, b, c, 2, 1) ^ READ_TABLE_WORD64(a, b, c, 3, 1); \
464                 c0 op READ_TABLE_WORD64(a, b, c, 4, 0) ^ READ_TABLE_WORD64(a, b, c, 5, 0); \
465                 c1 op READ_TABLE_WORD64(a, b, c, 4, 1) ^ READ_TABLE_WORD64(a, b, c, 5, 1); \
466                 d0 op READ_TABLE_WORD64(a, b, c, 6, 0) ^ READ_TABLE_WORD64(a, b, c, 7, 0); \
467                 d1 op READ_TABLE_WORD64(a, b, c, 6, 1) ^ READ_TABLE_WORD64(a, b, c, 7, 1); \
468 
469             GF_MUL_32BY128(=, 0, 0, 0)
470             GF_MUL_32BY128(^=, 0, 1, 1)
471             GF_MUL_32BY128(^=, 1, 0, 2)
472             GF_MUL_32BY128(^=, 1, 1, 3)
473 
474             word32 r = (word32)s_reductionTable[GF_MOST_SIG_8BITS(d)] << 16;
475             GF_SHIFT_8(d)
476             c0 ^= d0; c1 ^= d1;
477             r ^= (word32)s_reductionTable[GF_MOST_SIG_8BITS(c)] << 8;
478             GF_SHIFT_8(c)
479             b0 ^= c0; b1 ^= c1;
480             r ^= s_reductionTable[GF_MOST_SIG_8BITS(b)];
481             GF_SHIFT_8(b)
482             a0 ^= b0; a1 ^= b1;
483             a0 ^= ConditionalByteReverse<word64>(LITTLE_ENDIAN_ORDER, r);
484             x0 = a0; x1 = a1;
485         }
486         while (len >= HASH_BLOCKSIZE);
487 
488         hashBuffer[0] = x0; hashBuffer[1] = x1;
489         return len;
490         }
491 
492     case 2:        // non-SSE2 and 64K tables
493         {
494         byte *mulTable = MulTable();
495         word64 x0 = hashBuffer[0], x1 = hashBuffer[1];
496 
497         do
498         {
499             word64 y0, y1, a0, a1;
500             Block::Get(data)(y0)(y1);
501             x0 ^= y0;
502             x1 ^= y1;
503 
504             data += HASH_BLOCKSIZE;
505             len -= HASH_BLOCKSIZE;
506 
507             #undef READ_TABLE_WORD64_COMMON
508             #undef READ_TABLE_WORD64
509 
510             #define READ_TABLE_WORD64_COMMON(a, c, d)    *(word64 *)(void *)(mulTable+(a)*256*16+(c)+(d)*8)
511 
512             #if (CRYPTOPP_LITTLE_ENDIAN)
513                 #if CRYPTOPP_BOOL_SLOW_WORD64
514                     word32 z0 = (word32)x0;
515                     word32 z1 = (word32)(x0>>32);
516                     word32 z2 = (word32)x1;
517                     word32 z3 = (word32)(x1>>32);
518                     #define READ_TABLE_WORD64(b, c, d, e)    READ_TABLE_WORD64_COMMON(c*4+d, (d?(z##c>>((d?d:1)*8-4))&0xff0:(z##c&0xff)<<4), e)
519                 #else
520                     #define READ_TABLE_WORD64(b, c, d, e)    READ_TABLE_WORD64_COMMON(c*4+d, ((d+4*(c%2))?(x##b>>(((d+4*(c%2))?(d+4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e)
521                 #endif
522             #else
523                 #define READ_TABLE_WORD64(b, c, d, e)    READ_TABLE_WORD64_COMMON(c*4+d, ((7-d-4*(c%2))?(x##b>>(((7-d-4*(c%2))?(7-d-4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e)
524             #endif
525 
526             #define GF_MUL_8BY128(op, b, c, d)        \
527                 a0 op READ_TABLE_WORD64(b, c, d, 0);\
528                 a1 op READ_TABLE_WORD64(b, c, d, 1);\
529 
530             GF_MUL_8BY128(=, 0, 0, 0)
531             GF_MUL_8BY128(^=, 0, 0, 1)
532             GF_MUL_8BY128(^=, 0, 0, 2)
533             GF_MUL_8BY128(^=, 0, 0, 3)
534             GF_MUL_8BY128(^=, 0, 1, 0)
535             GF_MUL_8BY128(^=, 0, 1, 1)
536             GF_MUL_8BY128(^=, 0, 1, 2)
537             GF_MUL_8BY128(^=, 0, 1, 3)
538             GF_MUL_8BY128(^=, 1, 2, 0)
539             GF_MUL_8BY128(^=, 1, 2, 1)
540             GF_MUL_8BY128(^=, 1, 2, 2)
541             GF_MUL_8BY128(^=, 1, 2, 3)
542             GF_MUL_8BY128(^=, 1, 3, 0)
543             GF_MUL_8BY128(^=, 1, 3, 1)
544             GF_MUL_8BY128(^=, 1, 3, 2)
545             GF_MUL_8BY128(^=, 1, 3, 3)
546 
547             x0 = a0; x1 = a1;
548         }
549         while (len >= HASH_BLOCKSIZE);
550 
551         hashBuffer[0] = x0; hashBuffer[1] = x1;
552         return len;
553         }
554 #endif    // #ifndef CRYPTOPP_GENERATE_X64_MASM
555 
556 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
557     case 1:        // SSE2 and 2K tables
558         GCM_AuthenticateBlocks_2K_SSE2(data, len/16, hashBuffer, s_reductionTable);
559         return len % 16;
560     case 3:        // SSE2 and 64K tables
561         GCM_AuthenticateBlocks_64K_SSE2(data, len/16, hashBuffer);
562         return len % 16;
563 #endif
564 
565 #if CRYPTOPP_SSE2_ASM_AVAILABLE
566     case 1:        // SSE2 and 2K tables
567         {
568         #ifdef __GNUC__
569             __asm__ __volatile__
570             (
571             INTEL_NOPREFIX
572         #elif defined(CRYPTOPP_GENERATE_X64_MASM)
573             ALIGN   8
574             GCM_AuthenticateBlocks_2K_SSE2    PROC FRAME
575             rex_push_reg rsi
576             push_reg rdi
577             push_reg rbx
578             .endprolog
579             mov rsi, r8
580             mov r11, r9
581         #else
582             AS2(    mov        WORD_REG(cx), data        )
583             AS2(    mov        WORD_REG(dx), len         )
584             AS2(    mov        WORD_REG(si), hashBuffer  )
585             AS2(    shr        WORD_REG(dx), 4           )
586         #endif
587 
588         #if CRYPTOPP_BOOL_X32
589             AS1(push    rbx)
590             AS1(push    rbp)
591         #else
592             AS_PUSH_IF86(    bx)
593             AS_PUSH_IF86(    bp)
594         #endif
595 
596         #ifdef __GNUC__
597             AS2(    mov      AS_REG_7, WORD_REG(di))
598         #elif CRYPTOPP_BOOL_X86
599             AS2(    lea      AS_REG_7, s_reductionTable)
600         #endif
601 
602         AS2(    movdqa   xmm0, [WORD_REG(si)]            )
603 
604         #define MUL_TABLE_0 WORD_REG(si) + 32
605         #define MUL_TABLE_1 WORD_REG(si) + 32 + 1024
606         #define RED_TABLE AS_REG_7
607 
608         ASL(0)
609         AS2(    movdqu   xmm4, [WORD_REG(cx)]            )
610         AS2(    pxor     xmm0, xmm4                      )
611 
612         AS2(    movd     ebx, xmm0                       )
613         AS2(    mov      eax, AS_HEX(f0f0f0f0)           )
614         AS2(    and      eax, ebx                        )
615         AS2(    shl      ebx, 4                          )
616         AS2(    and      ebx, AS_HEX(f0f0f0f0)           )
617         AS2(    movzx    edi, ah                         )
618         AS2(    movdqa   xmm5, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)]    )
619         AS2(    movzx    edi, al                         )
620         AS2(    movdqa   xmm4, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)]    )
621         AS2(    shr      eax, 16                         )
622         AS2(    movzx    edi, ah                         )
623         AS2(    movdqa   xmm3, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)]    )
624         AS2(    movzx    edi, al                         )
625         AS2(    movdqa   xmm2, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)]    )
626 
627         #define SSE2_MUL_32BITS(i)                                                       \
628             AS2(    psrldq  xmm0, 4                                                     )\
629             AS2(    movd    eax, xmm0                                                   )\
630             AS2(    and     eax, AS_HEX(f0f0f0f0)                                       )\
631             AS2(    movzx   edi, bh                                                     )\
632             AS2(    pxor    xmm5, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)]  )\
633             AS2(    movzx   edi, bl                                                     )\
634             AS2(    pxor    xmm4, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)]  )\
635             AS2(    shr     ebx, 16                                                     )\
636             AS2(    movzx   edi, bh                                                     )\
637             AS2(    pxor    xmm3, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)]  )\
638             AS2(    movzx   edi, bl                                                     )\
639             AS2(    pxor    xmm2, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)]  )\
640             AS2(    movd    ebx, xmm0                                                   )\
641             AS2(    shl     ebx, 4                                                      )\
642             AS2(    and     ebx, AS_HEX(f0f0f0f0)                                       )\
643             AS2(    movzx   edi, ah                                                     )\
644             AS2(    pxor    xmm5, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)]      )\
645             AS2(    movzx   edi, al                                                     )\
646             AS2(    pxor    xmm4, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)]      )\
647             AS2(    shr     eax, 16                                                     )\
648             AS2(    movzx   edi, ah                                                     )\
649             AS2(    pxor    xmm3, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)]      )\
650             AS2(    movzx   edi, al                                                     )\
651             AS2(    pxor    xmm2, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)]      )\
652 
653         SSE2_MUL_32BITS(1)
654         SSE2_MUL_32BITS(2)
655         SSE2_MUL_32BITS(3)
656 
657         AS2(    movzx   edi, bh                    )
658         AS2(    pxor    xmm5, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)]    )
659         AS2(    movzx   edi, bl                    )
660         AS2(    pxor    xmm4, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)]    )
661         AS2(    shr     ebx, 16                    )
662         AS2(    movzx   edi, bh                    )
663         AS2(    pxor    xmm3, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)]    )
664         AS2(    movzx   edi, bl                    )
665         AS2(    pxor    xmm2, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)]    )
666 
667         AS2(    movdqa  xmm0, xmm3                 )
668         AS2(    pslldq  xmm3, 1                    )
669         AS2(    pxor    xmm2, xmm3                 )
670         AS2(    movdqa  xmm1, xmm2                 )
671         AS2(    pslldq  xmm2, 1                    )
672         AS2(    pxor    xmm5, xmm2                 )
673 
674         AS2(    psrldq  xmm0, 15                   )
675 #if USE_MOVD_REG32
676         AS2(    movd    edi, xmm0                  )
677 #elif USE_MOV_REG32_OR_REG64
678         AS2(    mov     WORD_REG(di), xmm0         )
679 #else    // GNU Assembler
680         AS2(    movd    WORD_REG(di), xmm0         )
681 #endif
682         AS2(    movzx   eax, WORD PTR [RED_TABLE + WORD_REG(di)*2]  )
683         AS2(    shl     eax, 8                     )
684 
685         AS2(    movdqa  xmm0, xmm5                 )
686         AS2(    pslldq  xmm5, 1                    )
687         AS2(    pxor    xmm4, xmm5                 )
688 
689         AS2(    psrldq  xmm1, 15                   )
690 #if USE_MOVD_REG32
691         AS2(    movd    edi, xmm1                  )
692 #elif USE_MOV_REG32_OR_REG64
693         AS2(    mov     WORD_REG(di), xmm1         )
694 #else
695         AS2(    movd    WORD_REG(di), xmm1         )
696 #endif
697         AS2(    xor     ax, WORD PTR [RED_TABLE + WORD_REG(di)*2]  )
698         AS2(    shl     eax, 8                     )
699 
700         AS2(    psrldq  xmm0, 15                   )
701 #if USE_MOVD_REG32
702         AS2(    movd    edi, xmm0                  )
703 #elif USE_MOV_REG32_OR_REG64
704         AS2(    mov     WORD_REG(di), xmm0         )
705 #else
706         AS2(    movd    WORD_REG(di), xmm0         )
707 #endif
708         AS2(    xor     ax, WORD PTR [RED_TABLE + WORD_REG(di)*2]  )
709 
710         AS2(    movd    xmm0, eax                  )
711         AS2(    pxor    xmm0, xmm4                 )
712 
713         AS2(    add     WORD_REG(cx), 16           )
714         AS2(    sub     WORD_REG(dx), 1            )
715         // ATT_NOPREFIX
716         ASJ(    jnz,    0, b                       )
717         INTEL_NOPREFIX
718         AS2(    movdqa  [WORD_REG(si)], xmm0       )
719 
720         #if CRYPTOPP_BOOL_X32
721             AS1(pop        rbp)
722             AS1(pop        rbx)
723         #else
724             AS_POP_IF86(    bp)
725             AS_POP_IF86(    bx)
726         #endif
727 
728         #ifdef __GNUC__
729                 ATT_PREFIX
730                     :
731                     : "c" (data), "d" (len/16), "S" (hashBuffer), "D" (s_reductionTable)
732                     : "memory", "cc", "%eax"
733             #if CRYPTOPP_BOOL_X64
734                     , "%ebx", "%r11"
735             #endif
736                 );
737         #elif defined(CRYPTOPP_GENERATE_X64_MASM)
738             pop rbx
739             pop rdi
740             pop rsi
741             ret
742             GCM_AuthenticateBlocks_2K_SSE2 ENDP
743         #endif
744 
745         return len%16;
746         }
747     case 3:        // SSE2 and 64K tables
748         {
749         #ifdef __GNUC__
750             __asm__ __volatile__
751             (
752             INTEL_NOPREFIX
753         #elif defined(CRYPTOPP_GENERATE_X64_MASM)
754             ALIGN   8
755             GCM_AuthenticateBlocks_64K_SSE2    PROC FRAME
756             rex_push_reg rsi
757             push_reg rdi
758             .endprolog
759             mov rsi, r8
760         #else
761             AS2(    mov        WORD_REG(cx), data       )
762             AS2(    mov        WORD_REG(dx), len        )
763             AS2(    mov        WORD_REG(si), hashBuffer )
764             AS2(    shr        WORD_REG(dx), 4          )
765         #endif
766 
767         AS2(    movdqa    xmm0, [WORD_REG(si)]          )
768 
769         #undef MUL_TABLE
770         #define MUL_TABLE(i,j) WORD_REG(si) + 32 + (i*4+j)*256*16
771 
772         ASL(1)
773         AS2(    movdqu    xmm1, [WORD_REG(cx)]          )
774         AS2(    pxor    xmm1, xmm0                      )
775         AS2(    pxor    xmm0, xmm0                      )
776 
777         #undef SSE2_MUL_32BITS
778         #define SSE2_MUL_32BITS(i)                                   \
779             AS2(    movd    eax, xmm1                               )\
780             AS2(    psrldq    xmm1, 4                               )\
781             AS2(    movzx    edi, al                                )\
782             AS2(    add        WORD_REG(di), WORD_REG(di)           )\
783             AS2(    pxor    xmm0, [MUL_TABLE(i,0) + WORD_REG(di)*8] )\
784             AS2(    movzx    edi, ah                                )\
785             AS2(    add        WORD_REG(di), WORD_REG(di)           )\
786             AS2(    pxor    xmm0, [MUL_TABLE(i,1) + WORD_REG(di)*8] )\
787             AS2(    shr        eax, 16                              )\
788             AS2(    movzx    edi, al                                )\
789             AS2(    add        WORD_REG(di), WORD_REG(di)           )\
790             AS2(    pxor    xmm0, [MUL_TABLE(i,2) + WORD_REG(di)*8] )\
791             AS2(    movzx    edi, ah                                )\
792             AS2(    add        WORD_REG(di), WORD_REG(di)           )\
793             AS2(    pxor    xmm0, [MUL_TABLE(i,3) + WORD_REG(di)*8] )\
794 
795         SSE2_MUL_32BITS(0)
796         SSE2_MUL_32BITS(1)
797         SSE2_MUL_32BITS(2)
798         SSE2_MUL_32BITS(3)
799 
800         AS2(    add     WORD_REG(cx), 16      )
801         AS2(    sub     WORD_REG(dx), 1       )
802         // ATT_NOPREFIX
803         ASJ(    jnz,    1, b                  )
804         INTEL_NOPREFIX
805         AS2(    movdqa  [WORD_REG(si)], xmm0  )
806 
807         #ifdef __GNUC__
808                 ATT_PREFIX
809                     :
810                     : "c" (data), "d" (len/16), "S" (hashBuffer)
811                     : "memory", "cc", "%edi", "%eax"
812                 );
813         #elif defined(CRYPTOPP_GENERATE_X64_MASM)
814             pop rdi
815             pop rsi
816             ret
817             GCM_AuthenticateBlocks_64K_SSE2 ENDP
818         #endif
819 
820         return len%16;
821         }
822 #endif
823 #ifndef CRYPTOPP_GENERATE_X64_MASM
824     }
825 
826     return len%16;
827 }
828 
AuthenticateLastHeaderBlock()829 void GCM_Base::AuthenticateLastHeaderBlock()
830 {
831     if (m_bufferedDataLength > 0)
832     {
833         memset(m_buffer+m_bufferedDataLength, 0, HASH_BLOCKSIZE-m_bufferedDataLength);
834         m_bufferedDataLength = 0;
835         GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
836     }
837 }
838 
AuthenticateLastConfidentialBlock()839 void GCM_Base::AuthenticateLastConfidentialBlock()
840 {
841     GCM_Base::AuthenticateLastHeaderBlock();
842     PutBlock<word64, BigEndian, true>(NULLPTR, m_buffer)(m_totalHeaderLength*8)(m_totalMessageLength*8);
843     GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
844 }
845 
AuthenticateLastFooterBlock(byte * mac,size_t macSize)846 void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
847 {
848     m_ctr.Seek(0);
849     ReverseHashBufferIfNeeded();
850     m_ctr.ProcessData(mac, HashBuffer(), macSize);
851 }
852 
853 NAMESPACE_END
854 
855 #endif    // Not CRYPTOPP_GENERATE_X64_MASM
856 #endif
857