1 // vmac.cpp - originally written and placed in the public domain by Wei Dai
2 // based on Ted Krovetz's public domain vmac.c and draft-krovetz-vmac-01.txt
3 
4 #include "pch.h"
5 #include "config.h"
6 
7 #include "vmac.h"
8 #include "cpu.h"
9 #include "argnames.h"
10 #include "secblock.h"
11 
12 #if defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
13 #include <intrin.h>
14 #endif
15 
16 #if defined(CRYPTOPP_DISABLE_VMAC_ASM)
17 # undef CRYPTOPP_X86_ASM_AVAILABLE
18 # undef CRYPTOPP_X32_ASM_AVAILABLE
19 # undef CRYPTOPP_X64_ASM_AVAILABLE
20 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
21 #endif
22 
23 #if CRYPTOPP_MSC_VERSION
24 # pragma warning(disable: 4731)
25 #endif
26 
27 ANONYMOUS_NAMESPACE_BEGIN
28 
29 #if defined(CRYPTOPP_WORD128_AVAILABLE) && !defined(CRYPTOPP_X64_ASM_AVAILABLE)
30 using CryptoPP::word128;
31 using CryptoPP::word64;
32 # define VMAC_BOOL_WORD128 1
33 #else
34 using CryptoPP::word64;
35 # define VMAC_BOOL_WORD128 0
36 #endif
37 
38 #ifdef __BORLANDC__
39 #define const	// Turbo C++ 2006 workaround
40 #endif
41 const word64 p64   = W64LIT(0xfffffffffffffeff);  /* 2^64 - 257 prime  */
42 const word64 m62   = W64LIT(0x3fffffffffffffff);  /* 62-bit mask       */
43 const word64 m63   = W64LIT(0x7fffffffffffffff);  /* 63-bit mask       */
44 const word64 m64   = W64LIT(0xffffffffffffffff);  /* 64-bit mask       */
45 const word64 mpoly = W64LIT(0x1fffffff1fffffff);  /* Poly key mask     */
46 #ifdef __BORLANDC__
47 #undef const
48 #endif
49 
50 #if VMAC_BOOL_WORD128
51 // workaround GCC Bug 31690: ICE with const __uint128_t and C++ front-end
52 # if defined(__powerpc__) && defined (CRYPTOPP_GCC_VERSION) && (CRYPTOPP_GCC_VERSION < 50300)
53 #  define m126				((word128(m62)<<64)|m64)
54 # else
55 const word128 m126 = (word128(m62)<<64)|m64;		 /* 126-bit mask      */
56 # endif
57 #endif
58 
59 ANONYMOUS_NAMESPACE_END
60 
NAMESPACE_BEGIN(CryptoPP)61 NAMESPACE_BEGIN(CryptoPP)
62 
63 void VMAC_Base::UncheckedSetKey(const byte *userKey, unsigned int keylength, const NameValuePairs &params)
64 {
65 	int digestLength = params.GetIntValueWithDefault(Name::DigestSize(), DefaultDigestSize());
66 	if (digestLength != 8 && digestLength != 16)
67 		throw InvalidArgument("VMAC: DigestSize must be 8 or 16");
68 	m_is128 = digestLength == 16;
69 
70 	m_L1KeyLength = params.GetIntValueWithDefault(Name::L1KeyLength(), 128);
71 	if (m_L1KeyLength <= 0 || m_L1KeyLength % 128 != 0)
72 		throw InvalidArgument("VMAC: L1KeyLength must be a positive multiple of 128");
73 
74 	AllocateBlocks();
75 
76 	BlockCipher &cipher = AccessCipher();
77 	cipher.SetKey(userKey, keylength, params);
78 	const unsigned int blockSize = cipher.BlockSize();
79 	const unsigned int blockSizeInWords = blockSize / sizeof(word64);
80 	SecBlock<word64, AllocatorWithCleanup<word64, true> > out(blockSizeInWords);
81 	AlignedSecByteBlock in;
82 	in.CleanNew(blockSize);
83 	size_t i;
84 
85 	/* Fill nh key */
86 	in[0] = 0x80;
87 	cipher.AdvancedProcessBlocks(in, NULLPTR, (byte *)m_nhKey(), m_nhKeySize()*sizeof(word64), cipher.BT_InBlockIsCounter);
88 	ConditionalByteReverse<word64>(BIG_ENDIAN_ORDER, m_nhKey(), m_nhKey(), m_nhKeySize()*sizeof(word64));
89 
90 	/* Fill poly key */
91 	in[0] = 0xC0;
92 	in[15] = 0;
93 	for (i = 0; i <= (size_t)m_is128; i++)
94 	{
95 		cipher.ProcessBlock(in, out.BytePtr());
96 		m_polyState()[i*4+2] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()) & mpoly;
97 		m_polyState()[i*4+3]  = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8) & mpoly;
98 		in[15]++;
99 	}
100 
101 	/* Fill ip key */
102 	in[0] = 0xE0;
103 	in[15] = 0;
104 	word64 *l3Key = m_l3Key();
105 	CRYPTOPP_ASSERT(IsAlignedOn(l3Key,GetAlignmentOf<word64>()));
106 
107 	for (i = 0; i <= (size_t)m_is128; i++)
108 		do
109 		{
110 			cipher.ProcessBlock(in, out.BytePtr());
111 			l3Key[i*2+0] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr());
112 			l3Key[i*2+1] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8);
113 			in[15]++;
114 		} while ((l3Key[i*2+0] >= p64) || (l3Key[i*2+1] >= p64));
115 
116 	m_padCached = false;
117 	size_t nonceLength;
118 	const byte *nonce = GetIVAndThrowIfInvalid(params, nonceLength);
119 	Resynchronize(nonce, (int)nonceLength);
120 }
121 
GetNextIV(RandomNumberGenerator & rng,byte * IV)122 void VMAC_Base::GetNextIV(RandomNumberGenerator &rng, byte *IV)
123 {
124 	SimpleKeyingInterface::GetNextIV(rng, IV);
125 	IV[0] &= 0x7f;
126 }
127 
Resynchronize(const byte * nonce,int len)128 void VMAC_Base::Resynchronize(const byte *nonce, int len)
129 {
130 	size_t length = ThrowIfInvalidIVLength(len);
131 	size_t s = IVSize();
132 	byte *storedNonce = m_nonce();
133 
134 	if (m_is128)
135 	{
136 		memset(storedNonce, 0, s-length);
137 		memcpy(storedNonce+s-length, nonce, length);
138 		AccessCipher().ProcessBlock(storedNonce, m_pad());
139 	}
140 	else
141 	{
142 		if (m_padCached && (storedNonce[s-1] | 1) == (nonce[length-1] | 1))
143 		{
144 			m_padCached = VerifyBufsEqual(storedNonce+s-length, nonce, length-1);
145 			for (size_t i=0; m_padCached && i<s-length; i++)
146 				m_padCached = (storedNonce[i] == 0);
147 		}
148 		if (!m_padCached)
149 		{
150 			memset(storedNonce, 0, s-length);
151 			memcpy(storedNonce+s-length, nonce, length-1);
152 			storedNonce[s-1] = nonce[length-1] & 0xfe;
153 			AccessCipher().ProcessBlock(storedNonce, m_pad());
154 			m_padCached = true;
155 		}
156 		storedNonce[s-1] = nonce[length-1];
157 	}
158 	m_isFirstBlock = true;
159 	Restart();
160 }
161 
HashEndianCorrectedBlock(const word64 * data)162 void VMAC_Base::HashEndianCorrectedBlock(const word64 *data)
163 {
164 	CRYPTOPP_UNUSED(data);
165 	CRYPTOPP_ASSERT(false);
166 	throw NotImplemented("VMAC: HashEndianCorrectedBlock is not implemented");
167 }
168 
OptimalDataAlignment() const169 unsigned int VMAC_Base::OptimalDataAlignment() const
170 {
171 	return
172 #if CRYPTOPP_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
173 		HasSSE2() ? 16 :
174 #endif
175 		GetCipher().OptimalDataAlignment();
176 }
177 
178 #if CRYPTOPP_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
179 #if CRYPTOPP_MSC_VERSION
180 # pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code
181 #endif
182 
183 CRYPTOPP_NOINLINE
VHASH_Update_SSE2(const word64 * data,size_t blocksRemainingInWord64,int tagPart)184 void VMAC_Base::VHASH_Update_SSE2(const word64 *data, size_t blocksRemainingInWord64, int tagPart)
185 {
186 	const word64 *nhK = m_nhKey();
187 	word64 *polyS = (word64*)(void*)m_polyState();
188 	word32 L1KeyLength = m_L1KeyLength;
189 
190 	// These are used in the ASM, but some analysis services miss it.
191 	CRYPTOPP_UNUSED(data); CRYPTOPP_UNUSED(tagPart);
192 	CRYPTOPP_UNUSED(L1KeyLength);
193 	CRYPTOPP_UNUSED(blocksRemainingInWord64);
194 
195 	// This inline ASM is tricky, and down right difficult on 32-bit when
196 	// PIC is in effect. The ASM uses all the general purpose registers
197 	// and all the XMM registers on 32-bit machines. When PIC is in effect
198 	// on a 32-bit machine, GCC uses EBX as a base register for PLT. Saving
199 	// EBX with 'mov %%ebx, %0' and restoring EBX with 'mov %0, %%ebx'
200 	// causes GCC to generate 'mov -0x40(%ebx), %ebx' for the restore. That
201 	// obviously won't work because EBX is no longer valid. We can push and
202 	// pop EBX, but that breaks the stack-based references. Attempting to
203 	// sidestep with clobber lists results in "error: ‘asm’ operand has
204 	// impossible constraints". Eventually, we found we could save EBX to
205 	// ESP-20, which is one word below our stack in the frame.
206 #ifdef __GNUC__
207 	__asm__ __volatile__
208 	(
209 # if CRYPTOPP_BOOL_X86
210 	// Hack. Save EBX for PIC. Do NOT 'push EBX' here.
211 	// GCC issues 'mov ESP+8, EBX' to load L1KeyLength.
212 	// A push breaks the reference to L1KeyLength.
213 	AS2(	mov 	%%ebx, -20(%%esp))
214 # endif
215 	// L1KeyLength into EBX.
216 	// GCC generates 'mov ESP+8, EBX'.
217 	AS2(	mov 	%0, %%ebx)
218 	INTEL_NOPREFIX
219 #else
220 	#if defined(__INTEL_COMPILER)
221 	char isFirstBlock = m_isFirstBlock;
222 	AS2(	mov 	ebx, [L1KeyLength])
223 	AS2(	mov 	dl, [isFirstBlock])
224 	#else
225 	AS2(	mov 	ecx, this)
226 	AS2(	mov 	ebx, [ecx+m_L1KeyLength])
227 	AS2(	mov 	dl, [ecx+m_isFirstBlock])
228 	#endif
229 	AS2(	mov 	eax, tagPart)
230 	AS2(	shl 	eax, 4)
231 	AS2(	mov 	edi, nhK)
232 	AS2(	add 	edi, eax)
233 	AS2(	add 	eax, eax)
234 	AS2(	add 	eax, polyS)
235 
236 	AS2(	mov 	esi, data)
237 	AS2(	mov 	ecx, blocksRemainingInWord64)
238 #endif
239 
240 	AS2(	shr 	ebx, 3)
241 	AS_PUSH_IF86(	bp)
242 	AS2(	sub 	esp, 12)
243 	ASL(4)
244 	AS2(	mov 	ebp, ebx)
245 	AS2(	cmp 	ecx, ebx)
246 	AS2(	cmovl	ebp, ecx)
247 	AS2(	sub 	ecx, ebp)
248 	AS2(	lea 	ebp, [edi+8*ebp])	// end of nhK
249 	AS2(	movq	mm6, [esi])
250 	AS2(	paddq	mm6, [edi])
251 	AS2(	movq	mm5, [esi+8])
252 	AS2(	paddq	mm5, [edi+8])
253 	AS2(	add 	esi, 16)
254 	AS2(	add 	edi, 16)
255 	AS2(	movq	mm4, mm6)
256 	ASS(	pshufw	mm2, mm6, 1, 0, 3, 2)
257 	AS2(	pmuludq	mm6, mm5)
258 	ASS(	pshufw	mm3, mm5, 1, 0, 3, 2)
259 	AS2(	pmuludq	mm5, mm2)
260 	AS2(	pmuludq	mm2, mm3)
261 	AS2(	pmuludq	mm3, mm4)
262 	AS2(	pxor	mm7, mm7)
263 	AS2(	movd	[esp], mm6)
264 	AS2(	psrlq	mm6, 32)
265 	AS2(	movd	[esp+4], mm5)
266 	AS2(	psrlq	mm5, 32)
267 	AS2(	cmp 	edi, ebp)
268 	ASJ(	je,  	1, f)
269 	ASL(0)
270 	AS2(	movq	mm0, [esi])
271 	AS2(	paddq	mm0, [edi])
272 	AS2(	movq	mm1, [esi+8])
273 	AS2(	paddq	mm1, [edi+8])
274 	AS2(	add 	esi, 16)
275 	AS2(	add 	edi, 16)
276 	AS2(	movq	mm4, mm0)
277 	AS2(	paddq	mm5, mm2)
278 	ASS(	pshufw	mm2, mm0, 1, 0, 3, 2)
279 	AS2(	pmuludq	mm0, mm1)
280 	AS2(	movd	[esp+8], mm3)
281 	AS2(	psrlq	mm3, 32)
282 	AS2(	paddq	mm5, mm3)
283 	ASS(	pshufw	mm3, mm1, 1, 0, 3, 2)
284 	AS2(	pmuludq	mm1, mm2)
285 	AS2(	pmuludq	mm2, mm3)
286 	AS2(	pmuludq	mm3, mm4)
287 	AS2(	movd	mm4, [esp])
288 	AS2(	paddq	mm7, mm4)
289 	AS2(	movd	mm4, [esp+4])
290 	AS2(	paddq	mm6, mm4)
291 	AS2(	movd	mm4, [esp+8])
292 	AS2(	paddq	mm6, mm4)
293 	AS2(	movd	[esp], mm0)
294 	AS2(	psrlq	mm0, 32)
295 	AS2(	paddq	mm6, mm0)
296 	AS2(	movd	[esp+4], mm1)
297 	AS2(	psrlq	mm1, 32)
298 	AS2(	paddq	mm5, mm1)
299 	AS2(	cmp 	edi, ebp)
300 	ASJ(	jne,	0, b)
301 	ASL(1)
302 	AS2(	paddq	mm5, mm2)
303 	AS2(	movd	[esp+8], mm3)
304 	AS2(	psrlq	mm3, 32)
305 	AS2(	paddq	mm5, mm3)
306 	AS2(	movd	mm4, [esp])
307 	AS2(	paddq	mm7, mm4)
308 	AS2(	movd	mm4, [esp+4])
309 	AS2(	paddq	mm6, mm4)
310 	AS2(	movd	mm4, [esp+8])
311 	AS2(	paddq	mm6, mm4)
312 	AS2(	lea 	ebp, [8*ebx])
313 	AS2(	sub 	edi, ebp)		// reset edi to start of nhK
314 
315 	AS2(	movd	[esp], mm7)
316 	AS2(	psrlq	mm7, 32)
317 	AS2(	paddq	mm6, mm7)
318 	AS2(	movd	[esp+4], mm6)
319 	AS2(	psrlq	mm6, 32)
320 	AS2(	paddq	mm5, mm6)
321 	AS2(	psllq	mm5, 2)
322 	AS2(	psrlq	mm5, 2)
323 
324 #define a0 [eax+2*4]
325 #define a1 [eax+3*4]
326 #define a2 [eax+0*4]
327 #define a3 [eax+1*4]
328 #define k0 [eax+2*8+2*4]
329 #define k1 [eax+2*8+3*4]
330 #define k2 [eax+2*8+0*4]
331 #define k3 [eax+2*8+1*4]
332 
333 	AS2(	test	dl, dl)
334 	ASJ(	jz,  	2, f)
335 	AS2(	movd	mm1, k0)
336 	AS2(	movd	mm0, [esp])
337 	AS2(	paddq	mm0, mm1)
338 	AS2(	movd	a0, mm0)
339 	AS2(	psrlq	mm0, 32)
340 	AS2(	movd	mm1, k1)
341 	AS2(	movd	mm2, [esp+4])
342 	AS2(	paddq	mm1, mm2)
343 	AS2(	paddq	mm0, mm1)
344 	AS2(	movd	a1, mm0)
345 	AS2(	psrlq	mm0, 32)
346 	AS2(	paddq	mm5, k2)
347 	AS2(	paddq	mm0, mm5)
348 	AS2(	movq	a2, mm0)
349 	AS2(	xor 	edx, edx)
350 	ASJ(	jmp,	3, f)
351 	ASL(2)
352 	AS2(	movd	mm0, a3)
353 	AS2(	movq	mm4, mm0)
354 	AS2(	pmuludq	mm0, k3)		// a3*k3
355 	AS2(	movd	mm1, a0)
356 	AS2(	pmuludq	mm1, k2)		// a0*k2
357 	AS2(	movd	mm2, a1)
358 	AS2(	movd	mm6, k1)
359 	AS2(	pmuludq	mm2, mm6)		// a1*k1
360 	AS2(	movd	mm3, a2)
361 	AS2(	psllq	mm0, 1)
362 	AS2(	paddq	mm0, mm5)
363 	AS2(	movq	mm5, mm3)
364 	AS2(	movd	mm7, k0)
365 	AS2(	pmuludq	mm3, mm7)		// a2*k0
366 	AS2(	pmuludq	mm4, mm7)		// a3*k0
367 	AS2(	pmuludq	mm5, mm6)		// a2*k1
368 	AS2(	paddq	mm0, mm1)
369 	AS2(	movd	mm1, a1)
370 	AS2(	paddq	mm4, mm5)
371 	AS2(	movq	mm5, mm1)
372 	AS2(	pmuludq	mm1, k2)		// a1*k2
373 	AS2(	paddq	mm0, mm2)
374 	AS2(	movd	mm2, a0)
375 	AS2(	paddq	mm0, mm3)
376 	AS2(	movq	mm3, mm2)
377 	AS2(	pmuludq	mm2, k3)		// a0*k3
378 	AS2(	pmuludq	mm3, mm7)		// a0*k0
379 	AS2(	movd	[esp+8], mm0)
380 	AS2(	psrlq	mm0, 32)
381 	AS2(	pmuludq	mm7, mm5)		// a1*k0
382 	AS2(	pmuludq	mm5, k3)		// a1*k3
383 	AS2(	paddq	mm0, mm1)
384 	AS2(	movd	mm1, a2)
385 	AS2(	pmuludq	mm1, k2)		// a2*k2
386 	AS2(	paddq	mm0, mm2)
387 	AS2(	paddq	mm0, mm4)
388 	AS2(	movq	mm4, mm0)
389 	AS2(	movd	mm2, a3)
390 	AS2(	pmuludq	mm2, mm6)		// a3*k1
391 	AS2(	pmuludq	mm6, a0)		// a0*k1
392 	AS2(	psrlq	mm0, 31)
393 	AS2(	paddq	mm0, mm3)
394 	AS2(	movd	mm3, [esp])
395 	AS2(	paddq	mm0, mm3)
396 	AS2(	movd	mm3, a2)
397 	AS2(	pmuludq	mm3, k3)		// a2*k3
398 	AS2(	paddq	mm5, mm1)
399 	AS2(	movd	mm1, a3)
400 	AS2(	pmuludq	mm1, k2)		// a3*k2
401 	AS2(	paddq	mm5, mm2)
402 	AS2(	movd	mm2, [esp+4])
403 	AS2(	psllq	mm5, 1)
404 	AS2(	paddq	mm0, mm5)
405 	AS2(	psllq	mm4, 33)
406 	AS2(	movd	a0, mm0)
407 	AS2(	psrlq	mm0, 32)
408 	AS2(	paddq	mm6, mm7)
409 	AS2(	movd	mm7, [esp+8])
410 	AS2(	paddq	mm0, mm6)
411 	AS2(	paddq	mm0, mm2)
412 	AS2(	paddq	mm3, mm1)
413 	AS2(	psllq	mm3, 1)
414 	AS2(	paddq	mm0, mm3)
415 	AS2(	psrlq	mm4, 1)
416 	AS2(	movd	a1, mm0)
417 	AS2(	psrlq	mm0, 32)
418 	AS2(	por 	mm4, mm7)
419 	AS2(	paddq	mm0, mm4)
420 	AS2(	movq	a2, mm0)
421 
422 #undef a0
423 #undef a1
424 #undef a2
425 #undef a3
426 #undef k0
427 #undef k1
428 #undef k2
429 #undef k3
430 
431 	ASL(3)
432 	AS2(	test	ecx, ecx)
433 	ASJ(	jnz,	4, b)
434 	AS2(	add 	esp, 12)
435 	AS_POP_IF86(	bp)
436 	AS1(	emms)
437 #ifdef __GNUC__
438 	ATT_PREFIX
439 # if CRYPTOPP_BOOL_X86
440 	// Restore EBX for PIC
441 	AS2(	mov 	-20(%%esp), %%ebx)
442 # endif
443 		:
444 		: "m" (L1KeyLength), "c" (blocksRemainingInWord64), "S" (data),
445 		  "D" (nhK+tagPart*2), "d" (m_isFirstBlock), "a" (polyS+tagPart*4)
446 		: "memory", "cc"
447 	);
448 #endif
449 }
450 #endif
451 
452 #if VMAC_BOOL_WORD128
453 	#define DeclareNH(a) word128 a=0
454 	#define MUL64(rh,rl,i1,i2) {word128 p = word128(i1)*(i2); rh = word64(p>>64); rl = word64(p);}
455 	#define AccumulateNH(a, b, c) a += word128(b)*(c)
456 	#define Multiply128(r, i1, i2) r = word128(word64(i1)) * word64(i2)
457 #else
458 	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_IA64))
459 		#define MUL32(a, b) __emulu(word32(a), word32(b))
460 	#else
461 		#define MUL32(a, b) ((word64)((word32)(a)) * (word32)(b))
462 	#endif
463 	#if defined(CRYPTOPP_X64_ASM_AVAILABLE)
464 		#define DeclareNH(a)			word64 a##0=0, a##1=0
465 		#define MUL64(rh,rl,i1,i2)		asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "g"(i2) : "cc");
466 		#define AccumulateNH(a, b, c)	asm ("mulq %3; addq %%rax, %0; adcq %%rdx, %1" : "+r"(a##0), "+r"(a##1) : "a"(b), "g"(c) : "%rdx", "cc");
467 		#define ADD128(rh,rl,ih,il)     asm ("addq %3, %1; adcq %2, %0" : "+r"(rh),"+r"(rl) : "r"(ih),"r"(il) : "cc");
468 	#elif defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
469 		#define DeclareNH(a) word64 a##0=0, a##1=0
470 		#define MUL64(rh,rl,i1,i2)   (rl) = _umul128(i1,i2,&(rh));
471 		#define AccumulateNH(a, b, c)	{\
472 			word64 ph, pl;\
473 			pl = _umul128(b,c,&ph);\
474 			a##0 += pl;\
475 			a##1 += ph + (a##0 < pl);}
476 	#else
477 		#define VMAC_BOOL_32BIT 1
478 		#define DeclareNH(a) word64 a##0=0, a##1=0, a##2=0
479 		#define MUL64(rh,rl,i1,i2)                                               \
480 			{   word64 _i1 = (i1), _i2 = (i2);                                 \
481 				word64 m1= MUL32(_i1,_i2>>32);                                 \
482 				word64 m2= MUL32(_i1>>32,_i2);                                 \
483 				rh         = MUL32(_i1>>32,_i2>>32);                             \
484 				rl         = MUL32(_i1,_i2);                                     \
485 				ADD128(rh,rl,(m1 >> 32),(m1 << 32));                             \
486 				ADD128(rh,rl,(m2 >> 32),(m2 << 32));                             \
487 			}
488 		#define AccumulateNH(a, b, c)	{\
489 			word64 p = MUL32(b, c);\
490 			a##1 += word32((p)>>32);\
491 			a##0 += word32(p);\
492 			p = MUL32((b)>>32, c);\
493 			a##2 += word32((p)>>32);\
494 			a##1 += word32(p);\
495 			p = MUL32((b)>>32, (c)>>32);\
496 			a##2 += p;\
497 			p = MUL32(b, (c)>>32);\
498 			a##1 += word32(p);\
499 			a##2 += word32(p>>32);}
500 	#endif
501 #endif
502 #ifndef VMAC_BOOL_32BIT
503 	#define VMAC_BOOL_32BIT 0
504 #endif
505 #ifndef ADD128
506 	#define ADD128(rh,rl,ih,il)                                          \
507 		{   word64 _il = (il);                                         \
508 			(rl) += (_il);                                               \
509 			(rh) += (ih) + ((rl) < (_il));                               \
510 		}
511 #endif
512 
513 template <bool T_128BitTag>
VHASH_Update_Template(const word64 * data,size_t blocksRemainingInWord64)514 void VMAC_Base::VHASH_Update_Template(const word64 *data, size_t blocksRemainingInWord64)
515 {
516 	CRYPTOPP_ASSERT(IsAlignedOn(m_polyState(),GetAlignmentOf<word64>()));
517 	CRYPTOPP_ASSERT(IsAlignedOn(m_nhKey(),GetAlignmentOf<word64>()));
518 
519 	#define INNER_LOOP_ITERATION(j)	{\
520 		word64 d0 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+0]);\
521 		word64 d1 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+1]);\
522 		AccumulateNH(nhA, d0+nhK[i+2*j+0], d1+nhK[i+2*j+1]);\
523 		if (T_128BitTag)\
524 			AccumulateNH(nhB, d0+nhK[i+2*j+2], d1+nhK[i+2*j+3]);\
525 		}
526 
527 	size_t L1KeyLengthInWord64 = m_L1KeyLength / 8;
528 	size_t innerLoopEnd = L1KeyLengthInWord64;
529 	const word64 *nhK = m_nhKey();
530 	word64 *polyS = (word64*)(void*)m_polyState();
531 	bool isFirstBlock = true;
532 	size_t i;
533 
534 	#if !VMAC_BOOL_32BIT
535 		#if VMAC_BOOL_WORD128
536 			word128 a1=0, a2=0;
537 		#else
538 			word64 ah1=0, al1=0, ah2=0, al2=0;
539 		#endif
540 		word64 kh1, kl1, kh2, kl2;
541 		kh1=(polyS+0*4+2)[0]; kl1=(polyS+0*4+2)[1];
542 		if (T_128BitTag)
543 		{
544 			kh2=(polyS+1*4+2)[0]; kl2=(polyS+1*4+2)[1];
545 		}
546 	#endif
547 
548 	do
549 	{
550 		DeclareNH(nhA);
551 		DeclareNH(nhB);
552 
553 		i = 0;
554 		if (blocksRemainingInWord64 < L1KeyLengthInWord64)
555 		{
556 			if (blocksRemainingInWord64 % 8)
557 			{
558 				innerLoopEnd = blocksRemainingInWord64 % 8;
559 				for (; i<innerLoopEnd; i+=2)
560 					INNER_LOOP_ITERATION(0);
561 			}
562 			innerLoopEnd = blocksRemainingInWord64;
563 		}
564 		for (; i<innerLoopEnd; i+=8)
565 		{
566 			INNER_LOOP_ITERATION(0);
567 			INNER_LOOP_ITERATION(1);
568 			INNER_LOOP_ITERATION(2);
569 			INNER_LOOP_ITERATION(3);
570 		}
571 		blocksRemainingInWord64 -= innerLoopEnd;
572 		data += innerLoopEnd;
573 
574 		#if VMAC_BOOL_32BIT
575 			word32 nh0[2],  nh1[2];
576 			word64 nh2[2];
577 
578 			nh0[0] = word32(nhA0);
579 			nhA1 += (nhA0 >> 32);
580 			nh1[0] = word32(nhA1);
581 			nh2[0] = (nhA2 + (nhA1 >> 32)) & m62;
582 
583 			if (T_128BitTag)
584 			{
585 				nh0[1] = word32(nhB0);
586 				nhB1 += (nhB0 >> 32);
587 				nh1[1] = word32(nhB1);
588 				nh2[1] = (nhB2 + (nhB1 >> 32)) & m62;
589 			}
590 
591 			#define a0 (((word32 *)(polyS+i*4))[2+NativeByteOrder::ToEnum()])
592 			#define a1 (*(((word32 *)(polyS+i*4))+3-NativeByteOrder::ToEnum()))		// workaround for GCC 3.2
593 			#define a2 (((word32 *)(polyS+i*4))[0+NativeByteOrder::ToEnum()])
594 			#define a3 (*(((word32 *)(polyS+i*4))+1-NativeByteOrder::ToEnum()))
595 			#define aHi ((polyS+i*4)[0])
596 			#define k0 (((word32 *)(polyS+i*4+2))[2+NativeByteOrder::ToEnum()])
597 			#define k1 (*(((word32 *)(polyS+i*4+2))+3-NativeByteOrder::ToEnum()))
598 			#define k2 (((word32 *)(polyS+i*4+2))[0+NativeByteOrder::ToEnum()])
599 			#define k3 (*(((word32 *)(polyS+i*4+2))+1-NativeByteOrder::ToEnum()))
600 			#define kHi ((polyS+i*4+2)[0])
601 
602 			if (isFirstBlock)
603 			{
604 				isFirstBlock = false;
605 				if (m_isFirstBlock)
606 				{
607 					m_isFirstBlock = false;
608 					for (i=0; i<=(size_t)T_128BitTag; i++)
609 					{
610 						word64 t = (word64)nh0[i] + k0;
611 						a0 = (word32)t;
612 						t = (t >> 32) + nh1[i] + k1;
613 						a1 = (word32)t;
614 						aHi = (t >> 32) + nh2[i] + kHi;
615 					}
616 					continue;
617 				}
618 			}
619 			for (i=0; i<=(size_t)T_128BitTag; i++)
620 			{
621 				word64 p, t;
622 				word32 t2;
623 
624 				p = MUL32(a3, 2*k3);
625 				p += nh2[i];
626 				p += MUL32(a0, k2);
627 				p += MUL32(a1, k1);
628 				p += MUL32(a2, k0);
629 				t2 = (word32)p;
630 				p >>= 32;
631 				p += MUL32(a0, k3);
632 				p += MUL32(a1, k2);
633 				p += MUL32(a2, k1);
634 				p += MUL32(a3, k0);
635 				t = (word64(word32(p) & 0x7fffffff) << 32) | t2;
636 				p >>= 31;
637 				p += nh0[i];
638 				p += MUL32(a0, k0);
639 				p += MUL32(a1, 2*k3);
640 				p += MUL32(a2, 2*k2);
641 				p += MUL32(a3, 2*k1);
642 				t2 = (word32)p;
643 				p >>= 32;
644 				p += nh1[i];
645 				p += MUL32(a0, k1);
646 				p += MUL32(a1, k0);
647 				p += MUL32(a2, 2*k3);
648 				p += MUL32(a3, 2*k2);
649 				a0 = t2;
650 				a1 = (word32)p;
651 				aHi = (p >> 32) + t;
652 			}
653 
654 			#undef a0
655 			#undef a1
656 			#undef a2
657 			#undef a3
658 			#undef aHi
659 			#undef k0
660 			#undef k1
661 			#undef k2
662 			#undef k3
663 			#undef kHi
664 		#else		// #if VMAC_BOOL_32BIT
665 			if (isFirstBlock)
666 			{
667 				isFirstBlock = false;
668 				if (m_isFirstBlock)
669 				{
670 					m_isFirstBlock = false;
671 					#if VMAC_BOOL_WORD128
672 						#define first_poly_step(a, kh, kl, m)	a = (m & m126) + ((word128(kh) << 64) | kl)
673 
674 						first_poly_step(a1, kh1, kl1, nhA);
675 						if (T_128BitTag)
676 							first_poly_step(a2, kh2, kl2, nhB);
677 					#else
678 						#define first_poly_step(ah, al, kh, kl, mh, ml)		{\
679 							mh &= m62;\
680 							ADD128(mh, ml, kh, kl);	\
681 							ah = mh; al = ml;}
682 
683 						first_poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
684 						if (T_128BitTag)
685 							first_poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
686 					#endif
687 					continue;
688 				}
689 				else
690 				{
691 					#if VMAC_BOOL_WORD128
692 						a1 = (word128((polyS+0*4)[0]) << 64) | (polyS+0*4)[1];
693 					#else
694 						ah1=(polyS+0*4)[0]; al1=(polyS+0*4)[1];
695 					#endif
696 					if (T_128BitTag)
697 					{
698 						#if VMAC_BOOL_WORD128
699 							a2 = (word128((polyS+1*4)[0]) << 64) | (polyS+1*4)[1];
700 						#else
701 							ah2=(polyS+1*4)[0]; al2=(polyS+1*4)[1];
702 						#endif
703 					}
704 				}
705 			}
706 
707 			#if VMAC_BOOL_WORD128
708 				#define poly_step(a, kh, kl, m)	\
709 				{   word128 t1, t2, t3, t4;\
710 					Multiply128(t2, a>>64, kl);\
711 					Multiply128(t3, a, kh);\
712 					Multiply128(t1, a, kl);\
713 					Multiply128(t4, a>>64, 2*kh);\
714 					t2 += t3;\
715 					t4 += t1;\
716 					t2 += t4>>64;\
717 					a = (word128(word64(t2)&m63) << 64) | word64(t4);\
718 					t2 *= 2;\
719 					a += m & m126;\
720 					a += t2>>64;}
721 
722 				poly_step(a1, kh1, kl1, nhA);
723 				if (T_128BitTag)
724 					poly_step(a2, kh2, kl2, nhB);
725 			#else
726 				#define poly_step(ah, al, kh, kl, mh, ml)					\
727 				{   word64 t1h, t1l, t2h, t2l, t3h, t3l, z=0;				\
728 					/* compute ab*cd, put bd into result registers */       \
729 					MUL64(t2h,t2l,ah,kl);                                   \
730 					MUL64(t3h,t3l,al,kh);                                   \
731 					MUL64(t1h,t1l,ah,2*kh);                                 \
732 					MUL64(ah,al,al,kl);                                     \
733 					/* add together ad + bc */                              \
734 					ADD128(t2h,t2l,t3h,t3l);                                \
735 					/* add 2 * ac to result */                              \
736 					ADD128(ah,al,t1h,t1l);                                  \
737 					/* now (ah,al), (t2l,2*t2h) need summing */             \
738 					/* first add the high registers, carrying into t2h */   \
739 					ADD128(t2h,ah,z,t2l);                                   \
740 					/* double t2h and add top bit of ah */                  \
741 					t2h += t2h + (ah >> 63);                                \
742 					ah &= m63;                                              \
743 					/* now add the low registers */                         \
744 					mh &= m62;												\
745 					ADD128(ah,al,mh,ml);                                    \
746 					ADD128(ah,al,z,t2h);                                    \
747 				}
748 
749 				poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
750 				if (T_128BitTag)
751 					poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
752 			#endif
753 		#endif		// #if VMAC_BOOL_32BIT
754 	} while (blocksRemainingInWord64);
755 
756 	#if VMAC_BOOL_WORD128
757 		(polyS+0*4)[0]=word64(a1>>64); (polyS+0*4)[1]=word64(a1);
758 		if (T_128BitTag)
759 		{
760 			(polyS+1*4)[0]=word64(a2>>64); (polyS+1*4)[1]=word64(a2);
761 		}
762 	#elif !VMAC_BOOL_32BIT
763 		(polyS+0*4)[0]=ah1; (polyS+0*4)[1]=al1;
764 		if (T_128BitTag)
765 		{
766 			(polyS+1*4)[0]=ah2; (polyS+1*4)[1]=al2;
767 		}
768 	#endif
769 }
770 
VHASH_Update(const word64 * data,size_t blocksRemainingInWord64)771 inline void VMAC_Base::VHASH_Update(const word64 *data, size_t blocksRemainingInWord64)
772 {
773 #if CRYPTOPP_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
774 	if (HasSSE2())
775 	{
776 		VHASH_Update_SSE2(data, blocksRemainingInWord64, 0);
777 		if (m_is128)
778 			VHASH_Update_SSE2(data, blocksRemainingInWord64, 1);
779 		m_isFirstBlock = false;
780 	}
781 	else
782 #endif
783 	{
784 		if (m_is128)
785 			VHASH_Update_Template<true>(data, blocksRemainingInWord64);
786 		else
787 			VHASH_Update_Template<false>(data, blocksRemainingInWord64);
788 	}
789 }
790 
HashMultipleBlocks(const word64 * data,size_t length)791 size_t VMAC_Base::HashMultipleBlocks(const word64 *data, size_t length)
792 {
793 	size_t remaining = ModPowerOf2(length, m_L1KeyLength);
794 	VHASH_Update(data, (length-remaining)/8);
795 	return remaining;
796 }
797 
L3Hash(const word64 * input,const word64 * l3Key,size_t len)798 word64 L3Hash(const word64 *input, const word64 *l3Key, size_t len)
799 {
800     word64 rh, rl, t, z=0;
801 	word64 p1 = input[0], p2 = input[1];
802 	word64 k1 = l3Key[0], k2 = l3Key[1];
803 
804     /* fully reduce (p1,p2)+(len,0) mod p127 */
805     t = p1 >> 63;
806     p1 &= m63;
807     ADD128(p1, p2, len, t);
808     /* At this point, (p1,p2) is at most 2^127+(len<<64) */
809     t = (p1 > m63) + ((p1 == m63) & (p2 == m64));
810     ADD128(p1, p2, z, t);
811     p1 &= m63;
812 
813     /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
814     t = p1 + (p2 >> 32);
815     t += (t >> 32);
816     t += (word32)t > 0xfffffffeU;
817     p1 += (t >> 32);
818     p2 += (p1 << 32);
819 
820     /* compute (p1+k1)%p64 and (p2+k2)%p64 */
821     p1 += k1;
822     p1 += (0 - (p1 < k1)) & 257;
823     p2 += k2;
824     p2 += (0 - (p2 < k2)) & 257;
825 
826     /* compute (p1+k1)*(p2+k2)%p64 */
827     MUL64(rh, rl, p1, p2);
828     t = rh >> 56;
829     ADD128(t, rl, z, rh);
830     rh <<= 8;
831     ADD128(t, rl, z, rh);
832     t += t << 8;
833     rl += t;
834     rl += (0 - (rl < t)) & 257;
835     rl += (0 - (rl > p64-1)) & 257;
836     return rl;
837 }
838 
TruncatedFinal(byte * mac,size_t size)839 void VMAC_Base::TruncatedFinal(byte *mac, size_t size)
840 {
841 	CRYPTOPP_ASSERT(IsAlignedOn(DataBuf(),GetAlignmentOf<word64>()));
842 	CRYPTOPP_ASSERT(IsAlignedOn(m_polyState(),GetAlignmentOf<word64>()));
843 	size_t len = ModPowerOf2(GetBitCountLo()/8, m_L1KeyLength);
844 
845 	if (len)
846 	{
847 		memset(m_data()+len, 0, (0-len)%16);
848 		VHASH_Update(DataBuf(), ((len+15)/16)*2);
849 		len *= 8;	// convert to bits
850 	}
851 	else if (m_isFirstBlock)
852 	{
853 		// special case for empty string
854 		m_polyState()[0] = m_polyState()[2];
855 		m_polyState()[1] = m_polyState()[3];
856 		if (m_is128)
857 		{
858 			m_polyState()[4] = m_polyState()[6];
859 			m_polyState()[5] = m_polyState()[7];
860 		}
861 	}
862 
863 	if (m_is128)
864 	{
865 		word64 t[2];
866 		t[0] = L3Hash(m_polyState(), m_l3Key(), len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad());
867 		t[1] = L3Hash(m_polyState()+4, m_l3Key()+2, len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad()+8);
868 		if (size == 16)
869 		{
870 			PutWord(false, BIG_ENDIAN_ORDER, mac, t[0]);
871 			PutWord(false, BIG_ENDIAN_ORDER, mac+8, t[1]);
872 		}
873 		else
874 		{
875 			t[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[0]);
876 			t[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[1]);
877 			memcpy(mac, t, size);
878 		}
879 	}
880 	else
881 	{
882 		word64 t = L3Hash(m_polyState(), m_l3Key(), len);
883 		t += GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad() + (m_nonce()[IVSize()-1]&1) * 8);
884 		if (size == 8)
885 			PutWord(false, BIG_ENDIAN_ORDER, mac, t);
886 		else
887 		{
888 			t = ConditionalByteReverse(BIG_ENDIAN_ORDER, t);
889 			memcpy(mac, &t, size);
890 		}
891 	}
892 }
893 
894 NAMESPACE_END
895