14d1abfb2Sjoerg ///////////////////////////////////////////////////////////////////////////////
24d1abfb2Sjoerg //
34d1abfb2Sjoerg /// \file       sha256.c
44d1abfb2Sjoerg /// \brief      SHA-256
54d1abfb2Sjoerg ///
64d1abfb2Sjoerg /// \todo       Crypto++ has x86 ASM optimizations. They use SSE so if they
74d1abfb2Sjoerg ///             are imported to liblzma, SSE instructions need to be used
84d1abfb2Sjoerg ///             conditionally to keep the code working on older boxes.
94d1abfb2Sjoerg //
104d1abfb2Sjoerg //  This code is based on the code found from 7-Zip, which has a modified
114d1abfb2Sjoerg //  version of the SHA-256 found from Crypto++ <http://www.cryptopp.com/>.
124d1abfb2Sjoerg //  The code was modified a little to fit into liblzma.
134d1abfb2Sjoerg //
144d1abfb2Sjoerg //  Authors:    Kevin Springle
154d1abfb2Sjoerg //              Wei Dai
164d1abfb2Sjoerg //              Igor Pavlov
174d1abfb2Sjoerg //              Lasse Collin
184d1abfb2Sjoerg //
194d1abfb2Sjoerg //  This file has been put into the public domain.
204d1abfb2Sjoerg //  You can do whatever you want with this file.
214d1abfb2Sjoerg //
224d1abfb2Sjoerg ///////////////////////////////////////////////////////////////////////////////
234d1abfb2Sjoerg 
244d1abfb2Sjoerg #include "check.h"
254d1abfb2Sjoerg 
26*7653b22fSchristos // Rotate a uint32_t. GCC can optimize this to a rotate instruction
27*7653b22fSchristos // at least on x86.
28*7653b22fSchristos static inline uint32_t
rotr_32(uint32_t num,unsigned amount)29*7653b22fSchristos rotr_32(uint32_t num, unsigned amount)
30*7653b22fSchristos {
31*7653b22fSchristos         return (num >> amount) | (num << (32 - amount));
32*7653b22fSchristos }
334d1abfb2Sjoerg 
34*7653b22fSchristos #define blk0(i) (W[i] = conv32be(data[i]))
354d1abfb2Sjoerg #define blk2(i) (W[i & 15] += s1(W[(i - 2) & 15]) + W[(i - 7) & 15] \
364d1abfb2Sjoerg 		+ s0(W[(i - 15) & 15]))
374d1abfb2Sjoerg 
384d1abfb2Sjoerg #define Ch(x, y, z) (z ^ (x & (y ^ z)))
39*7653b22fSchristos #define Maj(x, y, z) ((x & (y ^ z)) + (y & z))
404d1abfb2Sjoerg 
414d1abfb2Sjoerg #define a(i) T[(0 - i) & 7]
424d1abfb2Sjoerg #define b(i) T[(1 - i) & 7]
434d1abfb2Sjoerg #define c(i) T[(2 - i) & 7]
444d1abfb2Sjoerg #define d(i) T[(3 - i) & 7]
454d1abfb2Sjoerg #define e(i) T[(4 - i) & 7]
464d1abfb2Sjoerg #define f(i) T[(5 - i) & 7]
474d1abfb2Sjoerg #define g(i) T[(6 - i) & 7]
484d1abfb2Sjoerg #define h(i) T[(7 - i) & 7]
494d1abfb2Sjoerg 
50*7653b22fSchristos #define R(i, j, blk) \
51*7653b22fSchristos 	h(i) += S1(e(i)) + Ch(e(i), f(i), g(i)) + SHA256_K[i + j] + blk; \
524d1abfb2Sjoerg 	d(i) += h(i); \
534d1abfb2Sjoerg 	h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
54*7653b22fSchristos #define R0(i) R(i, 0, blk0(i))
55*7653b22fSchristos #define R2(i) R(i, j, blk2(i))
564d1abfb2Sjoerg 
57*7653b22fSchristos #define S0(x) rotr_32(x ^ rotr_32(x ^ rotr_32(x, 9), 11), 2)
58*7653b22fSchristos #define S1(x) rotr_32(x ^ rotr_32(x ^ rotr_32(x, 14), 5), 6)
59*7653b22fSchristos #define s0(x) (rotr_32(x ^ rotr_32(x, 11), 7) ^ (x >> 3))
60*7653b22fSchristos #define s1(x) (rotr_32(x ^ rotr_32(x, 2), 17) ^ (x >> 10))
614d1abfb2Sjoerg 
624d1abfb2Sjoerg 
634d1abfb2Sjoerg static const uint32_t SHA256_K[64] = {
644d1abfb2Sjoerg 	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
654d1abfb2Sjoerg 	0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
664d1abfb2Sjoerg 	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
674d1abfb2Sjoerg 	0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
684d1abfb2Sjoerg 	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
694d1abfb2Sjoerg 	0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
704d1abfb2Sjoerg 	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
714d1abfb2Sjoerg 	0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
724d1abfb2Sjoerg 	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
734d1abfb2Sjoerg 	0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
744d1abfb2Sjoerg 	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
754d1abfb2Sjoerg 	0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
764d1abfb2Sjoerg 	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
774d1abfb2Sjoerg 	0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
784d1abfb2Sjoerg 	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
794d1abfb2Sjoerg 	0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
804d1abfb2Sjoerg };
814d1abfb2Sjoerg 
824d1abfb2Sjoerg 
834d1abfb2Sjoerg static void
transform(uint32_t state[8],const uint32_t data[16])84*7653b22fSchristos transform(uint32_t state[8], const uint32_t data[16])
854d1abfb2Sjoerg {
864d1abfb2Sjoerg 	uint32_t W[16];
874d1abfb2Sjoerg 	uint32_t T[8];
884d1abfb2Sjoerg 
894d1abfb2Sjoerg 	// Copy state[] to working vars.
904d1abfb2Sjoerg 	memcpy(T, state, sizeof(T));
914d1abfb2Sjoerg 
92*7653b22fSchristos 	// The first 16 operations unrolled
93*7653b22fSchristos 	R0( 0); R0( 1); R0( 2); R0( 3);
94*7653b22fSchristos 	R0( 4); R0( 5); R0( 6); R0( 7);
95*7653b22fSchristos 	R0( 8); R0( 9); R0(10); R0(11);
96*7653b22fSchristos 	R0(12); R0(13); R0(14); R0(15);
97*7653b22fSchristos 
98*7653b22fSchristos 	// The remaining 48 operations partially unrolled
99*7653b22fSchristos 	for (unsigned int j = 16; j < 64; j += 16) {
100*7653b22fSchristos 		R2( 0); R2( 1); R2( 2); R2( 3);
101*7653b22fSchristos 		R2( 4); R2( 5); R2( 6); R2( 7);
102*7653b22fSchristos 		R2( 8); R2( 9); R2(10); R2(11);
103*7653b22fSchristos 		R2(12); R2(13); R2(14); R2(15);
1044d1abfb2Sjoerg 	}
1054d1abfb2Sjoerg 
1064d1abfb2Sjoerg 	// Add the working vars back into state[].
1074d1abfb2Sjoerg 	state[0] += a(0);
1084d1abfb2Sjoerg 	state[1] += b(0);
1094d1abfb2Sjoerg 	state[2] += c(0);
1104d1abfb2Sjoerg 	state[3] += d(0);
1114d1abfb2Sjoerg 	state[4] += e(0);
1124d1abfb2Sjoerg 	state[5] += f(0);
1134d1abfb2Sjoerg 	state[6] += g(0);
1144d1abfb2Sjoerg 	state[7] += h(0);
1154d1abfb2Sjoerg }
1164d1abfb2Sjoerg 
1174d1abfb2Sjoerg 
1184d1abfb2Sjoerg static void
process(lzma_check_state * check)1194d1abfb2Sjoerg process(lzma_check_state *check)
1204d1abfb2Sjoerg {
1214d1abfb2Sjoerg 	transform(check->state.sha256.state, check->buffer.u32);
1224d1abfb2Sjoerg 	return;
1234d1abfb2Sjoerg }
1244d1abfb2Sjoerg 
1254d1abfb2Sjoerg 
1264d1abfb2Sjoerg extern void
lzma_sha256_init(lzma_check_state * check)1274d1abfb2Sjoerg lzma_sha256_init(lzma_check_state *check)
1284d1abfb2Sjoerg {
1294d1abfb2Sjoerg 	static const uint32_t s[8] = {
1304d1abfb2Sjoerg 		0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
1314d1abfb2Sjoerg 		0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
1324d1abfb2Sjoerg 	};
1334d1abfb2Sjoerg 
1344d1abfb2Sjoerg 	memcpy(check->state.sha256.state, s, sizeof(s));
1354d1abfb2Sjoerg 	check->state.sha256.size = 0;
1364d1abfb2Sjoerg 
1374d1abfb2Sjoerg 	return;
1384d1abfb2Sjoerg }
1394d1abfb2Sjoerg 
1404d1abfb2Sjoerg 
1414d1abfb2Sjoerg extern void
lzma_sha256_update(const uint8_t * buf,size_t size,lzma_check_state * check)1424d1abfb2Sjoerg lzma_sha256_update(const uint8_t *buf, size_t size, lzma_check_state *check)
1434d1abfb2Sjoerg {
1444d1abfb2Sjoerg 	// Copy the input data into a properly aligned temporary buffer.
1454d1abfb2Sjoerg 	// This way we can be called with arbitrarily sized buffers
1464d1abfb2Sjoerg 	// (no need to be multiple of 64 bytes), and the code works also
1474d1abfb2Sjoerg 	// on architectures that don't allow unaligned memory access.
1484d1abfb2Sjoerg 	while (size > 0) {
1494d1abfb2Sjoerg 		const size_t copy_start = check->state.sha256.size & 0x3F;
1504d1abfb2Sjoerg 		size_t copy_size = 64 - copy_start;
1514d1abfb2Sjoerg 		if (copy_size > size)
1524d1abfb2Sjoerg 			copy_size = size;
1534d1abfb2Sjoerg 
1544d1abfb2Sjoerg 		memcpy(check->buffer.u8 + copy_start, buf, copy_size);
1554d1abfb2Sjoerg 
1564d1abfb2Sjoerg 		buf += copy_size;
1574d1abfb2Sjoerg 		size -= copy_size;
1584d1abfb2Sjoerg 		check->state.sha256.size += copy_size;
1594d1abfb2Sjoerg 
1604d1abfb2Sjoerg 		if ((check->state.sha256.size & 0x3F) == 0)
1614d1abfb2Sjoerg 			process(check);
1624d1abfb2Sjoerg 	}
1634d1abfb2Sjoerg 
1644d1abfb2Sjoerg 	return;
1654d1abfb2Sjoerg }
1664d1abfb2Sjoerg 
1674d1abfb2Sjoerg 
1684d1abfb2Sjoerg extern void
lzma_sha256_finish(lzma_check_state * check)1694d1abfb2Sjoerg lzma_sha256_finish(lzma_check_state *check)
1704d1abfb2Sjoerg {
1714d1abfb2Sjoerg 	// Add padding as described in RFC 3174 (it describes SHA-1 but
1724d1abfb2Sjoerg 	// the same padding style is used for SHA-256 too).
1734d1abfb2Sjoerg 	size_t pos = check->state.sha256.size & 0x3F;
1744d1abfb2Sjoerg 	check->buffer.u8[pos++] = 0x80;
1754d1abfb2Sjoerg 
1764d1abfb2Sjoerg 	while (pos != 64 - 8) {
1774d1abfb2Sjoerg 		if (pos == 64) {
1784d1abfb2Sjoerg 			process(check);
1794d1abfb2Sjoerg 			pos = 0;
1804d1abfb2Sjoerg 		}
1814d1abfb2Sjoerg 
1824d1abfb2Sjoerg 		check->buffer.u8[pos++] = 0x00;
1834d1abfb2Sjoerg 	}
1844d1abfb2Sjoerg 
1854d1abfb2Sjoerg 	// Convert the message size from bytes to bits.
1864d1abfb2Sjoerg 	check->state.sha256.size *= 8;
1874d1abfb2Sjoerg 
1884d1abfb2Sjoerg 	check->buffer.u64[(64 - 8) / 8] = conv64be(check->state.sha256.size);
1894d1abfb2Sjoerg 
1904d1abfb2Sjoerg 	process(check);
1914d1abfb2Sjoerg 
1924d1abfb2Sjoerg 	for (size_t i = 0; i < 8; ++i)
1934d1abfb2Sjoerg 		check->buffer.u32[i] = conv32be(check->state.sha256.state[i]);
1944d1abfb2Sjoerg 
1954d1abfb2Sjoerg 	return;
1964d1abfb2Sjoerg }
197