1 // sosemanuk.cpp - originally written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sosemanuk.cpp" to generate MASM code
4 
5 #include "pch.h"
6 #include "config.h"
7 
8 #if CRYPTOPP_MSC_VERSION
9 # pragma warning(disable: 4702 4731)
10 #endif
11 
12 #ifndef CRYPTOPP_GENERATE_X64_MASM
13 
14 #include "sosemanuk.h"
15 #include "serpentp.h"
16 #include "secblock.h"
17 #include "misc.h"
18 #include "cpu.h"
19 
NAMESPACE_BEGIN(CryptoPP)20 NAMESPACE_BEGIN(CryptoPP)
21 
22 std::string SosemanukPolicy::AlgorithmProvider() const
23 {
24 #ifndef CRYPTOPP_DISABLE_SOSEMANUK_ASM
25 # if CRYPTOPP_SSE2_ASM_AVAILABLE
26 	if (HasSSE2())
27 		return "SSE2";
28 # endif
29 #endif
30 	return "C++";
31 }
32 
CipherSetKey(const NameValuePairs & params,const byte * userKey,size_t keylen)33 void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen)
34 {
35 	CRYPTOPP_UNUSED(params);
36 	Serpent_KeySchedule(m_key, 24, userKey, keylen);
37 }
38 
CipherResynchronize(byte * keystreamBuffer,const byte * iv,size_t length)39 void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv, size_t length)
40 {
41 	CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(iv), CRYPTOPP_UNUSED(length);
42 	CRYPTOPP_ASSERT(length==16);
43 
44 	word32 a, b, c, d, e;
45 
46 	typedef BlockGetAndPut<word32, LittleEndian> Block;
47 	Block::Get(iv)(a)(b)(c)(d);
48 
49 	const word32 *k = m_key;
50 	unsigned int i=1;
51 
52 	do
53 	{
54 		beforeS0(KX); beforeS0(S0); afterS0(LT);
55 		afterS0(KX); afterS0(S1); afterS1(LT);
56 		if (i == 3)	// after 18th round
57 		{
58 			m_state[4] = b;
59 			m_state[5] = e;
60 			m_state[10] = c;
61 			m_state[11] = a;
62 		}
63 		afterS1(KX); afterS1(S2); afterS2(LT);
64 		afterS2(KX); afterS2(S3); afterS3(LT);
65 		if (i == 2)	// after 12th round
66 		{
67 			m_state[6] = c;
68 			m_state[7] = d;
69 			m_state[8] = b;
70 			m_state[9] = e;
71 		}
72 		afterS3(KX); afterS3(S4); afterS4(LT);
73 		afterS4(KX); afterS4(S5); afterS5(LT);
74 		afterS5(KX); afterS5(S6); afterS6(LT);
75 		afterS6(KX); afterS6(S7); afterS7(LT);
76 
77 		if (i == 3)
78 			break;
79 
80 		++i;
81 		c = b;
82 		b = e;
83 		e = d;
84 		d = a;
85 		a = e;
86 		k += 32;
87 	}
88 	while (true);
89 
90 	afterS7(KX);
91 
92 	m_state[0] = a;
93 	m_state[1] = b;
94 	m_state[2] = e;
95 	m_state[3] = d;
96 
97 #define XMUX(c, x, y)   (x ^ (y & (0 - (c & 1))))
98 	m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
99 	m_state[10] = rotlConstant<7>(m_state[10] * 0x54655307);
100 }
101 
102 extern "C" {
103 word32 s_sosemanukMulTables[512] = {
104 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
105 	0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836,
106 	0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E,
107 	0x05A7DC90, 0xE4381382, 0x6E30EBB4, 0x8FAF24A6,
108 	0xD320B2D8, 0x32BF7DCA, 0xB8B785FC, 0x59284AEE,
109 	0x0AE71189, 0xEB78DE9B, 0x617026AD, 0x80EFE9BF,
110 	0xDC607FC1, 0x3DFFB0D3, 0xB7F748E5, 0x566887F7,
111 	0x0F40CD19, 0xEEDF020B, 0x64D7FA3D, 0x8548352F,
112 	0xD9C7A351, 0x38586C43, 0xB2509475, 0x53CF5B67,
113 	0x146722BB, 0xF5F8EDA9, 0x7FF0159F, 0x9E6FDA8D,
114 	0xC2E04CF3, 0x237F83E1, 0xA9777BD7, 0x48E8B4C5,
115 	0x11C0FE2B, 0xF05F3139, 0x7A57C90F, 0x9BC8061D,
116 	0xC7479063, 0x26D85F71, 0xACD0A747, 0x4D4F6855,
117 	0x1E803332, 0xFF1FFC20, 0x75170416, 0x9488CB04,
118 	0xC8075D7A, 0x29989268, 0xA3906A5E, 0x420FA54C,
119 	0x1B27EFA2, 0xFAB820B0, 0x70B0D886, 0x912F1794,
120 	0xCDA081EA, 0x2C3F4EF8, 0xA637B6CE, 0x47A879DC,
121 	0x28CE44DF, 0xC9518BCD, 0x435973FB, 0xA2C6BCE9,
122 	0xFE492A97, 0x1FD6E585, 0x95DE1DB3, 0x7441D2A1,
123 	0x2D69984F, 0xCCF6575D, 0x46FEAF6B, 0xA7616079,
124 	0xFBEEF607, 0x1A713915, 0x9079C123, 0x71E60E31,
125 	0x22295556, 0xC3B69A44, 0x49BE6272, 0xA821AD60,
126 	0xF4AE3B1E, 0x1531F40C, 0x9F390C3A, 0x7EA6C328,
127 	0x278E89C6, 0xC61146D4, 0x4C19BEE2, 0xAD8671F0,
128 	0xF109E78E, 0x1096289C, 0x9A9ED0AA, 0x7B011FB8,
129 	0x3CA96664, 0xDD36A976, 0x573E5140, 0xB6A19E52,
130 	0xEA2E082C, 0x0BB1C73E, 0x81B93F08, 0x6026F01A,
131 	0x390EBAF4, 0xD89175E6, 0x52998DD0, 0xB30642C2,
132 	0xEF89D4BC, 0x0E161BAE, 0x841EE398, 0x65812C8A,
133 	0x364E77ED, 0xD7D1B8FF, 0x5DD940C9, 0xBC468FDB,
134 	0xE0C919A5, 0x0156D6B7, 0x8B5E2E81, 0x6AC1E193,
135 	0x33E9AB7D, 0xD276646F, 0x587E9C59, 0xB9E1534B,
136 	0xE56EC535, 0x04F10A27, 0x8EF9F211, 0x6F663D03,
137 	0x50358817, 0xB1AA4705, 0x3BA2BF33, 0xDA3D7021,
138 	0x86B2E65F, 0x672D294D, 0xED25D17B, 0x0CBA1E69,
139 	0x55925487, 0xB40D9B95, 0x3E0563A3, 0xDF9AACB1,
140 	0x83153ACF, 0x628AF5DD, 0xE8820DEB, 0x091DC2F9,
141 	0x5AD2999E, 0xBB4D568C, 0x3145AEBA, 0xD0DA61A8,
142 	0x8C55F7D6, 0x6DCA38C4, 0xE7C2C0F2, 0x065D0FE0,
143 	0x5F75450E, 0xBEEA8A1C, 0x34E2722A, 0xD57DBD38,
144 	0x89F22B46, 0x686DE454, 0xE2651C62, 0x03FAD370,
145 	0x4452AAAC, 0xA5CD65BE, 0x2FC59D88, 0xCE5A529A,
146 	0x92D5C4E4, 0x734A0BF6, 0xF942F3C0, 0x18DD3CD2,
147 	0x41F5763C, 0xA06AB92E, 0x2A624118, 0xCBFD8E0A,
148 	0x97721874, 0x76EDD766, 0xFCE52F50, 0x1D7AE042,
149 	0x4EB5BB25, 0xAF2A7437, 0x25228C01, 0xC4BD4313,
150 	0x9832D56D, 0x79AD1A7F, 0xF3A5E249, 0x123A2D5B,
151 	0x4B1267B5, 0xAA8DA8A7, 0x20855091, 0xC11A9F83,
152 	0x9D9509FD, 0x7C0AC6EF, 0xF6023ED9, 0x179DF1CB,
153 	0x78FBCCC8, 0x996403DA, 0x136CFBEC, 0xF2F334FE,
154 	0xAE7CA280, 0x4FE36D92, 0xC5EB95A4, 0x24745AB6,
155 	0x7D5C1058, 0x9CC3DF4A, 0x16CB277C, 0xF754E86E,
156 	0xABDB7E10, 0x4A44B102, 0xC04C4934, 0x21D38626,
157 	0x721CDD41, 0x93831253, 0x198BEA65, 0xF8142577,
158 	0xA49BB309, 0x45047C1B, 0xCF0C842D, 0x2E934B3F,
159 	0x77BB01D1, 0x9624CEC3, 0x1C2C36F5, 0xFDB3F9E7,
160 	0xA13C6F99, 0x40A3A08B, 0xCAAB58BD, 0x2B3497AF,
161 	0x6C9CEE73, 0x8D032161, 0x070BD957, 0xE6941645,
162 	0xBA1B803B, 0x5B844F29, 0xD18CB71F, 0x3013780D,
163 	0x693B32E3, 0x88A4FDF1, 0x02AC05C7, 0xE333CAD5,
164 	0xBFBC5CAB, 0x5E2393B9, 0xD42B6B8F, 0x35B4A49D,
165 	0x667BFFFA, 0x87E430E8, 0x0DECC8DE, 0xEC7307CC,
166 	0xB0FC91B2, 0x51635EA0, 0xDB6BA696, 0x3AF46984,
167 	0x63DC236A, 0x8243EC78, 0x084B144E, 0xE9D4DB5C,
168 	0xB55B4D22, 0x54C48230, 0xDECC7A06, 0x3F53B514,
169 #else
170 	0x00000000, 0xE19FCF13, 0x6B973726, 0x8A08F835,
171 	0xD6876E4C, 0x3718A15F, 0xBD10596A, 0x5C8F9679,
172 	0x05A7DC98, 0xE438138B, 0x6E30EBBE, 0x8FAF24AD,
173 	0xD320B2D4, 0x32BF7DC7, 0xB8B785F2, 0x59284AE1,
174 	0x0AE71199, 0xEB78DE8A, 0x617026BF, 0x80EFE9AC,
175 	0xDC607FD5, 0x3DFFB0C6, 0xB7F748F3, 0x566887E0,
176 	0x0F40CD01, 0xEEDF0212, 0x64D7FA27, 0x85483534,
177 	0xD9C7A34D, 0x38586C5E, 0xB250946B, 0x53CF5B78,
178 	0x1467229B, 0xF5F8ED88, 0x7FF015BD, 0x9E6FDAAE,
179 	0xC2E04CD7, 0x237F83C4, 0xA9777BF1, 0x48E8B4E2,
180 	0x11C0FE03, 0xF05F3110, 0x7A57C925, 0x9BC80636,
181 	0xC747904F, 0x26D85F5C, 0xACD0A769, 0x4D4F687A,
182 	0x1E803302, 0xFF1FFC11, 0x75170424, 0x9488CB37,
183 	0xC8075D4E, 0x2998925D, 0xA3906A68, 0x420FA57B,
184 	0x1B27EF9A, 0xFAB82089, 0x70B0D8BC, 0x912F17AF,
185 	0xCDA081D6, 0x2C3F4EC5, 0xA637B6F0, 0x47A879E3,
186 	0x28CE449F, 0xC9518B8C, 0x435973B9, 0xA2C6BCAA,
187 	0xFE492AD3, 0x1FD6E5C0, 0x95DE1DF5, 0x7441D2E6,
188 	0x2D699807, 0xCCF65714, 0x46FEAF21, 0xA7616032,
189 	0xFBEEF64B, 0x1A713958, 0x9079C16D, 0x71E60E7E,
190 	0x22295506, 0xC3B69A15, 0x49BE6220, 0xA821AD33,
191 	0xF4AE3B4A, 0x1531F459, 0x9F390C6C, 0x7EA6C37F,
192 	0x278E899E, 0xC611468D, 0x4C19BEB8, 0xAD8671AB,
193 	0xF109E7D2, 0x109628C1, 0x9A9ED0F4, 0x7B011FE7,
194 	0x3CA96604, 0xDD36A917, 0x573E5122, 0xB6A19E31,
195 	0xEA2E0848, 0x0BB1C75B, 0x81B93F6E, 0x6026F07D,
196 	0x390EBA9C, 0xD891758F, 0x52998DBA, 0xB30642A9,
197 	0xEF89D4D0, 0x0E161BC3, 0x841EE3F6, 0x65812CE5,
198 	0x364E779D, 0xD7D1B88E, 0x5DD940BB, 0xBC468FA8,
199 	0xE0C919D1, 0x0156D6C2, 0x8B5E2EF7, 0x6AC1E1E4,
200 	0x33E9AB05, 0xD2766416, 0x587E9C23, 0xB9E15330,
201 	0xE56EC549, 0x04F10A5A, 0x8EF9F26F, 0x6F663D7C,
202 	0x50358897, 0xB1AA4784, 0x3BA2BFB1, 0xDA3D70A2,
203 	0x86B2E6DB, 0x672D29C8, 0xED25D1FD, 0x0CBA1EEE,
204 	0x5592540F, 0xB40D9B1C, 0x3E056329, 0xDF9AAC3A,
205 	0x83153A43, 0x628AF550, 0xE8820D65, 0x091DC276,
206 	0x5AD2990E, 0xBB4D561D, 0x3145AE28, 0xD0DA613B,
207 	0x8C55F742, 0x6DCA3851, 0xE7C2C064, 0x065D0F77,
208 	0x5F754596, 0xBEEA8A85, 0x34E272B0, 0xD57DBDA3,
209 	0x89F22BDA, 0x686DE4C9, 0xE2651CFC, 0x03FAD3EF,
210 	0x4452AA0C, 0xA5CD651F, 0x2FC59D2A, 0xCE5A5239,
211 	0x92D5C440, 0x734A0B53, 0xF942F366, 0x18DD3C75,
212 	0x41F57694, 0xA06AB987, 0x2A6241B2, 0xCBFD8EA1,
213 	0x977218D8, 0x76EDD7CB, 0xFCE52FFE, 0x1D7AE0ED,
214 	0x4EB5BB95, 0xAF2A7486, 0x25228CB3, 0xC4BD43A0,
215 	0x9832D5D9, 0x79AD1ACA, 0xF3A5E2FF, 0x123A2DEC,
216 	0x4B12670D, 0xAA8DA81E, 0x2085502B, 0xC11A9F38,
217 	0x9D950941, 0x7C0AC652, 0xF6023E67, 0x179DF174,
218 	0x78FBCC08, 0x9964031B, 0x136CFB2E, 0xF2F3343D,
219 	0xAE7CA244, 0x4FE36D57, 0xC5EB9562, 0x24745A71,
220 	0x7D5C1090, 0x9CC3DF83, 0x16CB27B6, 0xF754E8A5,
221 	0xABDB7EDC, 0x4A44B1CF, 0xC04C49FA, 0x21D386E9,
222 	0x721CDD91, 0x93831282, 0x198BEAB7, 0xF81425A4,
223 	0xA49BB3DD, 0x45047CCE, 0xCF0C84FB, 0x2E934BE8,
224 	0x77BB0109, 0x9624CE1A, 0x1C2C362F, 0xFDB3F93C,
225 	0xA13C6F45, 0x40A3A056, 0xCAAB5863, 0x2B349770,
226 	0x6C9CEE93, 0x8D032180, 0x070BD9B5, 0xE69416A6,
227 	0xBA1B80DF, 0x5B844FCC, 0xD18CB7F9, 0x301378EA,
228 	0x693B320B, 0x88A4FD18, 0x02AC052D, 0xE333CA3E,
229 	0xBFBC5C47, 0x5E239354, 0xD42B6B61, 0x35B4A472,
230 	0x667BFF0A, 0x87E43019, 0x0DECC82C, 0xEC73073F,
231 	0xB0FC9146, 0x51635E55, 0xDB6BA660, 0x3AF46973,
232 	0x63DC2392, 0x8243EC81, 0x084B14B4, 0xE9D4DBA7,
233 	0xB55B4DDE, 0x54C482CD, 0xDECC7AF8, 0x3F53B5EB,
234 #endif
235 	0x00000000, 0x180F40CD, 0x301E8033, 0x2811C0FE,
236 	0x603CA966, 0x7833E9AB, 0x50222955, 0x482D6998,
237 	0xC078FBCC, 0xD877BB01, 0xF0667BFF, 0xE8693B32,
238 	0xA04452AA, 0xB84B1267, 0x905AD299, 0x88559254,
239 	0x29F05F31, 0x31FF1FFC, 0x19EEDF02, 0x01E19FCF,
240 	0x49CCF657, 0x51C3B69A, 0x79D27664, 0x61DD36A9,
241 	0xE988A4FD, 0xF187E430, 0xD99624CE, 0xC1996403,
242 	0x89B40D9B, 0x91BB4D56, 0xB9AA8DA8, 0xA1A5CD65,
243 	0x5249BE62, 0x4A46FEAF, 0x62573E51, 0x7A587E9C,
244 	0x32751704, 0x2A7A57C9, 0x026B9737, 0x1A64D7FA,
245 	0x923145AE, 0x8A3E0563, 0xA22FC59D, 0xBA208550,
246 	0xF20DECC8, 0xEA02AC05, 0xC2136CFB, 0xDA1C2C36,
247 	0x7BB9E153, 0x63B6A19E, 0x4BA76160, 0x53A821AD,
248 	0x1B854835, 0x038A08F8, 0x2B9BC806, 0x339488CB,
249 	0xBBC11A9F, 0xA3CE5A52, 0x8BDF9AAC, 0x93D0DA61,
250 	0xDBFDB3F9, 0xC3F2F334, 0xEBE333CA, 0xF3EC7307,
251 	0xA492D5C4, 0xBC9D9509, 0x948C55F7, 0x8C83153A,
252 	0xC4AE7CA2, 0xDCA13C6F, 0xF4B0FC91, 0xECBFBC5C,
253 	0x64EA2E08, 0x7CE56EC5, 0x54F4AE3B, 0x4CFBEEF6,
254 	0x04D6876E, 0x1CD9C7A3, 0x34C8075D, 0x2CC74790,
255 	0x8D628AF5, 0x956DCA38, 0xBD7C0AC6, 0xA5734A0B,
256 	0xED5E2393, 0xF551635E, 0xDD40A3A0, 0xC54FE36D,
257 	0x4D1A7139, 0x551531F4, 0x7D04F10A, 0x650BB1C7,
258 	0x2D26D85F, 0x35299892, 0x1D38586C, 0x053718A1,
259 	0xF6DB6BA6, 0xEED42B6B, 0xC6C5EB95, 0xDECAAB58,
260 	0x96E7C2C0, 0x8EE8820D, 0xA6F942F3, 0xBEF6023E,
261 	0x36A3906A, 0x2EACD0A7, 0x06BD1059, 0x1EB25094,
262 	0x569F390C, 0x4E9079C1, 0x6681B93F, 0x7E8EF9F2,
263 	0xDF2B3497, 0xC724745A, 0xEF35B4A4, 0xF73AF469,
264 	0xBF179DF1, 0xA718DD3C, 0x8F091DC2, 0x97065D0F,
265 	0x1F53CF5B, 0x075C8F96, 0x2F4D4F68, 0x37420FA5,
266 	0x7F6F663D, 0x676026F0, 0x4F71E60E, 0x577EA6C3,
267 	0xE18D0321, 0xF98243EC, 0xD1938312, 0xC99CC3DF,
268 	0x81B1AA47, 0x99BEEA8A, 0xB1AF2A74, 0xA9A06AB9,
269 	0x21F5F8ED, 0x39FAB820, 0x11EB78DE, 0x09E43813,
270 	0x41C9518B, 0x59C61146, 0x71D7D1B8, 0x69D89175,
271 	0xC87D5C10, 0xD0721CDD, 0xF863DC23, 0xE06C9CEE,
272 	0xA841F576, 0xB04EB5BB, 0x985F7545, 0x80503588,
273 	0x0805A7DC, 0x100AE711, 0x381B27EF, 0x20146722,
274 	0x68390EBA, 0x70364E77, 0x58278E89, 0x4028CE44,
275 	0xB3C4BD43, 0xABCBFD8E, 0x83DA3D70, 0x9BD57DBD,
276 	0xD3F81425, 0xCBF754E8, 0xE3E69416, 0xFBE9D4DB,
277 	0x73BC468F, 0x6BB30642, 0x43A2C6BC, 0x5BAD8671,
278 	0x1380EFE9, 0x0B8FAF24, 0x239E6FDA, 0x3B912F17,
279 	0x9A34E272, 0x823BA2BF, 0xAA2A6241, 0xB225228C,
280 	0xFA084B14, 0xE2070BD9, 0xCA16CB27, 0xD2198BEA,
281 	0x5A4C19BE, 0x42435973, 0x6A52998D, 0x725DD940,
282 	0x3A70B0D8, 0x227FF015, 0x0A6E30EB, 0x12617026,
283 	0x451FD6E5, 0x5D109628, 0x750156D6, 0x6D0E161B,
284 	0x25237F83, 0x3D2C3F4E, 0x153DFFB0, 0x0D32BF7D,
285 	0x85672D29, 0x9D686DE4, 0xB579AD1A, 0xAD76EDD7,
286 	0xE55B844F, 0xFD54C482, 0xD545047C, 0xCD4A44B1,
287 	0x6CEF89D4, 0x74E0C919, 0x5CF109E7, 0x44FE492A,
288 	0x0CD320B2, 0x14DC607F, 0x3CCDA081, 0x24C2E04C,
289 	0xAC977218, 0xB49832D5, 0x9C89F22B, 0x8486B2E6,
290 	0xCCABDB7E, 0xD4A49BB3, 0xFCB55B4D, 0xE4BA1B80,
291 	0x17566887, 0x0F59284A, 0x2748E8B4, 0x3F47A879,
292 	0x776AC1E1, 0x6F65812C, 0x477441D2, 0x5F7B011F,
293 	0xD72E934B, 0xCF21D386, 0xE7301378, 0xFF3F53B5,
294 	0xB7123A2D, 0xAF1D7AE0, 0x870CBA1E, 0x9F03FAD3,
295 	0x3EA637B6, 0x26A9777B, 0x0EB8B785, 0x16B7F748,
296 	0x5E9A9ED0, 0x4695DE1D, 0x6E841EE3, 0x768B5E2E,
297 	0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
298 	0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
299 };
300 }
301 
302 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64)
GetAlignment() const303 unsigned int SosemanukPolicy::GetAlignment() const
304 {
305 #if CRYPTOPP_SSE2_ASM_AVAILABLE
306 #ifdef __INTEL_COMPILER
307 	if (HasSSE2() && !IsP4())	// Intel compiler produces faster code for this algorithm on the P4
308 #else
309 	if (HasSSE2())
310 #endif
311 		return 16;
312 	else
313 #endif
314 		return GetAlignmentOf<word32>();
315 }
316 
GetOptimalBlockSize() const317 unsigned int SosemanukPolicy::GetOptimalBlockSize() const
318 {
319 #if CRYPTOPP_SSE2_ASM_AVAILABLE
320 #ifdef __INTEL_COMPILER
321 	if (HasSSE2() && !IsP4())	// Intel compiler produces faster code for this algorithm on the P4
322 #else
323 	if (HasSSE2())
324 #endif
325 		return 4*BYTES_PER_ITERATION;
326 	else
327 #endif
328 		return BYTES_PER_ITERATION;
329 }
330 #endif
331 
332 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
333 extern "C" {
334 void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state);
335 }
336 #endif
337 
OperateKeystream(KeystreamOperation operation,byte * output,const byte * input,size_t iterationCount)338 void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
339 {
340 #endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
341 
342 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
343 	Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data());
344 	return;
345 #endif
346 
347 #if CRYPTOPP_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
348 #ifdef CRYPTOPP_GENERATE_X64_MASM
349 		ALIGN   8
350 	Sosemanuk_OperateKeystream	PROC FRAME
351 		rex_push_reg rsi
352 		push_reg rdi
353 		alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8)
354 		save_xmm128 xmm6, 02f0h
355 		save_xmm128 xmm7, 0300h
356 		.endprolog
357 		mov		rdi, r8
358 		mov		rax, r9
359 #else
360 #ifdef __INTEL_COMPILER
361 	if (HasSSE2() && !IsP4())	// Intel compiler produces faster code for this algorithm on the P4
362 #else
363 	if (HasSSE2())
364 #endif
365 	{
366 #ifdef __GNUC__
367 	#if CRYPTOPP_BOOL_X64
368 		FixedSizeAlignedSecBlock<byte, 80*4*2+12*4+8*WORD_SZ> workspace;
369 	#endif
370 		__asm__ __volatile__
371 		(
372 		INTEL_NOPREFIX
373 		AS_PUSH_IF86(	bx)
374 #else
375 		word32 *state = m_state;
376 		AS2(	mov		WORD_REG(ax), state)
377 		AS2(	mov		WORD_REG(di), output)
378 		AS2(	mov		WORD_REG(dx), input)
379 		AS2(	mov		WORD_REG(cx), iterationCount)
380 #endif
381 #endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
382 
383 #if defined(__GNUC__) && CRYPTOPP_BOOL_X64
384 	#define SSE2_workspace %5
385 #else
386 	#define SSE2_workspace WORD_REG(sp)
387 #endif
388 
389 #define SSE2_output			WORD_PTR [SSE2_workspace+1*WORD_SZ]
390 #define SSE2_input			WORD_PTR [SSE2_workspace+2*WORD_SZ]
391 #define SSE2_wordsLeft		WORD_PTR [SSE2_workspace+3*WORD_SZ]
392 #define SSE2_diEnd			WORD_PTR [SSE2_workspace+4*WORD_SZ]
393 #define SSE2_pMulTables		WORD_PTR [SSE2_workspace+5*WORD_SZ]
394 #define SSE2_state			WORD_PTR [SSE2_workspace+6*WORD_SZ]
395 #define SSE2_wordsLeft2		WORD_PTR [SSE2_workspace+7*WORD_SZ]
396 #define SSE2_stateCopy		SSE2_workspace + 8*WORD_SZ
397 #define	SSE2_uvStart		SSE2_stateCopy + 12*4
398 
399 #if (CRYPTOPP_BOOL_X86) && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
400 		AS_PUSH_IF86(	bp)
401 		AS2(	mov		AS_REG_6, esp)
402 		AS2(	and		esp, -16)
403 		AS2(	sub		esp, 80*4*2+12*4+8*WORD_SZ)	// 80 v's, 80 u's, 12 state, 8 locals
404 		AS2(	mov		[esp], AS_REG_6)
405 #endif
406 		AS2(	mov		SSE2_output, WORD_REG(di))
407 		AS2(	mov		SSE2_input, WORD_REG(dx))
408 		AS2(	mov		SSE2_state, WORD_REG(ax))
409 #ifndef _MSC_VER
410 		AS2(	mov		SSE2_pMulTables, WORD_REG(si))
411 #endif
412 		AS2(	lea		WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
413 		AS2(	lea		WORD_REG(si), [4*WORD_REG(cx)])
414 		AS2(	mov		SSE2_wordsLeft, WORD_REG(si))
415 		AS2(	movdqa	xmm0, [WORD_REG(ax)+0*16])		// copy state to stack to save a register
416 		AS2(	movdqa	[SSE2_stateCopy+0*16], xmm0)
417 		AS2(	movdqa	xmm0, [WORD_REG(ax)+1*16])
418 		AS2(	movdqa	[SSE2_stateCopy+1*16], xmm0)
419 		AS2(	movq	xmm0, QWORD PTR [WORD_REG(ax)+2*16])
420 		AS2(	movq	QWORD PTR [SSE2_stateCopy+2*16], xmm0)
421 		AS2(	psrlq	xmm0, 32)
422 		AS2(	movd	AS_REG_6d, xmm0)				// s(9)
423 		AS2(	mov		ecx, [WORD_REG(ax)+10*4])
424 		AS2(	mov		edx, [WORD_REG(ax)+11*4])
425 		AS2(	pcmpeqb	xmm7, xmm7)				// all ones
426 
427 #define s(i)	SSE2_stateCopy + ASM_MOD(i,10)*4
428 #define u(j)	WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
429 #define v(j)	WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
430 
431 #define R10 ecx
432 #define R11 edx
433 #define R20 edx
434 #define R21 ecx
435 // workaround bug in GAS 2.15
436 #define R20r WORD_REG(dx)
437 #define R21r WORD_REG(cx)
438 
439 #define SSE2_STEP(i, j)	\
440 	AS2(	mov		eax, [s(i+0)])\
441 	AS2(	mov		[v(i)], eax)\
442 	AS2(	rol		eax, 8)\
443 	AS2(	lea		AS_REG_7, [AS_REG_6 + R2##j##r])\
444 	AS2(	xor		AS_REG_7d, R1##j)\
445 	AS2(	mov		[u(i)], AS_REG_7d)\
446 	AS2(	mov		AS_REG_7d, 1)\
447 	AS2(	and		AS_REG_7d, R2##j)\
448 	AS1(	neg		AS_REG_7d)\
449 	AS2(	and		AS_REG_7d, AS_REG_6d)\
450 	AS2(	xor		AS_REG_6d, eax)\
451 	AS2(	movzx	eax, al)\
452 	AS2(	xor		AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\
453 	AS2(	mov		eax, [s(i+3)])\
454 	AS2(	xor		AS_REG_7d, [s(i+2)])\
455 	AS2(	add		R1##j, AS_REG_7d)\
456 	AS2(	movzx	AS_REG_7d, al)\
457 	AS2(	shr		eax, 8)\
458 	AS2(	xor		AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\
459 	AS2(	xor		AS_REG_6d, eax)\
460 	AS2(	imul	R2##j, AS_HEX(54655307))\
461 	AS2(	rol		R2##j, 7)\
462 	AS2(	mov		[s(i+0)], AS_REG_6d)\
463 
464 		ASL(2)	// outer loop, each iteration of this processes 80 words
465 		AS2(	lea		WORD_REG(di), [SSE2_uvStart])	// start of v and u
466 		AS2(	mov		WORD_REG(ax), 80)
467 		AS2(	cmp		WORD_REG(si), 80)
468 		AS2(	cmovg	WORD_REG(si), WORD_REG(ax))
469 		AS2(	mov		SSE2_wordsLeft2, WORD_REG(si))
470 		AS2(	lea		WORD_REG(si), [WORD_REG(di)+WORD_REG(si)])		// use to end first inner loop
471 		AS2(	mov		SSE2_diEnd, WORD_REG(si))
472 #ifdef _MSC_VER
473 		AS2(	lea		WORD_REG(si), s_sosemanukMulTables)
474 #else
475 		AS2(	mov		WORD_REG(si), SSE2_pMulTables)
476 #endif
477 
478 		ASL(0)	// first inner loop, 20 words each, 4 iterations
479 		SSE2_STEP(0, 0)
480 		SSE2_STEP(1, 1)
481 		SSE2_STEP(2, 0)
482 		SSE2_STEP(3, 1)
483 		SSE2_STEP(4, 0)
484 		SSE2_STEP(5, 1)
485 		SSE2_STEP(6, 0)
486 		SSE2_STEP(7, 1)
487 		SSE2_STEP(8, 0)
488 		SSE2_STEP(9, 1)
489 		SSE2_STEP(10, 0)
490 		SSE2_STEP(11, 1)
491 		SSE2_STEP(12, 0)
492 		SSE2_STEP(13, 1)
493 		SSE2_STEP(14, 0)
494 		SSE2_STEP(15, 1)
495 		SSE2_STEP(16, 0)
496 		SSE2_STEP(17, 1)
497 		SSE2_STEP(18, 0)
498 		SSE2_STEP(19, 1)
499 		// loop
500 		AS2(	add		WORD_REG(di), 5*4)
501 		AS2(	cmp		WORD_REG(di), SSE2_diEnd)
502 		ASJ(	jne,	0, b)
503 
504 		AS2(	mov		WORD_REG(ax), SSE2_input)
505 		AS2(	mov		AS_REG_7, SSE2_output)
506 		AS2(	lea		WORD_REG(di), [SSE2_uvStart])		// start of v and u
507 		AS2(	mov		WORD_REG(si), SSE2_wordsLeft2)
508 
509 		ASL(1)	// second inner loop, 16 words each, 5 iterations
510 		AS2(	movdqa	xmm0, [WORD_REG(di)+0*20*4])
511 		AS2(	movdqa	xmm2, [WORD_REG(di)+2*20*4])
512 		AS2(	movdqa	xmm3, [WORD_REG(di)+3*20*4])
513 		AS2(	movdqa	xmm1, [WORD_REG(di)+1*20*4])
514 		// S2
515 		AS2(	movdqa	xmm4, xmm0)
516 		AS2(	pand	xmm0, xmm2)
517 		AS2(    pxor	xmm0, xmm3)
518 		AS2(    pxor	xmm2, xmm1)
519  		AS2(	pxor	xmm2, xmm0)
520  		AS2(	por		xmm3, xmm4)
521  		AS2(	pxor	xmm3, xmm1)
522  		AS2(	pxor	xmm4, xmm2)
523  		AS2(	movdqa	xmm1, xmm3)
524  		AS2(	por		xmm3, xmm4)
525  		AS2(	pxor	xmm3, xmm0)
526  		AS2(	pand	xmm0, xmm1)
527  		AS2(	pxor	xmm4, xmm0)
528  		AS2(	pxor	xmm1, xmm3)
529  		AS2(	pxor	xmm1, xmm4)
530 		AS2(	pxor	xmm4, xmm7)
531 		// xor with v
532 		AS2(	pxor	xmm2, [WORD_REG(di)+80*4])
533 		AS2(	pxor	xmm3, [WORD_REG(di)+80*5])
534 		AS2(	pxor	xmm1, [WORD_REG(di)+80*6])
535 		AS2(	pxor	xmm4, [WORD_REG(di)+80*7])
536 		// exit loop early if less than 16 words left to output
537 		// this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
538 		AS2(	cmp		WORD_REG(si), 16)
539 		ASJ(	jl,		4, f)
540 		// unpack
541 		AS2(	movdqa		xmm6, xmm2)
542 		AS2(	punpckldq	xmm2, xmm3)
543 		AS2(	movdqa		xmm5, xmm1)
544 		AS2(	punpckldq	xmm1, xmm4)
545 		AS2(	movdqa		xmm0, xmm2)
546 		AS2(	punpcklqdq	xmm2, xmm1)
547 		AS2(	punpckhqdq	xmm0, xmm1)
548 		AS2(	punpckhdq	xmm6, xmm3)
549 		AS2(	punpckhdq	xmm5, xmm4)
550 		AS2(	movdqa		xmm3, xmm6)
551 		AS2(	punpcklqdq	xmm6, xmm5)
552 		AS2(	punpckhqdq	xmm3, xmm5)
553 
554 		// output keystream
555 		AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4)
556 
557 		// loop
558 		AS2(	add		WORD_REG(di), 4*4)
559 		AS2(	sub		WORD_REG(si), 16)
560 		ASJ(	jnz,	1, b)
561 
562 		// outer loop
563 		AS2(	mov		WORD_REG(si), SSE2_wordsLeft)
564 		AS2(	sub		WORD_REG(si), 80)
565 		ASJ(	jz,		6, f)
566 		AS2(	mov		SSE2_wordsLeft, WORD_REG(si))
567 		AS2(	mov		SSE2_input, WORD_REG(ax))
568 		AS2(	mov		SSE2_output, AS_REG_7)
569 		ASJ(	jmp,	2, b)
570 
571 		ASL(4)	// final output of less than 16 words
572 		AS2(	test	WORD_REG(ax), WORD_REG(ax))
573 		ASJ(	jz,		5, f)
574 		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+0*4])
575 		AS2(	pxor	xmm2, xmm0)
576 		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+1*4])
577 		AS2(	pxor	xmm3, xmm0)
578 		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+2*4])
579 		AS2(	pxor	xmm1, xmm0)
580 		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+3*4])
581 		AS2(	pxor	xmm4, xmm0)
582 		AS2(	add		WORD_REG(ax), 16)
583 		ASL(5)
584 		AS2(	movd	dword ptr [AS_REG_7+0*4], xmm2)
585 		AS2(	movd	dword ptr [AS_REG_7+1*4], xmm3)
586 		AS2(	movd	dword ptr [AS_REG_7+2*4], xmm1)
587 		AS2(	movd	dword ptr [AS_REG_7+3*4], xmm4)
588 		AS2(	sub		WORD_REG(si), 4)
589 		ASJ(	jz,		6, f)
590 		AS2(	add		AS_REG_7, 16)
591 		AS2(	psrldq	xmm2, 4)
592 		AS2(	psrldq	xmm3, 4)
593 		AS2(	psrldq	xmm1, 4)
594 		AS2(	psrldq	xmm4, 4)
595 		ASJ(	jmp,	4, b)
596 
597 		ASL(6)	// save state
598 		AS2(	mov		AS_REG_6, SSE2_state)
599 		AS2(	movdqa	xmm0, [SSE2_stateCopy+0*16])
600 		AS2(	movdqa	[AS_REG_6+0*16], xmm0)
601 		AS2(	movdqa	xmm0, [SSE2_stateCopy+1*16])
602 		AS2(	movdqa	[AS_REG_6+1*16], xmm0)
603 		AS2(	movq	xmm0, QWORD PTR [SSE2_stateCopy+2*16])
604 		AS2(	movq	QWORD PTR [AS_REG_6+2*16], xmm0)
605 		AS2(	mov		[AS_REG_6+10*4], ecx)
606 		AS2(	mov		[AS_REG_6+11*4], edx)
607 
608 		AS_POP_IF86(	sp)
609 		AS_POP_IF86(	bp)
610 
611 #ifdef __GNUC__
612 		AS_POP_IF86(	bx)
613 		ATT_PREFIX
614 			:
615 			: "a" (m_state.data()), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input)
616 	#if CRYPTOPP_BOOL_X64
617 			, "r" (workspace.data())
618 			: "memory", "cc", "%r9", "%r10", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
619 	#else
620 			: "memory", "cc"
621 	#endif
622 		);
623 #endif
624 #ifdef CRYPTOPP_GENERATE_X64_MASM
625 	movdqa	xmm6, [rsp + 02f0h]
626 	movdqa	xmm7, [rsp + 0300h]
627 	add		rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8
628 	pop		rdi
629 	pop		rsi
630 	ret
631 	Sosemanuk_OperateKeystream ENDP
632 #else
633 	}
634 	else
635 #endif
636 #endif
637 #ifndef CRYPTOPP_GENERATE_X64_MASM
638 	{
639 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
640 #define MUL_A(x)    (x = (rotlConstant<8>(x)), x ^ s_sosemanukMulTables[byte(x)])
641 #else
642 #define MUL_A(x)    (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24])
643 #endif
644 
645 #define DIV_A(x)    (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)])
646 
647 #define r1(i) ((i%2) ? reg2 : reg1)
648 #define r2(i) ((i%2) ? reg1 : reg2)
649 
650 #define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u)	\
651 		u = (s##x9 + r2(x0)) ^ r1(x0);\
652 		t = v = s##x0;\
653 		s##x0 = MUL_A(t) ^ DIV_A(s##x3) ^ s##x9;\
654 		r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
655 		r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
656 
657 #define SOSEMANUK_OUTPUT(x)	\
658 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\
659 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, u3 ^ v1);\
660 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, u1 ^ v2);\
661 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, u4 ^ v3);
662 
663 #define OUTPUT4	\
664 	S2(0, u0, u1, u2, u3, u4);\
665 	CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SOSEMANUK_OUTPUT, 4*4);
666 
667 	word32 s0 = m_state[0];
668 	word32 s1 = m_state[1];
669 	word32 s2 = m_state[2];
670 	word32 s3 = m_state[3];
671 	word32 s4 = m_state[4];
672 	word32 s5 = m_state[5];
673 	word32 s6 = m_state[6];
674 	word32 s7 = m_state[7];
675 	word32 s8 = m_state[8];
676 	word32 s9 = m_state[9];
677 	word32 reg1 = m_state[10];
678 	word32 reg2 = m_state[11];
679 	word32 t, u0, u1, u2, u3, u4, v0, v1, v2, v3;
680 
681 	do
682 	{
683 		STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v0, u0)
684 		STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v1, u1)
685 		STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v2, u2)
686 		STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v3, u3)
687 		OUTPUT4
688 		STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v0, u0)
689 		STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v1, u1)
690 		STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v2, u2)
691 		STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v3, u3)
692 		OUTPUT4
693 		STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v0, u0)
694 		STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v1, u1)
695 		STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v2, u2)
696 		STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v3, u3)
697 		OUTPUT4
698 		STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v0, u0)
699 		STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v1, u1)
700 		STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v2, u2)
701 		STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v3, u3)
702 		OUTPUT4
703 		STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v0, u0)
704 		STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v1, u1)
705 		STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v2, u2)
706 		STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v3, u3)
707 		OUTPUT4
708 	}
709 	while (--iterationCount);
710 
711 	m_state[0] = s0;
712 	m_state[1] = s1;
713 	m_state[2] = s2;
714 	m_state[3] = s3;
715 	m_state[4] = s4;
716 	m_state[5] = s5;
717 	m_state[6] = s6;
718 	m_state[7] = s7;
719 	m_state[8] = s8;
720 	m_state[9] = s9;
721 	m_state[10] = reg1;
722 	m_state[11] = reg2;
723 	}
724 }
725 
726 NAMESPACE_END
727 
728 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
729