1 // sosemanuk.cpp - written and placed in the public domain by Wei Dai
2 
3 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sosemanuk.cpp" to generate MASM code
4 
5 #include "pch.h"
6 #include "config.h"
7 
8 #if CRYPTOPP_MSC_VERSION
9 # pragma warning(disable: 4702 4731)
10 #endif
11 
12 #ifndef CRYPTOPP_GENERATE_X64_MASM
13 
14 #include "sosemanuk.h"
15 #include "serpentp.h"
16 #include "secblock.h"
17 #include "misc.h"
18 #include "cpu.h"
19 
NAMESPACE_BEGIN(CryptoPP)20 NAMESPACE_BEGIN(CryptoPP)
21 
22 void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen)
23 {
24 	CRYPTOPP_UNUSED(params);
25 	Serpent_KeySchedule(m_key, 24, userKey, keylen);
26 }
27 
CipherResynchronize(byte * keystreamBuffer,const byte * iv,size_t length)28 void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv, size_t length)
29 {
30 	CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(iv), CRYPTOPP_UNUSED(length);
31 	CRYPTOPP_ASSERT(length==16);
32 
33 	word32 a, b, c, d, e;
34 
35 	typedef BlockGetAndPut<word32, LittleEndian> Block;
36 	Block::Get(iv)(a)(b)(c)(d);
37 
38 	const word32 *k = m_key;
39 	unsigned int i=1;
40 
41 	do
42 	{
43 		beforeS0(KX); beforeS0(S0); afterS0(LT);
44 		afterS0(KX); afterS0(S1); afterS1(LT);
45 		if (i == 3)	// after 18th round
46 		{
47 			m_state[4] = b;
48 			m_state[5] = e;
49 			m_state[10] = c;
50 			m_state[11] = a;
51 		}
52 		afterS1(KX); afterS1(S2); afterS2(LT);
53 		afterS2(KX); afterS2(S3); afterS3(LT);
54 		if (i == 2)	// after 12th round
55 		{
56 			m_state[6] = c;
57 			m_state[7] = d;
58 			m_state[8] = b;
59 			m_state[9] = e;
60 		}
61 		afterS3(KX); afterS3(S4); afterS4(LT);
62 		afterS4(KX); afterS4(S5); afterS5(LT);
63 		afterS5(KX); afterS5(S6); afterS6(LT);
64 		afterS6(KX); afterS6(S7); afterS7(LT);
65 
66 		if (i == 3)
67 			break;
68 
69 		++i;
70 		c = b;
71 		b = e;
72 		e = d;
73 		d = a;
74 		a = e;
75 		k += 32;
76 	}
77 	while (true);
78 
79 	afterS7(KX);
80 
81 	m_state[0] = a;
82 	m_state[1] = b;
83 	m_state[2] = e;
84 	m_state[3] = d;
85 
86 #define XMUX(c, x, y)   (x ^ (y & (0 - (c & 1))))
87 	m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
88 	m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
89 }
90 
91 extern "C" {
92 word32 s_sosemanukMulTables[512] = {
93 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
94 	0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836,
95 	0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E,
96 	0x05A7DC90, 0xE4381382, 0x6E30EBB4, 0x8FAF24A6,
97 	0xD320B2D8, 0x32BF7DCA, 0xB8B785FC, 0x59284AEE,
98 	0x0AE71189, 0xEB78DE9B, 0x617026AD, 0x80EFE9BF,
99 	0xDC607FC1, 0x3DFFB0D3, 0xB7F748E5, 0x566887F7,
100 	0x0F40CD19, 0xEEDF020B, 0x64D7FA3D, 0x8548352F,
101 	0xD9C7A351, 0x38586C43, 0xB2509475, 0x53CF5B67,
102 	0x146722BB, 0xF5F8EDA9, 0x7FF0159F, 0x9E6FDA8D,
103 	0xC2E04CF3, 0x237F83E1, 0xA9777BD7, 0x48E8B4C5,
104 	0x11C0FE2B, 0xF05F3139, 0x7A57C90F, 0x9BC8061D,
105 	0xC7479063, 0x26D85F71, 0xACD0A747, 0x4D4F6855,
106 	0x1E803332, 0xFF1FFC20, 0x75170416, 0x9488CB04,
107 	0xC8075D7A, 0x29989268, 0xA3906A5E, 0x420FA54C,
108 	0x1B27EFA2, 0xFAB820B0, 0x70B0D886, 0x912F1794,
109 	0xCDA081EA, 0x2C3F4EF8, 0xA637B6CE, 0x47A879DC,
110 	0x28CE44DF, 0xC9518BCD, 0x435973FB, 0xA2C6BCE9,
111 	0xFE492A97, 0x1FD6E585, 0x95DE1DB3, 0x7441D2A1,
112 	0x2D69984F, 0xCCF6575D, 0x46FEAF6B, 0xA7616079,
113 	0xFBEEF607, 0x1A713915, 0x9079C123, 0x71E60E31,
114 	0x22295556, 0xC3B69A44, 0x49BE6272, 0xA821AD60,
115 	0xF4AE3B1E, 0x1531F40C, 0x9F390C3A, 0x7EA6C328,
116 	0x278E89C6, 0xC61146D4, 0x4C19BEE2, 0xAD8671F0,
117 	0xF109E78E, 0x1096289C, 0x9A9ED0AA, 0x7B011FB8,
118 	0x3CA96664, 0xDD36A976, 0x573E5140, 0xB6A19E52,
119 	0xEA2E082C, 0x0BB1C73E, 0x81B93F08, 0x6026F01A,
120 	0x390EBAF4, 0xD89175E6, 0x52998DD0, 0xB30642C2,
121 	0xEF89D4BC, 0x0E161BAE, 0x841EE398, 0x65812C8A,
122 	0x364E77ED, 0xD7D1B8FF, 0x5DD940C9, 0xBC468FDB,
123 	0xE0C919A5, 0x0156D6B7, 0x8B5E2E81, 0x6AC1E193,
124 	0x33E9AB7D, 0xD276646F, 0x587E9C59, 0xB9E1534B,
125 	0xE56EC535, 0x04F10A27, 0x8EF9F211, 0x6F663D03,
126 	0x50358817, 0xB1AA4705, 0x3BA2BF33, 0xDA3D7021,
127 	0x86B2E65F, 0x672D294D, 0xED25D17B, 0x0CBA1E69,
128 	0x55925487, 0xB40D9B95, 0x3E0563A3, 0xDF9AACB1,
129 	0x83153ACF, 0x628AF5DD, 0xE8820DEB, 0x091DC2F9,
130 	0x5AD2999E, 0xBB4D568C, 0x3145AEBA, 0xD0DA61A8,
131 	0x8C55F7D6, 0x6DCA38C4, 0xE7C2C0F2, 0x065D0FE0,
132 	0x5F75450E, 0xBEEA8A1C, 0x34E2722A, 0xD57DBD38,
133 	0x89F22B46, 0x686DE454, 0xE2651C62, 0x03FAD370,
134 	0x4452AAAC, 0xA5CD65BE, 0x2FC59D88, 0xCE5A529A,
135 	0x92D5C4E4, 0x734A0BF6, 0xF942F3C0, 0x18DD3CD2,
136 	0x41F5763C, 0xA06AB92E, 0x2A624118, 0xCBFD8E0A,
137 	0x97721874, 0x76EDD766, 0xFCE52F50, 0x1D7AE042,
138 	0x4EB5BB25, 0xAF2A7437, 0x25228C01, 0xC4BD4313,
139 	0x9832D56D, 0x79AD1A7F, 0xF3A5E249, 0x123A2D5B,
140 	0x4B1267B5, 0xAA8DA8A7, 0x20855091, 0xC11A9F83,
141 	0x9D9509FD, 0x7C0AC6EF, 0xF6023ED9, 0x179DF1CB,
142 	0x78FBCCC8, 0x996403DA, 0x136CFBEC, 0xF2F334FE,
143 	0xAE7CA280, 0x4FE36D92, 0xC5EB95A4, 0x24745AB6,
144 	0x7D5C1058, 0x9CC3DF4A, 0x16CB277C, 0xF754E86E,
145 	0xABDB7E10, 0x4A44B102, 0xC04C4934, 0x21D38626,
146 	0x721CDD41, 0x93831253, 0x198BEA65, 0xF8142577,
147 	0xA49BB309, 0x45047C1B, 0xCF0C842D, 0x2E934B3F,
148 	0x77BB01D1, 0x9624CEC3, 0x1C2C36F5, 0xFDB3F9E7,
149 	0xA13C6F99, 0x40A3A08B, 0xCAAB58BD, 0x2B3497AF,
150 	0x6C9CEE73, 0x8D032161, 0x070BD957, 0xE6941645,
151 	0xBA1B803B, 0x5B844F29, 0xD18CB71F, 0x3013780D,
152 	0x693B32E3, 0x88A4FDF1, 0x02AC05C7, 0xE333CAD5,
153 	0xBFBC5CAB, 0x5E2393B9, 0xD42B6B8F, 0x35B4A49D,
154 	0x667BFFFA, 0x87E430E8, 0x0DECC8DE, 0xEC7307CC,
155 	0xB0FC91B2, 0x51635EA0, 0xDB6BA696, 0x3AF46984,
156 	0x63DC236A, 0x8243EC78, 0x084B144E, 0xE9D4DB5C,
157 	0xB55B4D22, 0x54C48230, 0xDECC7A06, 0x3F53B514,
158 #else
159 	0x00000000, 0xE19FCF13, 0x6B973726, 0x8A08F835,
160 	0xD6876E4C, 0x3718A15F, 0xBD10596A, 0x5C8F9679,
161 	0x05A7DC98, 0xE438138B, 0x6E30EBBE, 0x8FAF24AD,
162 	0xD320B2D4, 0x32BF7DC7, 0xB8B785F2, 0x59284AE1,
163 	0x0AE71199, 0xEB78DE8A, 0x617026BF, 0x80EFE9AC,
164 	0xDC607FD5, 0x3DFFB0C6, 0xB7F748F3, 0x566887E0,
165 	0x0F40CD01, 0xEEDF0212, 0x64D7FA27, 0x85483534,
166 	0xD9C7A34D, 0x38586C5E, 0xB250946B, 0x53CF5B78,
167 	0x1467229B, 0xF5F8ED88, 0x7FF015BD, 0x9E6FDAAE,
168 	0xC2E04CD7, 0x237F83C4, 0xA9777BF1, 0x48E8B4E2,
169 	0x11C0FE03, 0xF05F3110, 0x7A57C925, 0x9BC80636,
170 	0xC747904F, 0x26D85F5C, 0xACD0A769, 0x4D4F687A,
171 	0x1E803302, 0xFF1FFC11, 0x75170424, 0x9488CB37,
172 	0xC8075D4E, 0x2998925D, 0xA3906A68, 0x420FA57B,
173 	0x1B27EF9A, 0xFAB82089, 0x70B0D8BC, 0x912F17AF,
174 	0xCDA081D6, 0x2C3F4EC5, 0xA637B6F0, 0x47A879E3,
175 	0x28CE449F, 0xC9518B8C, 0x435973B9, 0xA2C6BCAA,
176 	0xFE492AD3, 0x1FD6E5C0, 0x95DE1DF5, 0x7441D2E6,
177 	0x2D699807, 0xCCF65714, 0x46FEAF21, 0xA7616032,
178 	0xFBEEF64B, 0x1A713958, 0x9079C16D, 0x71E60E7E,
179 	0x22295506, 0xC3B69A15, 0x49BE6220, 0xA821AD33,
180 	0xF4AE3B4A, 0x1531F459, 0x9F390C6C, 0x7EA6C37F,
181 	0x278E899E, 0xC611468D, 0x4C19BEB8, 0xAD8671AB,
182 	0xF109E7D2, 0x109628C1, 0x9A9ED0F4, 0x7B011FE7,
183 	0x3CA96604, 0xDD36A917, 0x573E5122, 0xB6A19E31,
184 	0xEA2E0848, 0x0BB1C75B, 0x81B93F6E, 0x6026F07D,
185 	0x390EBA9C, 0xD891758F, 0x52998DBA, 0xB30642A9,
186 	0xEF89D4D0, 0x0E161BC3, 0x841EE3F6, 0x65812CE5,
187 	0x364E779D, 0xD7D1B88E, 0x5DD940BB, 0xBC468FA8,
188 	0xE0C919D1, 0x0156D6C2, 0x8B5E2EF7, 0x6AC1E1E4,
189 	0x33E9AB05, 0xD2766416, 0x587E9C23, 0xB9E15330,
190 	0xE56EC549, 0x04F10A5A, 0x8EF9F26F, 0x6F663D7C,
191 	0x50358897, 0xB1AA4784, 0x3BA2BFB1, 0xDA3D70A2,
192 	0x86B2E6DB, 0x672D29C8, 0xED25D1FD, 0x0CBA1EEE,
193 	0x5592540F, 0xB40D9B1C, 0x3E056329, 0xDF9AAC3A,
194 	0x83153A43, 0x628AF550, 0xE8820D65, 0x091DC276,
195 	0x5AD2990E, 0xBB4D561D, 0x3145AE28, 0xD0DA613B,
196 	0x8C55F742, 0x6DCA3851, 0xE7C2C064, 0x065D0F77,
197 	0x5F754596, 0xBEEA8A85, 0x34E272B0, 0xD57DBDA3,
198 	0x89F22BDA, 0x686DE4C9, 0xE2651CFC, 0x03FAD3EF,
199 	0x4452AA0C, 0xA5CD651F, 0x2FC59D2A, 0xCE5A5239,
200 	0x92D5C440, 0x734A0B53, 0xF942F366, 0x18DD3C75,
201 	0x41F57694, 0xA06AB987, 0x2A6241B2, 0xCBFD8EA1,
202 	0x977218D8, 0x76EDD7CB, 0xFCE52FFE, 0x1D7AE0ED,
203 	0x4EB5BB95, 0xAF2A7486, 0x25228CB3, 0xC4BD43A0,
204 	0x9832D5D9, 0x79AD1ACA, 0xF3A5E2FF, 0x123A2DEC,
205 	0x4B12670D, 0xAA8DA81E, 0x2085502B, 0xC11A9F38,
206 	0x9D950941, 0x7C0AC652, 0xF6023E67, 0x179DF174,
207 	0x78FBCC08, 0x9964031B, 0x136CFB2E, 0xF2F3343D,
208 	0xAE7CA244, 0x4FE36D57, 0xC5EB9562, 0x24745A71,
209 	0x7D5C1090, 0x9CC3DF83, 0x16CB27B6, 0xF754E8A5,
210 	0xABDB7EDC, 0x4A44B1CF, 0xC04C49FA, 0x21D386E9,
211 	0x721CDD91, 0x93831282, 0x198BEAB7, 0xF81425A4,
212 	0xA49BB3DD, 0x45047CCE, 0xCF0C84FB, 0x2E934BE8,
213 	0x77BB0109, 0x9624CE1A, 0x1C2C362F, 0xFDB3F93C,
214 	0xA13C6F45, 0x40A3A056, 0xCAAB5863, 0x2B349770,
215 	0x6C9CEE93, 0x8D032180, 0x070BD9B5, 0xE69416A6,
216 	0xBA1B80DF, 0x5B844FCC, 0xD18CB7F9, 0x301378EA,
217 	0x693B320B, 0x88A4FD18, 0x02AC052D, 0xE333CA3E,
218 	0xBFBC5C47, 0x5E239354, 0xD42B6B61, 0x35B4A472,
219 	0x667BFF0A, 0x87E43019, 0x0DECC82C, 0xEC73073F,
220 	0xB0FC9146, 0x51635E55, 0xDB6BA660, 0x3AF46973,
221 	0x63DC2392, 0x8243EC81, 0x084B14B4, 0xE9D4DBA7,
222 	0xB55B4DDE, 0x54C482CD, 0xDECC7AF8, 0x3F53B5EB,
223 #endif
224 	0x00000000, 0x180F40CD, 0x301E8033, 0x2811C0FE,
225 	0x603CA966, 0x7833E9AB, 0x50222955, 0x482D6998,
226 	0xC078FBCC, 0xD877BB01, 0xF0667BFF, 0xE8693B32,
227 	0xA04452AA, 0xB84B1267, 0x905AD299, 0x88559254,
228 	0x29F05F31, 0x31FF1FFC, 0x19EEDF02, 0x01E19FCF,
229 	0x49CCF657, 0x51C3B69A, 0x79D27664, 0x61DD36A9,
230 	0xE988A4FD, 0xF187E430, 0xD99624CE, 0xC1996403,
231 	0x89B40D9B, 0x91BB4D56, 0xB9AA8DA8, 0xA1A5CD65,
232 	0x5249BE62, 0x4A46FEAF, 0x62573E51, 0x7A587E9C,
233 	0x32751704, 0x2A7A57C9, 0x026B9737, 0x1A64D7FA,
234 	0x923145AE, 0x8A3E0563, 0xA22FC59D, 0xBA208550,
235 	0xF20DECC8, 0xEA02AC05, 0xC2136CFB, 0xDA1C2C36,
236 	0x7BB9E153, 0x63B6A19E, 0x4BA76160, 0x53A821AD,
237 	0x1B854835, 0x038A08F8, 0x2B9BC806, 0x339488CB,
238 	0xBBC11A9F, 0xA3CE5A52, 0x8BDF9AAC, 0x93D0DA61,
239 	0xDBFDB3F9, 0xC3F2F334, 0xEBE333CA, 0xF3EC7307,
240 	0xA492D5C4, 0xBC9D9509, 0x948C55F7, 0x8C83153A,
241 	0xC4AE7CA2, 0xDCA13C6F, 0xF4B0FC91, 0xECBFBC5C,
242 	0x64EA2E08, 0x7CE56EC5, 0x54F4AE3B, 0x4CFBEEF6,
243 	0x04D6876E, 0x1CD9C7A3, 0x34C8075D, 0x2CC74790,
244 	0x8D628AF5, 0x956DCA38, 0xBD7C0AC6, 0xA5734A0B,
245 	0xED5E2393, 0xF551635E, 0xDD40A3A0, 0xC54FE36D,
246 	0x4D1A7139, 0x551531F4, 0x7D04F10A, 0x650BB1C7,
247 	0x2D26D85F, 0x35299892, 0x1D38586C, 0x053718A1,
248 	0xF6DB6BA6, 0xEED42B6B, 0xC6C5EB95, 0xDECAAB58,
249 	0x96E7C2C0, 0x8EE8820D, 0xA6F942F3, 0xBEF6023E,
250 	0x36A3906A, 0x2EACD0A7, 0x06BD1059, 0x1EB25094,
251 	0x569F390C, 0x4E9079C1, 0x6681B93F, 0x7E8EF9F2,
252 	0xDF2B3497, 0xC724745A, 0xEF35B4A4, 0xF73AF469,
253 	0xBF179DF1, 0xA718DD3C, 0x8F091DC2, 0x97065D0F,
254 	0x1F53CF5B, 0x075C8F96, 0x2F4D4F68, 0x37420FA5,
255 	0x7F6F663D, 0x676026F0, 0x4F71E60E, 0x577EA6C3,
256 	0xE18D0321, 0xF98243EC, 0xD1938312, 0xC99CC3DF,
257 	0x81B1AA47, 0x99BEEA8A, 0xB1AF2A74, 0xA9A06AB9,
258 	0x21F5F8ED, 0x39FAB820, 0x11EB78DE, 0x09E43813,
259 	0x41C9518B, 0x59C61146, 0x71D7D1B8, 0x69D89175,
260 	0xC87D5C10, 0xD0721CDD, 0xF863DC23, 0xE06C9CEE,
261 	0xA841F576, 0xB04EB5BB, 0x985F7545, 0x80503588,
262 	0x0805A7DC, 0x100AE711, 0x381B27EF, 0x20146722,
263 	0x68390EBA, 0x70364E77, 0x58278E89, 0x4028CE44,
264 	0xB3C4BD43, 0xABCBFD8E, 0x83DA3D70, 0x9BD57DBD,
265 	0xD3F81425, 0xCBF754E8, 0xE3E69416, 0xFBE9D4DB,
266 	0x73BC468F, 0x6BB30642, 0x43A2C6BC, 0x5BAD8671,
267 	0x1380EFE9, 0x0B8FAF24, 0x239E6FDA, 0x3B912F17,
268 	0x9A34E272, 0x823BA2BF, 0xAA2A6241, 0xB225228C,
269 	0xFA084B14, 0xE2070BD9, 0xCA16CB27, 0xD2198BEA,
270 	0x5A4C19BE, 0x42435973, 0x6A52998D, 0x725DD940,
271 	0x3A70B0D8, 0x227FF015, 0x0A6E30EB, 0x12617026,
272 	0x451FD6E5, 0x5D109628, 0x750156D6, 0x6D0E161B,
273 	0x25237F83, 0x3D2C3F4E, 0x153DFFB0, 0x0D32BF7D,
274 	0x85672D29, 0x9D686DE4, 0xB579AD1A, 0xAD76EDD7,
275 	0xE55B844F, 0xFD54C482, 0xD545047C, 0xCD4A44B1,
276 	0x6CEF89D4, 0x74E0C919, 0x5CF109E7, 0x44FE492A,
277 	0x0CD320B2, 0x14DC607F, 0x3CCDA081, 0x24C2E04C,
278 	0xAC977218, 0xB49832D5, 0x9C89F22B, 0x8486B2E6,
279 	0xCCABDB7E, 0xD4A49BB3, 0xFCB55B4D, 0xE4BA1B80,
280 	0x17566887, 0x0F59284A, 0x2748E8B4, 0x3F47A879,
281 	0x776AC1E1, 0x6F65812C, 0x477441D2, 0x5F7B011F,
282 	0xD72E934B, 0xCF21D386, 0xE7301378, 0xFF3F53B5,
283 	0xB7123A2D, 0xAF1D7AE0, 0x870CBA1E, 0x9F03FAD3,
284 	0x3EA637B6, 0x26A9777B, 0x0EB8B785, 0x16B7F748,
285 	0x5E9A9ED0, 0x4695DE1D, 0x6E841EE3, 0x768B5E2E,
286 	0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
287 	0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
288 };
289 }
290 
291 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
GetAlignment() const292 unsigned int SosemanukPolicy::GetAlignment() const
293 {
294 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
295 #ifdef __INTEL_COMPILER
296 	if (HasSSE2() && !IsP4())	// Intel compiler produces faster code for this algorithm on the P4
297 #else
298 	if (HasSSE2())
299 #endif
300 		return 16;
301 	else
302 #endif
303 		return GetAlignmentOf<word32>();
304 }
305 
GetOptimalBlockSize() const306 unsigned int SosemanukPolicy::GetOptimalBlockSize() const
307 {
308 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
309 #ifdef __INTEL_COMPILER
310 	if (HasSSE2() && !IsP4())	// Intel compiler produces faster code for this algorithm on the P4
311 #else
312 	if (HasSSE2())
313 #endif
314 		return 4*BYTES_PER_ITERATION;
315 	else
316 #endif
317 		return BYTES_PER_ITERATION;
318 }
319 #endif
320 
321 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
322 extern "C" {
323 void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state);
324 }
325 #endif
326 
OperateKeystream(KeystreamOperation operation,byte * output,const byte * input,size_t iterationCount)327 void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
328 {
329 #endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
330 
331 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
332 	Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data());
333 	return;
334 #endif
335 
336 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
337 #ifdef CRYPTOPP_GENERATE_X64_MASM
338 		ALIGN   8
339 	Sosemanuk_OperateKeystream	PROC FRAME
340 		rex_push_reg rsi
341 		push_reg rdi
342 		alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8)
343 		save_xmm128 xmm6, 02f0h
344 		save_xmm128 xmm7, 0300h
345 		.endprolog
346 		mov		rdi, r8
347 		mov		rax, r9
348 #else
349 #ifdef __INTEL_COMPILER
350 	if (HasSSE2() && !IsP4())	// Intel compiler produces faster code for this algorithm on the P4
351 #else
352 	if (HasSSE2())
353 #endif
354 	{
355 #ifdef __GNUC__
356 	#if CRYPTOPP_BOOL_X64
357 		FixedSizeAlignedSecBlock<byte, 80*4*2+12*4+8*WORD_SZ> workspace;
358 	#endif
359 		__asm__ __volatile__
360 		(
361 		INTEL_NOPREFIX
362 		AS_PUSH_IF86(	bx)
363 #else
364 		word32 *state = m_state;
365 		AS2(	mov		WORD_REG(ax), state)
366 		AS2(	mov		WORD_REG(di), output)
367 		AS2(	mov		WORD_REG(dx), input)
368 		AS2(	mov		WORD_REG(cx), iterationCount)
369 #endif
370 #endif	// #ifdef CRYPTOPP_GENERATE_X64_MASM
371 
372 #if defined(__GNUC__) && CRYPTOPP_BOOL_X64
373 	#define SSE2_workspace %5
374 #else
375 	#define SSE2_workspace WORD_REG(sp)
376 #endif
377 
378 #define SSE2_output			WORD_PTR [SSE2_workspace+1*WORD_SZ]
379 #define SSE2_input			WORD_PTR [SSE2_workspace+2*WORD_SZ]
380 #define SSE2_wordsLeft		WORD_PTR [SSE2_workspace+3*WORD_SZ]
381 #define SSE2_diEnd			WORD_PTR [SSE2_workspace+4*WORD_SZ]
382 #define SSE2_pMulTables		WORD_PTR [SSE2_workspace+5*WORD_SZ]
383 #define SSE2_state			WORD_PTR [SSE2_workspace+6*WORD_SZ]
384 #define SSE2_wordsLeft2		WORD_PTR [SSE2_workspace+7*WORD_SZ]
385 #define SSE2_stateCopy		SSE2_workspace + 8*WORD_SZ
386 #define	SSE2_uvStart		SSE2_stateCopy + 12*4
387 
388 #if (CRYPTOPP_BOOL_X86) && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
389 		AS_PUSH_IF86(	bp)
390 		AS2(	mov		AS_REG_6, esp)
391 		AS2(	and		esp, -16)
392 		AS2(	sub		esp, 80*4*2+12*4+8*WORD_SZ)	// 80 v's, 80 u's, 12 state, 8 locals
393 		AS2(	mov		[esp], AS_REG_6)
394 #endif
395 		AS2(	mov		SSE2_output, WORD_REG(di))
396 		AS2(	mov		SSE2_input, WORD_REG(dx))
397 		AS2(	mov		SSE2_state, WORD_REG(ax))
398 #ifndef _MSC_VER
399 		AS2(	mov		SSE2_pMulTables, WORD_REG(si))
400 #endif
401 		AS2(	lea		WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
402 		AS2(	lea		WORD_REG(si), [4*WORD_REG(cx)])
403 		AS2(	mov		SSE2_wordsLeft, WORD_REG(si))
404 		AS2(	movdqa	xmm0, [WORD_REG(ax)+0*16])		// copy state to stack to save a register
405 		AS2(	movdqa	[SSE2_stateCopy+0*16], xmm0)
406 		AS2(	movdqa	xmm0, [WORD_REG(ax)+1*16])
407 		AS2(	movdqa	[SSE2_stateCopy+1*16], xmm0)
408 		AS2(	movq	xmm0, QWORD PTR [WORD_REG(ax)+2*16])
409 		AS2(	movq	QWORD PTR [SSE2_stateCopy+2*16], xmm0)
410 		AS2(	psrlq	xmm0, 32)
411 		AS2(	movd	AS_REG_6d, xmm0)				// s(9)
412 		AS2(	mov		ecx, [WORD_REG(ax)+10*4])
413 		AS2(	mov		edx, [WORD_REG(ax)+11*4])
414 		AS2(	pcmpeqb	xmm7, xmm7)				// all ones
415 
416 #define s(i)	SSE2_stateCopy + ASM_MOD(i,10)*4
417 #define u(j)	WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
418 #define v(j)	WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
419 
420 #define R10 ecx
421 #define R11 edx
422 #define R20 edx
423 #define R21 ecx
424 // workaround bug in GAS 2.15
425 #define R20r WORD_REG(dx)
426 #define R21r WORD_REG(cx)
427 
428 #define SSE2_STEP(i, j)	\
429 	AS2(	mov		eax, [s(i+0)])\
430 	AS2(	mov		[v(i)], eax)\
431 	AS2(	rol		eax, 8)\
432 	AS2(	lea		AS_REG_7, [AS_REG_6 + R2##j##r])\
433 	AS2(	xor		AS_REG_7d, R1##j)\
434 	AS2(	mov		[u(i)], AS_REG_7d)\
435 	AS2(	mov		AS_REG_7d, 1)\
436 	AS2(	and		AS_REG_7d, R2##j)\
437 	AS1(	neg		AS_REG_7d)\
438 	AS2(	and		AS_REG_7d, AS_REG_6d)\
439 	AS2(	xor		AS_REG_6d, eax)\
440 	AS2(	movzx	eax, al)\
441 	AS2(	xor		AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\
442 	AS2(	mov		eax, [s(i+3)])\
443 	AS2(	xor		AS_REG_7d, [s(i+2)])\
444 	AS2(	add		R1##j, AS_REG_7d)\
445 	AS2(	movzx	AS_REG_7d, al)\
446 	AS2(	shr		eax, 8)\
447 	AS2(	xor		AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\
448 	AS2(	xor		AS_REG_6d, eax)\
449 	AS2(	imul	R2##j, AS_HEX(54655307))\
450 	AS2(	rol		R2##j, 7)\
451 	AS2(	mov		[s(i+0)], AS_REG_6d)\
452 
453 		ASL(2)	// outer loop, each iteration of this processes 80 words
454 		AS2(	lea		WORD_REG(di), [SSE2_uvStart])	// start of v and u
455 		AS2(	mov		WORD_REG(ax), 80)
456 		AS2(	cmp		WORD_REG(si), 80)
457 		AS2(	cmovg	WORD_REG(si), WORD_REG(ax))
458 		AS2(	mov		SSE2_wordsLeft2, WORD_REG(si))
459 		AS2(	lea		WORD_REG(si), [WORD_REG(di)+WORD_REG(si)])		// use to end first inner loop
460 		AS2(	mov		SSE2_diEnd, WORD_REG(si))
461 #ifdef _MSC_VER
462 		AS2(	lea		WORD_REG(si), s_sosemanukMulTables)
463 #else
464 		AS2(	mov		WORD_REG(si), SSE2_pMulTables)
465 #endif
466 
467 		ASL(0)	// first inner loop, 20 words each, 4 iterations
468 		SSE2_STEP(0, 0)
469 		SSE2_STEP(1, 1)
470 		SSE2_STEP(2, 0)
471 		SSE2_STEP(3, 1)
472 		SSE2_STEP(4, 0)
473 		SSE2_STEP(5, 1)
474 		SSE2_STEP(6, 0)
475 		SSE2_STEP(7, 1)
476 		SSE2_STEP(8, 0)
477 		SSE2_STEP(9, 1)
478 		SSE2_STEP(10, 0)
479 		SSE2_STEP(11, 1)
480 		SSE2_STEP(12, 0)
481 		SSE2_STEP(13, 1)
482 		SSE2_STEP(14, 0)
483 		SSE2_STEP(15, 1)
484 		SSE2_STEP(16, 0)
485 		SSE2_STEP(17, 1)
486 		SSE2_STEP(18, 0)
487 		SSE2_STEP(19, 1)
488 		// loop
489 		AS2(	add		WORD_REG(di), 5*4)
490 		AS2(	cmp		WORD_REG(di), SSE2_diEnd)
491 		ASJ(	jne,	0, b)
492 
493 		AS2(	mov		WORD_REG(ax), SSE2_input)
494 		AS2(	mov		AS_REG_7, SSE2_output)
495 		AS2(	lea		WORD_REG(di), [SSE2_uvStart])		// start of v and u
496 		AS2(	mov		WORD_REG(si), SSE2_wordsLeft2)
497 
498 		ASL(1)	// second inner loop, 16 words each, 5 iterations
499 		AS2(	movdqa	xmm0, [WORD_REG(di)+0*20*4])
500 		AS2(	movdqa	xmm2, [WORD_REG(di)+2*20*4])
501 		AS2(	movdqa	xmm3, [WORD_REG(di)+3*20*4])
502 		AS2(	movdqa	xmm1, [WORD_REG(di)+1*20*4])
503 		// S2
504 		AS2(	movdqa	xmm4, xmm0)
505 		AS2(	pand	xmm0, xmm2)
506 		AS2(    pxor	xmm0, xmm3)
507 		AS2(    pxor	xmm2, xmm1)
508  		AS2(	pxor	xmm2, xmm0)
509  		AS2(	por		xmm3, xmm4)
510  		AS2(	pxor	xmm3, xmm1)
511  		AS2(	pxor	xmm4, xmm2)
512  		AS2(	movdqa	xmm1, xmm3)
513  		AS2(	por		xmm3, xmm4)
514  		AS2(	pxor	xmm3, xmm0)
515  		AS2(	pand	xmm0, xmm1)
516  		AS2(	pxor	xmm4, xmm0)
517  		AS2(	pxor	xmm1, xmm3)
518  		AS2(	pxor	xmm1, xmm4)
519 		AS2(	pxor	xmm4, xmm7)
520 		// xor with v
521 		AS2(	pxor	xmm2, [WORD_REG(di)+80*4])
522 		AS2(	pxor	xmm3, [WORD_REG(di)+80*5])
523 		AS2(	pxor	xmm1, [WORD_REG(di)+80*6])
524 		AS2(	pxor	xmm4, [WORD_REG(di)+80*7])
525 		// exit loop early if less than 16 words left to output
526 		// this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
527 		AS2(	cmp		WORD_REG(si), 16)
528 		ASJ(	jl,		4, f)
529 		// unpack
530 		AS2(	movdqa		xmm6, xmm2)
531 		AS2(	punpckldq	xmm2, xmm3)
532 		AS2(	movdqa		xmm5, xmm1)
533 		AS2(	punpckldq	xmm1, xmm4)
534 		AS2(	movdqa		xmm0, xmm2)
535 		AS2(	punpcklqdq	xmm2, xmm1)
536 		AS2(	punpckhqdq	xmm0, xmm1)
537 		AS2(	punpckhdq	xmm6, xmm3)
538 		AS2(	punpckhdq	xmm5, xmm4)
539 		AS2(	movdqa		xmm3, xmm6)
540 		AS2(	punpcklqdq	xmm6, xmm5)
541 		AS2(	punpckhqdq	xmm3, xmm5)
542 
543 		// output keystream
544 		AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4)
545 
546 		// loop
547 		AS2(	add		WORD_REG(di), 4*4)
548 		AS2(	sub		WORD_REG(si), 16)
549 		ASJ(	jnz,	1, b)
550 
551 		// outer loop
552 		AS2(	mov		WORD_REG(si), SSE2_wordsLeft)
553 		AS2(	sub		WORD_REG(si), 80)
554 		ASJ(	jz,		6, f)
555 		AS2(	mov		SSE2_wordsLeft, WORD_REG(si))
556 		AS2(	mov		SSE2_input, WORD_REG(ax))
557 		AS2(	mov		SSE2_output, AS_REG_7)
558 		ASJ(	jmp,	2, b)
559 
560 		ASL(4)	// final output of less than 16 words
561 		AS2(	test	WORD_REG(ax), WORD_REG(ax))
562 		ASJ(	jz,		5, f)
563 		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+0*4])
564 		AS2(	pxor	xmm2, xmm0)
565 		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+1*4])
566 		AS2(	pxor	xmm3, xmm0)
567 		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+2*4])
568 		AS2(	pxor	xmm1, xmm0)
569 		AS2(	movd	xmm0, dword ptr [WORD_REG(ax)+3*4])
570 		AS2(	pxor	xmm4, xmm0)
571 		AS2(	add		WORD_REG(ax), 16)
572 		ASL(5)
573 		AS2(	movd	dword ptr [AS_REG_7+0*4], xmm2)
574 		AS2(	movd	dword ptr [AS_REG_7+1*4], xmm3)
575 		AS2(	movd	dword ptr [AS_REG_7+2*4], xmm1)
576 		AS2(	movd	dword ptr [AS_REG_7+3*4], xmm4)
577 		AS2(	sub		WORD_REG(si), 4)
578 		ASJ(	jz,		6, f)
579 		AS2(	add		AS_REG_7, 16)
580 		AS2(	psrldq	xmm2, 4)
581 		AS2(	psrldq	xmm3, 4)
582 		AS2(	psrldq	xmm1, 4)
583 		AS2(	psrldq	xmm4, 4)
584 		ASJ(	jmp,	4, b)
585 
586 		ASL(6)	// save state
587 		AS2(	mov		AS_REG_6, SSE2_state)
588 		AS2(	movdqa	xmm0, [SSE2_stateCopy+0*16])
589 		AS2(	movdqa	[AS_REG_6+0*16], xmm0)
590 		AS2(	movdqa	xmm0, [SSE2_stateCopy+1*16])
591 		AS2(	movdqa	[AS_REG_6+1*16], xmm0)
592 		AS2(	movq	xmm0, QWORD PTR [SSE2_stateCopy+2*16])
593 		AS2(	movq	QWORD PTR [AS_REG_6+2*16], xmm0)
594 		AS2(	mov		[AS_REG_6+10*4], ecx)
595 		AS2(	mov		[AS_REG_6+11*4], edx)
596 
597 		AS_POP_IF86(	sp)
598 		AS_POP_IF86(	bp)
599 
600 #ifdef __GNUC__
601 		AS_POP_IF86(	bx)
602 		ATT_PREFIX
603 			:
604 			: "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input)
605 	#if CRYPTOPP_BOOL_X64
606 			, "r" (workspace.m_ptr)
607 			: "memory", "cc", "%r9", "%r10", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
608 	#else
609 			: "memory", "cc"
610 	#endif
611 		);
612 #endif
613 #ifdef CRYPTOPP_GENERATE_X64_MASM
614 	movdqa	xmm6, [rsp + 02f0h]
615 	movdqa	xmm7, [rsp + 0300h]
616 	add		rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8
617 	pop		rdi
618 	pop		rsi
619 	ret
620 	Sosemanuk_OperateKeystream ENDP
621 #else
622 	}
623 	else
624 #endif
625 #endif
626 #ifndef CRYPTOPP_GENERATE_X64_MASM
627 	{
628 #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) && !defined(CRYPTOPP_DISABLE_SOSEMANUK_ASM)
629 #define MUL_A(x)    (x = rotlFixed(x, 8), x ^ s_sosemanukMulTables[byte(x)])
630 #else
631 #define MUL_A(x)    (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24])
632 #endif
633 
634 #define DIV_A(x)    (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)])
635 
636 #define r1(i) ((i%2) ? reg2 : reg1)
637 #define r2(i) ((i%2) ? reg1 : reg2)
638 
639 #define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u)	\
640 		u = (s##x9 + r2(x0)) ^ r1(x0);\
641 		v = s##x0;\
642 		s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
643 		r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
644 		r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
645 
646 #define SOSEMANUK_OUTPUT(x)	\
647 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\
648 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, u3 ^ v1);\
649 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, u1 ^ v2);\
650 	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, u4 ^ v3);
651 
652 #define OUTPUT4	\
653 	S2(0, u0, u1, u2, u3, u4);\
654 	CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SOSEMANUK_OUTPUT, 4*4);
655 
656 	word32 s0 = m_state[0];
657 	word32 s1 = m_state[1];
658 	word32 s2 = m_state[2];
659 	word32 s3 = m_state[3];
660 	word32 s4 = m_state[4];
661 	word32 s5 = m_state[5];
662 	word32 s6 = m_state[6];
663 	word32 s7 = m_state[7];
664 	word32 s8 = m_state[8];
665 	word32 s9 = m_state[9];
666 	word32 reg1 = m_state[10];
667 	word32 reg2 = m_state[11];
668 	word32 u0, u1, u2, u3, u4, v0, v1, v2, v3;
669 
670 	do
671 	{
672 		STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v0, u0)
673 		STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v1, u1)
674 		STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v2, u2)
675 		STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v3, u3)
676 		OUTPUT4
677 		STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v0, u0)
678 		STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v1, u1)
679 		STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v2, u2)
680 		STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v3, u3)
681 		OUTPUT4
682 		STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v0, u0)
683 		STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v1, u1)
684 		STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v2, u2)
685 		STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v3, u3)
686 		OUTPUT4
687 		STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v0, u0)
688 		STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v1, u1)
689 		STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v2, u2)
690 		STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v3, u3)
691 		OUTPUT4
692 		STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v0, u0)
693 		STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v1, u1)
694 		STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v2, u2)
695 		STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v3, u3)
696 		OUTPUT4
697 	}
698 	while (--iterationCount);
699 
700 	m_state[0] = s0;
701 	m_state[1] = s1;
702 	m_state[2] = s2;
703 	m_state[3] = s3;
704 	m_state[4] = s4;
705 	m_state[5] = s5;
706 	m_state[6] = s6;
707 	m_state[7] = s7;
708 	m_state[8] = s8;
709 	m_state[9] = s9;
710 	m_state[10] = reg1;
711 	m_state[11] = reg2;
712 	}
713 }
714 
715 NAMESPACE_END
716 
717 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
718