1/* twofish-arm.S  -  ARM assembly implementation of Twofish cipher
2 *
3 * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
4 *
5 * This file is part of Libgcrypt.
6 *
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
11 *
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#include <config.h>
22
23#if defined(__ARMEL__)
24#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
25
26.text
27
28.syntax unified
29.arm
30
31/* structure of TWOFISH_context: */
32#define s0 0
33#define s1 ((s0) + 4 * 256)
34#define s2 ((s1) + 4 * 256)
35#define s3 ((s2) + 4 * 256)
36#define w  ((s3) + 4 * 256)
37#define k  ((w) + 4 * 8)
38
39/* register macros */
40#define CTX %r0
41#define CTXs0 %r0
42#define CTXs1 %r1
43#define CTXs3 %r7
44
45#define RA %r3
46#define RB %r4
47#define RC %r5
48#define RD %r6
49
50#define RX %r2
51#define RY %ip
52
53#define RMASK %lr
54
55#define RT0 %r8
56#define RT1 %r9
57#define RT2 %r10
58#define RT3 %r11
59
60/* helper macros */
61#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
62	ldrb rout, [rsrc, #((offs) + 0)]; \
63	ldrb rtmp, [rsrc, #((offs) + 1)]; \
64	orr rout, rout, rtmp, lsl #8; \
65	ldrb rtmp, [rsrc, #((offs) + 2)]; \
66	orr rout, rout, rtmp, lsl #16; \
67	ldrb rtmp, [rsrc, #((offs) + 3)]; \
68	orr rout, rout, rtmp, lsl #24;
69
70#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
71	mov rtmp0, rin, lsr #8; \
72	strb rin, [rdst, #((offs) + 0)]; \
73	mov rtmp1, rin, lsr #16; \
74	strb rtmp0, [rdst, #((offs) + 1)]; \
75	mov rtmp0, rin, lsr #24; \
76	strb rtmp1, [rdst, #((offs) + 2)]; \
77	strb rtmp0, [rdst, #((offs) + 3)];
78
79#ifndef __ARMEL__
80	/* bswap on big-endian */
81	#define host_to_le(reg) \
82		rev reg, reg;
83	#define le_to_host(reg) \
84		rev reg, reg;
85#else
86	/* nop on little-endian */
87	#define host_to_le(reg) /*_*/
88	#define le_to_host(reg) /*_*/
89#endif
90
91#define ldr_input_aligned_le(rin, a, b, c, d) \
92	ldr a, [rin, #0]; \
93	ldr b, [rin, #4]; \
94	le_to_host(a); \
95	ldr c, [rin, #8]; \
96	le_to_host(b); \
97	ldr d, [rin, #12]; \
98	le_to_host(c); \
99	le_to_host(d);
100
101#define str_output_aligned_le(rout, a, b, c, d) \
102	le_to_host(a); \
103	le_to_host(b); \
104	str a, [rout, #0]; \
105	le_to_host(c); \
106	str b, [rout, #4]; \
107	le_to_host(d); \
108	str c, [rout, #8]; \
109	str d, [rout, #12];
110
111#ifdef __ARM_FEATURE_UNALIGNED
112	/* unaligned word reads/writes allowed */
113	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
114		ldr_input_aligned_le(rin, ra, rb, rc, rd)
115
116	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
117		str_output_aligned_le(rout, ra, rb, rc, rd)
118#else
119	/* need to handle unaligned reads/writes by byte reads */
120	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \
121		tst rin, #3; \
122		beq 1f; \
123			ldr_unaligned_le(ra, rin, 0, rtmp0); \
124			ldr_unaligned_le(rb, rin, 4, rtmp0); \
125			ldr_unaligned_le(rc, rin, 8, rtmp0); \
126			ldr_unaligned_le(rd, rin, 12, rtmp0); \
127			b 2f; \
128		1:;\
129			ldr_input_aligned_le(rin, ra, rb, rc, rd); \
130		2:;
131
132	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
133		tst rout, #3; \
134		beq 1f; \
135			str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \
136			str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \
137			str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \
138			str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \
139			b 2f; \
140		1:;\
141			str_output_aligned_le(rout, ra, rb, rc, rd); \
142		2:;
143#endif
144
145/**********************************************************************
146  1-way twofish
147 **********************************************************************/
148#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
149	and RT0, RMASK, b, lsr#(8 - 2); \
150	and RY, RMASK, b, lsr#(16 - 2); \
151	add RT0, RT0, #(s2 - s1); \
152	and RT1, RMASK, b, lsr#(24 - 2); \
153	ldr RY, [CTXs3, RY]; \
154	and RT2, RMASK, b, lsl#(2); \
155	ldr RT0, [CTXs1, RT0]; \
156	and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
157	ldr RT1, [CTXs0, RT1]; \
158	and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
159	ldr RT2, [CTXs1, RT2]; \
160	add RT3, RT3, #(s2 - s1); \
161	ldr RX, [CTXs1, RX]; \
162	ror_a(a); \
163	\
164	eor RY, RY, RT0; \
165	ldr RT3, [CTXs1, RT3]; \
166	and RT0, RMASK, a, lsl#(2); \
167	eor RY, RY, RT1; \
168	and RT1, RMASK, a, lsr#(24 - 2); \
169	eor RY, RY, RT2; \
170	ldr RT0, [CTXs0, RT0]; \
171	eor RX, RX, RT3; \
172	ldr RT1, [CTXs3, RT1]; \
173	eor RX, RX, RT0; \
174	\
175	ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
176	eor RX, RX, RT1; \
177	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
178	\
179	add RT0, RX, RY, lsl #1; \
180	add RX, RX, RY; \
181	add RT0, RT0, RT3; \
182	add RX, RX, RT2; \
183	eor rd, RT0, rd, ror #31; \
184	eor rc, rc, RX;
185
186#define dummy(x) /*_*/
187
188#define ror1(r) \
189	ror r, r, #1;
190
191#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
192	and RT3, RMASK, b, lsl#(2 - (adj_b)); \
193	and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
194	ror_b(b); \
195	and RT2, RMASK, a, lsl#(2); \
196	and RT0, RMASK, a, lsr#(8 - 2); \
197	\
198	ldr RY, [CTXs1, RT3]; \
199	add RT1, RT1, #(s2 - s1); \
200	ldr RX, [CTXs0, RT2]; \
201	and RT3, RMASK, b, lsr#(16 - 2); \
202	ldr RT1, [CTXs1, RT1]; \
203	and RT2, RMASK, a, lsr#(16 - 2); \
204	ldr RT0, [CTXs1, RT0]; \
205	\
206	add RT2, RT2, #(s2 - s1); \
207	ldr RT3, [CTXs3, RT3]; \
208	eor RY, RY, RT1; \
209	\
210	and RT1, RMASK, b, lsr#(24 - 2); \
211	eor RX, RX, RT0; \
212	ldr RT2, [CTXs1, RT2]; \
213	and RT0, RMASK, a, lsr#(24 - 2); \
214	\
215	ldr RT1, [CTXs0, RT1]; \
216	\
217	eor RY, RY, RT3; \
218	ldr RT0, [CTXs3, RT0]; \
219	eor RX, RX, RT2; \
220	eor RY, RY, RT1; \
221	\
222	ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
223	eor RX, RX, RT0; \
224	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
225	\
226	add RT0, RX, RY, lsl #1; \
227	add RX, RX, RY; \
228	add RT0, RT0, RT1; \
229	add RX, RX, RT2; \
230	eor rd, rd, RT0; \
231	eor rc, RX, rc, ror #31;
232
233#define first_encrypt_cycle(nc) \
234	encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
235	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
236
237#define encrypt_cycle(nc) \
238	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
239	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
240
241#define last_encrypt_cycle(nc) \
242	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
243	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
244	ror1(RA);
245
246#define first_decrypt_cycle(nc) \
247	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
248	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
249
250#define decrypt_cycle(nc) \
251	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
252	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
253
254#define last_decrypt_cycle(nc) \
255	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
256	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
257	ror1(RD);
258
259.align 3
260.globl _gcry_twofish_arm_encrypt_block
261.type   _gcry_twofish_arm_encrypt_block,%function;
262
263_gcry_twofish_arm_encrypt_block:
264	/* input:
265	 *	%r0: ctx
266	 *	%r1: dst
267	 *	%r2: src
268	 */
269	push {%r1, %r4-%r11, %ip, %lr};
270
271	add RY, CTXs0, #w;
272
273	ldr_input_le(%r2, RA, RB, RC, RD, RT0);
274
275	/* Input whitening */
276	ldm RY, {RT0, RT1, RT2, RT3};
277	add CTXs3, CTXs0, #(s3 - s0);
278	add CTXs1, CTXs0, #(s1 - s0);
279	mov RMASK, #(0xff << 2);
280	eor RA, RA, RT0;
281	eor RB, RB, RT1;
282	eor RC, RC, RT2;
283	eor RD, RD, RT3;
284
285	first_encrypt_cycle(0);
286	encrypt_cycle(1);
287	encrypt_cycle(2);
288	encrypt_cycle(3);
289	encrypt_cycle(4);
290	encrypt_cycle(5);
291	encrypt_cycle(6);
292	last_encrypt_cycle(7);
293
294	add RY, CTXs3, #(w + 4*4 - s3);
295	pop {%r1}; /* dst */
296
297	/* Output whitening */
298	ldm RY, {RT0, RT1, RT2, RT3};
299	eor RC, RC, RT0;
300	eor RD, RD, RT1;
301	eor RA, RA, RT2;
302	eor RB, RB, RT3;
303
304	str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
305
306	pop {%r4-%r11, %ip, %pc};
307.ltorg
308.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;
309
310.align 3
311.globl _gcry_twofish_arm_decrypt_block
312.type   _gcry_twofish_arm_decrypt_block,%function;
313
314_gcry_twofish_arm_decrypt_block:
315	/* input:
316	 *	%r0: ctx
317	 *	%r1: dst
318	 *	%r2: src
319	 */
320	push {%r1, %r4-%r11, %ip, %lr};
321
322	add CTXs3, CTXs0, #(s3 - s0);
323
324	ldr_input_le(%r2, RC, RD, RA, RB, RT0);
325
326	add RY, CTXs3, #(w + 4*4 - s3);
327	add CTXs3, CTXs0, #(s3 - s0);
328
329	/* Input whitening */
330	ldm RY, {RT0, RT1, RT2, RT3};
331	add CTXs1, CTXs0, #(s1 - s0);
332	mov RMASK, #(0xff << 2);
333	eor RC, RC, RT0;
334	eor RD, RD, RT1;
335	eor RA, RA, RT2;
336	eor RB, RB, RT3;
337
338	first_decrypt_cycle(7);
339	decrypt_cycle(6);
340	decrypt_cycle(5);
341	decrypt_cycle(4);
342	decrypt_cycle(3);
343	decrypt_cycle(2);
344	decrypt_cycle(1);
345	last_decrypt_cycle(0);
346
347	add RY, CTXs0, #w;
348	pop {%r1}; /* dst */
349
350	/* Output whitening */
351	ldm RY, {RT0, RT1, RT2, RT3};
352	eor RA, RA, RT0;
353	eor RB, RB, RT1;
354	eor RC, RC, RT2;
355	eor RD, RD, RT3;
356
357	str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
358
359	pop {%r4-%r11, %ip, %pc};
360.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;
361
362#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
363#endif /*__ARMEL__*/
364