1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17#if __ARM_MAX_ARCH__>=7
18.text
19
20
21.code	32
22#undef	__thumb2__
23.align	5
24Lrcon:
25.long	0x01,0x01,0x01,0x01
26.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
27.long	0x1b,0x1b,0x1b,0x1b
28
29.text
30
31.globl	_aes_hw_set_encrypt_key
32.private_extern	_aes_hw_set_encrypt_key
33#ifdef __thumb2__
34.thumb_func	_aes_hw_set_encrypt_key
35#endif
36.align	5
37_aes_hw_set_encrypt_key:
38Lenc_key:
39	mov	r3,#-1
40	cmp	r0,#0
41	beq	Lenc_key_abort
42	cmp	r2,#0
43	beq	Lenc_key_abort
44	mov	r3,#-2
45	cmp	r1,#128
46	blt	Lenc_key_abort
47	cmp	r1,#256
48	bgt	Lenc_key_abort
49	tst	r1,#0x3f
50	bne	Lenc_key_abort
51
52	adr	r3,Lrcon
53	cmp	r1,#192
54
55	veor	q0,q0,q0
56	vld1.8	{q3},[r0]!
57	mov	r1,#8		@ reuse r1
58	vld1.32	{q1,q2},[r3]!
59
60	blt	Loop128
61	beq	L192
62	b	L256
63
64.align	4
65Loop128:
66	vtbl.8	d20,{q3},d4
67	vtbl.8	d21,{q3},d5
68	vext.8	q9,q0,q3,#12
69	vst1.32	{q3},[r2]!
70.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
71	subs	r1,r1,#1
72
73	veor	q3,q3,q9
74	vext.8	q9,q0,q9,#12
75	veor	q3,q3,q9
76	vext.8	q9,q0,q9,#12
77	veor	q10,q10,q1
78	veor	q3,q3,q9
79	vshl.u8	q1,q1,#1
80	veor	q3,q3,q10
81	bne	Loop128
82
83	vld1.32	{q1},[r3]
84
85	vtbl.8	d20,{q3},d4
86	vtbl.8	d21,{q3},d5
87	vext.8	q9,q0,q3,#12
88	vst1.32	{q3},[r2]!
89.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
90
91	veor	q3,q3,q9
92	vext.8	q9,q0,q9,#12
93	veor	q3,q3,q9
94	vext.8	q9,q0,q9,#12
95	veor	q10,q10,q1
96	veor	q3,q3,q9
97	vshl.u8	q1,q1,#1
98	veor	q3,q3,q10
99
100	vtbl.8	d20,{q3},d4
101	vtbl.8	d21,{q3},d5
102	vext.8	q9,q0,q3,#12
103	vst1.32	{q3},[r2]!
104.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
105
106	veor	q3,q3,q9
107	vext.8	q9,q0,q9,#12
108	veor	q3,q3,q9
109	vext.8	q9,q0,q9,#12
110	veor	q10,q10,q1
111	veor	q3,q3,q9
112	veor	q3,q3,q10
113	vst1.32	{q3},[r2]
114	add	r2,r2,#0x50
115
116	mov	r12,#10
117	b	Ldone
118
119.align	4
120L192:
121	vld1.8	{d16},[r0]!
122	vmov.i8	q10,#8			@ borrow q10
123	vst1.32	{q3},[r2]!
124	vsub.i8	q2,q2,q10	@ adjust the mask
125
126Loop192:
127	vtbl.8	d20,{q8},d4
128	vtbl.8	d21,{q8},d5
129	vext.8	q9,q0,q3,#12
130	vst1.32	{d16},[r2]!
131.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
132	subs	r1,r1,#1
133
134	veor	q3,q3,q9
135	vext.8	q9,q0,q9,#12
136	veor	q3,q3,q9
137	vext.8	q9,q0,q9,#12
138	veor	q3,q3,q9
139
140	vdup.32	q9,d7[1]
141	veor	q9,q9,q8
142	veor	q10,q10,q1
143	vext.8	q8,q0,q8,#12
144	vshl.u8	q1,q1,#1
145	veor	q8,q8,q9
146	veor	q3,q3,q10
147	veor	q8,q8,q10
148	vst1.32	{q3},[r2]!
149	bne	Loop192
150
151	mov	r12,#12
152	add	r2,r2,#0x20
153	b	Ldone
154
155.align	4
156L256:
157	vld1.8	{q8},[r0]
158	mov	r1,#7
159	mov	r12,#14
160	vst1.32	{q3},[r2]!
161
162Loop256:
163	vtbl.8	d20,{q8},d4
164	vtbl.8	d21,{q8},d5
165	vext.8	q9,q0,q3,#12
166	vst1.32	{q8},[r2]!
167.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
168	subs	r1,r1,#1
169
170	veor	q3,q3,q9
171	vext.8	q9,q0,q9,#12
172	veor	q3,q3,q9
173	vext.8	q9,q0,q9,#12
174	veor	q10,q10,q1
175	veor	q3,q3,q9
176	vshl.u8	q1,q1,#1
177	veor	q3,q3,q10
178	vst1.32	{q3},[r2]!
179	beq	Ldone
180
181	vdup.32	q10,d7[1]
182	vext.8	q9,q0,q8,#12
183.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
184
185	veor	q8,q8,q9
186	vext.8	q9,q0,q9,#12
187	veor	q8,q8,q9
188	vext.8	q9,q0,q9,#12
189	veor	q8,q8,q9
190
191	veor	q8,q8,q10
192	b	Loop256
193
194Ldone:
195	str	r12,[r2]
196	mov	r3,#0
197
198Lenc_key_abort:
199	mov	r0,r3			@ return value
200
201	bx	lr
202
203
204.globl	_aes_hw_set_decrypt_key
205.private_extern	_aes_hw_set_decrypt_key
206#ifdef __thumb2__
207.thumb_func	_aes_hw_set_decrypt_key
208#endif
209.align	5
210_aes_hw_set_decrypt_key:
211	stmdb	sp!,{r4,lr}
212	bl	Lenc_key
213
214	cmp	r0,#0
215	bne	Ldec_key_abort
216
217	sub	r2,r2,#240		@ restore original r2
218	mov	r4,#-16
219	add	r0,r2,r12,lsl#4	@ end of key schedule
220
221	vld1.32	{q0},[r2]
222	vld1.32	{q1},[r0]
223	vst1.32	{q0},[r0],r4
224	vst1.32	{q1},[r2]!
225
226Loop_imc:
227	vld1.32	{q0},[r2]
228	vld1.32	{q1},[r0]
229.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
230.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
231	vst1.32	{q0},[r0],r4
232	vst1.32	{q1},[r2]!
233	cmp	r0,r2
234	bhi	Loop_imc
235
236	vld1.32	{q0},[r2]
237.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
238	vst1.32	{q0},[r0]
239
240	eor	r0,r0,r0		@ return value
241Ldec_key_abort:
242	ldmia	sp!,{r4,pc}
243
244.globl	_aes_hw_encrypt
245.private_extern	_aes_hw_encrypt
246#ifdef __thumb2__
247.thumb_func	_aes_hw_encrypt
248#endif
249.align	5
250_aes_hw_encrypt:
251	AARCH64_VALID_CALL_TARGET
252	ldr	r3,[r2,#240]
253	vld1.32	{q0},[r2]!
254	vld1.8	{q2},[r0]
255	sub	r3,r3,#2
256	vld1.32	{q1},[r2]!
257
258Loop_enc:
259.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
260.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
261	vld1.32	{q0},[r2]!
262	subs	r3,r3,#2
263.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
264.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
265	vld1.32	{q1},[r2]!
266	bgt	Loop_enc
267
268.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
269.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
270	vld1.32	{q0},[r2]
271.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
272	veor	q2,q2,q0
273
274	vst1.8	{q2},[r1]
275	bx	lr
276
277.globl	_aes_hw_decrypt
278.private_extern	_aes_hw_decrypt
279#ifdef __thumb2__
280.thumb_func	_aes_hw_decrypt
281#endif
282.align	5
283_aes_hw_decrypt:
284	AARCH64_VALID_CALL_TARGET
285	ldr	r3,[r2,#240]
286	vld1.32	{q0},[r2]!
287	vld1.8	{q2},[r0]
288	sub	r3,r3,#2
289	vld1.32	{q1},[r2]!
290
291Loop_dec:
292.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
293.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
294	vld1.32	{q0},[r2]!
295	subs	r3,r3,#2
296.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
297.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
298	vld1.32	{q1},[r2]!
299	bgt	Loop_dec
300
301.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
302.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
303	vld1.32	{q0},[r2]
304.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
305	veor	q2,q2,q0
306
307	vst1.8	{q2},[r1]
308	bx	lr
309
310.globl	_aes_hw_cbc_encrypt
311.private_extern	_aes_hw_cbc_encrypt
312#ifdef __thumb2__
313.thumb_func	_aes_hw_cbc_encrypt
314#endif
315.align	5
316_aes_hw_cbc_encrypt:
317	mov	ip,sp
318	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
319	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
320	ldmia	ip,{r4,r5}		@ load remaining args
321	subs	r2,r2,#16
322	mov	r8,#16
323	blo	Lcbc_abort
324	moveq	r8,#0
325
326	cmp	r5,#0			@ en- or decrypting?
327	ldr	r5,[r3,#240]
328	and	r2,r2,#-16
329	vld1.8	{q6},[r4]
330	vld1.8	{q0},[r0],r8
331
332	vld1.32	{q8,q9},[r3]		@ load key schedule...
333	sub	r5,r5,#6
334	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
335	sub	r5,r5,#2
336	vld1.32	{q10,q11},[r7]!
337	vld1.32	{q12,q13},[r7]!
338	vld1.32	{q14,q15},[r7]!
339	vld1.32	{q7},[r7]
340
341	add	r7,r3,#32
342	mov	r6,r5
343	beq	Lcbc_dec
344
345	cmp	r5,#2
346	veor	q0,q0,q6
347	veor	q5,q8,q7
348	beq	Lcbc_enc128
349
350	vld1.32	{q2,q3},[r7]
351	add	r7,r3,#16
352	add	r6,r3,#16*4
353	add	r12,r3,#16*5
354.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
355.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
356	add	r14,r3,#16*6
357	add	r3,r3,#16*7
358	b	Lenter_cbc_enc
359
360.align	4
361Loop_cbc_enc:
362.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
363.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
364	vst1.8	{q6},[r1]!
365Lenter_cbc_enc:
366.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
367.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
368.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
369.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
370	vld1.32	{q8},[r6]
371	cmp	r5,#4
372.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
373.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
374	vld1.32	{q9},[r12]
375	beq	Lcbc_enc192
376
377.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
378.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
379	vld1.32	{q8},[r14]
380.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
381.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
382	vld1.32	{q9},[r3]
383	nop
384
385Lcbc_enc192:
386.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
387.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
388	subs	r2,r2,#16
389.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
390.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
391	moveq	r8,#0
392.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
393.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
394.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
395.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
396	vld1.8	{q8},[r0],r8
397.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
398.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
399	veor	q8,q8,q5
400.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
401.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
402	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
403.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
404.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
405.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
406	veor	q6,q0,q7
407	bhs	Loop_cbc_enc
408
409	vst1.8	{q6},[r1]!
410	b	Lcbc_done
411
412.align	5
413Lcbc_enc128:
414	vld1.32	{q2,q3},[r7]
415.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
416.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
417	b	Lenter_cbc_enc128
418Loop_cbc_enc128:
419.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
420.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
421	vst1.8	{q6},[r1]!
422Lenter_cbc_enc128:
423.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
424.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
425	subs	r2,r2,#16
426.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
427.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
428	moveq	r8,#0
429.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
430.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
431.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
432.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
433.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
434.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
435	vld1.8	{q8},[r0],r8
436.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
437.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
438.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
439.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
440.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
441.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
442	veor	q8,q8,q5
443.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
444	veor	q6,q0,q7
445	bhs	Loop_cbc_enc128
446
447	vst1.8	{q6},[r1]!
448	b	Lcbc_done
449.align	5
450Lcbc_dec:
451	vld1.8	{q10},[r0]!
452	subs	r2,r2,#32		@ bias
453	add	r6,r5,#2
454	vorr	q3,q0,q0
455	vorr	q1,q0,q0
456	vorr	q11,q10,q10
457	blo	Lcbc_dec_tail
458
459	vorr	q1,q10,q10
460	vld1.8	{q10},[r0]!
461	vorr	q2,q0,q0
462	vorr	q3,q1,q1
463	vorr	q11,q10,q10
464
465Loop3x_cbc_dec:
466.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
467.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
468.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
469.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
470.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
471.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
472	vld1.32	{q8},[r7]!
473	subs	r6,r6,#2
474.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
475.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
476.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
477.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
478.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
479.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
480	vld1.32	{q9},[r7]!
481	bgt	Loop3x_cbc_dec
482
483.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
484.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
485.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
486.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
487.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
488.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
489	veor	q4,q6,q7
490	subs	r2,r2,#0x30
491	veor	q5,q2,q7
492	movlo	r6,r2			@ r6, r6, is zero at this point
493.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
494.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
495.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
496.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
497.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
498.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
499	veor	q9,q3,q7
500	add	r0,r0,r6		@ r0 is adjusted in such way that
501					@ at exit from the loop q1-q10
502					@ are loaded with last "words"
503	vorr	q6,q11,q11
504	mov	r7,r3
505.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
506.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
507.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
508.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
509.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
510.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
511	vld1.8	{q2},[r0]!
512.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
513.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
514.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
515.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
516.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
517.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
518	vld1.8	{q3},[r0]!
519.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
520.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
521.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
522.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
523.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
524.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
525	vld1.8	{q11},[r0]!
526.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
527.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
528.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
529	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
530	add	r6,r5,#2
531	veor	q4,q4,q0
532	veor	q5,q5,q1
533	veor	q10,q10,q9
534	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
535	vst1.8	{q4},[r1]!
536	vorr	q0,q2,q2
537	vst1.8	{q5},[r1]!
538	vorr	q1,q3,q3
539	vst1.8	{q10},[r1]!
540	vorr	q10,q11,q11
541	bhs	Loop3x_cbc_dec
542
543	cmn	r2,#0x30
544	beq	Lcbc_done
545	nop
546
547Lcbc_dec_tail:
548.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
549.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
550.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
551.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
552	vld1.32	{q8},[r7]!
553	subs	r6,r6,#2
554.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
555.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
556.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
557.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
558	vld1.32	{q9},[r7]!
559	bgt	Lcbc_dec_tail
560
561.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
562.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
563.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
564.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
565.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
566.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
567.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
568.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
569.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
570.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
571.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
572.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
573	cmn	r2,#0x20
574.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
575.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
576.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
577.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
578	veor	q5,q6,q7
579.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
580.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
581.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
582.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
583	veor	q9,q3,q7
584.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
585.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
586	beq	Lcbc_dec_one
587	veor	q5,q5,q1
588	veor	q9,q9,q10
589	vorr	q6,q11,q11
590	vst1.8	{q5},[r1]!
591	vst1.8	{q9},[r1]!
592	b	Lcbc_done
593
594Lcbc_dec_one:
595	veor	q5,q5,q10
596	vorr	q6,q11,q11
597	vst1.8	{q5},[r1]!
598
599Lcbc_done:
600	vst1.8	{q6},[r4]
601Lcbc_abort:
602	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
603	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
604
605.globl	_aes_hw_ctr32_encrypt_blocks
606.private_extern	_aes_hw_ctr32_encrypt_blocks
607#ifdef __thumb2__
608.thumb_func	_aes_hw_ctr32_encrypt_blocks
609#endif
610.align	5
611_aes_hw_ctr32_encrypt_blocks:
612	mov	ip,sp
613	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
614	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
615	ldr	r4, [ip]		@ load remaining arg
616	ldr	r5,[r3,#240]
617
618	ldr	r8, [r4, #12]
619	vld1.32	{q0},[r4]
620
621	vld1.32	{q8,q9},[r3]		@ load key schedule...
622	sub	r5,r5,#4
623	mov	r12,#16
624	cmp	r2,#2
625	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
626	sub	r5,r5,#2
627	vld1.32	{q12,q13},[r7]!
628	vld1.32	{q14,q15},[r7]!
629	vld1.32	{q7},[r7]
630	add	r7,r3,#32
631	mov	r6,r5
632	movlo	r12,#0
633
634	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
635	@ affected by silicon errata #1742098 [0] and #1655431 [1],
636	@ respectively, where the second instruction of an aese/aesmc
637	@ instruction pair may execute twice if an interrupt is taken right
638	@ after the first instruction consumes an input register of which a
639	@ single 32-bit lane has been updated the last time it was modified.
640	@
641	@ This function uses a counter in one 32-bit lane. The
642	@ could write to q1 and q10 directly, but that trips this bugs.
643	@ We write to q6 and copy to the final register as a workaround.
644	@
645	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
646	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
647#ifndef __ARMEB__
648	rev	r8, r8
649#endif
650	add	r10, r8, #1
651	vorr	q6,q0,q0
652	rev	r10, r10
653	vmov.32	d13[1],r10
654	add	r8, r8, #2
655	vorr	q1,q6,q6
656	bls	Lctr32_tail
657	rev	r12, r8
658	vmov.32	d13[1],r12
659	sub	r2,r2,#3		@ bias
660	vorr	q10,q6,q6
661	b	Loop3x_ctr32
662
663.align	4
664Loop3x_ctr32:
665.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
666.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
667.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
668.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
669.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
670.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
671	vld1.32	{q8},[r7]!
672	subs	r6,r6,#2
673.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
674.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
675.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
676.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
677.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
678.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
679	vld1.32	{q9},[r7]!
680	bgt	Loop3x_ctr32
681
682.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
683.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
684.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
685.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
686	vld1.8	{q2},[r0]!
687	add	r9,r8,#1
688.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
689.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
690	vld1.8	{q3},[r0]!
691	rev	r9,r9
692.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
693.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
694.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
695.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
696	vld1.8	{q11},[r0]!
697	mov	r7,r3
698.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
699.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
700.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
701.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
702.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
703.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
704	veor	q2,q2,q7
705	add	r10,r8,#2
706.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
707.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
708	veor	q3,q3,q7
709	add	r8,r8,#3
710.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
711.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
712.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
713.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
714	 @ Note the logic to update q0, q1, and q1 is written to work
715	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
716	 @ 32-bit mode. See the comment above.
717	veor	q11,q11,q7
718	vmov.32	d13[1], r9
719.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
720.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
721	vorr	q0,q6,q6
722	rev	r10,r10
723.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
724.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
725	vmov.32	d13[1], r10
726	rev	r12,r8
727.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
728.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
729	vorr	q1,q6,q6
730	vmov.32	d13[1], r12
731.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
732.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
733	vorr	q10,q6,q6
734	subs	r2,r2,#3
735.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
736.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
737.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
738
739	veor	q2,q2,q4
740	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
741	vst1.8	{q2},[r1]!
742	veor	q3,q3,q5
743	mov	r6,r5
744	vst1.8	{q3},[r1]!
745	veor	q11,q11,q9
746	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
747	vst1.8	{q11},[r1]!
748	bhs	Loop3x_ctr32
749
750	adds	r2,r2,#3
751	beq	Lctr32_done
752	cmp	r2,#1
753	mov	r12,#16
754	moveq	r12,#0
755
756Lctr32_tail:
757.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
758.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
759.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
760.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
761	vld1.32	{q8},[r7]!
762	subs	r6,r6,#2
763.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
764.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
765.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
766.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
767	vld1.32	{q9},[r7]!
768	bgt	Lctr32_tail
769
770.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
771.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
772.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
773.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
774.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
775.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
776.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
777.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
778	vld1.8	{q2},[r0],r12
779.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
780.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
781.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
782.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
783	vld1.8	{q3},[r0]
784.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
785.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
786.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
787.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
788	veor	q2,q2,q7
789.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
790.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
791.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
792.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
793	veor	q3,q3,q7
794.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
795.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
796
797	cmp	r2,#1
798	veor	q2,q2,q0
799	veor	q3,q3,q1
800	vst1.8	{q2},[r1]!
801	beq	Lctr32_done
802	vst1.8	{q3},[r1]
803
804Lctr32_done:
805	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
806	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
807
808#endif
809#endif  // !OPENSSL_NO_ASM
810