1# This Source Code Form is subject to the terms of the Mozilla Public
2# License, v. 2.0. If a copy of the MPL was not distributed with this
3# file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5
6# vs0 - vs15              : buffer for xor
7# vs32 - vs47 (v0 - v15)  : 4 "converted" states
8# vs48 - vs51 (v16 - v19) : original state
9# vs52 - vs55 (v20 - v23) : "converted" constants
10# vs56 (v24)              : "converted" counter
11# vs57 (v25)              : increment for "converted" counter
12# vs60 - vs63 (v28 - v31) : constants for rotate left or vpermxor
13
14#define r0	0
15#define sp	1
16#define r2	2
17#define rSIZE	3
18#define rDST	4
19#define rSRC	5
20#define rKEY	6
21#define rNONCE	7
22#define rCNTR	8
23#define r9	9
24#define r10	10
25#define r11	11
26#define r12	12
27#define r13	13
28#define r14	14
29#define r15	15
30#define r16	16
31#define r17	17
32#define r18	18
33#define r19	19
34#define r20	20
35#define r21	21
36#define r22	22
37#define r23	23
38#define r24	24
39#define r25	25
40#define r26	26
41#define r27	27
42#define r28	28
43#define r29	29
44#define r30	30
45#define r31	31
46
47#define v0	0
48#define v1	1
49#define v2	2
50#define v3	3
51#define v4	4
52#define v5	5
53#define v6	6
54#define v7	7
55#define v8	8
56#define v9	9
57#define v10	10
58#define v11	11
59#define v12	12
60#define v13	13
61#define v14	14
62#define v15	15
63#define v16	16
64#define v17	17
65#define v18	18
66#define v19	19
67#define v20	20
68#define v21	21
69#define v22	22
70#define v23	23
71#define v24	24
72#define v25	25
73#define v26	26
74#define v27	27
75#define v28	28
76#define v29	29
77#define v30	30
78#define v31	31
79
80#define vs0	0
81#define vs1	1
82#define vs2	2
83#define vs3	3
84#define vs4	4
85#define vs5	5
86#define vs6	6
87#define vs7	7
88#define vs8	8
89#define vs9	9
90#define vs10	10
91#define vs11	11
92#define vs12	12
93#define vs13	13
94#define vs14	14
95#define vs15	15
96#define vs16	16
97#define vs17	17
98#define vs18	18
99#define vs19	19
100#define vs20	20
101#define vs21	21
102#define vs22	22
103#define vs23	23
104#define vs24	24
105#define vs25	25
106#define vs26	26
107#define vs27	27
108#define vs28	28
109#define vs29	29
110#define vs30	30
111#define vs31	31
112#define vs32	32
113#define vs33	33
114#define vs34	34
115#define vs35	35
116#define vs36	36
117#define vs37	37
118#define vs38	38
119#define vs39	39
120#define vs40	40
121#define vs41	41
122#define vs42	42
123#define vs43	43
124#define vs44	44
125#define vs45	45
126#define vs46	46
127#define vs47	47
128#define vs48	48
129#define vs49	49
130#define vs50	50
131#define vs51	51
132#define vs52	52
133#define vs53	53
134#define vs54	54
135#define vs55	55
136#define vs56	56
137#define vs57	57
138#define vs58	58
139#define vs59	59
140#define vs60	60
141#define vs61	61
142#define vs62	62
143#define vs63	63
144
145.abiversion 2
146.section ".data"
147.align 5
148lblock:	.skip 256
149cnts0:	.long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
150cnts1:	.long 0x61707865, 0x61707865, 0x61707865, 0x61707865
151cnts2:	.long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e
152cnts3:	.long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32
153cnts4:	.long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574
154st4:	.long 0, 0, 0, 0
155cntr:	.long 0, 0, 0, 0
156incr:	.long 4, 4, 4, 4
157rotl1:	.long 0x22330011, 0x66774455, 0xAABB8899, 0xEEFFCCDD
158rotl2:	.long 12, 12, 12, 12
159rotl3:	.long 0x11223300, 0x55667744, 0x99AABB88, 0xDDEEFFCC
160rotl4:	.long 7, 7, 7, 7
161
162.section ".text"
163.align 5
164.globl chacha20vsx
165.type chacha20vsx, @function
166chacha20vsx:
167	# prologue
168	addis	2, r12, .TOC.-chacha20vsx@ha
169	addi	2, 2, .TOC.-chacha20vsx@l
170	.localentry	chacha20vsx, .-chacha20vsx
171	std	r14, -8(sp)
172	std	r15, -16(sp)
173	std	r16, -24(sp)
174	std	r17, -32(sp)
175	std	r18, -40(sp)
176	std	r19, -48(sp)
177	std	r20, -56(sp)
178	std	r21, -64(sp)
179	std	r22, -72(sp)
180	std	r23, -80(sp)
181	std	r24, -88(sp)
182	std	r25, -96(sp)
183	std	r26, -104(sp)
184	std	r27, -112(sp)
185	std	r28, -120(sp)
186	std	r29, -128(sp)
187	std	r30, -136(sp)
188	std	r31, -144(sp)
189
190	addi	r14, sp, -160
191
192	li	r16, -16
193	li	r17, -32
194	li	r18, -48
195	li	r19, -64
196	li	r20, -80
197	li	r21, -96
198	li	r22, -112
199	li	r23, -128
200	li	r24, -144
201	li	r25, -160
202	li	r26, -176
203	li	r27, -192
204	li	r28, -208
205
206	# save f14, f15
207	stxvw4x	vs14, 0, r14
208	stxvw4x	vs15, r16, r14
209
210	# save v20 - v31
211	stxvw4x	vs52, r17, r14
212	stxvw4x	vs53, r18, r14
213	stxvw4x	vs54, r19, r14
214	stxvw4x	vs55, r20, r14
215	stxvw4x	vs56, r21, r14
216	stxvw4x	vs57, r22, r14
217	stxvw4x	vs58, r23, r14
218	stxvw4x	vs59, r24, r14
219	stxvw4x	vs60, r25, r14
220	stxvw4x	vs61, r26, r14
221	stxvw4x	vs62, r27, r14
222	stxvw4x	vs63, r28, r14
223
224	# offset in src/dst
225	li	r17, 16
226	li	r18, 32
227	li	r19, 48
228	li	r20, 64
229	li	r21, 80
230	li	r22, 96
231	li	r23, 112
232	li	r24, 128
233	li	r25, 144
234	li	r26, 160
235	li	r27, 176
236	li	r28, 192
237	li	r29, 208
238	li	r30, 224
239	li	r31, 240
240
241	# load const's address
242	addis	r14, 2, cnts0@toc@ha
243	addi	r14, r14, cnts0@toc@l
244
245	# save nonce to st4
246	lwz	r15, 0(rNONCE)
247	stw	r15, 84(r14)
248	lwz	r15, 4(rNONCE)
249	stw	r15, 88(r14)
250	lwz	r15, 8(rNONCE)
251	stw	r15, 92(r14)
252
253	# load state to vectors
254	lxvw4x	vs48, 0, r14
255	lxvw4x	vs49, 0, rKEY
256	lxvw4x	vs50, r17, rKEY
257	lxvw4x	vs51, r21, r14
258
259	# load consts for x4 rounds
260	lxvw4x	vs52, r17, r14
261	lxvw4x	vs53, r18, r14
262	lxvw4x	vs54, r19, r14
263	lxvw4x	vs55, r20, r14
264
265	# counter
266	stw	rCNTR, 96(r14)
267	addi	rCNTR, rCNTR, 1
268	stw	rCNTR, 100(r14)
269	addi	rCNTR, rCNTR, 1
270	stw	rCNTR, 104(r14)
271	addi	rCNTR, rCNTR, 1
272	stw	rCNTR, 108(r14)
273	lxvw4x	vs56, r22, r14
274
275	# load increment
276	lxvw4x	vs57, r23, r14
277
278	# load rotl to vectors
279	lxvw4x	vs60, r24, r14
280	lxvw4x	vs61, r25, r14
281	lxvw4x	vs62, r26, r14
282	lxvw4x	vs63, r27, r14
283
284	# counter for loop = size/256
285	li	r15, 256
286	divdu.	r16, rSIZE, r15
287	beq	lastblock
288	mtctr	r16
289
290mainloop:
291	# init 16 vectors (4 states x4)
292	vor	v0, v20, v20
293	vor	v1, v21, v21
294	vor	v2, v22, v22
295	vor	v3, v23, v23
296	vspltw	v4, v17, v0
297	vspltw	v5, v17, v1
298	vspltw	v6, v17, v2
299	vspltw	v7, v17, v3
300	vspltw	v8, v18, v0
301	vspltw	v9, v18, v1
302	vspltw	v10, v18, v2
303	vspltw	v11, v18, v3
304	vor	v12, v24, v24
305	vspltw	v13, v19, v1
306	vspltw	v14, v19, v2
307	vspltw	v15, v19, v3
308
309.macro _plus a b_y b_x
310	vadduwm	\a,   \a,   \b_y*4+(\b_x)%4
311	vadduwm	\a+1, \a+1, \b_y*4+(\b_x+1)%4
312	vadduwm	\a+2, \a+2, \b_y*4+(\b_x+2)%4
313	vadduwm	\a+3, \a+3, \b_y*4+(\b_x+3)%4
314.endm
315
316.macro _xor a b_y b_x
317	vxor	\a,   \a,   \b_y*4+(\b_x)%4
318	vxor	\a+1, \a+1, \b_y*4+(\b_x+1)%4
319	vxor	\a+2, \a+2, \b_y*4+(\b_x+2)%4
320	vxor	\a+3, \a+3, \b_y*4+(\b_x+3)%4
321.endm
322
323.macro _rotl a b
324	vrlw	\a,   \a,   \b
325	vrlw	\a+1, \a+1, \b
326	vrlw	\a+2, \a+2, \b
327	vrlw	\a+3, \a+3, \b
328.endm
329
330.macro _pxor a b_y b_x c
331	vpermxor	\a,   \a,   \b_y*4+(\b_x)%4,   \c
332	vpermxor	\a+1, \a+1, \b_y*4+(\b_x+1)%4, \c
333	vpermxor	\a+2, \a+2, \b_y*4+(\b_x+2)%4, \c
334	vpermxor	\a+3, \a+3, \b_y*4+(\b_x+3)%4, \c
335.endm
336
337#  00  01  02  03
338#  04  05  06  07
339#  08  09  10  11
340#  12  13  14  15
341.macro doubleround
342	# column round
343	_plus	v0,  v1, v0       # a+=b
344	_pxor	v12, v0, v0, v28  # d^=a; d<<<=16
345	_plus	v8,  v3, v0       # c+=d
346	_xor	v4,  v2, v0       # b^=c
347	_rotl	v4,  v29          # b<<<=12
348	_plus	v0,  v1, v0       # a+=b
349	_pxor	v12, v0, v0, v30  # d^=a; d<<<=8
350	_plus	v8,  v3, v0       # c+=d
351	_xor	v4,  v2, v0       # b^=c
352	_rotl	v4,  v31          # b<<<=7
353
354	# diagonal round
355	_plus	v0,  v1, v1       # a+=b
356	_pxor	v12, v0, v1, v28  # d^=a; d<<<=16
357	_plus	v8,  v3, v1       # c+=d
358	_xor	v4,  v2, v1       # b^=c
359	_rotl	v4,  v29          # b<<<=12
360	_plus	v0,  v1, v1       # a+=b
361	_pxor	v12, v0, v1, v30  # d^=a; d<<<=8
362	_plus	v8,  v3, v1       # c+=d
363	_xor	v4,  v2, v1       # b^=c
364	_rotl	v4,  v31          # b<<<=7
365.endm
366
367	doubleround # 1
368	doubleround # 2
369	doubleround # 3
370	doubleround # 4
371	doubleround # 5
372	doubleround # 6
373	doubleround # 7
374	doubleround # 8
375	doubleround # 9
376	doubleround # 10
377
378	# counter += original counter
379	vadduwm	v12, v12, v24
380
381.macro convert a
382	vmrgew	26, 0+\a, 1+\a
383	vmrgew	27, 2+\a, 3+\a
384	vmrgow	0+\a, 0+\a, 1+\a
385	vmrgow	2+\a, 2+\a, 3+\a
386	xxmrghd	33+\a, 32+\a, 34+\a
387	xxmrgld	35+\a, 32+\a, 34+\a
388	xxmrghd	32+\a, 58, 59
389	xxmrgld	34+\a, 58, 59
390.endm
391
392	convert 0
393	convert 4
394	convert 8
395	convert 12
396
397.macro addition a
398	vadduwm	0+\a, 0+\a, 16
399	vadduwm	4+\a, 4+\a, 17
400	vadduwm	8+\a, 8+\a, 18
401	vadduwm	12+\a, 12+\a, 19
402.endm
403
404	addition 0
405	addition 1
406	addition 2
407	addition 3
408
409	# load text/cipher
410	lxvw4x	vs0, 0, rSRC
411	lxvw4x	vs1, r17, rSRC
412	lxvw4x	vs2, r18, rSRC
413	lxvw4x	vs3, r19, rSRC
414	lxvw4x	vs4, r20, rSRC
415	lxvw4x	vs5, r21, rSRC
416	lxvw4x	vs6, r22, rSRC
417	lxvw4x	vs7, r23, rSRC
418	lxvw4x	vs8, r24, rSRC
419	lxvw4x	vs9, r25, rSRC
420	lxvw4x	vs10, r26, rSRC
421	lxvw4x	vs11, r27, rSRC
422	lxvw4x	vs12, r28, rSRC
423	lxvw4x	vs13, r29, rSRC
424	lxvw4x	vs14, r30, rSRC
425	lxvw4x	vs15, r31, rSRC
426	# xor (encrypt/decrypt)
427	xxlxor	vs0, vs0, vs32
428	xxlxor	vs1, vs1, vs36
429	xxlxor	vs2, vs2, vs40
430	xxlxor	vs3, vs3, vs44
431	xxlxor	vs4, vs4, vs33
432	xxlxor	vs5, vs5, vs37
433	xxlxor	vs6, vs6, vs41
434	xxlxor	vs7, vs7, vs45
435	xxlxor	vs8, vs8, vs34
436	xxlxor	vs9, vs9, vs38
437	xxlxor	vs10, vs10, vs42
438	xxlxor	vs11, vs11, vs46
439	xxlxor	vs12, vs12, vs35
440	xxlxor	vs13, vs13, vs39
441	xxlxor	vs14, vs14, vs43
442	xxlxor	vs15, vs15, vs47
443	# store cipher/text
444	stxvw4x	vs0, 0, rDST
445	stxvw4x	vs1, r17, rDST
446	stxvw4x	vs2, r18, rDST
447	stxvw4x	vs3, r19, rDST
448	stxvw4x	vs4, r20, rDST
449	stxvw4x	vs5, r21, rDST
450	stxvw4x	vs6, r22, rDST
451	stxvw4x	vs7, r23, rDST
452	stxvw4x	vs8, r24, rDST
453	stxvw4x	vs9, r25, rDST
454	stxvw4x	vs10, r26, rDST
455	stxvw4x	vs11, r27, rDST
456	stxvw4x	vs12, r28, rDST
457	stxvw4x	vs13, r29, rDST
458	stxvw4x	vs14, r30, rDST
459	stxvw4x	vs15, r31, rDST
460
461	# src/dst increment
462	addi	rSRC, rSRC, 256
463	addi	rDST, rDST, 256
464
465	# counter increment
466	vadduwm	v24, v24, v25
467
468	bdnz	mainloop
469
470lastblock:
471	# reminder
472	mulld	r16, r16, r15
473	subf.	r16, r16, rSIZE
474
475	# check reminder
476	beq	exitsub
477
478	addi	r14, r14, -256
479	# last block x4
480	# init 16 vectors (4 states x4)
481	vor	v0, v20, v20
482	vor	v1, v21, v21
483	vor	v2, v22, v22
484	vor	v3, v23, v23
485	vspltw	v4, v17, v0
486	vspltw	v5, v17, v1
487	vspltw	v6, v17, v2
488	vspltw	v7, v17, v3
489	vspltw	v8, v18, v0
490	vspltw	v9, v18, v1
491	vspltw	v10, v18, v2
492	vspltw	v11, v18, v3
493	vor	v12, v24, v24
494	vspltw	v13, v19, v1
495	vspltw	v14, v19, v2
496	vspltw	v15, v19, v3
497
498	doubleround # 1
499	doubleround # 2
500	doubleround # 3
501	doubleround # 4
502	doubleround # 5
503	doubleround # 6
504	doubleround # 7
505	doubleround # 8
506	doubleround # 9
507	doubleround # 10
508
509	vadduwm	v12, v12, v24
510
511	convert 0
512	convert 4
513	convert 8
514	convert 12
515
516	addition 0
517	addition 1
518	addition 2
519	addition 3
520
521	# store vectors
522	stxvw4x	vs32, 0, r14
523	stxvw4x	vs36, r17, r14
524	stxvw4x	vs40, r18, r14
525	stxvw4x	vs44, r19, r14
526	stxvw4x	vs33, r20, r14
527	stxvw4x	vs37, r21, r14
528	stxvw4x	vs41, r22, r14
529	stxvw4x	vs45, r23, r14
530	stxvw4x	vs34, r24, r14
531	stxvw4x	vs38, r25, r14
532	stxvw4x	vs42, r26, r14
533	stxvw4x	vs46, r27, r14
534	stxvw4x	vs35, r28, r14
535	stxvw4x	vs39, r29, r14
536	stxvw4x	vs43, r30, r14
537	stxvw4x	vs47, r31, r14
538
539	mtctr	r16
540	addi	rSIZE, r14, -1
541	addi	rSRC, rSRC, -1
542	addi	rDST, rDST, -1
543xorlast:
544	lbzu	r15, 1(rSIZE)
545	lbzu	r16, 1(rSRC)
546	xor	r15, r15, r16
547	stbu	r15, 1(rDST)
548	bdnz	xorlast
549
550	# zeroing last block
551	xxlxor	vs0, vs0, vs0
552	stxvw4x	vs0, 0, r14
553	stxvw4x	vs0, r17, r14
554	stxvw4x	vs0, r18, r14
555	stxvw4x	vs0, r19, r14
556	stxvw4x	vs0, r20, r14
557	stxvw4x	vs0, r21, r14
558	stxvw4x	vs0, r22, r14
559	stxvw4x	vs0, r23, r14
560	stxvw4x	vs0, r24, r14
561	stxvw4x	vs0, r25, r14
562	stxvw4x	vs0, r26, r14
563	stxvw4x	vs0, r27, r14
564	stxvw4x	vs0, r28, r14
565	stxvw4x	vs0, r29, r14
566	stxvw4x	vs0, r30, r14
567	stxvw4x	vs0, r31, r14
568
569exitsub:
570	# zeroing volatile registers
571	xxlxor	vs0, vs0, vs0
572	xxlxor	vs1, vs1, vs1
573	xxlxor	vs2, vs2, vs2
574	xxlxor	vs3, vs3, vs3
575	xxlxor	vs4, vs4, vs4
576	xxlxor	vs5, vs5, vs5
577	xxlxor	vs6, vs6, vs6
578	xxlxor	vs7, vs7, vs7
579	xxlxor	vs8, vs8, vs8
580	xxlxor	vs9, vs9, vs9
581	xxlxor	vs10, vs10, vs10
582	xxlxor	vs11, vs11, vs11
583	xxlxor	vs12, vs12, vs12
584	xxlxor	vs13, vs13, vs13
585
586	xxlxor	vs32, vs32, vs32
587	xxlxor	vs33, vs33, vs33
588	xxlxor	vs34, vs34, vs34
589	xxlxor	vs35, vs35, vs35
590	xxlxor	vs36, vs36, vs36
591	xxlxor	vs37, vs37, vs37
592	xxlxor	vs38, vs38, vs38
593	xxlxor	vs39, vs39, vs39
594	xxlxor	vs40, vs40, vs40
595	xxlxor	vs41, vs41, vs41
596	xxlxor	vs42, vs42, vs42
597	xxlxor	vs43, vs43, vs43
598	xxlxor	vs44, vs44, vs44
599	xxlxor	vs45, vs45, vs45
600	xxlxor	vs46, vs46, vs46
601	xxlxor	vs47, vs47, vs47
602	xxlxor	vs48, vs48, vs48
603	xxlxor	vs49, vs49, vs49
604	xxlxor	vs50, vs50, vs50
605	xxlxor	vs51, vs51, vs51
606
607	li	rSIZE, 0
608	li	rDST, 0
609	li	rSRC, 0
610	li	rKEY, 0
611	li	rNONCE, 0
612	li	rCNTR, 0
613
614	# epilogue
615	addi	r14, sp, -160
616
617	li	r16, -16
618	li	r17, -32
619	li	r18, -48
620	li	r19, -64
621	li	r20, -80
622	li	r21, -96
623	li	r22, -112
624	li	r23, -128
625	li	r24, -144
626	li	r25, -160
627	li	r26, -176
628	li	r27, -192
629	li	r28, -208
630
631	# load f14, f15
632	lxvw4x	vs14, 0, r14
633	lxvw4x	vs15, r16, r14
634
635	# load v20 - v31
636	lxvw4x	vs52, r17, r14
637	lxvw4x	vs53, r18, r14
638	lxvw4x	vs54, r19, r14
639	lxvw4x	vs55, r20, r14
640	lxvw4x	vs56, r21, r14
641	lxvw4x	vs57, r22, r14
642	lxvw4x	vs58, r23, r14
643	lxvw4x	vs59, r24, r14
644	lxvw4x	vs60, r25, r14
645	lxvw4x	vs61, r26, r14
646	lxvw4x	vs62, r27, r14
647	lxvw4x	vs63, r28, r14
648
649	ld	r14, -8(sp)
650	ld	r15, -16(sp)
651	ld	r16, -24(sp)
652	ld	r17, -32(sp)
653	ld	r18, -40(sp)
654	ld	r19, -48(sp)
655	ld	r20, -56(sp)
656	ld	r21, -64(sp)
657	ld	r22, -72(sp)
658	ld	r23, -80(sp)
659	ld	r24, -88(sp)
660	ld	r25, -96(sp)
661	ld	r26, -104(sp)
662	ld	r27, -112(sp)
663	ld	r28, -120(sp)
664	ld	r29, -128(sp)
665	ld	r30, -136(sp)
666	ld	r31, -144(sp)
667
668	blr
669