1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Based on CRYPTOGAMS code with the following comment:
6// # ====================================================================
7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8// # project. The module is, however, dual licensed under OpenSSL and
9// # CRYPTOGAMS licenses depending on where you obtain it. For further
10// # details see http://www.openssl.org/~appro/cryptogams/.
11// # ====================================================================
12
13// Code for the perl script that generates the ppc64 assembler
14// can be found in the cryptogams repository at the link below. It is based on
15// the original from openssl.
16
17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
18
19// The differences in this and the original implementation are
20// due to the calling conventions and initialization of constants.
21
22// +build !gccgo,!purego
23
24#include "textflag.h"
25
26#define OUT  R3
27#define INP  R4
28#define LEN  R5
29#define KEY  R6
30#define CNT  R7
31#define TMP  R15
32
33#define CONSTBASE  R16
34#define BLOCKS R17
35
36DATA consts<>+0x00(SB)/8, $0x3320646e61707865
37DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
38DATA consts<>+0x10(SB)/8, $0x0000000000000001
39DATA consts<>+0x18(SB)/8, $0x0000000000000000
40DATA consts<>+0x20(SB)/8, $0x0000000000000004
41DATA consts<>+0x28(SB)/8, $0x0000000000000000
42DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
43DATA consts<>+0x38(SB)/8, $0x0203000106070405
44DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
45DATA consts<>+0x48(SB)/8, $0x0102030005060704
46DATA consts<>+0x50(SB)/8, $0x6170786561707865
47DATA consts<>+0x58(SB)/8, $0x6170786561707865
48DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
49DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
50DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
51DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
52DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
53DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
54DATA consts<>+0x90(SB)/8, $0x0000000100000000
55DATA consts<>+0x98(SB)/8, $0x0000000300000002
56GLOBL consts<>(SB), RODATA, $0xa0
57
58//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
59TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
60	MOVD out+0(FP), OUT
61	MOVD inp+8(FP), INP
62	MOVD len+16(FP), LEN
63	MOVD key+24(FP), KEY
64	MOVD counter+32(FP), CNT
65
66	// Addressing for constants
67	MOVD $consts<>+0x00(SB), CONSTBASE
68	MOVD $16, R8
69	MOVD $32, R9
70	MOVD $48, R10
71	MOVD $64, R11
72	SRD $6, LEN, BLOCKS
73	// V16
74	LXVW4X (CONSTBASE)(R0), VS48
75	ADD $80,CONSTBASE
76
77	// Load key into V17,V18
78	LXVW4X (KEY)(R0), VS49
79	LXVW4X (KEY)(R8), VS50
80
81	// Load CNT, NONCE into V19
82	LXVW4X (CNT)(R0), VS51
83
84	// Clear V27
85	VXOR V27, V27, V27
86
87	// V28
88	LXVW4X (CONSTBASE)(R11), VS60
89
90	// splat slot from V19 -> V26
91	VSPLTW $0, V19, V26
92
93	VSLDOI $4, V19, V27, V19
94	VSLDOI $12, V27, V19, V19
95
96	VADDUWM V26, V28, V26
97
98	MOVD $10, R14
99	MOVD R14, CTR
100
101loop_outer_vsx:
102	// V0, V1, V2, V3
103	LXVW4X (R0)(CONSTBASE), VS32
104	LXVW4X (R8)(CONSTBASE), VS33
105	LXVW4X (R9)(CONSTBASE), VS34
106	LXVW4X (R10)(CONSTBASE), VS35
107
108	// splat values from V17, V18 into V4-V11
109	VSPLTW $0, V17, V4
110	VSPLTW $1, V17, V5
111	VSPLTW $2, V17, V6
112	VSPLTW $3, V17, V7
113	VSPLTW $0, V18, V8
114	VSPLTW $1, V18, V9
115	VSPLTW $2, V18, V10
116	VSPLTW $3, V18, V11
117
118	// VOR
119	VOR V26, V26, V12
120
121	// splat values from V19 -> V13, V14, V15
122	VSPLTW $1, V19, V13
123	VSPLTW $2, V19, V14
124	VSPLTW $3, V19, V15
125
126	// splat   const values
127	VSPLTISW $-16, V27
128	VSPLTISW $12, V28
129	VSPLTISW $8, V29
130	VSPLTISW $7, V30
131
132loop_vsx:
133	VADDUWM V0, V4, V0
134	VADDUWM V1, V5, V1
135	VADDUWM V2, V6, V2
136	VADDUWM V3, V7, V3
137
138	VXOR V12, V0, V12
139	VXOR V13, V1, V13
140	VXOR V14, V2, V14
141	VXOR V15, V3, V15
142
143	VRLW V12, V27, V12
144	VRLW V13, V27, V13
145	VRLW V14, V27, V14
146	VRLW V15, V27, V15
147
148	VADDUWM V8, V12, V8
149	VADDUWM V9, V13, V9
150	VADDUWM V10, V14, V10
151	VADDUWM V11, V15, V11
152
153	VXOR V4, V8, V4
154	VXOR V5, V9, V5
155	VXOR V6, V10, V6
156	VXOR V7, V11, V7
157
158	VRLW V4, V28, V4
159	VRLW V5, V28, V5
160	VRLW V6, V28, V6
161	VRLW V7, V28, V7
162
163	VADDUWM V0, V4, V0
164	VADDUWM V1, V5, V1
165	VADDUWM V2, V6, V2
166	VADDUWM V3, V7, V3
167
168	VXOR V12, V0, V12
169	VXOR V13, V1, V13
170	VXOR V14, V2, V14
171	VXOR V15, V3, V15
172
173	VRLW V12, V29, V12
174	VRLW V13, V29, V13
175	VRLW V14, V29, V14
176	VRLW V15, V29, V15
177
178	VADDUWM V8, V12, V8
179	VADDUWM V9, V13, V9
180	VADDUWM V10, V14, V10
181	VADDUWM V11, V15, V11
182
183	VXOR V4, V8, V4
184	VXOR V5, V9, V5
185	VXOR V6, V10, V6
186	VXOR V7, V11, V7
187
188	VRLW V4, V30, V4
189	VRLW V5, V30, V5
190	VRLW V6, V30, V6
191	VRLW V7, V30, V7
192
193	VADDUWM V0, V5, V0
194	VADDUWM V1, V6, V1
195	VADDUWM V2, V7, V2
196	VADDUWM V3, V4, V3
197
198	VXOR V15, V0, V15
199	VXOR V12, V1, V12
200	VXOR V13, V2, V13
201	VXOR V14, V3, V14
202
203	VRLW V15, V27, V15
204	VRLW V12, V27, V12
205	VRLW V13, V27, V13
206	VRLW V14, V27, V14
207
208	VADDUWM V10, V15, V10
209	VADDUWM V11, V12, V11
210	VADDUWM V8, V13, V8
211	VADDUWM V9, V14, V9
212
213	VXOR V5, V10, V5
214	VXOR V6, V11, V6
215	VXOR V7, V8, V7
216	VXOR V4, V9, V4
217
218	VRLW V5, V28, V5
219	VRLW V6, V28, V6
220	VRLW V7, V28, V7
221	VRLW V4, V28, V4
222
223	VADDUWM V0, V5, V0
224	VADDUWM V1, V6, V1
225	VADDUWM V2, V7, V2
226	VADDUWM V3, V4, V3
227
228	VXOR V15, V0, V15
229	VXOR V12, V1, V12
230	VXOR V13, V2, V13
231	VXOR V14, V3, V14
232
233	VRLW V15, V29, V15
234	VRLW V12, V29, V12
235	VRLW V13, V29, V13
236	VRLW V14, V29, V14
237
238	VADDUWM V10, V15, V10
239	VADDUWM V11, V12, V11
240	VADDUWM V8, V13, V8
241	VADDUWM V9, V14, V9
242
243	VXOR V5, V10, V5
244	VXOR V6, V11, V6
245	VXOR V7, V8, V7
246	VXOR V4, V9, V4
247
248	VRLW V5, V30, V5
249	VRLW V6, V30, V6
250	VRLW V7, V30, V7
251	VRLW V4, V30, V4
252	BC   16, LT, loop_vsx
253
254	VADDUWM V12, V26, V12
255
256	WORD $0x13600F8C		// VMRGEW V0, V1, V27
257	WORD $0x13821F8C		// VMRGEW V2, V3, V28
258
259	WORD $0x10000E8C		// VMRGOW V0, V1, V0
260	WORD $0x10421E8C		// VMRGOW V2, V3, V2
261
262	WORD $0x13A42F8C		// VMRGEW V4, V5, V29
263	WORD $0x13C63F8C		// VMRGEW V6, V7, V30
264
265	XXPERMDI VS32, VS34, $0, VS33
266	XXPERMDI VS32, VS34, $3, VS35
267	XXPERMDI VS59, VS60, $0, VS32
268	XXPERMDI VS59, VS60, $3, VS34
269
270	WORD $0x10842E8C		// VMRGOW V4, V5, V4
271	WORD $0x10C63E8C		// VMRGOW V6, V7, V6
272
273	WORD $0x13684F8C		// VMRGEW V8, V9, V27
274	WORD $0x138A5F8C		// VMRGEW V10, V11, V28
275
276	XXPERMDI VS36, VS38, $0, VS37
277	XXPERMDI VS36, VS38, $3, VS39
278	XXPERMDI VS61, VS62, $0, VS36
279	XXPERMDI VS61, VS62, $3, VS38
280
281	WORD $0x11084E8C		// VMRGOW V8, V9, V8
282	WORD $0x114A5E8C		// VMRGOW V10, V11, V10
283
284	WORD $0x13AC6F8C		// VMRGEW V12, V13, V29
285	WORD $0x13CE7F8C		// VMRGEW V14, V15, V30
286
287	XXPERMDI VS40, VS42, $0, VS41
288	XXPERMDI VS40, VS42, $3, VS43
289	XXPERMDI VS59, VS60, $0, VS40
290	XXPERMDI VS59, VS60, $3, VS42
291
292	WORD $0x118C6E8C		// VMRGOW V12, V13, V12
293	WORD $0x11CE7E8C		// VMRGOW V14, V15, V14
294
295	VSPLTISW $4, V27
296	VADDUWM V26, V27, V26
297
298	XXPERMDI VS44, VS46, $0, VS45
299	XXPERMDI VS44, VS46, $3, VS47
300	XXPERMDI VS61, VS62, $0, VS44
301	XXPERMDI VS61, VS62, $3, VS46
302
303	VADDUWM V0, V16, V0
304	VADDUWM V4, V17, V4
305	VADDUWM V8, V18, V8
306	VADDUWM V12, V19, V12
307
308	CMPU LEN, $64
309	BLT tail_vsx
310
311	// Bottom of loop
312	LXVW4X (INP)(R0), VS59
313	LXVW4X (INP)(R8), VS60
314	LXVW4X (INP)(R9), VS61
315	LXVW4X (INP)(R10), VS62
316
317	VXOR V27, V0, V27
318	VXOR V28, V4, V28
319	VXOR V29, V8, V29
320	VXOR V30, V12, V30
321
322	STXVW4X VS59, (OUT)(R0)
323	STXVW4X VS60, (OUT)(R8)
324	ADD     $64, INP
325	STXVW4X VS61, (OUT)(R9)
326	ADD     $-64, LEN
327	STXVW4X VS62, (OUT)(R10)
328	ADD     $64, OUT
329	BEQ     done_vsx
330
331	VADDUWM V1, V16, V0
332	VADDUWM V5, V17, V4
333	VADDUWM V9, V18, V8
334	VADDUWM V13, V19, V12
335
336	CMPU  LEN, $64
337	BLT   tail_vsx
338
339	LXVW4X (INP)(R0), VS59
340	LXVW4X (INP)(R8), VS60
341	LXVW4X (INP)(R9), VS61
342	LXVW4X (INP)(R10), VS62
343	VXOR   V27, V0, V27
344
345	VXOR V28, V4, V28
346	VXOR V29, V8, V29
347	VXOR V30, V12, V30
348
349	STXVW4X VS59, (OUT)(R0)
350	STXVW4X VS60, (OUT)(R8)
351	ADD     $64, INP
352	STXVW4X VS61, (OUT)(R9)
353	ADD     $-64, LEN
354	STXVW4X VS62, (OUT)(V10)
355	ADD     $64, OUT
356	BEQ     done_vsx
357
358	VADDUWM V2, V16, V0
359	VADDUWM V6, V17, V4
360	VADDUWM V10, V18, V8
361	VADDUWM V14, V19, V12
362
363	CMPU LEN, $64
364	BLT  tail_vsx
365
366	LXVW4X (INP)(R0), VS59
367	LXVW4X (INP)(R8), VS60
368	LXVW4X (INP)(R9), VS61
369	LXVW4X (INP)(R10), VS62
370
371	VXOR V27, V0, V27
372	VXOR V28, V4, V28
373	VXOR V29, V8, V29
374	VXOR V30, V12, V30
375
376	STXVW4X VS59, (OUT)(R0)
377	STXVW4X VS60, (OUT)(R8)
378	ADD     $64, INP
379	STXVW4X VS61, (OUT)(R9)
380	ADD     $-64, LEN
381	STXVW4X VS62, (OUT)(R10)
382	ADD     $64, OUT
383	BEQ     done_vsx
384
385	VADDUWM V3, V16, V0
386	VADDUWM V7, V17, V4
387	VADDUWM V11, V18, V8
388	VADDUWM V15, V19, V12
389
390	CMPU  LEN, $64
391	BLT   tail_vsx
392
393	LXVW4X (INP)(R0), VS59
394	LXVW4X (INP)(R8), VS60
395	LXVW4X (INP)(R9), VS61
396	LXVW4X (INP)(R10), VS62
397
398	VXOR V27, V0, V27
399	VXOR V28, V4, V28
400	VXOR V29, V8, V29
401	VXOR V30, V12, V30
402
403	STXVW4X VS59, (OUT)(R0)
404	STXVW4X VS60, (OUT)(R8)
405	ADD     $64, INP
406	STXVW4X VS61, (OUT)(R9)
407	ADD     $-64, LEN
408	STXVW4X VS62, (OUT)(R10)
409	ADD     $64, OUT
410
411	MOVD $10, R14
412	MOVD R14, CTR
413	BNE  loop_outer_vsx
414
415done_vsx:
416	// Increment counter by number of 64 byte blocks
417	MOVD (CNT), R14
418	ADD  BLOCKS, R14
419	MOVD R14, (CNT)
420	RET
421
422tail_vsx:
423	ADD  $32, R1, R11
424	MOVD LEN, CTR
425
426	// Save values on stack to copy from
427	STXVW4X VS32, (R11)(R0)
428	STXVW4X VS36, (R11)(R8)
429	STXVW4X VS40, (R11)(R9)
430	STXVW4X VS44, (R11)(R10)
431	ADD $-1, R11, R12
432	ADD $-1, INP
433	ADD $-1, OUT
434
435looptail_vsx:
436	// Copying the result to OUT
437	// in bytes.
438	MOVBZU 1(R12), KEY
439	MOVBZU 1(INP), TMP
440	XOR    KEY, TMP, KEY
441	MOVBU  KEY, 1(OUT)
442	BC     16, LT, looptail_vsx
443
444	// Clear the stack values
445	STXVW4X VS48, (R11)(R0)
446	STXVW4X VS48, (R11)(R8)
447	STXVW4X VS48, (R11)(R9)
448	STXVW4X VS48, (R11)(R10)
449	BR      done_vsx
450