1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Based on CRYPTOGAMS code with the following comment:
6// # ====================================================================
7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8// # project. The module is, however, dual licensed under OpenSSL and
9// # CRYPTOGAMS licenses depending on where you obtain it. For further
10// # details see http://www.openssl.org/~appro/cryptogams/.
11// # ====================================================================
12
13// Code for the perl script that generates the ppc64 assembler
14// can be found in the cryptogams repository at the link below. It is based on
15// the original from openssl.
16
17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
18
19// The differences in this and the original implementation are
20// due to the calling conventions and initialization of constants.
21
22//go:build gc && !purego
23// +build gc,!purego
24
25#include "textflag.h"
26
27#define OUT  R3
28#define INP  R4
29#define LEN  R5
30#define KEY  R6
31#define CNT  R7
32#define TMP  R15
33
34#define CONSTBASE  R16
35#define BLOCKS R17
36
37DATA consts<>+0x00(SB)/8, $0x3320646e61707865
38DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
39DATA consts<>+0x10(SB)/8, $0x0000000000000001
40DATA consts<>+0x18(SB)/8, $0x0000000000000000
41DATA consts<>+0x20(SB)/8, $0x0000000000000004
42DATA consts<>+0x28(SB)/8, $0x0000000000000000
43DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
44DATA consts<>+0x38(SB)/8, $0x0203000106070405
45DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
46DATA consts<>+0x48(SB)/8, $0x0102030005060704
47DATA consts<>+0x50(SB)/8, $0x6170786561707865
48DATA consts<>+0x58(SB)/8, $0x6170786561707865
49DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
50DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
51DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
52DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
53DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
54DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
55DATA consts<>+0x90(SB)/8, $0x0000000100000000
56DATA consts<>+0x98(SB)/8, $0x0000000300000002
57GLOBL consts<>(SB), RODATA, $0xa0
58
59//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
60TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
61	MOVD out+0(FP), OUT
62	MOVD inp+8(FP), INP
63	MOVD len+16(FP), LEN
64	MOVD key+24(FP), KEY
65	MOVD counter+32(FP), CNT
66
67	// Addressing for constants
68	MOVD $consts<>+0x00(SB), CONSTBASE
69	MOVD $16, R8
70	MOVD $32, R9
71	MOVD $48, R10
72	MOVD $64, R11
73	SRD $6, LEN, BLOCKS
74	// V16
75	LXVW4X (CONSTBASE)(R0), VS48
76	ADD $80,CONSTBASE
77
78	// Load key into V17,V18
79	LXVW4X (KEY)(R0), VS49
80	LXVW4X (KEY)(R8), VS50
81
82	// Load CNT, NONCE into V19
83	LXVW4X (CNT)(R0), VS51
84
85	// Clear V27
86	VXOR V27, V27, V27
87
88	// V28
89	LXVW4X (CONSTBASE)(R11), VS60
90
91	// splat slot from V19 -> V26
92	VSPLTW $0, V19, V26
93
94	VSLDOI $4, V19, V27, V19
95	VSLDOI $12, V27, V19, V19
96
97	VADDUWM V26, V28, V26
98
99	MOVD $10, R14
100	MOVD R14, CTR
101
102loop_outer_vsx:
103	// V0, V1, V2, V3
104	LXVW4X (R0)(CONSTBASE), VS32
105	LXVW4X (R8)(CONSTBASE), VS33
106	LXVW4X (R9)(CONSTBASE), VS34
107	LXVW4X (R10)(CONSTBASE), VS35
108
109	// splat values from V17, V18 into V4-V11
110	VSPLTW $0, V17, V4
111	VSPLTW $1, V17, V5
112	VSPLTW $2, V17, V6
113	VSPLTW $3, V17, V7
114	VSPLTW $0, V18, V8
115	VSPLTW $1, V18, V9
116	VSPLTW $2, V18, V10
117	VSPLTW $3, V18, V11
118
119	// VOR
120	VOR V26, V26, V12
121
122	// splat values from V19 -> V13, V14, V15
123	VSPLTW $1, V19, V13
124	VSPLTW $2, V19, V14
125	VSPLTW $3, V19, V15
126
127	// splat   const values
128	VSPLTISW $-16, V27
129	VSPLTISW $12, V28
130	VSPLTISW $8, V29
131	VSPLTISW $7, V30
132
133loop_vsx:
134	VADDUWM V0, V4, V0
135	VADDUWM V1, V5, V1
136	VADDUWM V2, V6, V2
137	VADDUWM V3, V7, V3
138
139	VXOR V12, V0, V12
140	VXOR V13, V1, V13
141	VXOR V14, V2, V14
142	VXOR V15, V3, V15
143
144	VRLW V12, V27, V12
145	VRLW V13, V27, V13
146	VRLW V14, V27, V14
147	VRLW V15, V27, V15
148
149	VADDUWM V8, V12, V8
150	VADDUWM V9, V13, V9
151	VADDUWM V10, V14, V10
152	VADDUWM V11, V15, V11
153
154	VXOR V4, V8, V4
155	VXOR V5, V9, V5
156	VXOR V6, V10, V6
157	VXOR V7, V11, V7
158
159	VRLW V4, V28, V4
160	VRLW V5, V28, V5
161	VRLW V6, V28, V6
162	VRLW V7, V28, V7
163
164	VADDUWM V0, V4, V0
165	VADDUWM V1, V5, V1
166	VADDUWM V2, V6, V2
167	VADDUWM V3, V7, V3
168
169	VXOR V12, V0, V12
170	VXOR V13, V1, V13
171	VXOR V14, V2, V14
172	VXOR V15, V3, V15
173
174	VRLW V12, V29, V12
175	VRLW V13, V29, V13
176	VRLW V14, V29, V14
177	VRLW V15, V29, V15
178
179	VADDUWM V8, V12, V8
180	VADDUWM V9, V13, V9
181	VADDUWM V10, V14, V10
182	VADDUWM V11, V15, V11
183
184	VXOR V4, V8, V4
185	VXOR V5, V9, V5
186	VXOR V6, V10, V6
187	VXOR V7, V11, V7
188
189	VRLW V4, V30, V4
190	VRLW V5, V30, V5
191	VRLW V6, V30, V6
192	VRLW V7, V30, V7
193
194	VADDUWM V0, V5, V0
195	VADDUWM V1, V6, V1
196	VADDUWM V2, V7, V2
197	VADDUWM V3, V4, V3
198
199	VXOR V15, V0, V15
200	VXOR V12, V1, V12
201	VXOR V13, V2, V13
202	VXOR V14, V3, V14
203
204	VRLW V15, V27, V15
205	VRLW V12, V27, V12
206	VRLW V13, V27, V13
207	VRLW V14, V27, V14
208
209	VADDUWM V10, V15, V10
210	VADDUWM V11, V12, V11
211	VADDUWM V8, V13, V8
212	VADDUWM V9, V14, V9
213
214	VXOR V5, V10, V5
215	VXOR V6, V11, V6
216	VXOR V7, V8, V7
217	VXOR V4, V9, V4
218
219	VRLW V5, V28, V5
220	VRLW V6, V28, V6
221	VRLW V7, V28, V7
222	VRLW V4, V28, V4
223
224	VADDUWM V0, V5, V0
225	VADDUWM V1, V6, V1
226	VADDUWM V2, V7, V2
227	VADDUWM V3, V4, V3
228
229	VXOR V15, V0, V15
230	VXOR V12, V1, V12
231	VXOR V13, V2, V13
232	VXOR V14, V3, V14
233
234	VRLW V15, V29, V15
235	VRLW V12, V29, V12
236	VRLW V13, V29, V13
237	VRLW V14, V29, V14
238
239	VADDUWM V10, V15, V10
240	VADDUWM V11, V12, V11
241	VADDUWM V8, V13, V8
242	VADDUWM V9, V14, V9
243
244	VXOR V5, V10, V5
245	VXOR V6, V11, V6
246	VXOR V7, V8, V7
247	VXOR V4, V9, V4
248
249	VRLW V5, V30, V5
250	VRLW V6, V30, V6
251	VRLW V7, V30, V7
252	VRLW V4, V30, V4
253	BC   16, LT, loop_vsx
254
255	VADDUWM V12, V26, V12
256
257	WORD $0x13600F8C		// VMRGEW V0, V1, V27
258	WORD $0x13821F8C		// VMRGEW V2, V3, V28
259
260	WORD $0x10000E8C		// VMRGOW V0, V1, V0
261	WORD $0x10421E8C		// VMRGOW V2, V3, V2
262
263	WORD $0x13A42F8C		// VMRGEW V4, V5, V29
264	WORD $0x13C63F8C		// VMRGEW V6, V7, V30
265
266	XXPERMDI VS32, VS34, $0, VS33
267	XXPERMDI VS32, VS34, $3, VS35
268	XXPERMDI VS59, VS60, $0, VS32
269	XXPERMDI VS59, VS60, $3, VS34
270
271	WORD $0x10842E8C		// VMRGOW V4, V5, V4
272	WORD $0x10C63E8C		// VMRGOW V6, V7, V6
273
274	WORD $0x13684F8C		// VMRGEW V8, V9, V27
275	WORD $0x138A5F8C		// VMRGEW V10, V11, V28
276
277	XXPERMDI VS36, VS38, $0, VS37
278	XXPERMDI VS36, VS38, $3, VS39
279	XXPERMDI VS61, VS62, $0, VS36
280	XXPERMDI VS61, VS62, $3, VS38
281
282	WORD $0x11084E8C		// VMRGOW V8, V9, V8
283	WORD $0x114A5E8C		// VMRGOW V10, V11, V10
284
285	WORD $0x13AC6F8C		// VMRGEW V12, V13, V29
286	WORD $0x13CE7F8C		// VMRGEW V14, V15, V30
287
288	XXPERMDI VS40, VS42, $0, VS41
289	XXPERMDI VS40, VS42, $3, VS43
290	XXPERMDI VS59, VS60, $0, VS40
291	XXPERMDI VS59, VS60, $3, VS42
292
293	WORD $0x118C6E8C		// VMRGOW V12, V13, V12
294	WORD $0x11CE7E8C		// VMRGOW V14, V15, V14
295
296	VSPLTISW $4, V27
297	VADDUWM V26, V27, V26
298
299	XXPERMDI VS44, VS46, $0, VS45
300	XXPERMDI VS44, VS46, $3, VS47
301	XXPERMDI VS61, VS62, $0, VS44
302	XXPERMDI VS61, VS62, $3, VS46
303
304	VADDUWM V0, V16, V0
305	VADDUWM V4, V17, V4
306	VADDUWM V8, V18, V8
307	VADDUWM V12, V19, V12
308
309	CMPU LEN, $64
310	BLT tail_vsx
311
312	// Bottom of loop
313	LXVW4X (INP)(R0), VS59
314	LXVW4X (INP)(R8), VS60
315	LXVW4X (INP)(R9), VS61
316	LXVW4X (INP)(R10), VS62
317
318	VXOR V27, V0, V27
319	VXOR V28, V4, V28
320	VXOR V29, V8, V29
321	VXOR V30, V12, V30
322
323	STXVW4X VS59, (OUT)(R0)
324	STXVW4X VS60, (OUT)(R8)
325	ADD     $64, INP
326	STXVW4X VS61, (OUT)(R9)
327	ADD     $-64, LEN
328	STXVW4X VS62, (OUT)(R10)
329	ADD     $64, OUT
330	BEQ     done_vsx
331
332	VADDUWM V1, V16, V0
333	VADDUWM V5, V17, V4
334	VADDUWM V9, V18, V8
335	VADDUWM V13, V19, V12
336
337	CMPU  LEN, $64
338	BLT   tail_vsx
339
340	LXVW4X (INP)(R0), VS59
341	LXVW4X (INP)(R8), VS60
342	LXVW4X (INP)(R9), VS61
343	LXVW4X (INP)(R10), VS62
344	VXOR   V27, V0, V27
345
346	VXOR V28, V4, V28
347	VXOR V29, V8, V29
348	VXOR V30, V12, V30
349
350	STXVW4X VS59, (OUT)(R0)
351	STXVW4X VS60, (OUT)(R8)
352	ADD     $64, INP
353	STXVW4X VS61, (OUT)(R9)
354	ADD     $-64, LEN
355	STXVW4X VS62, (OUT)(V10)
356	ADD     $64, OUT
357	BEQ     done_vsx
358
359	VADDUWM V2, V16, V0
360	VADDUWM V6, V17, V4
361	VADDUWM V10, V18, V8
362	VADDUWM V14, V19, V12
363
364	CMPU LEN, $64
365	BLT  tail_vsx
366
367	LXVW4X (INP)(R0), VS59
368	LXVW4X (INP)(R8), VS60
369	LXVW4X (INP)(R9), VS61
370	LXVW4X (INP)(R10), VS62
371
372	VXOR V27, V0, V27
373	VXOR V28, V4, V28
374	VXOR V29, V8, V29
375	VXOR V30, V12, V30
376
377	STXVW4X VS59, (OUT)(R0)
378	STXVW4X VS60, (OUT)(R8)
379	ADD     $64, INP
380	STXVW4X VS61, (OUT)(R9)
381	ADD     $-64, LEN
382	STXVW4X VS62, (OUT)(R10)
383	ADD     $64, OUT
384	BEQ     done_vsx
385
386	VADDUWM V3, V16, V0
387	VADDUWM V7, V17, V4
388	VADDUWM V11, V18, V8
389	VADDUWM V15, V19, V12
390
391	CMPU  LEN, $64
392	BLT   tail_vsx
393
394	LXVW4X (INP)(R0), VS59
395	LXVW4X (INP)(R8), VS60
396	LXVW4X (INP)(R9), VS61
397	LXVW4X (INP)(R10), VS62
398
399	VXOR V27, V0, V27
400	VXOR V28, V4, V28
401	VXOR V29, V8, V29
402	VXOR V30, V12, V30
403
404	STXVW4X VS59, (OUT)(R0)
405	STXVW4X VS60, (OUT)(R8)
406	ADD     $64, INP
407	STXVW4X VS61, (OUT)(R9)
408	ADD     $-64, LEN
409	STXVW4X VS62, (OUT)(R10)
410	ADD     $64, OUT
411
412	MOVD $10, R14
413	MOVD R14, CTR
414	BNE  loop_outer_vsx
415
416done_vsx:
417	// Increment counter by number of 64 byte blocks
418	MOVD (CNT), R14
419	ADD  BLOCKS, R14
420	MOVD R14, (CNT)
421	RET
422
423tail_vsx:
424	ADD  $32, R1, R11
425	MOVD LEN, CTR
426
427	// Save values on stack to copy from
428	STXVW4X VS32, (R11)(R0)
429	STXVW4X VS36, (R11)(R8)
430	STXVW4X VS40, (R11)(R9)
431	STXVW4X VS44, (R11)(R10)
432	ADD $-1, R11, R12
433	ADD $-1, INP
434	ADD $-1, OUT
435
436looptail_vsx:
437	// Copying the result to OUT
438	// in bytes.
439	MOVBZU 1(R12), KEY
440	MOVBZU 1(INP), TMP
441	XOR    KEY, TMP, KEY
442	MOVBU  KEY, 1(OUT)
443	BC     16, LT, looptail_vsx
444
445	// Clear the stack values
446	STXVW4X VS48, (R11)(R0)
447	STXVW4X VS48, (R11)(R8)
448	STXVW4X VS48, (R11)(R9)
449	STXVW4X VS48, (R11)(R10)
450	BR      done_vsx
451