1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build s390x,go1.11,!gccgo,!appengine
6
7#include "textflag.h"
8
9// Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
10
11// constants
12#define EX0   V1
13#define EX1   V2
14#define EX2   V3
15
16// temporaries
17#define T_0 V4
18#define T_1 V5
19#define T_2 V6
20#define T_3 V7
21#define T_4 V8
22#define T_5 V9
23#define T_6 V10
24#define T_7 V11
25#define T_8 V12
26#define T_9 V13
27#define T_10 V14
28
29// r**2 & r**4
30#define R_0  V15
31#define R_1  V16
32#define R_2  V17
33#define R5_1 V18
34#define R5_2 V19
35// key (r)
36#define RSAVE_0 R7
37#define RSAVE_1 R8
38#define RSAVE_2 R9
39#define R5SAVE_1 R10
40#define R5SAVE_2 R11
41
42// message block
43#define M0 V20
44#define M1 V21
45#define M2 V22
46#define M3 V23
47#define M4 V24
48#define M5 V25
49
50// accumulator
51#define H0_0 V26
52#define H1_0 V27
53#define H2_0 V28
54#define H0_1 V29
55#define H1_1 V30
56#define H2_1 V31
57
58GLOBL ·keyMask<>(SB), RODATA, $16
59DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
60DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
61
62GLOBL ·bswapMask<>(SB), RODATA, $16
63DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
64DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
65
66GLOBL ·constants<>(SB), RODATA, $48
67// EX0
68DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
69DATA ·constants<>+8(SB)/8, $0x0000050403020100
70// EX1
71DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
72DATA ·constants<>+24(SB)/8, $0x00000a0908070605
73// EX2
74DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
75DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
76
77GLOBL ·c<>(SB), RODATA, $48
78// EX0
79DATA ·c<>+0(SB)/8, $0x0000050403020100
80DATA ·c<>+8(SB)/8, $0x0000151413121110
81// EX1
82DATA ·c<>+16(SB)/8, $0x00000a0908070605
83DATA ·c<>+24(SB)/8, $0x00001a1918171615
84// EX2
85DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
86DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
87
88GLOBL ·reduce<>(SB), RODATA, $32
89// 44 bit
90DATA ·reduce<>+0(SB)/8, $0x0
91DATA ·reduce<>+8(SB)/8, $0xfffffffffff
92// 42 bit
93DATA ·reduce<>+16(SB)/8, $0x0
94DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
95
96// h = (f*g) % (2**130-5) [partial reduction]
97// uses T_0...T_9 temporary registers
98// input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
99// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
100// output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
101#define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
102	\ // Eliminate the dependency for the last 2 VMSLs
103	VMSLG m02_0, r_2, m4_2, m4_2                       \
104	VMSLG m13_0, r_2, m5_2, m5_2                       \ // 8 VMSLs pipelined
105	VMSLG m02_0, r_0, m4_0, m4_0                       \
106	VMSLG m02_1, r5_2, V0, T_0                         \
107	VMSLG m02_0, r_1, m4_1, m4_1                       \
108	VMSLG m02_1, r_0, V0, T_1                          \
109	VMSLG m02_1, r_1, V0, T_2                          \
110	VMSLG m02_2, r5_1, V0, T_3                         \
111	VMSLG m02_2, r5_2, V0, T_4                         \
112	VMSLG m13_0, r_0, m5_0, m5_0                       \
113	VMSLG m13_1, r5_2, V0, T_5                         \
114	VMSLG m13_0, r_1, m5_1, m5_1                       \
115	VMSLG m13_1, r_0, V0, T_6                          \
116	VMSLG m13_1, r_1, V0, T_7                          \
117	VMSLG m13_2, r5_1, V0, T_8                         \
118	VMSLG m13_2, r5_2, V0, T_9                         \
119	VMSLG m02_2, r_0, m4_2, m4_2                       \
120	VMSLG m13_2, r_0, m5_2, m5_2                       \
121	VAQ   m4_0, T_0, m02_0                             \
122	VAQ   m4_1, T_1, m02_1                             \
123	VAQ   m5_0, T_5, m13_0                             \
124	VAQ   m5_1, T_6, m13_1                             \
125	VAQ   m02_0, T_3, m02_0                            \
126	VAQ   m02_1, T_4, m02_1                            \
127	VAQ   m13_0, T_8, m13_0                            \
128	VAQ   m13_1, T_9, m13_1                            \
129	VAQ   m4_2, T_2, m02_2                             \
130	VAQ   m5_2, T_7, m13_2                             \
131
132// SQUARE uses three limbs of r and r_2*5 to output square of r
133// uses T_1, T_5 and T_7 temporary registers
134// input: r_0, r_1, r_2, r5_2
135// temp: TEMP0, TEMP1, TEMP2
136// output: p0, p1, p2
137#define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
138	VMSLG r_0, r_0, p0, p0     \
139	VMSLG r_1, r5_2, V0, TEMP0 \
140	VMSLG r_2, r5_2, p1, p1    \
141	VMSLG r_0, r_1, V0, TEMP1  \
142	VMSLG r_1, r_1, p2, p2     \
143	VMSLG r_0, r_2, V0, TEMP2  \
144	VAQ   TEMP0, p0, p0        \
145	VAQ   TEMP1, p1, p1        \
146	VAQ   TEMP2, p2, p2        \
147	VAQ   TEMP0, p0, p0        \
148	VAQ   TEMP1, p1, p1        \
149	VAQ   TEMP2, p2, p2        \
150
151// carry h0->h1->h2->h0 || h3->h4->h5->h3
152// uses T_2, T_4, T_5, T_7, T_8, T_9
153//       t6,  t7,  t8,  t9, t10, t11
154// input: h0, h1, h2, h3, h4, h5
155// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
156// output: h0, h1, h2, h3, h4, h5
157#define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
158	VLM    (R12), t6, t7  \ // 44 and 42 bit clear mask
159	VLEIB  $7, $0x28, t10 \ // 5 byte shift mask
160	VREPIB $4, t8         \ // 4 bit shift mask
161	VREPIB $2, t11        \ // 2 bit shift mask
162	VSRLB  t10, h0, t0    \ // h0 byte shift
163	VSRLB  t10, h1, t1    \ // h1 byte shift
164	VSRLB  t10, h2, t2    \ // h2 byte shift
165	VSRLB  t10, h3, t3    \ // h3 byte shift
166	VSRLB  t10, h4, t4    \ // h4 byte shift
167	VSRLB  t10, h5, t5    \ // h5 byte shift
168	VSRL   t8, t0, t0     \ // h0 bit shift
169	VSRL   t8, t1, t1     \ // h2 bit shift
170	VSRL   t11, t2, t2    \ // h2 bit shift
171	VSRL   t8, t3, t3     \ // h3 bit shift
172	VSRL   t8, t4, t4     \ // h4 bit shift
173	VESLG  $2, t2, t9     \ // h2 carry x5
174	VSRL   t11, t5, t5    \ // h5 bit shift
175	VN     t6, h0, h0     \ // h0 clear carry
176	VAQ    t2, t9, t2     \ // h2 carry x5
177	VESLG  $2, t5, t9     \ // h5 carry x5
178	VN     t6, h1, h1     \ // h1 clear carry
179	VN     t7, h2, h2     \ // h2 clear carry
180	VAQ    t5, t9, t5     \ // h5 carry x5
181	VN     t6, h3, h3     \ // h3 clear carry
182	VN     t6, h4, h4     \ // h4 clear carry
183	VN     t7, h5, h5     \ // h5 clear carry
184	VAQ    t0, h1, h1     \ // h0->h1
185	VAQ    t3, h4, h4     \ // h3->h4
186	VAQ    t1, h2, h2     \ // h1->h2
187	VAQ    t4, h5, h5     \ // h4->h5
188	VAQ    t2, h0, h0     \ // h2->h0
189	VAQ    t5, h3, h3     \ // h5->h3
190	VREPG  $1, t6, t6     \ // 44 and 42 bit masks across both halves
191	VREPG  $1, t7, t7     \
192	VSLDB  $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
193	VSLDB  $8, h1, h1, h1 \
194	VSLDB  $8, h2, h2, h2 \
195	VO     h0, h3, h3     \
196	VO     h1, h4, h4     \
197	VO     h2, h5, h5     \
198	VESRLG $44, h3, t0    \ // 44 bit shift right
199	VESRLG $44, h4, t1    \
200	VESRLG $42, h5, t2    \
201	VN     t6, h3, h3     \ // clear carry bits
202	VN     t6, h4, h4     \
203	VN     t7, h5, h5     \
204	VESLG  $2, t2, t9     \ // multiply carry by 5
205	VAQ    t9, t2, t2     \
206	VAQ    t0, h4, h4     \
207	VAQ    t1, h5, h5     \
208	VAQ    t2, h3, h3     \
209
210// carry h0->h1->h2->h0
211// input: h0, h1, h2
212// temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
213// output: h0, h1, h2
214#define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
215	VLEIB  $7, $0x28, t3 \ // 5 byte shift mask
216	VREPIB $4, t4        \ // 4 bit shift mask
217	VREPIB $2, t7        \ // 2 bit shift mask
218	VGBM   $0x003F, t5   \ // mask to clear carry bits
219	VSRLB  t3, h0, t0    \
220	VSRLB  t3, h1, t1    \
221	VSRLB  t3, h2, t2    \
222	VESRLG $4, t5, t5    \ // 44 bit clear mask
223	VSRL   t4, t0, t0    \
224	VSRL   t4, t1, t1    \
225	VSRL   t7, t2, t2    \
226	VESRLG $2, t5, t6    \ // 42 bit clear mask
227	VESLG  $2, t2, t8    \
228	VAQ    t8, t2, t2    \
229	VN     t5, h0, h0    \
230	VN     t5, h1, h1    \
231	VN     t6, h2, h2    \
232	VAQ    t0, h1, h1    \
233	VAQ    t1, h2, h2    \
234	VAQ    t2, h0, h0    \
235	VSRLB  t3, h0, t0    \
236	VSRLB  t3, h1, t1    \
237	VSRLB  t3, h2, t2    \
238	VSRL   t4, t0, t0    \
239	VSRL   t4, t1, t1    \
240	VSRL   t7, t2, t2    \
241	VN     t5, h0, h0    \
242	VN     t5, h1, h1    \
243	VESLG  $2, t2, t8    \
244	VN     t6, h2, h2    \
245	VAQ    t0, h1, h1    \
246	VAQ    t8, t2, t2    \
247	VAQ    t1, h2, h2    \
248	VAQ    t2, h0, h0    \
249
250// expands two message blocks into the lower halfs of the d registers
251// moves the contents of the d registers into upper halfs
252// input: in1, in2, d0, d1, d2, d3, d4, d5
253// temp: TEMP0, TEMP1, TEMP2, TEMP3
254// output: d0, d1, d2, d3, d4, d5
255#define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
256	VGBM   $0xff3f, TEMP0      \
257	VGBM   $0xff1f, TEMP1      \
258	VESLG  $4, d1, TEMP2       \
259	VESLG  $4, d4, TEMP3       \
260	VESRLG $4, TEMP0, TEMP0    \
261	VPERM  in1, d0, EX0, d0    \
262	VPERM  in2, d3, EX0, d3    \
263	VPERM  in1, d2, EX2, d2    \
264	VPERM  in2, d5, EX2, d5    \
265	VPERM  in1, TEMP2, EX1, d1 \
266	VPERM  in2, TEMP3, EX1, d4 \
267	VN     TEMP0, d0, d0       \
268	VN     TEMP0, d3, d3       \
269	VESRLG $4, d1, d1          \
270	VESRLG $4, d4, d4          \
271	VN     TEMP1, d2, d2       \
272	VN     TEMP1, d5, d5       \
273	VN     TEMP0, d1, d1       \
274	VN     TEMP0, d4, d4       \
275
276// expands one message block into the lower halfs of the d registers
277// moves the contents of the d registers into upper halfs
278// input: in, d0, d1, d2
279// temp: TEMP0, TEMP1, TEMP2
280// output: d0, d1, d2
281#define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
282	VGBM   $0xff3f, TEMP0     \
283	VESLG  $4, d1, TEMP2      \
284	VGBM   $0xff1f, TEMP1     \
285	VPERM  in, d0, EX0, d0    \
286	VESRLG $4, TEMP0, TEMP0   \
287	VPERM  in, d2, EX2, d2    \
288	VPERM  in, TEMP2, EX1, d1 \
289	VN     TEMP0, d0, d0      \
290	VN     TEMP1, d2, d2      \
291	VESRLG $4, d1, d1         \
292	VN     TEMP0, d1, d1      \
293
294// pack h2:h0 into h1:h0 (no carry)
295// input: h0, h1, h2
296// output: h0, h1, h2
297#define PACK(h0, h1, h2) \
298	VMRLG  h1, h2, h2  \ // copy h1 to upper half h2
299	VESLG  $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
300	VO     h0, h1, h0  \ // combine h0 with 20 bits from limb 1
301	VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
302	VLEIG  $1, $0, h1  \ // clear h2 stuff from lower half of h1
303	VO     h0, h1, h0  \ // h0 now has 88 bits (limb 0 and 1)
304	VLEIG  $0, $0, h2  \ // clear upper half of h2
305	VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
306	VLEIB  $7, $88, h1 \ // for byte shift (11 bytes)
307	VSLB   h1, h2, h2  \ // shift h2 11 bytes to the left
308	VO     h0, h2, h0  \ // combine h0 with 20 bits from limb 1
309	VLEIG  $0, $0, h1  \ // clear upper half of h1
310
311// if h > 2**130-5 then h -= 2**130-5
312// input: h0, h1
313// temp: t0, t1, t2
314// output: h0
315#define MOD(h0, h1, t0, t1, t2) \
316	VZERO t0          \
317	VLEIG $1, $5, t0  \
318	VACCQ h0, t0, t1  \
319	VAQ   h0, t0, t0  \
320	VONE  t2          \
321	VLEIG $1, $-4, t2 \
322	VAQ   t2, t1, t1  \
323	VACCQ h1, t1, t1  \
324	VONE  t2          \
325	VAQ   t2, t1, t1  \
326	VN    h0, t1, t2  \
327	VNC   t0, t1, t1  \
328	VO    t1, t2, h0  \
329
330// func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
331TEXT ·poly1305vmsl(SB), $0-32
332	// This code processes 6 + up to 4 blocks (32 bytes) per iteration
333	// using the algorithm described in:
334	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
335	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
336	// And as moddified for VMSL as described in
337	// Accelerating Poly1305 Cryptographic Message Authentication on the z14
338	// O'Farrell et al, CASCON 2017, p48-55
339	// https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
340
341	LMG   out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
342	VZERO V0                // c
343
344	// load EX0, EX1 and EX2
345	MOVDconstants<>(SB), R5
346	VLM  (R5), EX0, EX2        // c
347
348	// setup r
349	VL    (R4), T_0
350	MOVDkeyMask<>(SB), R6
351	VL    (R6), T_1
352	VN    T_0, T_1, T_0
353	VZERO T_2                 // limbs for r
354	VZERO T_3
355	VZERO T_4
356	EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
357
358	// T_2, T_3, T_4: [0, r]
359
360	// setup r*20
361	VLEIG $0, $0, T_0
362	VLEIG $1, $20, T_0       // T_0: [0, 20]
363	VZERO T_5
364	VZERO T_6
365	VMSLG T_0, T_3, T_5, T_5
366	VMSLG T_0, T_4, T_6, T_6
367
368	// store r for final block in GR
369	VLGVG $1, T_2, RSAVE_0  // c
370	VLGVG $1, T_3, RSAVE_1  // c
371	VLGVG $1, T_4, RSAVE_2  // c
372	VLGVG $1, T_5, R5SAVE_1 // c
373	VLGVG $1, T_6, R5SAVE_2 // c
374
375	// initialize h
376	VZERO H0_0
377	VZERO H1_0
378	VZERO H2_0
379	VZERO H0_1
380	VZERO H1_1
381	VZERO H2_1
382
383	// initialize pointer for reduce constants
384	MOVDreduce<>(SB), R12
385
386	// calculate r**2 and 20*(r**2)
387	VZERO R_0
388	VZERO R_1
389	VZERO R_2
390	SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
391	REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
392	VZERO R5_1
393	VZERO R5_2
394	VMSLG T_0, R_1, R5_1, R5_1
395	VMSLG T_0, R_2, R5_2, R5_2
396
397	// skip r**4 calculation if 3 blocks or less
398	CMPBLE R3, $48, b4
399
400	// calculate r**4 and 20*(r**4)
401	VZERO T_8
402	VZERO T_9
403	VZERO T_10
404	SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
405	REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
406	VZERO T_2
407	VZERO T_3
408	VMSLG T_0, T_9, T_2, T_2
409	VMSLG T_0, T_10, T_3, T_3
410
411	// put r**2 to the right and r**4 to the left of R_0, R_1, R_2
412	VSLDB $8, T_8, T_8, T_8
413	VSLDB $8, T_9, T_9, T_9
414	VSLDB $8, T_10, T_10, T_10
415	VSLDB $8, T_2, T_2, T_2
416	VSLDB $8, T_3, T_3, T_3
417
418	VO T_8, R_0, R_0
419	VO T_9, R_1, R_1
420	VO T_10, R_2, R_2
421	VO T_2, R5_1, R5_1
422	VO T_3, R5_2, R5_2
423
424	CMPBLE R3, $80, load // less than or equal to 5 blocks in message
425
426	// 6(or 5+1) blocks
427	SUB    $81, R3
428	VLM    (R2), M0, M4
429	VLL    R3, 80(R2), M5
430	ADD    $1, R3
431	MOVBZ  $1, R0
432	CMPBGE R3, $16, 2(PC)
433	VLVGB  R3, R0, M5
434	MOVD   $96(R2), R2
435	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
436	EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
437	VLEIB  $2, $1, H2_0
438	VLEIB  $2, $1, H2_1
439	VLEIB  $10, $1, H2_0
440	VLEIB  $10, $1, H2_1
441
442	VZERO  M0
443	VZERO  M1
444	VZERO  M2
445	VZERO  M3
446	VZERO  T_4
447	VZERO  T_10
448	EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
449	VLR    T_4, M4
450	VLEIB  $10, $1, M2
451	CMPBLT R3, $16, 2(PC)
452	VLEIB  $10, $1, T_10
453	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
454	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
455	VMRHG  V0, H0_1, H0_0
456	VMRHG  V0, H1_1, H1_0
457	VMRHG  V0, H2_1, H2_0
458	VMRLG  V0, H0_1, H0_1
459	VMRLG  V0, H1_1, H1_1
460	VMRLG  V0, H2_1, H2_1
461
462	SUB    $16, R3
463	CMPBLE R3, $0, square
464
465load:
466	// load EX0, EX1 and EX2
467	MOVDc<>(SB), R5
468	VLM  (R5), EX0, EX2
469
470loop:
471	CMPBLE R3, $64, add // b4	// last 4 or less blocks left
472
473	// next 4 full blocks
474	VLM  (R2), M2, M5
475	SUB  $64, R3
476	MOVD $64(R2), R2
477	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
478
479	// expacc in-lined to create [m2, m3] limbs
480	VGBM   $0x3f3f, T_0     // 44 bit clear mask
481	VGBM   $0x1f1f, T_1     // 40 bit clear mask
482	VPERM  M2, M3, EX0, T_3
483	VESRLG $4, T_0, T_0     // 44 bit clear mask ready
484	VPERM  M2, M3, EX1, T_4
485	VPERM  M2, M3, EX2, T_5
486	VN     T_0, T_3, T_3
487	VESRLG $4, T_4, T_4
488	VN     T_1, T_5, T_5
489	VN     T_0, T_4, T_4
490	VMRHG  H0_1, T_3, H0_0
491	VMRHG  H1_1, T_4, H1_0
492	VMRHG  H2_1, T_5, H2_0
493	VMRLG  H0_1, T_3, H0_1
494	VMRLG  H1_1, T_4, H1_1
495	VMRLG  H2_1, T_5, H2_1
496	VLEIB  $10, $1, H2_0
497	VLEIB  $10, $1, H2_1
498	VPERM  M4, M5, EX0, T_3
499	VPERM  M4, M5, EX1, T_4
500	VPERM  M4, M5, EX2, T_5
501	VN     T_0, T_3, T_3
502	VESRLG $4, T_4, T_4
503	VN     T_1, T_5, T_5
504	VN     T_0, T_4, T_4
505	VMRHG  V0, T_3, M0
506	VMRHG  V0, T_4, M1
507	VMRHG  V0, T_5, M2
508	VMRLG  V0, T_3, M3
509	VMRLG  V0, T_4, M4
510	VMRLG  V0, T_5, M5
511	VLEIB  $10, $1, M2
512	VLEIB  $10, $1, M5
513
514	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
515	CMPBNE R3, $0, loop
516	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
517	VMRHG  V0, H0_1, H0_0
518	VMRHG  V0, H1_1, H1_0
519	VMRHG  V0, H2_1, H2_0
520	VMRLG  V0, H0_1, H0_1
521	VMRLG  V0, H1_1, H1_1
522	VMRLG  V0, H2_1, H2_1
523
524	// load EX0, EX1, EX2
525	MOVDconstants<>(SB), R5
526	VLM  (R5), EX0, EX2
527
528	// sum vectors
529	VAQ H0_0, H0_1, H0_0
530	VAQ H1_0, H1_1, H1_0
531	VAQ H2_0, H2_1, H2_0
532
533	// h may be >= 2*(2**130-5) so we need to reduce it again
534	// M0...M4 are used as temps here
535	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
536
537next:  // carry h1->h2
538	VLEIB  $7, $0x28, T_1
539	VREPIB $4, T_2
540	VGBM   $0x003F, T_3
541	VESRLG $4, T_3
542
543	// byte shift
544	VSRLB T_1, H1_0, T_4
545
546	// bit shift
547	VSRL T_2, T_4, T_4
548
549	// clear h1 carry bits
550	VN T_3, H1_0, H1_0
551
552	// add carry
553	VAQ T_4, H2_0, H2_0
554
555	// h is now < 2*(2**130-5)
556	// pack h into h1 (hi) and h0 (lo)
557	PACK(H0_0, H1_0, H2_0)
558
559	// if h > 2**130-5 then h -= 2**130-5
560	MOD(H0_0, H1_0, T_0, T_1, T_2)
561
562	// h += s
563	MOVDbswapMask<>(SB), R5
564	VL    (R5), T_1
565	VL    16(R4), T_0
566	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
567	VAQ   T_0, H0_0, H0_0
568	VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
569	VST   H0_0, (R1)
570	RET
571
572add:
573	// load EX0, EX1, EX2
574	MOVDconstants<>(SB), R5
575	VLM  (R5), EX0, EX2
576
577	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
578	VMRHG  V0, H0_1, H0_0
579	VMRHG  V0, H1_1, H1_0
580	VMRHG  V0, H2_1, H2_0
581	VMRLG  V0, H0_1, H0_1
582	VMRLG  V0, H1_1, H1_1
583	VMRLG  V0, H2_1, H2_1
584	CMPBLE R3, $64, b4
585
586b4:
587	CMPBLE R3, $48, b3 // 3 blocks or less
588
589	// 4(3+1) blocks remaining
590	SUB    $49, R3
591	VLM    (R2), M0, M2
592	VLL    R3, 48(R2), M3
593	ADD    $1, R3
594	MOVBZ  $1, R0
595	CMPBEQ R3, $16, 2(PC)
596	VLVGB  R3, R0, M3
597	MOVD   $64(R2), R2
598	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
599	VLEIB  $10, $1, H2_0
600	VLEIB  $10, $1, H2_1
601	VZERO  M0
602	VZERO  M1
603	VZERO  M4
604	VZERO  M5
605	VZERO  T_4
606	VZERO  T_10
607	EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
608	VLR    T_4, M2
609	VLEIB  $10, $1, M4
610	CMPBNE R3, $16, 2(PC)
611	VLEIB  $10, $1, T_10
612	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
613	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
614	VMRHG  V0, H0_1, H0_0
615	VMRHG  V0, H1_1, H1_0
616	VMRHG  V0, H2_1, H2_0
617	VMRLG  V0, H0_1, H0_1
618	VMRLG  V0, H1_1, H1_1
619	VMRLG  V0, H2_1, H2_1
620	SUB    $16, R3
621	CMPBLE R3, $0, square // this condition must always hold true!
622
623b3:
624	CMPBLE R3, $32, b2
625
626	// 3 blocks remaining
627
628	// setup [r²,r]
629	VSLDB $8, R_0, R_0, R_0
630	VSLDB $8, R_1, R_1, R_1
631	VSLDB $8, R_2, R_2, R_2
632	VSLDB $8, R5_1, R5_1, R5_1
633	VSLDB $8, R5_2, R5_2, R5_2
634
635	VLVGG $1, RSAVE_0, R_0
636	VLVGG $1, RSAVE_1, R_1
637	VLVGG $1, RSAVE_2, R_2
638	VLVGG $1, R5SAVE_1, R5_1
639	VLVGG $1, R5SAVE_2, R5_2
640
641	// setup [h0, h1]
642	VSLDB $8, H0_0, H0_0, H0_0
643	VSLDB $8, H1_0, H1_0, H1_0
644	VSLDB $8, H2_0, H2_0, H2_0
645	VO    H0_1, H0_0, H0_0
646	VO    H1_1, H1_0, H1_0
647	VO    H2_1, H2_0, H2_0
648	VZERO H0_1
649	VZERO H1_1
650	VZERO H2_1
651
652	VZERO M0
653	VZERO M1
654	VZERO M2
655	VZERO M3
656	VZERO M4
657	VZERO M5
658
659	// H*[r**2, r]
660	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
661	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
662
663	SUB    $33, R3
664	VLM    (R2), M0, M1
665	VLL    R3, 32(R2), M2
666	ADD    $1, R3
667	MOVBZ  $1, R0
668	CMPBEQ R3, $16, 2(PC)
669	VLVGB  R3, R0, M2
670
671	// H += m0
672	VZERO T_1
673	VZERO T_2
674	VZERO T_3
675	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
676	VLEIB $10, $1, T_3
677	VAG   H0_0, T_1, H0_0
678	VAG   H1_0, T_2, H1_0
679	VAG   H2_0, T_3, H2_0
680
681	VZERO M0
682	VZERO M3
683	VZERO M4
684	VZERO M5
685	VZERO T_10
686
687	// (H+m0)*r
688	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
689	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
690
691	// H += m1
692	VZERO V0
693	VZERO T_1
694	VZERO T_2
695	VZERO T_3
696	EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
697	VLEIB $10, $1, T_3
698	VAQ   H0_0, T_1, H0_0
699	VAQ   H1_0, T_2, H1_0
700	VAQ   H2_0, T_3, H2_0
701	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
702
703	// [H, m2] * [r**2, r]
704	EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
705	CMPBNE R3, $16, 2(PC)
706	VLEIB  $10, $1, H2_0
707	VZERO  M0
708	VZERO  M1
709	VZERO  M2
710	VZERO  M3
711	VZERO  M4
712	VZERO  M5
713	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
714	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
715	SUB    $16, R3
716	CMPBLE R3, $0, next   // this condition must always hold true!
717
718b2:
719	CMPBLE R3, $16, b1
720
721	// 2 blocks remaining
722
723	// setup [r²,r]
724	VSLDB $8, R_0, R_0, R_0
725	VSLDB $8, R_1, R_1, R_1
726	VSLDB $8, R_2, R_2, R_2
727	VSLDB $8, R5_1, R5_1, R5_1
728	VSLDB $8, R5_2, R5_2, R5_2
729
730	VLVGG $1, RSAVE_0, R_0
731	VLVGG $1, RSAVE_1, R_1
732	VLVGG $1, RSAVE_2, R_2
733	VLVGG $1, R5SAVE_1, R5_1
734	VLVGG $1, R5SAVE_2, R5_2
735
736	// setup [h0, h1]
737	VSLDB $8, H0_0, H0_0, H0_0
738	VSLDB $8, H1_0, H1_0, H1_0
739	VSLDB $8, H2_0, H2_0, H2_0
740	VO    H0_1, H0_0, H0_0
741	VO    H1_1, H1_0, H1_0
742	VO    H2_1, H2_0, H2_0
743	VZERO H0_1
744	VZERO H1_1
745	VZERO H2_1
746
747	VZERO M0
748	VZERO M1
749	VZERO M2
750	VZERO M3
751	VZERO M4
752	VZERO M5
753
754	// H*[r**2, r]
755	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
756	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
757	VMRHG V0, H0_1, H0_0
758	VMRHG V0, H1_1, H1_0
759	VMRHG V0, H2_1, H2_0
760	VMRLG V0, H0_1, H0_1
761	VMRLG V0, H1_1, H1_1
762	VMRLG V0, H2_1, H2_1
763
764	// move h to the left and 0s at the right
765	VSLDB $8, H0_0, H0_0, H0_0
766	VSLDB $8, H1_0, H1_0, H1_0
767	VSLDB $8, H2_0, H2_0, H2_0
768
769	// get message blocks and append 1 to start
770	SUB    $17, R3
771	VL     (R2), M0
772	VLL    R3, 16(R2), M1
773	ADD    $1, R3
774	MOVBZ  $1, R0
775	CMPBEQ R3, $16, 2(PC)
776	VLVGB  R3, R0, M1
777	VZERO  T_6
778	VZERO  T_7
779	VZERO  T_8
780	EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
781	EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
782	VLEIB  $2, $1, T_8
783	CMPBNE R3, $16, 2(PC)
784	VLEIB  $10, $1, T_8
785
786	// add [m0, m1] to h
787	VAG H0_0, T_6, H0_0
788	VAG H1_0, T_7, H1_0
789	VAG H2_0, T_8, H2_0
790
791	VZERO M2
792	VZERO M3
793	VZERO M4
794	VZERO M5
795	VZERO T_10
796	VZERO M0
797
798	// at this point R_0 .. R5_2 look like [r**2, r]
799	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
800	REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
801	SUB    $16, R3, R3
802	CMPBLE R3, $0, next
803
804b1:
805	CMPBLE R3, $0, next
806
807	// 1 block remaining
808
809	// setup [r²,r]
810	VSLDB $8, R_0, R_0, R_0
811	VSLDB $8, R_1, R_1, R_1
812	VSLDB $8, R_2, R_2, R_2
813	VSLDB $8, R5_1, R5_1, R5_1
814	VSLDB $8, R5_2, R5_2, R5_2
815
816	VLVGG $1, RSAVE_0, R_0
817	VLVGG $1, RSAVE_1, R_1
818	VLVGG $1, RSAVE_2, R_2
819	VLVGG $1, R5SAVE_1, R5_1
820	VLVGG $1, R5SAVE_2, R5_2
821
822	// setup [h0, h1]
823	VSLDB $8, H0_0, H0_0, H0_0
824	VSLDB $8, H1_0, H1_0, H1_0
825	VSLDB $8, H2_0, H2_0, H2_0
826	VO    H0_1, H0_0, H0_0
827	VO    H1_1, H1_0, H1_0
828	VO    H2_1, H2_0, H2_0
829	VZERO H0_1
830	VZERO H1_1
831	VZERO H2_1
832
833	VZERO M0
834	VZERO M1
835	VZERO M2
836	VZERO M3
837	VZERO M4
838	VZERO M5
839
840	// H*[r**2, r]
841	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
842	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
843
844	// set up [0, m0] limbs
845	SUB    $1, R3
846	VLL    R3, (R2), M0
847	ADD    $1, R3
848	MOVBZ  $1, R0
849	CMPBEQ R3, $16, 2(PC)
850	VLVGB  R3, R0, M0
851	VZERO  T_1
852	VZERO  T_2
853	VZERO  T_3
854	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
855	CMPBNE R3, $16, 2(PC)
856	VLEIB  $10, $1, T_3
857
858	// h+m0
859	VAQ H0_0, T_1, H0_0
860	VAQ H1_0, T_2, H1_0
861	VAQ H2_0, T_3, H2_0
862
863	VZERO M0
864	VZERO M1
865	VZERO M2
866	VZERO M3
867	VZERO M4
868	VZERO M5
869	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
870	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
871
872	BR next
873
874square:
875	// setup [r²,r]
876	VSLDB $8, R_0, R_0, R_0
877	VSLDB $8, R_1, R_1, R_1
878	VSLDB $8, R_2, R_2, R_2
879	VSLDB $8, R5_1, R5_1, R5_1
880	VSLDB $8, R5_2, R5_2, R5_2
881
882	VLVGG $1, RSAVE_0, R_0
883	VLVGG $1, RSAVE_1, R_1
884	VLVGG $1, RSAVE_2, R_2
885	VLVGG $1, R5SAVE_1, R5_1
886	VLVGG $1, R5SAVE_2, R5_2
887
888	// setup [h0, h1]
889	VSLDB $8, H0_0, H0_0, H0_0
890	VSLDB $8, H1_0, H1_0, H1_0
891	VSLDB $8, H2_0, H2_0, H2_0
892	VO    H0_1, H0_0, H0_0
893	VO    H1_1, H1_0, H1_0
894	VO    H2_1, H2_0, H2_0
895	VZERO H0_1
896	VZERO H1_1
897	VZERO H2_1
898
899	VZERO M0
900	VZERO M1
901	VZERO M2
902	VZERO M3
903	VZERO M4
904	VZERO M5
905
906	// (h0*r**2) + (h1*r)
907	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
908	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
909	BR next
910
911TEXT ·hasVMSLFacility(SB), NOSPLIT, $24-1
912	MOVD  $x-24(SP), R1
913	XC    $24, 0(R1), 0(R1) // clear the storage
914	MOVD  $2, R0            // R0 is the number of double words stored -1
915	WORD  $0xB2B01000       // STFLE 0(R1)
916	XOR   R0, R0            // reset the value of R0
917	MOVBZ z-8(SP), R1
918	AND   $0x01, R1
919	BEQ   novmsl
920
921vectorinstalled:
922	// check if the vector instruction has been enabled
923	VLEIB  $0, $0xF, V16
924	VLGVB  $0, V16, R1
925	CMPBNE R1, $0xF, novmsl
926	MOVB   $1, ret+0(FP)    // have vx
927	RET
928
929novmsl:
930	MOVB $0, ret+0(FP) // no vx
931	RET
932