1/ This Source Code Form is subject to the terms of the Mozilla Public
2/ License, v. 2.0. If a copy of the MPL was not distributed with this
3/ file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5
6/ ------------------------------------------------------------------------
7/
8/  Implementation of s_mpv_mul_set_vec which exploits
9/  the 64X64->128 bit  unsigned multiply instruction.
10/
11/ ------------------------------------------------------------------------
12
13/ r = a * digit, r and a are vectors of length len
14/ returns the carry digit
15/ r and a are 64 bit aligned.
16/
17/ uint64_t
18/ s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
19/
20
21.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
22
23	xorq	%rax, %rax		/ if (len == 0) return (0)
24	testq	%rdx, %rdx
25	jz	.L17
26
27	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
28	xorq	%r9, %r9		/ cy = 0
29
30.L15:
31	cmpq	$8, %r8			/ 8 - len
32	jb	.L16
33	movq	0(%rsi), %rax		/ rax = a[0]
34	movq	8(%rsi), %r11		/ prefetch a[1]
35	mulq	%rcx			/ p = a[0] * digit
36	addq	%r9, %rax
37	adcq	$0, %rdx		/ p += cy
38	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
39	movq	%rdx, %r9		/ cy = hi(p)
40
41	movq	%r11, %rax
42	movq	16(%rsi), %r11		/ prefetch a[2]
43	mulq	%rcx			/ p = a[1] * digit
44	addq	%r9, %rax
45	adcq	$0, %rdx		/ p += cy
46	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
47	movq	%rdx, %r9		/ cy = hi(p)
48
49	movq	%r11, %rax
50	movq	24(%rsi), %r11		/ prefetch a[3]
51	mulq	%rcx			/ p = a[2] * digit
52	addq	%r9, %rax
53	adcq	$0, %rdx		/ p += cy
54	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
55	movq	%rdx, %r9		/ cy = hi(p)
56
57	movq	%r11, %rax
58	movq	32(%rsi), %r11		/ prefetch a[4]
59	mulq	%rcx			/ p = a[3] * digit
60	addq	%r9, %rax
61	adcq	$0, %rdx		/ p += cy
62	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
63	movq	%rdx, %r9		/ cy = hi(p)
64
65	movq	%r11, %rax
66	movq	40(%rsi), %r11		/ prefetch a[5]
67	mulq	%rcx			/ p = a[4] * digit
68	addq	%r9, %rax
69	adcq	$0, %rdx		/ p += cy
70	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
71	movq	%rdx, %r9		/ cy = hi(p)
72
73	movq	%r11, %rax
74	movq	48(%rsi), %r11		/ prefetch a[6]
75	mulq	%rcx			/ p = a[5] * digit
76	addq	%r9, %rax
77	adcq	$0, %rdx		/ p += cy
78	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
79	movq	%rdx, %r9		/ cy = hi(p)
80
81	movq	%r11, %rax
82	movq	56(%rsi), %r11		/ prefetch a[7]
83	mulq	%rcx			/ p = a[6] * digit
84	addq	%r9, %rax
85	adcq	$0, %rdx		/ p += cy
86	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
87	movq	%rdx, %r9		/ cy = hi(p)
88
89	movq	%r11, %rax
90	mulq	%rcx			/ p = a[7] * digit
91	addq	%r9, %rax
92	adcq	$0, %rdx		/ p += cy
93	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
94	movq	%rdx, %r9		/ cy = hi(p)
95
96	addq	$64, %rsi
97	addq	$64, %rdi
98	subq	$8, %r8
99
100	jz	.L17
101	jmp	.L15
102
103.L16:
104	movq	0(%rsi), %rax
105	mulq	%rcx			/ p = a[0] * digit
106	addq	%r9, %rax
107	adcq	$0, %rdx		/ p += cy
108	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
109	movq	%rdx, %r9		/ cy = hi(p)
110	decq	%r8
111	jz	.L17
112
113	movq	8(%rsi), %rax
114	mulq	%rcx			/ p = a[1] * digit
115	addq	%r9, %rax
116	adcq	$0, %rdx		/ p += cy
117	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
118	movq	%rdx, %r9		/ cy = hi(p)
119	decq	%r8
120	jz	.L17
121
122	movq	16(%rsi), %rax
123	mulq	%rcx			/ p = a[2] * digit
124	addq	%r9, %rax
125	adcq	$0, %rdx		/ p += cy
126	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
127	movq	%rdx, %r9		/ cy = hi(p)
128	decq	%r8
129	jz	.L17
130
131	movq	24(%rsi), %rax
132	mulq	%rcx			/ p = a[3] * digit
133	addq	%r9, %rax
134	adcq	$0, %rdx		/ p += cy
135	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
136	movq	%rdx, %r9		/ cy = hi(p)
137	decq	%r8
138	jz	.L17
139
140	movq	32(%rsi), %rax
141	mulq	%rcx			/ p = a[4] * digit
142	addq	%r9, %rax
143	adcq	$0, %rdx		/ p += cy
144	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
145	movq	%rdx, %r9		/ cy = hi(p)
146	decq	%r8
147	jz	.L17
148
149	movq	40(%rsi), %rax
150	mulq	%rcx			/ p = a[5] * digit
151	addq	%r9, %rax
152	adcq	$0, %rdx		/ p += cy
153	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
154	movq	%rdx, %r9		/ cy = hi(p)
155	decq	%r8
156	jz	.L17
157
158	movq	48(%rsi), %rax
159	mulq	%rcx			/ p = a[6] * digit
160	addq	%r9, %rax
161	adcq	$0, %rdx		/ p += cy
162	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
163	movq	%rdx, %r9		/ cy = hi(p)
164	decq	%r8
165	jz	.L17
166
167
168.L17:
169	movq	%r9, %rax
170	ret
171
172.size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
173
174/ ------------------------------------------------------------------------
175/
176/  Implementation of s_mpv_mul_add_vec which exploits
177/  the 64X64->128 bit  unsigned multiply instruction.
178/
179/ ------------------------------------------------------------------------
180
181/ r += a * digit, r and a are vectors of length len
182/ returns the carry digit
183/ r and a are 64 bit aligned.
184/
185/ uint64_t
186/ s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
187/
188
189.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
190
191	xorq	%rax, %rax		/ if (len == 0) return (0)
192	testq	%rdx, %rdx
193	jz	.L27
194
195	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
196	xorq	%r9, %r9		/ cy = 0
197
198.L25:
199	cmpq	$8, %r8			/ 8 - len
200	jb	.L26
201	movq	0(%rsi), %rax		/ rax = a[0]
202	movq	0(%rdi), %r10		/ r10 = r[0]
203	movq	8(%rsi), %r11		/ prefetch a[1]
204	mulq	%rcx			/ p = a[0] * digit
205	addq	%r10, %rax
206	adcq	$0, %rdx		/ p += r[0]
207	movq	8(%rdi), %r10		/ prefetch r[1]
208	addq	%r9, %rax
209	adcq	$0, %rdx		/ p += cy
210	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
211	movq	%rdx, %r9		/ cy = hi(p)
212
213	movq	%r11, %rax
214	movq	16(%rsi), %r11		/ prefetch a[2]
215	mulq	%rcx			/ p = a[1] * digit
216	addq	%r10, %rax
217	adcq	$0, %rdx		/ p += r[1]
218	movq	16(%rdi), %r10		/ prefetch r[2]
219	addq	%r9, %rax
220	adcq	$0, %rdx		/ p += cy
221	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
222	movq	%rdx, %r9		/ cy = hi(p)
223
224	movq	%r11, %rax
225	movq	24(%rsi), %r11		/ prefetch a[3]
226	mulq	%rcx			/ p = a[2] * digit
227	addq	%r10, %rax
228	adcq	$0, %rdx		/ p += r[2]
229	movq	24(%rdi), %r10		/ prefetch r[3]
230	addq	%r9, %rax
231	adcq	$0, %rdx		/ p += cy
232	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
233	movq	%rdx, %r9		/ cy = hi(p)
234
235	movq	%r11, %rax
236	movq	32(%rsi), %r11		/ prefetch a[4]
237	mulq	%rcx			/ p = a[3] * digit
238	addq	%r10, %rax
239	adcq	$0, %rdx		/ p += r[3]
240	movq	32(%rdi), %r10		/ prefetch r[4]
241	addq	%r9, %rax
242	adcq	$0, %rdx		/ p += cy
243	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
244	movq	%rdx, %r9		/ cy = hi(p)
245
246	movq	%r11, %rax
247	movq	40(%rsi), %r11		/ prefetch a[5]
248	mulq	%rcx			/ p = a[4] * digit
249	addq	%r10, %rax
250	adcq	$0, %rdx		/ p += r[4]
251	movq	40(%rdi), %r10		/ prefetch r[5]
252	addq	%r9, %rax
253	adcq	$0, %rdx		/ p += cy
254	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
255	movq	%rdx, %r9		/ cy = hi(p)
256
257	movq	%r11, %rax
258	movq	48(%rsi), %r11		/ prefetch a[6]
259	mulq	%rcx			/ p = a[5] * digit
260	addq	%r10, %rax
261	adcq	$0, %rdx		/ p += r[5]
262	movq	48(%rdi), %r10		/ prefetch r[6]
263	addq	%r9, %rax
264	adcq	$0, %rdx		/ p += cy
265	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
266	movq	%rdx, %r9		/ cy = hi(p)
267
268	movq	%r11, %rax
269	movq	56(%rsi), %r11		/ prefetch a[7]
270	mulq	%rcx			/ p = a[6] * digit
271	addq	%r10, %rax
272	adcq	$0, %rdx		/ p += r[6]
273	movq	56(%rdi), %r10		/ prefetch r[7]
274	addq	%r9, %rax
275	adcq	$0, %rdx		/ p += cy
276	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
277	movq	%rdx, %r9		/ cy = hi(p)
278
279	movq	%r11, %rax
280	mulq	%rcx			/ p = a[7] * digit
281	addq	%r10, %rax
282	adcq	$0, %rdx		/ p += r[7]
283	addq	%r9, %rax
284	adcq	$0, %rdx		/ p += cy
285	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
286	movq	%rdx, %r9		/ cy = hi(p)
287
288	addq	$64, %rsi
289	addq	$64, %rdi
290	subq	$8, %r8
291
292	jz	.L27
293	jmp	.L25
294
295.L26:
296	movq	0(%rsi), %rax
297	movq	0(%rdi), %r10
298	mulq	%rcx			/ p = a[0] * digit
299	addq	%r10, %rax
300	adcq	$0, %rdx		/ p += r[0]
301	addq	%r9, %rax
302	adcq	$0, %rdx		/ p += cy
303	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
304	movq	%rdx, %r9		/ cy = hi(p)
305	decq	%r8
306	jz	.L27
307
308	movq	8(%rsi), %rax
309	movq	8(%rdi), %r10
310	mulq	%rcx			/ p = a[1] * digit
311	addq	%r10, %rax
312	adcq	$0, %rdx		/ p += r[1]
313	addq	%r9, %rax
314	adcq	$0, %rdx		/ p += cy
315	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
316	movq	%rdx, %r9		/ cy = hi(p)
317	decq	%r8
318	jz	.L27
319
320	movq	16(%rsi), %rax
321	movq	16(%rdi), %r10
322	mulq	%rcx			/ p = a[2] * digit
323	addq	%r10, %rax
324	adcq	$0, %rdx		/ p += r[2]
325	addq	%r9, %rax
326	adcq	$0, %rdx		/ p += cy
327	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
328	movq	%rdx, %r9		/ cy = hi(p)
329	decq	%r8
330	jz	.L27
331
332	movq	24(%rsi), %rax
333	movq	24(%rdi), %r10
334	mulq	%rcx			/ p = a[3] * digit
335	addq	%r10, %rax
336	adcq	$0, %rdx		/ p += r[3]
337	addq	%r9, %rax
338	adcq	$0, %rdx		/ p += cy
339	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
340	movq	%rdx, %r9		/ cy = hi(p)
341	decq	%r8
342	jz	.L27
343
344	movq	32(%rsi), %rax
345	movq	32(%rdi), %r10
346	mulq	%rcx			/ p = a[4] * digit
347	addq	%r10, %rax
348	adcq	$0, %rdx		/ p += r[4]
349	addq	%r9, %rax
350	adcq	$0, %rdx		/ p += cy
351	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
352	movq	%rdx, %r9		/ cy = hi(p)
353	decq	%r8
354	jz	.L27
355
356	movq	40(%rsi), %rax
357	movq	40(%rdi), %r10
358	mulq	%rcx			/ p = a[5] * digit
359	addq	%r10, %rax
360	adcq	$0, %rdx		/ p += r[5]
361	addq	%r9, %rax
362	adcq	$0, %rdx		/ p += cy
363	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
364	movq	%rdx, %r9		/ cy = hi(p)
365	decq	%r8
366	jz	.L27
367
368	movq	48(%rsi), %rax
369	movq	48(%rdi), %r10
370	mulq	%rcx			/ p = a[6] * digit
371	addq	%r10, %rax
372	adcq	$0, %rdx		/ p += r[6]
373	addq	%r9, %rax
374	adcq	$0, %rdx		/ p += cy
375	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
376	movq	%rdx, %r9		/ cy = hi(p)
377	decq	%r8
378	jz	.L27
379
380
381.L27:
382	movq	%r9, %rax
383	ret
384
385.size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64
386