1# mp_limb_t mulredc1_6(mp_limb_t * z, const mp_limb_t x, const mp_limb_t * y,
2#                 const mp_limb_t *m, mp_limb_t inv_m);
3#
4# Linux:   z: %rdi, x: %rsi, y: %rdx, m: %rcx, inv_m: %r8
5#          Needs %rbx, %rsp, %rbp, %r12-%r15 restored
6# Windows: z: %rcx, x: %rdx, y: %r8,  m: %r9, inv_m: 28(%rsp)
7#          Needs %rbx, %rbp, %rdi, %rsi, %r12...%15 restored
8
9
10
11include(`config.m4')
12
13ifdef(`WINDOWS64_ABI',
14`define(`Y_PARAM', `%r8')dnl
15define(`INVM_PARAM',`72(%rsp)')dnl'
16,
17`define(`Y_PARAM', `%rdx')dnl
18define(`INVM_PARAM',`%r8')dnl'
19)dnl
20	TEXT
21.p2align 6 # Opteron L1 code cache line is 64 bytes long
22	GLOBL GSYM_PREFIX`'mulredc1_6
23	TYPE(GSYM_PREFIX`'mulredc1_`'6,`function')
24
25# Implements multiplication and REDC for one input numbers of LENGTH words
26# and a multiplier of one word
27ifdef(`WINDOWS64_ABI', `# Uses Windows ABI', `# Uses Linux ABI')
28
29# Values that are referenced only once in the loop over j go into r8 .. r14,
30# In the inner loop (over j), tmp, x[i], y, m, and u are constant.
31# tmp[j], tmp[j+1], tmp[j+2] are updated frequently. These 8 values
32# stay in registers and are referenced as
33# YP = y, MP = m,
34# X = x, T0 = tmp[j], T1 = tmp[j+1], CY = carry
35
36define(`T0', `%rsi')dnl
37define(`T1', `%rbx')dnl
38define(`CY', `%rcx')dnl
39define(`CYl', `%ecx')dnl
40define(`CYb', `%cl')dnl
41define(`X', `%r14')dnl		# register that holds x value
42define(`U', `%r11')dnl
43define(`YP', `%r9')dnl		# register that points to the y array
44define(`MP', `%r10')dnl		# register that points to the m array
45define(`ZP', `%rdi')dnl		# register that holds z
46
47`#' Register vars: `T0' = T0, `T1' = T1, `CY' = CY, `X' = X, `U' = U
48`#'                `YP' = YP, `MP' = MP
49
50GSYM_PREFIX`'mulredc1_6:
51
52
53#########################################################################
54# i = 0 pass
55#########################################################################
56
57`#' register values at loop entry: YP = y, MP = m
58
59# We need to compute u
60
61	movq	(Y_PARAM), %rax		# rax = y[0] (time critical, do first)
62	pushq	%rbx
63	pushq	%r14
64ifdef(`WINDOWS64_ABI',
65`	pushq	%rsi
66	pushq	%rdi
67	movq	%r9, MP			# store m in MP
68	movq    Y_PARAM, YP
69	movq	%rcx, ZP
70	movq	%rdx, X'
71,
72`	movq	Y_PARAM, YP
73	movq	%rcx, MP
74	movq    %rsi, X		# store x in X
75	# ZP is same as passed in'
76)
77
78	xorl	CYl, CYl		# set %CY to 0
79
80	mulq	X			# rdx:rax = y[0] * x
81
82	movq 	%rax, T0		# Move low word of product to T0
83	movq	%rdx, T1		# Move high word of product to T1
84
85	imulq	INVM_PARAM, %rax	# %rax = ((x[i]*y[0]+tmp[0])*invm)%2^64
86	movq	%rax, U			# this is the new u value
87
88	mulq	(MP)			# multipy u*m[0]
89	addq	%rax, T0		# Now %T0 = 0, need not be stored
90	movq	8(YP), %rax		# Fetch y[1]
91	adcq	%rdx, T1		#
92	setc	CYb
93	# CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence
94	# CY:T1 <= 2*2^64 - 4
95
96define(`TT', defn(`T0'))dnl
97define(`T0', defn(`T1'))dnl
98define(`T1', defn(`TT'))dnl
99undefine(`TT')dnl
100`#' Now `T0' = T0, `T1' = T1
101
102
103# Pass for j = 1
104# Register values at entry:
105# %rax = y[j], X = x, U = u
106# T0 = value to store in tmp[j], T1 undefined
107# CY = carry into T1 (is <= 2)
108# We have CY:T1 <= 2 * 2^64 - 2
109
110	movq	CY, T1		# T1 = CY <= 1
111
112	# Here, T1:T0 <= 2*2^64 - 2
113	mulq	X		# y[j] * x
114	# rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1
115	addq	%rax, T0	# Add low word to T0
116	movq	8(MP), %rax	# Fetch m[j] into %rax
117	adcq	%rdx, T1	# Add high word with carry to T1
118	# T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry!
119
120	mulq	U		# m[j]*u
121	# rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1
122	addq	T0, %rax	# Add T0 and low word
123	movq	%rax, 0(ZP)	# Store T0 in z[1-1]
124	movq	16(YP), %rax	# Fetch y[j+1] = y[2] into %rax
125	adcq	%rdx, T1	# Add high word with carry to T1
126	setc	CYb		# CY <= 1
127	# CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <=
128	#             2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2
129
130define(`TT', defn(`T0'))dnl
131define(`T0', defn(`T1'))dnl
132define(`T1', defn(`TT'))dnl
133undefine(`TT')dnl
134`#' Now `T0' = T0, `T1' = T1
135
136
137# Pass for j = 2
138# Register values at entry:
139# %rax = y[j], X = x, U = u
140# T0 = value to store in tmp[j], T1 undefined
141# CY = carry into T1 (is <= 2)
142# We have CY:T1 <= 2 * 2^64 - 2
143
144	movq	CY, T1		# T1 = CY <= 1
145
146	# Here, T1:T0 <= 2*2^64 - 2
147	mulq	X		# y[j] * x
148	# rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1
149	addq	%rax, T0	# Add low word to T0
150	movq	16(MP), %rax	# Fetch m[j] into %rax
151	adcq	%rdx, T1	# Add high word with carry to T1
152	# T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry!
153
154	mulq	U		# m[j]*u
155	# rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1
156	addq	T0, %rax	# Add T0 and low word
157	movq	%rax, 8(ZP)	# Store T0 in z[2-1]
158	movq	24(YP), %rax	# Fetch y[j+1] = y[3] into %rax
159	adcq	%rdx, T1	# Add high word with carry to T1
160	setc	CYb		# CY <= 1
161	# CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <=
162	#             2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2
163
164define(`TT', defn(`T0'))dnl
165define(`T0', defn(`T1'))dnl
166define(`T1', defn(`TT'))dnl
167undefine(`TT')dnl
168`#' Now `T0' = T0, `T1' = T1
169
170
171# Pass for j = 3
172# Register values at entry:
173# %rax = y[j], X = x, U = u
174# T0 = value to store in tmp[j], T1 undefined
175# CY = carry into T1 (is <= 2)
176# We have CY:T1 <= 2 * 2^64 - 2
177
178	movq	CY, T1		# T1 = CY <= 1
179
180	# Here, T1:T0 <= 2*2^64 - 2
181	mulq	X		# y[j] * x
182	# rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1
183	addq	%rax, T0	# Add low word to T0
184	movq	24(MP), %rax	# Fetch m[j] into %rax
185	adcq	%rdx, T1	# Add high word with carry to T1
186	# T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry!
187
188	mulq	U		# m[j]*u
189	# rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1
190	addq	T0, %rax	# Add T0 and low word
191	movq	%rax, 16(ZP)	# Store T0 in z[3-1]
192	movq	32(YP), %rax	# Fetch y[j+1] = y[4] into %rax
193	adcq	%rdx, T1	# Add high word with carry to T1
194	setc	CYb		# CY <= 1
195	# CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <=
196	#             2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2
197
198define(`TT', defn(`T0'))dnl
199define(`T0', defn(`T1'))dnl
200define(`T1', defn(`TT'))dnl
201undefine(`TT')dnl
202`#' Now `T0' = T0, `T1' = T1
203
204
205# Pass for j = 4
206# Register values at entry:
207# %rax = y[j], X = x, U = u
208# T0 = value to store in tmp[j], T1 undefined
209# CY = carry into T1 (is <= 2)
210# We have CY:T1 <= 2 * 2^64 - 2
211
212	movq	CY, T1		# T1 = CY <= 1
213
214	# Here, T1:T0 <= 2*2^64 - 2
215	mulq	X		# y[j] * x
216	# rdx:rax <= (2^64-1)^2 <= 2^128 - 2*2^64 + 1
217	addq	%rax, T0	# Add low word to T0
218	movq	32(MP), %rax	# Fetch m[j] into %rax
219	adcq	%rdx, T1	# Add high word with carry to T1
220	# T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 2 <= 2^128 - 1, no carry!
221
222	mulq	U		# m[j]*u
223	# rdx:rax <= 2^128 - 2*2^64 + 1, T1:T0 <= 2^128 - 1
224	addq	T0, %rax	# Add T0 and low word
225	movq	%rax, 24(ZP)	# Store T0 in z[4-1]
226	movq	40(YP), %rax	# Fetch y[j+1] = y[5] into %rax
227	adcq	%rdx, T1	# Add high word with carry to T1
228	setc	CYb		# CY <= 1
229	# CY:T1:T0 <= 2^128 - 1 + 2^128 - 2*2^64 + 1 <=
230	#             2 * 2^128 - 2*2^64 ==> CY:T1 <= 2 * 2^64 - 2
231
232define(`TT', defn(`T0'))dnl
233define(`T0', defn(`T1'))dnl
234define(`T1', defn(`TT'))dnl
235undefine(`TT')dnl
236`#' Now `T0' = T0, `T1' = T1
237
238
239# Pass for j = 5. Don't fetch new data from y[j+1].
240
241	movq	CY, T1		# T1 = CY <= 1
242
243	mulq	X		# y[j] * x[i]
244	addq	%rax, T0	# Add low word to T0
245	movq	40(MP), %rax	# Fetch m[j] into %rax
246	adcq	%rdx, T1 	# Add high word with carry to T1
247	mulq    U		# m[j]*u
248	addq	%rax, T0	# Add low word to T0
249	movq	T0, 32(ZP)	# Store T0 in z[j-1]
250	adcq	%rdx, T1	# Add high word with carry to T1
251	movq	T1, 40(ZP)	# Store T1 in tmp[j]
252	setc	CYb		# %CY <= 1
253
254	movq	CY, %rax	# use carry as return value
255ifdef(`WINDOWS64_ABI',
256`	popq	%rdi
257	popq	%rsi
258') dnl
259	popq	%r14
260	popq	%rbx
261	ret
262