1d25e02daSmrgdnl  S/390-32 mpn_add_n and mpn_sub_n.
2d25e02daSmrg
3d25e02daSmrgdnl  Copyright 2011 Free Software Foundation, Inc.
4d25e02daSmrg
5d25e02daSmrgdnl  This file is part of the GNU MP Library.
6*f81b1c5bSmrgdnl
7d25e02daSmrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8*f81b1c5bSmrgdnl  it under the terms of either:
9*f81b1c5bSmrgdnl
10*f81b1c5bSmrgdnl    * the GNU Lesser General Public License as published by the Free
11*f81b1c5bSmrgdnl      Software Foundation; either version 3 of the License, or (at your
12*f81b1c5bSmrgdnl      option) any later version.
13*f81b1c5bSmrgdnl
14*f81b1c5bSmrgdnl  or
15*f81b1c5bSmrgdnl
16*f81b1c5bSmrgdnl    * the GNU General Public License as published by the Free Software
17*f81b1c5bSmrgdnl      Foundation; either version 2 of the License, or (at your option) any
18*f81b1c5bSmrgdnl      later version.
19*f81b1c5bSmrgdnl
20*f81b1c5bSmrgdnl  or both in parallel, as here.
21*f81b1c5bSmrgdnl
22d25e02daSmrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
23d25e02daSmrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24*f81b1c5bSmrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25*f81b1c5bSmrgdnl  for more details.
26*f81b1c5bSmrgdnl
27*f81b1c5bSmrgdnl  You should have received copies of the GNU General Public License and the
28*f81b1c5bSmrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29*f81b1c5bSmrgdnl  see https://www.gnu.org/licenses/.
30d25e02daSmrg
31d25e02daSmrginclude(`../config.m4')
32d25e02daSmrg
33d25e02daSmrgC            cycles/limb
34d25e02daSmrgC z900		 ?
35d25e02daSmrgC z990	      2.75-3		(fast for even n, slow for odd n)
36d25e02daSmrgC z9		 ?
37d25e02daSmrgC z10		 ?
38d25e02daSmrgC z196		 ?
39d25e02daSmrg
40d25e02daSmrgC TODO
41d25e02daSmrgC  * Optimise for small n
42d25e02daSmrgC  * Use r0 and save/restore one less register
43d25e02daSmrgC  * Using logops_n's v1 inner loop operand order make the loop about 20%
44d25e02daSmrgC    faster, at the expense of highly alignment-dependent performance.
45d25e02daSmrg
46d25e02daSmrgC INPUT PARAMETERS
47d25e02daSmrgdefine(`rp',	`%r2')
48d25e02daSmrgdefine(`up',	`%r3')
49d25e02daSmrgdefine(`vp',	`%r4')
50d25e02daSmrgdefine(`n',	`%r5')
51d25e02daSmrg
52d25e02daSmrgifdef(`OPERATION_add_n', `
53d25e02daSmrg  define(ADSB,		al)
54d25e02daSmrg  define(ADSBCR,	alcr)
55d25e02daSmrg  define(ADSBC,		alc)
56d25e02daSmrg  define(RETVAL,`dnl
57d25e02daSmrg	lhi	%r2, 0
58d25e02daSmrg	alcr	%r2, %r2')
59d25e02daSmrg  define(func,		mpn_add_n)
60d25e02daSmrg  define(func_nc,	mpn_add_nc)')
61d25e02daSmrgifdef(`OPERATION_sub_n', `
62d25e02daSmrg  define(ADSB,		sl)
63d25e02daSmrg  define(ADSBCR,	slbr)
64d25e02daSmrg  define(ADSBC,		slb)
65d25e02daSmrg  define(RETVAL,`dnl
66d25e02daSmrg	slbr	%r2, %r2
67d25e02daSmrg	lcr	%r2, %r2')
68d25e02daSmrg  define(func,		mpn_sub_n)
69d25e02daSmrg  define(func_nc,	mpn_sub_nc)')
70d25e02daSmrg
71d25e02daSmrgMULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
72d25e02daSmrg
73d25e02daSmrgASM_START()
74d25e02daSmrgPROLOGUE(func)
75d25e02daSmrg	stm	%r6, %r8, 24(%r15)
76d25e02daSmrg
77d25e02daSmrg	ahi	n, 3
78d25e02daSmrg	lhi	%r7, 3
79d25e02daSmrg	lr	%r1, n
80d25e02daSmrg	srl	%r1, 2
81d25e02daSmrg	nr	%r7, n			C n mod 4
82d25e02daSmrg	je	L(b1)
83d25e02daSmrg	chi	%r7, 2
84d25e02daSmrg	jl	L(b2)
85d25e02daSmrg	jne	L(b0)
86d25e02daSmrg
87d25e02daSmrgL(b3):	lm	%r5, %r7, 0(up)
88d25e02daSmrg	la	up, 12(up)
89d25e02daSmrg	ADSB	%r5, 0(vp)
90d25e02daSmrg	ADSBC	%r6, 4(vp)
91d25e02daSmrg	ADSBC	%r7, 8(vp)
92d25e02daSmrg	la	vp, 12(vp)
93d25e02daSmrg	stm	%r5, %r7, 0(rp)
94d25e02daSmrg	la	rp, 12(rp)
95d25e02daSmrg	brct	%r1, L(top)
96d25e02daSmrg	j	L(end)
97d25e02daSmrg
98d25e02daSmrgL(b0):	lm	%r5, %r8, 0(up)		C This redundant insns is no mistake,
99d25e02daSmrg	la	up, 16(up)		C it is needed to make main loop run
100d25e02daSmrg	ADSB	%r5, 0(vp)		C fast for n = 0 (mod 4).
101d25e02daSmrg	ADSBC	%r6, 4(vp)
102d25e02daSmrg	j	L(m0)
103d25e02daSmrg
104d25e02daSmrgL(b1):	l	%r5, 0(up)
105d25e02daSmrg	la	up, 4(up)
106d25e02daSmrg	ADSB	%r5, 0(vp)
107d25e02daSmrg	la	vp, 4(vp)
108d25e02daSmrg	st	%r5, 0(rp)
109d25e02daSmrg	la	rp, 4(rp)
110d25e02daSmrg	brct	%r1, L(top)
111d25e02daSmrg	j	L(end)
112d25e02daSmrg
113d25e02daSmrgL(b2):	lm	%r5, %r6, 0(up)
114d25e02daSmrg	la	up, 8(up)
115d25e02daSmrg	ADSB	%r5, 0(vp)
116d25e02daSmrg	ADSBC	%r6, 4(vp)
117d25e02daSmrg	la	vp, 8(vp)
118d25e02daSmrg	stm	%r5, %r6, 0(rp)
119d25e02daSmrg	la	rp, 8(rp)
120d25e02daSmrg	brct	%r1, L(top)
121d25e02daSmrg	j	L(end)
122d25e02daSmrg
123d25e02daSmrgL(top):	lm	%r5, %r8, 0(up)
124d25e02daSmrg	la	up, 16(up)
125d25e02daSmrg	ADSBC	%r5, 0(vp)
126d25e02daSmrg	ADSBC	%r6, 4(vp)
127d25e02daSmrgL(m0):	ADSBC	%r7, 8(vp)
128d25e02daSmrg	ADSBC	%r8, 12(vp)
129d25e02daSmrg	la	vp, 16(vp)
130d25e02daSmrg	stm	%r5, %r8, 0(rp)
131d25e02daSmrg	la	rp, 16(rp)
132d25e02daSmrg	brct	%r1, L(top)
133d25e02daSmrg
134d25e02daSmrgL(end):	RETVAL
135d25e02daSmrg	lm	%r6, %r8, 24(%r15)
136d25e02daSmrg	br	%r14
137d25e02daSmrgEPILOGUE()
138